diff mbox series

[v5,16/36] spapr: add hcalls support for the XIVE exploitation interrupt mode

Message ID 20181116105729.23240-17-clg@kaod.org
State New
Headers show
Series ppc: support for the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 16, 2018, 10:57 a.m. UTC
The different XIVE virtualization structures (sources and event queues)
are configured with a set of Hypervisor calls :

 - H_INT_GET_SOURCE_INFO

   used to obtain the address of the MMIO page of the Event State
   Buffer (ESB) entry associated with the source.

 - H_INT_SET_SOURCE_CONFIG

   assigns a source to a "target".

 - H_INT_GET_SOURCE_CONFIG

   determines which "target" and "priority" is assigned to a source

 - H_INT_GET_QUEUE_INFO

   returns the address of the notification management page associated
   with the specified "target" and "priority".

 - H_INT_SET_QUEUE_CONFIG

   sets or resets the event queue for a given "target" and "priority".
   It is also used to set the notification configuration associated
   with the queue, only unconditional notification is supported for
   the moment. Reset is performed with a queue size of 0 and queueing
   is disabled in that case.

 - H_INT_GET_QUEUE_CONFIG

   returns the queue settings for a given "target" and "priority".

 - H_INT_RESET

   resets all of the guest's internal interrupt structures to their
   initial state, losing all configuration set via the hcalls
   H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.

 - H_INT_SYNC

   issue a synchronisation on a source to make sure all notifications
   have reached their queue.

Calls that still need to be addressed :

   H_INT_SET_OS_REPORTING_LINE
   H_INT_GET_OS_REPORTING_LINE

See the code for more documentation on each hcall.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr.h      |  15 +-
 include/hw/ppc/spapr_xive.h |   6 +
 hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_irq.c          |   2 +
 hw/intc/Makefile.objs       |   2 +-
 5 files changed, 915 insertions(+), 2 deletions(-)
 create mode 100644 hw/intc/spapr_xive_hcall.c

Comments

David Gibson Nov. 28, 2018, 4:25 a.m. UTC | #1
On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
> The different XIVE virtualization structures (sources and event queues)
> are configured with a set of Hypervisor calls :
> 
>  - H_INT_GET_SOURCE_INFO
> 
>    used to obtain the address of the MMIO page of the Event State
>    Buffer (ESB) entry associated with the source.
> 
>  - H_INT_SET_SOURCE_CONFIG
> 
>    assigns a source to a "target".
> 
>  - H_INT_GET_SOURCE_CONFIG
> 
>    determines which "target" and "priority" is assigned to a source
> 
>  - H_INT_GET_QUEUE_INFO
> 
>    returns the address of the notification management page associated
>    with the specified "target" and "priority".
> 
>  - H_INT_SET_QUEUE_CONFIG
> 
>    sets or resets the event queue for a given "target" and "priority".
>    It is also used to set the notification configuration associated
>    with the queue, only unconditional notification is supported for
>    the moment. Reset is performed with a queue size of 0 and queueing
>    is disabled in that case.
> 
>  - H_INT_GET_QUEUE_CONFIG
> 
>    returns the queue settings for a given "target" and "priority".
> 
>  - H_INT_RESET
> 
>    resets all of the guest's internal interrupt structures to their
>    initial state, losing all configuration set via the hcalls
>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> 
>  - H_INT_SYNC
> 
>    issue a synchronisation on a source to make sure all notifications
>    have reached their queue.
> 
> Calls that still need to be addressed :
> 
>    H_INT_SET_OS_REPORTING_LINE
>    H_INT_GET_OS_REPORTING_LINE
> 
> See the code for more documentation on each hcall.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  include/hw/ppc/spapr.h      |  15 +-
>  include/hw/ppc/spapr_xive.h |   6 +
>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_irq.c          |   2 +
>  hw/intc/Makefile.objs       |   2 +-
>  5 files changed, 915 insertions(+), 2 deletions(-)
>  create mode 100644 hw/intc/spapr_xive_hcall.c
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 1fbc2663e06c..8415faea7b82 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
>  #define H_INVALIDATE_PID        0x378
>  #define H_REGISTER_PROC_TBL     0x37C
>  #define H_SIGNAL_SYS_RESET      0x380
> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
> +
> +#define H_INT_GET_SOURCE_INFO   0x3A8
> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
> +#define H_INT_GET_QUEUE_INFO    0x3B4
> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
> +#define H_INT_ESB               0x3C8
> +#define H_INT_SYNC              0x3CC
> +#define H_INT_RESET             0x3D0
> +
> +#define MAX_HCALL_OPCODE        H_INT_RESET
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 3f65b8f485fd..418511f3dc10 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
>  
> +bool spapr_xive_priority_is_valid(uint8_t priority);

AFAICT this could be a local function.

> +
> +typedef struct sPAPRMachineState sPAPRMachineState;
> +
> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
> +
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
> new file mode 100644
> index 000000000000..52e4e23995f5
> --- /dev/null
> +++ b/hw/intc/spapr_xive_hcall.c
> @@ -0,0 +1,892 @@
> +/*
> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> + *
> + * Copyright (c) 2017-2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qapi/error.h"
> +#include "cpu.h"
> +#include "hw/ppc/fdt.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_xive.h"
> +#include "hw/ppc/xive_regs.h"
> +#include "monitor/monitor.h"

Fwiw, I don't think it's particularly necessary to split the hcall
handling out into a separate .c file.

> +/*
> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
> + * available for the guest.

Referencing OPAL behaviour doesn't really make sense in the context of
PAPR.  What I think you're getting at is that the PAPR spec only
allows a PAPR guest to use priorities 0..6 (or at least it will if the
XIVE updated spec ever gets published).  The fact that this allows the
host use 7 for escalations is a design rationale but not really
relevant to the guest device itself.

> + */
> +bool spapr_xive_priority_is_valid(uint8_t priority)
> +{
> +    switch (priority) {
> +    case 0 ... 6:
> +        return true;
> +    case 7: /* OPAL escalation queue */
> +    default:
> +        return false;
> +    }
> +}
> +
> +/*
> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
> + * real address of the MMIO page through which the Event State Buffer
> + * entry associated with the value of the "lisn" parameter is managed.
> + *
> + * Parameters:
> + * Input
> + * - "flags"
> + *       Bits 0-63 reserved
> + * - "lisn" is per "interrupts", "interrupt-map", or
> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
> + *       ibm,query-interrupt-source-number RTAS call, or as returned
> + *       by the H_ALLOCATE_VAS_WINDOW hcall

I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
to implement in kvm/qemu, or is it only of interest for PowerVM?

Also, putting the register numbers on the inputs as well as the
outputs would be helpful.

> + *
> + * Output
> + * - R4: "flags"
> + *       Bits 0-59: Reserved
> + *       Bit 60: H_INT_ESB must be used for Event State Buffer
> + *               management
> + *       Bit 61: 1 == LSI  0 == MSI
> + *       Bit 62: the full function page supports trigger
> + *       Bit 63: Store EOI Supported
> + * - R5: Logical Real address of full function Event State Buffer
> + *       management page, -1 if ESB hcall flag is set to 1.

You've defined what H_INT_ESB means above, so it will be clearer if
you reference that by name here.

> + * - R6: Logical Real Address of trigger only Event State Buffer
> + *       management page or -1.
> + * - R7: Power of 2 page size for the ESB management pages returned in
> + *       R5 and R6.
> + */
> +
> +#define SPAPR_XIVE_SRC_H_INT_ESB     PPC_BIT(60) /* ESB manage with H_INT_ESB */
> +#define SPAPR_XIVE_SRC_LSI           PPC_BIT(61) /* Virtual LSI type */
> +#define SPAPR_XIVE_SRC_TRIGGER       PPC_BIT(62) /* Trigger and management
> +                                                    on same page */
> +#define SPAPR_XIVE_SRC_STORE_EOI     PPC_BIT(63) /* Store EOI support */

Probably makes sense to put these #defines in spapr.h since they form
part of the PAPR interface definition.

> +static target_ulong h_int_get_source_info(PowerPCCPU *cpu,
> +                                          sPAPRMachineState *spapr,
> +                                          target_ulong opcode,
> +                                          target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveSource *xsrc = &xive->source;
> +    XiveEAS eas;
> +    target_ulong flags  = args[0];
> +    target_ulong lisn   = args[1];
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> +        return H_P2;
> +    }
> +
> +    if (!(eas.w & EAS_VALID)) {
> +        return H_P2;
> +    }
> +
> +    /* All sources are emulated under the main XIVE object and share
> +     * the same characteristics.
> +     */
> +    args[0] = 0;
> +    if (!xive_source_esb_has_2page(xsrc)) {
> +        args[0] |= SPAPR_XIVE_SRC_TRIGGER;
> +    }
> +    if (xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
> +        args[0] |= SPAPR_XIVE_SRC_STORE_EOI;
> +    }
> +
> +    /*
> +     * Force the use of the H_INT_ESB hcall in case of an LSI
> +     * interrupt. This is necessary under KVM to re-trigger the
> +     * interrupt if the level is still asserted
> +     */
> +    if (xive_source_irq_is_lsi(xsrc, lisn)) {
> +        args[0] |= SPAPR_XIVE_SRC_H_INT_ESB | SPAPR_XIVE_SRC_LSI;
> +    }
> +
> +    if (!(args[0] & SPAPR_XIVE_SRC_H_INT_ESB)) {
> +        args[1] = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn);
> +    } else {
> +        args[1] = -1;
> +    }
> +
> +    if (xive_source_esb_has_2page(xsrc)) {
> +        args[2] = xive->vc_base + xive_source_esb_page(xsrc, lisn);
> +    } else {
> +        args[2] = -1;
> +    }

Do we also need to keep this address clear in the H_INT_ESB case?

> +    args[3] = TARGET_PAGE_SIZE;

That seems wrong.  TARGET_PAGE_SIZE is generally 4kiB, but won't these
usually actually be 64kiB?

> +
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_SET_SOURCE_CONFIG hcall() is used to assign a Logical
> + * Interrupt Source to a target. The Logical Interrupt Source is
> + * designated with the "lisn" parameter and the target is designated
> + * with the "target" and "priority" parameters.  Upon return from the
> + * hcall(), no additional interrupts will be directed to the old EQ.
> + *
> + * TODO: The old EQ should be investigated for interrupts that
> + * occurred prior to or during the hcall().

Isn't that the responsibility of the guest?

> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-61: Reserved
> + *      Bit 62: set the "eisn" in the EA

What's the "EA"?  Do you mean the EAS?

> + *      Bit 63: masks the interrupt source in the hardware interrupt
> + *      control structure. An interrupt masked by this mechanism will
> + *      be dropped, but it's source state bits will still be
> + *      set. There is no race-free way of unmasking and restoring the
> + *      source. Thus this should only be used in interrupts that are
> + *      also masked at the source, and only in cases where the
> + *      interrupt is not meant to be used for a large amount of time
> + *      because no valid target exists for it for example
> + * - "lisn" is per "interrupts", "interrupt-map", or
> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> + *      ibm,query-interrupt-source-number RTAS call, or as returned by
> + *      the H_ALLOCATE_VAS_WINDOW hcall
> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> + *      "ibm,ppc-interrupt-gserver#s"
> + * - "priority" is a valid priority not in
> + *      "ibm,plat-res-int-priorities"
> + * - "eisn" is the guest EISN associated with the "lisn"

I don't think the EISN term has been used before in the series.  I'm
guessing this is the guest-assigned global interrupt number?

> + *
> + * Output:
> + * - None
> + */
> +
> +#define SPAPR_XIVE_SRC_SET_EISN PPC_BIT(62)
> +#define SPAPR_XIVE_SRC_MASK     PPC_BIT(63)
> +
> +static target_ulong h_int_set_source_config(PowerPCCPU *cpu,
> +                                            sPAPRMachineState *spapr,
> +                                            target_ulong opcode,
> +                                            target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> +    XiveEAS eas, new_eas;
> +    target_ulong flags    = args[0];
> +    target_ulong lisn     = args[1];
> +    target_ulong target   = args[2];
> +    target_ulong priority = args[3];
> +    target_ulong eisn     = args[4];
> +    uint8_t end_blk;
> +    uint32_t end_idx;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags & ~(SPAPR_XIVE_SRC_SET_EISN | SPAPR_XIVE_SRC_MASK)) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
> +        return H_P2;
> +    }
> +
> +    if (!(eas.w & EAS_VALID)) {
> +        return H_P2;
> +    }
> +
> +    /* priority 0xff is used to reset the EAS */
> +    if (priority == 0xff) {
> +        new_eas.w = EAS_VALID | EAS_MASKED;
> +        goto out;
> +    }
> +
> +    if (flags & SPAPR_XIVE_SRC_MASK) {
> +        new_eas.w = eas.w | EAS_MASKED;
> +    } else {
> +        new_eas.w = eas.w & ~EAS_MASKED;
> +    }
> +
> +    if (!spapr_xive_priority_is_valid(priority)) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> +                      priority);
> +        return H_P4;
> +    }
> +
> +    /* Validate that "target" is part of the list of threads allocated
> +     * to the partition. For that, find the END corresponding to the
> +     * target.
> +     */
> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> +        return H_P3;
> +    }
> +
> +    new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
> +    new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
> +
> +    if (flags & SPAPR_XIVE_SRC_SET_EISN) {
> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
> +    }
> +
> +out:
> +    if (xive_router_set_eas(xrtr, lisn, &new_eas)) {
> +        return H_HARDWARE;
> +    }

As noted earlier in the series, the spapr specific code owns the
memory backing the EAT, so you can just access it directly rather than
using a method here.

> +
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_GET_SOURCE_CONFIG hcall() is used to determine to which
> + * target/priority pair is assigned to the specified Logical Interrupt
> + * Source.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-63 Reserved
> + * - "lisn" is per "interrupts", "interrupt-map", or
> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> + *      ibm,query-interrupt-source-number RTAS call, or as
> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> + *
> + * Output:
> + * - R4: Target to which the specified Logical Interrupt Source is
> + *       assigned
> + * - R5: Priority to which the specified Logical Interrupt Source is
> + *       assigned
> + * - R6: EISN for the specified Logical Interrupt Source (this will be
> + *       equivalent to the LISN if not changed by H_INT_SET_SOURCE_CONFIG)
> + */
> +static target_ulong h_int_get_source_config(PowerPCCPU *cpu,
> +                                            sPAPRMachineState *spapr,
> +                                            target_ulong opcode,
> +                                            target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> +    target_ulong flags = args[0];
> +    target_ulong lisn = args[1];
> +    XiveEAS eas;
> +    XiveEND end;
> +    uint8_t end_blk, nvt_blk;
> +    uint32_t end_idx, nvt_idx;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
> +        return H_P2;
> +    }
> +
> +    if (!(eas.w & EAS_VALID)) {
> +        return H_P2;
> +    }
> +
> +    end_blk = GETFIELD(EAS_END_BLOCK, eas.w);
> +    end_idx = GETFIELD(EAS_END_INDEX, eas.w);
> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
> +        /* Not sure what to return here */
> +        return H_HARDWARE;

IIUC this indicates a bug in the PAPR specific code, not the guest, so
an assert() is probably the right answer.

> +    }
> +
> +    nvt_blk = GETFIELD(END_W6_NVT_BLOCK, end.w6);
> +    nvt_idx = GETFIELD(END_W6_NVT_INDEX, end.w6);
> +    args[0] = spapr_xive_nvt_to_target(xive, nvt_blk, nvt_idx);

AIUI there's a specific END for each target & priority, so you could
avoid this second level lookup, although I guess this might be
valuable if we do more complicated internal routing in the future.

> +    if (eas.w & EAS_MASKED) {
> +        args[1] = 0xff;
> +    } else {
> +        args[1] = GETFIELD(END_W7_F0_PRIORITY, end.w7);
> +    }
> +
> +    args[2] = GETFIELD(EAS_END_DATA, eas.w);
> +
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_GET_QUEUE_INFO hcall() is used to get the logical real
> + * address of the notification management page associated with the
> + * specified target and priority.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *       Bits 0-63 Reserved
> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> + *       "ibm,ppc-interrupt-gserver#s"
> + * - "priority" is a valid priority not in
> + *       "ibm,plat-res-int-priorities"
> + *
> + * Output:
> + * - R4: Logical real address of notification page
> + * - R5: Power of 2 page size of the notification page
> + */
> +static target_ulong h_int_get_queue_info(PowerPCCPU *cpu,
> +                                         sPAPRMachineState *spapr,
> +                                         target_ulong opcode,
> +                                         target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveENDSource *end_xsrc = &xive->end_source;
> +    target_ulong flags = args[0];
> +    target_ulong target = args[1];
> +    target_ulong priority = args[2];
> +    XiveEND end;
> +    uint8_t end_blk;
> +    uint32_t end_idx;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags) {
> +        return H_PARAMETER;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    if (!spapr_xive_priority_is_valid(priority)) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> +                      priority);
> +        return H_P3;
> +    }
> +
> +    /* Validate that "target" is part of the list of threads allocated
> +     * to the partition. For that, find the END corresponding to the
> +     * target.
> +     */
> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> +        return H_P2;
> +    }
> +
> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
> +        return H_HARDWARE;
> +    }
> +
> +    args[0] = xive->end_base + (1ull << (end_xsrc->esb_shift + 1)) * end_idx;
> +    if (end.w0 & END_W0_ENQUEUE) {
> +        args[1] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> +    } else {
> +        args[1] = 0;
> +    }
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_SET_QUEUE_CONFIG hcall() is used to set or reset a EQ for
> + * a given "target" and "priority".  It is also used to set the
> + * notification config associated with the EQ.  An EQ size of 0 is
> + * used to reset the EQ config for a given target and priority. If
> + * resetting the EQ config, the END associated with the given "target"
> + * and "priority" will be changed to disable queueing.
> + *
> + * Upon return from the hcall(), no additional interrupts will be
> + * directed to the old EQ (if one was set). The old EQ (if one was
> + * set) should be investigated for interrupts that occurred prior to
> + * or during the hcall().
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-62: Reserved
> + *      Bit 63: Unconditional Notify (n) per the XIVE spec
> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> + *       "ibm,ppc-interrupt-gserver#s"
> + * - "priority" is a valid priority not in
> + *       "ibm,plat-res-int-priorities"
> + * - "eventQueue": The logical real address of the start of the EQ
> + * - "eventQueueSize": The power of 2 EQ size per "ibm,xive-eq-sizes"
> + *
> + * Output:
> + * - None
> + */
> +
> +#define SPAPR_XIVE_END_ALWAYS_NOTIFY PPC_BIT(63)
> +
> +static target_ulong h_int_set_queue_config(PowerPCCPU *cpu,
> +                                           sPAPRMachineState *spapr,
> +                                           target_ulong opcode,
> +                                           target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> +    target_ulong flags = args[0];
> +    target_ulong target = args[1];
> +    target_ulong priority = args[2];
> +    target_ulong qpage = args[3];
> +    target_ulong qsize = args[4];
> +    XiveEND end;
> +    uint8_t end_blk, nvt_blk;
> +    uint32_t end_idx, nvt_idx;
> +    uint32_t qdata;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags & ~SPAPR_XIVE_END_ALWAYS_NOTIFY) {
> +        return H_PARAMETER;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    if (!spapr_xive_priority_is_valid(priority)) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> +                      priority);
> +        return H_P3;
> +    }
> +
> +    /* Validate that "target" is part of the list of threads allocated
> +     * to the partition. For that, find the END corresponding to the
> +     * target.
> +     */
> +
> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> +        return H_P2;
> +    }
> +
> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
> +        return H_HARDWARE;

Again, I think this indicates a qemu (spapr) code bug, so could be an assert().

> +    }
> +
> +    switch (qsize) {
> +    case 12:
> +    case 16:
> +    case 21:
> +    case 24:
> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;

It just occurred to me that I haven't been looking for this across any
of these reviews.  Don't you need byteswaps when accessing these
in-memory structures?

> +        end.w2 = (((uint64_t)qpage)) >> 32 & 0x0fffffff;
> +        end.w0 |= END_W0_ENQUEUE;
> +        end.w0 = SETFIELD(END_W0_QSIZE, end.w0, qsize - 12);
> +        break;
> +    case 0:
> +        /* reset queue and disable queueing */
> +        xive_end_reset(&end);
> +        goto out;
> +
> +    default:
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid EQ size %"PRIx64"\n",
> +                      qsize);
> +        return H_P5;
> +    }
> +
> +    if (qsize) {
> +        /*
> +         * Let's validate the EQ address with a read of the first EQ
> +         * entry. We could also check that the full queue has been
> +         * zeroed by the OS.
> +         */
> +        if (address_space_read(&address_space_memory, qpage,
> +                               MEMTXATTRS_UNSPECIFIED,
> +                               (uint8_t *) &qdata, sizeof(qdata))) {
> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to read EQ data @0x%"
> +                          HWADDR_PRIx "\n", qpage);
> +            return H_P4;

Just checking the first entry doesn't seem entirely safe.  Using
address_space_map() and making sure the returned plen doesn't get
reduced below the queue size might be a better option.

> +        }
> +    }
> +
> +    if (spapr_xive_target_to_nvt(xive, target, &nvt_blk, &nvt_idx)) {
> +        return H_HARDWARE;

That could be caused by a bogus 'target' value, couldn't it?  In which
case it a) should probably be checked earlier and b) should be
H_PARAMETER or similar, not H_HARDWARE, yes?

> +    }
> +
> +    /* Ensure the priority and target are correctly set (they will not
> +     * be right after allocation)

AIUI there's a static association from END to target in the PAPR
model.  So it seems to make more sense to get that set up right at
initialization / reset, rather than doing it lazily when the queue is configured.

> +     */
> +    end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
> +        SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
> +    end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, priority);
> +
> +    if (flags & SPAPR_XIVE_END_ALWAYS_NOTIFY) {
> +        end.w0 |= END_W0_UCOND_NOTIFY;
> +    } else {
> +        end.w0 &= ~END_W0_UCOND_NOTIFY;
> +    }
> +
> +    /* The generation bit for the END starts at 1 and The END page
> +     * offset counter starts at 0.
> +     */
> +    end.w1 = END_W1_GENERATION | SETFIELD(END_W1_PAGE_OFF, 0ul, 0ul);
> +    end.w0 |= END_W0_VALID;
> +
> +    /* TODO: issue syncs required to ensure all in-flight interrupts
> +     * are complete on the old END */
> +out:
> +    /* Update END */
> +    if (xive_router_set_end(xrtr, end_blk, end_idx, &end)) {
> +        return H_HARDWARE;
> +    }

Again the PAPR code owns the ENDs, so it can update them directly
rather than going through an abstraction.

> +
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_GET_QUEUE_CONFIG hcall() is used to get a EQ for a given
> + * target and priority.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-62: Reserved
> + *      Bit 63: Debug: Return debug data
> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> + *       "ibm,ppc-interrupt-gserver#s"
> + * - "priority" is a valid priority not in
> + *       "ibm,plat-res-int-priorities"
> + *
> + * Output:
> + * - R4: "flags":
> + *       Bits 0-61: Reserved
> + *       Bit 62: The value of Event Queue Generation Number (g) per
> + *              the XIVE spec if "Debug" = 1
> + *       Bit 63: The value of Unconditional Notify (n) per the XIVE spec
> + * - R5: The logical real address of the start of the EQ
> + * - R6: The power of 2 EQ size per "ibm,xive-eq-sizes"
> + * - R7: The value of Event Queue Offset Counter per XIVE spec
> + *       if "Debug" = 1, else 0
> + *
> + */
> +
> +#define SPAPR_XIVE_END_DEBUG     PPC_BIT(63)
> +
> +static target_ulong h_int_get_queue_config(PowerPCCPU *cpu,
> +                                           sPAPRMachineState *spapr,
> +                                           target_ulong opcode,
> +                                           target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    target_ulong flags = args[0];
> +    target_ulong target = args[1];
> +    target_ulong priority = args[2];
> +    XiveEND end;
> +    uint8_t end_blk;
> +    uint32_t end_idx;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags & ~SPAPR_XIVE_END_DEBUG) {
> +        return H_PARAMETER;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    if (!spapr_xive_priority_is_valid(priority)) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> +                      priority);
> +        return H_P3;
> +    }
> +
> +    /* Validate that "target" is part of the list of threads allocated
> +     * to the partition. For that, find the END corresponding to the
> +     * target.
> +     */
> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> +        return H_P2;
> +    }
> +
> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
> +        return H_HARDWARE;

Again, assert() seems appropriate here.

> +    }
> +
> +    args[0] = 0;
> +    if (end.w0 & END_W0_UCOND_NOTIFY) {
> +        args[0] |= SPAPR_XIVE_END_ALWAYS_NOTIFY;
> +    }
> +
> +    if (end.w0 & END_W0_ENQUEUE) {
> +        args[1] =
> +            (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
> +        args[2] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> +    } else {
> +        args[1] = 0;
> +        args[2] = 0;
> +    }
> +
> +    /* TODO: do we need any locking on the END ? */
> +    if (flags & SPAPR_XIVE_END_DEBUG) {
> +        /* Load the event queue generation number into the return flags */
> +        args[0] |= (uint64_t)GETFIELD(END_W1_GENERATION, end.w1) << 62;
> +
> +        /* Load R7 with the event queue offset counter */
> +        args[3] = GETFIELD(END_W1_PAGE_OFF, end.w1);
> +    } else {
> +        args[3] = 0;
> +    }
> +
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_SET_OS_REPORTING_LINE hcall() is used to set the
> + * reporting cache line pair for the calling thread.  The reporting
> + * cache lines will contain the OS interrupt context when the OS
> + * issues a CI store byte to @TIMA+0xC10 to acknowledge the OS
> + * interrupt. The reporting cache lines can be reset by inputting -1
> + * in "reportingLine".  Issuing the CI store byte without reporting
> + * cache lines registered will result in the data not being accessible
> + * to the OS.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-63: Reserved
> + * - "reportingLine": The logical real address of the reporting cache
> + *    line pair
> + *
> + * Output:
> + * - None
> + */
> +static target_ulong h_int_set_os_reporting_line(PowerPCCPU *cpu,
> +                                                sPAPRMachineState *spapr,
> +                                                target_ulong opcode,
> +                                                target_ulong *args)
> +{
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    /* TODO: H_INT_SET_OS_REPORTING_LINE */
> +    return H_FUNCTION;
> +}
> +
> +/*
> + * The H_INT_GET_OS_REPORTING_LINE hcall() is used to get the logical
> + * real address of the reporting cache line pair set for the input
> + * "target".  If no reporting cache line pair has been set, -1 is
> + * returned.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-63: Reserved
> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> + *       "ibm,ppc-interrupt-gserver#s"
> + * - "reportingLine": The logical real address of the reporting cache
> + *   line pair
> + *
> + * Output:
> + * - R4: The logical real address of the reporting line if set, else -1
> + */
> +static target_ulong h_int_get_os_reporting_line(PowerPCCPU *cpu,
> +                                                sPAPRMachineState *spapr,
> +                                                target_ulong opcode,
> +                                                target_ulong *args)
> +{
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    /* TODO: H_INT_GET_OS_REPORTING_LINE */
> +    return H_FUNCTION;
> +}
> +
> +/*
> + * The H_INT_ESB hcall() is used to issue a load or store to the ESB
> + * page for the input "lisn".  This hcall is only supported for LISNs
> + * that have the ESB hcall flag set to 1 when returned from hcall()
> + * H_INT_GET_SOURCE_INFO.

Is there a reason for specifically restricting this to LISNs which
advertise it, rather than allowing it for anything?  Obviously using
the direct MMIOs will generally be a faster option when possible, but
I could see occasions where it might be simpler for the guest to
always use H_INT_ESB (e.g. for micro-guests like kvm-unit-tests).

> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-62: Reserved
> + *      bit 63: Store: Store=1, store operation, else load operation
> + * - "lisn" is per "interrupts", "interrupt-map", or
> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> + *      ibm,query-interrupt-source-number RTAS call, or as
> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> + * - "esbOffset" is the offset into the ESB page for the load or store operation
> + * - "storeData" is the data to write for a store operation
> + *
> + * Output:
> + * - R4: R4: The value of the load if load operation, else -1
> + */
> +
> +#define SPAPR_XIVE_ESB_STORE PPC_BIT(63)
> +
> +static target_ulong h_int_esb(PowerPCCPU *cpu,
> +                              sPAPRMachineState *spapr,
> +                              target_ulong opcode,
> +                              target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveEAS eas;
> +    target_ulong flags  = args[0];
> +    target_ulong lisn   = args[1];
> +    target_ulong offset = args[2];
> +    target_ulong data   = args[3];
> +    hwaddr mmio_addr;
> +    XiveSource *xsrc = &xive->source;
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags & ~SPAPR_XIVE_ESB_STORE) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> +        return H_P2;
> +    }
> +
> +    if (!(eas.w & EAS_VALID)) {
> +        return H_P2;
> +    }
> +
> +    if (offset > (1ull << xsrc->esb_shift)) {
> +        return H_P3;
> +    }
> +
> +    mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
> +
> +    if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8,
> +                      (flags & SPAPR_XIVE_ESB_STORE))) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
> +                      HWADDR_PRIx "\n", mmio_addr);
> +        return H_HARDWARE;
> +    }
> +    args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data;
> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_SYNC hcall() is used to issue hardware syncs that will
> + * ensure any in flight events for the input lisn are in the event
> + * queue.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-63: Reserved
> + * - "lisn" is per "interrupts", "interrupt-map", or
> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> + *      ibm,query-interrupt-source-number RTAS call, or as
> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> + *
> + * Output:
> + * - None
> + */
> +static target_ulong h_int_sync(PowerPCCPU *cpu,
> +                               sPAPRMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    XiveEAS eas;
> +    target_ulong flags = args[0];
> +    target_ulong lisn = args[1];
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> +        return H_P2;
> +    }
> +
> +    if (!(eas.w & EAS_VALID)) {
> +        return H_P2;
> +    }
> +
> +    /*
> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> +     * This is not needed when running the emulation under QEMU
> +     */
> +
> +    /* This is not real hardware. Nothing to be done */

At least, not as long as all the XIVE operations are under the BQL.

> +    return H_SUCCESS;
> +}
> +
> +/*
> + * The H_INT_RESET hcall() is used to reset all of the partition's
> + * interrupt exploitation structures to their initial state.  This
> + * means losing all previously set interrupt state set via
> + * H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> + *
> + * Parameters:
> + * Input:
> + * - "flags"
> + *      Bits 0-63: Reserved
> + *
> + * Output:
> + * - None
> + */
> +static target_ulong h_int_reset(PowerPCCPU *cpu,
> +                                sPAPRMachineState *spapr,
> +                                target_ulong opcode,
> +                                target_ulong *args)
> +{
> +    sPAPRXive *xive = spapr->xive;
> +    target_ulong flags   = args[0];
> +
> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if (flags) {
> +        return H_PARAMETER;
> +    }
> +
> +    device_reset(DEVICE(xive));
> +    return H_SUCCESS;
> +}
> +
> +void spapr_xive_hcall_init(sPAPRMachineState *spapr)
> +{
> +    spapr_register_hypercall(H_INT_GET_SOURCE_INFO, h_int_get_source_info);
> +    spapr_register_hypercall(H_INT_SET_SOURCE_CONFIG, h_int_set_source_config);
> +    spapr_register_hypercall(H_INT_GET_SOURCE_CONFIG, h_int_get_source_config);
> +    spapr_register_hypercall(H_INT_GET_QUEUE_INFO, h_int_get_queue_info);
> +    spapr_register_hypercall(H_INT_SET_QUEUE_CONFIG, h_int_set_queue_config);
> +    spapr_register_hypercall(H_INT_GET_QUEUE_CONFIG, h_int_get_queue_config);
> +    spapr_register_hypercall(H_INT_SET_OS_REPORTING_LINE,
> +                             h_int_set_os_reporting_line);
> +    spapr_register_hypercall(H_INT_GET_OS_REPORTING_LINE,
> +                             h_int_get_os_reporting_line);
> +    spapr_register_hypercall(H_INT_ESB, h_int_esb);
> +    spapr_register_hypercall(H_INT_SYNC, h_int_sync);
> +    spapr_register_hypercall(H_INT_RESET, h_int_reset);
> +}
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 2569ae1bc7f8..da6fcfaa3c52 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -258,6 +258,8 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
>          error_propagate(errp, local_err);
>          return;
>      }
> +
> +    spapr_xive_hcall_init(spapr);
>  }
>  
>  static int spapr_irq_claim_xive(sPAPRMachineState *spapr, int irq, bool lsi,
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 301a8e972d91..eacd26836ebf 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -38,7 +38,7 @@ obj-$(CONFIG_XICS) += xics.o
>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>  obj-$(CONFIG_XIVE) += xive.o
> -obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
> +obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
Cédric Le Goater Nov. 28, 2018, 10:21 p.m. UTC | #2
On 11/28/18 5:25 AM, David Gibson wrote:
> On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
>> The different XIVE virtualization structures (sources and event queues)
>> are configured with a set of Hypervisor calls :
>>
>>  - H_INT_GET_SOURCE_INFO
>>
>>    used to obtain the address of the MMIO page of the Event State
>>    Buffer (ESB) entry associated with the source.
>>
>>  - H_INT_SET_SOURCE_CONFIG
>>
>>    assigns a source to a "target".
>>
>>  - H_INT_GET_SOURCE_CONFIG
>>
>>    determines which "target" and "priority" is assigned to a source
>>
>>  - H_INT_GET_QUEUE_INFO
>>
>>    returns the address of the notification management page associated
>>    with the specified "target" and "priority".
>>
>>  - H_INT_SET_QUEUE_CONFIG
>>
>>    sets or resets the event queue for a given "target" and "priority".
>>    It is also used to set the notification configuration associated
>>    with the queue, only unconditional notification is supported for
>>    the moment. Reset is performed with a queue size of 0 and queueing
>>    is disabled in that case.
>>
>>  - H_INT_GET_QUEUE_CONFIG
>>
>>    returns the queue settings for a given "target" and "priority".
>>
>>  - H_INT_RESET
>>
>>    resets all of the guest's internal interrupt structures to their
>>    initial state, losing all configuration set via the hcalls
>>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
>>
>>  - H_INT_SYNC
>>
>>    issue a synchronisation on a source to make sure all notifications
>>    have reached their queue.
>>
>> Calls that still need to be addressed :
>>
>>    H_INT_SET_OS_REPORTING_LINE
>>    H_INT_GET_OS_REPORTING_LINE
>>
>> See the code for more documentation on each hcall.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  include/hw/ppc/spapr.h      |  15 +-
>>  include/hw/ppc/spapr_xive.h |   6 +
>>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
>>  hw/ppc/spapr_irq.c          |   2 +
>>  hw/intc/Makefile.objs       |   2 +-
>>  5 files changed, 915 insertions(+), 2 deletions(-)
>>  create mode 100644 hw/intc/spapr_xive_hcall.c
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 1fbc2663e06c..8415faea7b82 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
>>  #define H_INVALIDATE_PID        0x378
>>  #define H_REGISTER_PROC_TBL     0x37C
>>  #define H_SIGNAL_SYS_RESET      0x380
>> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
>> +
>> +#define H_INT_GET_SOURCE_INFO   0x3A8
>> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
>> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
>> +#define H_INT_GET_QUEUE_INFO    0x3B4
>> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
>> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
>> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
>> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
>> +#define H_INT_ESB               0x3C8
>> +#define H_INT_SYNC              0x3CC
>> +#define H_INT_RESET             0x3D0
>> +
>> +#define MAX_HCALL_OPCODE        H_INT_RESET
>>  
>>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>>   * as well.
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 3f65b8f485fd..418511f3dc10 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
>>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
>>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
>>  
>> +bool spapr_xive_priority_is_valid(uint8_t priority);
> 
> AFAICT this could be a local function.

the KVM model uses it also, when collecting state from the KVM device 
to build the QEMU ENDT.

>> +
>> +typedef struct sPAPRMachineState sPAPRMachineState;
>> +
>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
>> +
>>  #endif /* PPC_SPAPR_XIVE_H */
>> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
>> new file mode 100644
>> index 000000000000..52e4e23995f5
>> --- /dev/null
>> +++ b/hw/intc/spapr_xive_hcall.c
>> @@ -0,0 +1,892 @@
>> +/*
>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
>> + *
>> + * Copyright (c) 2017-2018, IBM Corporation.
>> + *
>> + * This code is licensed under the GPL version 2 or later. See the
>> + * COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qapi/error.h"
>> +#include "cpu.h"
>> +#include "hw/ppc/fdt.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_xive.h"
>> +#include "hw/ppc/xive_regs.h"
>> +#include "monitor/monitor.h"
> 
> Fwiw, I don't think it's particularly necessary to split the hcall
> handling out into a separate .c file.

ok. let's move it to spapr_xive then ? It might help in reducing the 
exported funtions. 

>> +/*
>> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
>> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
>> + * available for the guest.
> 
> Referencing OPAL behaviour doesn't really make sense in the context of
> PAPR.  

It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
constraint also.

> What I think you're getting at is that the PAPR spec only
> allows a PAPR guest to use priorities 0..6 (or at least it will if the
> XIVE updated spec ever gets published).  

It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
 
> The fact that this allows the
> host use 7 for escalations is a design rationale 
> but not really relevant to the guest device itself. 

The guest should be aware of which priorities are reserved for
the hypervisor though.

>> + */
>> +bool spapr_xive_priority_is_valid(uint8_t priority)
>> +{
>> +    switch (priority) {
>> +    case 0 ... 6:
>> +        return true;
>> +    case 7: /* OPAL escalation queue */
>> +    default:
>> +        return false;
>> +    }
>> +}
>> +
>> +/*
>> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
>> + * real address of the MMIO page through which the Event State Buffer
>> + * entry associated with the value of the "lisn" parameter is managed.
>> + *
>> + * Parameters:
>> + * Input
>> + * - "flags"
>> + *       Bits 0-63 reserved
>> + * - "lisn" is per "interrupts", "interrupt-map", or
>> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
>> + *       ibm,query-interrupt-source-number RTAS call, or as returned
>> + *       by the H_ALLOCATE_VAS_WINDOW hcall
> 
> I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
> to implement in kvm/qemu, or is it only of interest for PowerVM?

The hcall is part of the PAPR NX Interfaces and it returns interrupt
numbers. I don't know if any work has been done on the topic.  
 
> Also, putting the register numbers on the inputs as well as the
> outputs would be helpful.

yes. I will add them.

>> + *
>> + * Output
>> + * - R4: "flags"
>> + *       Bits 0-59: Reserved
>> + *       Bit 60: H_INT_ESB must be used for Event State Buffer
>> + *               management
>> + *       Bit 61: 1 == LSI  0 == MSI
>> + *       Bit 62: the full function page supports trigger
>> + *       Bit 63: Store EOI Supported
>> + * - R5: Logical Real address of full function Event State Buffer
>> + *       management page, -1 if ESB hcall flag is set to 1.
> 
> You've defined what H_INT_ESB means above, so it will be clearer if
> you reference that by name here.

yes. 

>> + * - R6: Logical Real Address of trigger only Event State Buffer
>> + *       management page or -1.
>> + * - R7: Power of 2 page size for the ESB management pages returned in
>> + *       R5 and R6.
>> + */
>> +
>> +#define SPAPR_XIVE_SRC_H_INT_ESB     PPC_BIT(60) /* ESB manage with H_INT_ESB */
>> +#define SPAPR_XIVE_SRC_LSI           PPC_BIT(61) /* Virtual LSI type */
>> +#define SPAPR_XIVE_SRC_TRIGGER       PPC_BIT(62) /* Trigger and management
>> +                                                    on same page */
>> +#define SPAPR_XIVE_SRC_STORE_EOI     PPC_BIT(63) /* Store EOI support */
> 
> Probably makes sense to put these #defines in spapr.h since they form
> part of the PAPR interface definition.

ok.

> 
>> +static target_ulong h_int_get_source_info(PowerPCCPU *cpu,
>> +                                          sPAPRMachineState *spapr,
>> +                                          target_ulong opcode,
>> +                                          target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveSource *xsrc = &xive->source;
>> +    XiveEAS eas;
>> +    target_ulong flags  = args[0];
>> +    target_ulong lisn   = args[1];
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (!(eas.w & EAS_VALID)) {
>> +        return H_P2;
>> +    }
>> +
>> +    /* All sources are emulated under the main XIVE object and share
>> +     * the same characteristics.
>> +     */
>> +    args[0] = 0;
>> +    if (!xive_source_esb_has_2page(xsrc)) {
>> +        args[0] |= SPAPR_XIVE_SRC_TRIGGER;
>> +    }
>> +    if (xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
>> +        args[0] |= SPAPR_XIVE_SRC_STORE_EOI;
>> +    }
>> +
>> +    /*
>> +     * Force the use of the H_INT_ESB hcall in case of an LSI
>> +     * interrupt. This is necessary under KVM to re-trigger the
>> +     * interrupt if the level is still asserted
>> +     */
>> +    if (xive_source_irq_is_lsi(xsrc, lisn)) {
>> +        args[0] |= SPAPR_XIVE_SRC_H_INT_ESB | SPAPR_XIVE_SRC_LSI;
>> +    }
>> +
>> +    if (!(args[0] & SPAPR_XIVE_SRC_H_INT_ESB)) {
>> +        args[1] = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn);
>> +    } else {
>> +        args[1] = -1;
>> +    }
>> +
>> +    if (xive_source_esb_has_2page(xsrc)) {
>> +        args[2] = xive->vc_base + xive_source_esb_page(xsrc, lisn);
>> +    } else {
>> +        args[2] = -1;
>> +    }
> 
> Do we also need to keep this address clear in the H_INT_ESB case?

I think not, but the specs are not very clear on that topic. I will
ask for clarification and use a -1 for now. We can not do loads on
the trigger page so it can not be used by the H_INT_ESB hcall.

> 
>> +    args[3] = TARGET_PAGE_SIZE;
> 
> That seems wrong.  

This is utterly wrong. it should be a power of 2 number ... I got
it right under KVM though. I guess that ioremap() under Linux rounds 
up the size to the page size in use, so, that's why it didn't blow
up under TCG.

> TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
> actually be 64kiB?

yes. So what should I use to get a PAGE_SHIFT instead ? 
 
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_SET_SOURCE_CONFIG hcall() is used to assign a Logical
>> + * Interrupt Source to a target. The Logical Interrupt Source is
>> + * designated with the "lisn" parameter and the target is designated
>> + * with the "target" and "priority" parameters.  Upon return from the
>> + * hcall(), no additional interrupts will be directed to the old EQ.
>> + *
>> + * TODO: The old EQ should be investigated for interrupts that
>> + * occurred prior to or during the hcall().
> 
> Isn't that the responsibility of the guest?

It should yes.

> 
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-61: Reserved
>> + *      Bit 62: set the "eisn" in the EA
> 
> What's the "EA"?  Do you mean the EAS?

Another XIVE acronym, EA for Event Assignment. I think we can forget
this one and just use EAS.
 
> 
>> + *      Bit 63: masks the interrupt source in the hardware interrupt
>> + *      control structure. An interrupt masked by this mechanism will
>> + *      be dropped, but it's source state bits will still be
>> + *      set. There is no race-free way of unmasking and restoring the
>> + *      source. Thus this should only be used in interrupts that are
>> + *      also masked at the source, and only in cases where the
>> + *      interrupt is not meant to be used for a large amount of time
>> + *      because no valid target exists for it for example
>> + * - "lisn" is per "interrupts", "interrupt-map", or
>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>> + *      ibm,query-interrupt-source-number RTAS call, or as returned by
>> + *      the H_ALLOCATE_VAS_WINDOW hcall
>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>> + *      "ibm,ppc-interrupt-gserver#s"
>> + * - "priority" is a valid priority not in
>> + *      "ibm,plat-res-int-priorities"
>> + * - "eisn" is the guest EISN associated with the "lisn"
> 
> I don't think the EISN term has been used before in the series.  

Effective Interrupt Source Number, which is the event data enqueued
in the OS EQ.

I'm planning on adding some more acronyms used by the sPAPR hcalls
in this file. There are only a couple.
 
> I'm guessing this is the guest-assigned global interrupt number?

yes 

>> + *
>> + * Output:
>> + * - None
>> + */
>> +
>> +#define SPAPR_XIVE_SRC_SET_EISN PPC_BIT(62)
>> +#define SPAPR_XIVE_SRC_MASK     PPC_BIT(63)
>> +
>> +static target_ulong h_int_set_source_config(PowerPCCPU *cpu,
>> +                                            sPAPRMachineState *spapr,
>> +                                            target_ulong opcode,
>> +                                            target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>> +    XiveEAS eas, new_eas;
>> +    target_ulong flags    = args[0];
>> +    target_ulong lisn     = args[1];
>> +    target_ulong target   = args[2];
>> +    target_ulong priority = args[3];
>> +    target_ulong eisn     = args[4];
>> +    uint8_t end_blk;
>> +    uint32_t end_idx;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags & ~(SPAPR_XIVE_SRC_SET_EISN | SPAPR_XIVE_SRC_MASK)) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (!(eas.w & EAS_VALID)) {
>> +        return H_P2;
>> +    }
>> +
>> +    /* priority 0xff is used to reset the EAS */
>> +    if (priority == 0xff) {
>> +        new_eas.w = EAS_VALID | EAS_MASKED;
>> +        goto out;
>> +    }
>> +
>> +    if (flags & SPAPR_XIVE_SRC_MASK) {
>> +        new_eas.w = eas.w | EAS_MASKED;
>> +    } else {
>> +        new_eas.w = eas.w & ~EAS_MASKED;
>> +    }
>> +
>> +    if (!spapr_xive_priority_is_valid(priority)) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>> +                      priority);
>> +        return H_P4;
>> +    }
>> +
>> +    /* Validate that "target" is part of the list of threads allocated
>> +     * to the partition. For that, find the END corresponding to the
>> +     * target.
>> +     */
>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>> +        return H_P3;
>> +    }
>> +
>> +    new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
>> +    new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
>> +
>> +    if (flags & SPAPR_XIVE_SRC_SET_EISN) {
>> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
>> +    }
>> +
>> +out:
>> +    if (xive_router_set_eas(xrtr, lisn, &new_eas)) {
>> +        return H_HARDWARE;
>> +    }
> 
> As noted earlier in the series, the spapr specific code owns the
> memory backing the EAT, so you can just access it directly rather than
> using a method here.

Yes. I will give a try. I wonder if I need accessors for the tables ? 

> 
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_GET_SOURCE_CONFIG hcall() is used to determine to which
>> + * target/priority pair is assigned to the specified Logical Interrupt
>> + * Source.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-63 Reserved
>> + * - "lisn" is per "interrupts", "interrupt-map", or
>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>> + *      ibm,query-interrupt-source-number RTAS call, or as
>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>> + *
>> + * Output:
>> + * - R4: Target to which the specified Logical Interrupt Source is
>> + *       assigned
>> + * - R5: Priority to which the specified Logical Interrupt Source is
>> + *       assigned
>> + * - R6: EISN for the specified Logical Interrupt Source (this will be
>> + *       equivalent to the LISN if not changed by H_INT_SET_SOURCE_CONFIG)
>> + */
>> +static target_ulong h_int_get_source_config(PowerPCCPU *cpu,
>> +                                            sPAPRMachineState *spapr,
>> +                                            target_ulong opcode,
>> +                                            target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>> +    target_ulong flags = args[0];
>> +    target_ulong lisn = args[1];
>> +    XiveEAS eas;
>> +    XiveEND end;
>> +    uint8_t end_blk, nvt_blk;
>> +    uint32_t end_idx, nvt_idx;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (!(eas.w & EAS_VALID)) {
>> +        return H_P2;
>> +    }
>> +
>> +    end_blk = GETFIELD(EAS_END_BLOCK, eas.w);
>> +    end_idx = GETFIELD(EAS_END_INDEX, eas.w);
>> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
>> +        /* Not sure what to return here */
>> +        return H_HARDWARE;
> 
> IIUC this indicates a bug in the PAPR specific code, not the guest, so
> an assert() is probably the right answer.

ok

>> +    }
>> +
>> +    nvt_blk = GETFIELD(END_W6_NVT_BLOCK, end.w6);
>> +    nvt_idx = GETFIELD(END_W6_NVT_INDEX, end.w6);
>> +    args[0] = spapr_xive_nvt_to_target(xive, nvt_blk, nvt_idx);
> 
> AIUI there's a specific END for each target & priority, so you could
> avoid this second level lookup, 

yes 

> although I guess this might be
> valuable if we do more complicated internal routing in the future.

I am not sure of that but I'd rather keep these converting helpers
for the moment.
 
>> +    if (eas.w & EAS_MASKED) {
>> +        args[1] = 0xff;
>> +    } else {
>> +        args[1] = GETFIELD(END_W7_F0_PRIORITY, end.w7);
>> +    }
>> +
>> +    args[2] = GETFIELD(EAS_END_DATA, eas.w);
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_GET_QUEUE_INFO hcall() is used to get the logical real
>> + * address of the notification management page associated with the
>> + * specified target and priority.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *       Bits 0-63 Reserved
>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>> + *       "ibm,ppc-interrupt-gserver#s"
>> + * - "priority" is a valid priority not in
>> + *       "ibm,plat-res-int-priorities"
>> + *
>> + * Output:
>> + * - R4: Logical real address of notification page
>> + * - R5: Power of 2 page size of the notification page
>> + */
>> +static target_ulong h_int_get_queue_info(PowerPCCPU *cpu,
>> +                                         sPAPRMachineState *spapr,
>> +                                         target_ulong opcode,
>> +                                         target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveENDSource *end_xsrc = &xive->end_source;
>> +    target_ulong flags = args[0];
>> +    target_ulong target = args[1];
>> +    target_ulong priority = args[2];
>> +    XiveEND end;
>> +    uint8_t end_blk;
>> +    uint32_t end_idx;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    if (!spapr_xive_priority_is_valid(priority)) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>> +                      priority);
>> +        return H_P3;
>> +    }
>> +
>> +    /* Validate that "target" is part of the list of threads allocated
>> +     * to the partition. For that, find the END corresponding to the
>> +     * target.
>> +     */
>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
>> +        return H_HARDWARE;
>> +    }
>> +
>> +    args[0] = xive->end_base + (1ull << (end_xsrc->esb_shift + 1)) * end_idx;
>> +    if (end.w0 & END_W0_ENQUEUE) {
>> +        args[1] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
>> +    } else {
>> +        args[1] = 0;
>> +    }
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_SET_QUEUE_CONFIG hcall() is used to set or reset a EQ for
>> + * a given "target" and "priority".  It is also used to set the
>> + * notification config associated with the EQ.  An EQ size of 0 is
>> + * used to reset the EQ config for a given target and priority. If
>> + * resetting the EQ config, the END associated with the given "target"
>> + * and "priority" will be changed to disable queueing.
>> + *
>> + * Upon return from the hcall(), no additional interrupts will be
>> + * directed to the old EQ (if one was set). The old EQ (if one was
>> + * set) should be investigated for interrupts that occurred prior to
>> + * or during the hcall().
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-62: Reserved
>> + *      Bit 63: Unconditional Notify (n) per the XIVE spec
>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>> + *       "ibm,ppc-interrupt-gserver#s"
>> + * - "priority" is a valid priority not in
>> + *       "ibm,plat-res-int-priorities"
>> + * - "eventQueue": The logical real address of the start of the EQ
>> + * - "eventQueueSize": The power of 2 EQ size per "ibm,xive-eq-sizes"
>> + *
>> + * Output:
>> + * - None
>> + */
>> +
>> +#define SPAPR_XIVE_END_ALWAYS_NOTIFY PPC_BIT(63)
>> +
>> +static target_ulong h_int_set_queue_config(PowerPCCPU *cpu,
>> +                                           sPAPRMachineState *spapr,
>> +                                           target_ulong opcode,
>> +                                           target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>> +    target_ulong flags = args[0];
>> +    target_ulong target = args[1];
>> +    target_ulong priority = args[2];
>> +    target_ulong qpage = args[3];
>> +    target_ulong qsize = args[4];
>> +    XiveEND end;
>> +    uint8_t end_blk, nvt_blk;
>> +    uint32_t end_idx, nvt_idx;
>> +    uint32_t qdata;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags & ~SPAPR_XIVE_END_ALWAYS_NOTIFY) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    if (!spapr_xive_priority_is_valid(priority)) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>> +                      priority);
>> +        return H_P3;
>> +    }
>> +
>> +    /* Validate that "target" is part of the list of threads allocated
>> +     * to the partition. For that, find the END corresponding to the
>> +     * target.
>> +     */
>> +
>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
>> +        return H_HARDWARE;
> 
> Again, I think this indicates a qemu (spapr) code bug, so could be an assert().

ok

> 
>> +    }
>> +
>> +    switch (qsize) {
>> +    case 12:
>> +    case 16:
>> +    case 21:
>> +    case 24:
>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
> 
> It just occurred to me that I haven't been looking for this across any
> of these reviews.  Don't you need byteswaps when accessing these
> in-memory structures?

yes this is done when some event data is enqueued in the EQ.

> 
>> +        end.w2 = (((uint64_t)qpage)) >> 32 & 0x0fffffff;
>> +        end.w0 |= END_W0_ENQUEUE;
>> +        end.w0 = SETFIELD(END_W0_QSIZE, end.w0, qsize - 12);
>> +        break;
>> +    case 0:
>> +        /* reset queue and disable queueing */
>> +        xive_end_reset(&end);
>> +        goto out;
>> +
>> +    default:
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid EQ size %"PRIx64"\n",
>> +                      qsize);
>> +        return H_P5;
>> +    }
>> +
>> +    if (qsize) {
>> +        /*
>> +         * Let's validate the EQ address with a read of the first EQ
>> +         * entry. We could also check that the full queue has been
>> +         * zeroed by the OS.
>> +         */
>> +        if (address_space_read(&address_space_memory, qpage,
>> +                               MEMTXATTRS_UNSPECIFIED,
>> +                               (uint8_t *) &qdata, sizeof(qdata))) {
>> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to read EQ data @0x%"
>> +                          HWADDR_PRIx "\n", qpage);
>> +            return H_P4;
> 
> Just checking the first entry doesn't seem entirely safe.  Using
> address_space_map() and making sure the returned plen doesn't get
> reduced below the queue size might be a better option.

ok. That was on my todo list.

> 
>> +        }
>> +    }
>> +
>> +    if (spapr_xive_target_to_nvt(xive, target, &nvt_blk, &nvt_idx)) {
>> +        return H_HARDWARE;
> 
> That could be caused by a bogus 'target' value, couldn't it?  

yes. It should have returned H_P2 above when spapr_xive_target_to_end() 
is called.

> In which
> case it a) should probably be checked earlier and b) should be
> H_PARAMETER or similar, not H_HARDWARE, yes?

H_P2 may be again. It should be checked earlier

> 
>> +    }
>> +
>> +    /* Ensure the priority and target are correctly set (they will not
>> +     * be right after allocation)
> 
> AIUI there's a static association from END to target in the PAPR
> model. 

yes. 8 priorities per cpu.

> So it seems to make more sense to get that set up right at
> initialization / reset, rather than doing it lazily when the 
> queue is configured.

Ah. You would preconfigure the word6 and word7 then. Yes, it would
save us some of the conversion fuss. I will look at it.

>> +     */
>> +    end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
>> +        SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
>> +    end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, priority);
>> +
>> +    if (flags & SPAPR_XIVE_END_ALWAYS_NOTIFY) {
>> +        end.w0 |= END_W0_UCOND_NOTIFY;
>> +    } else {
>> +        end.w0 &= ~END_W0_UCOND_NOTIFY;
>> +    }
>> +
>> +    /* The generation bit for the END starts at 1 and The END page
>> +     * offset counter starts at 0.
>> +     */
>> +    end.w1 = END_W1_GENERATION | SETFIELD(END_W1_PAGE_OFF, 0ul, 0ul);
>> +    end.w0 |= END_W0_VALID;
>> +
>> +    /* TODO: issue syncs required to ensure all in-flight interrupts
>> +     * are complete on the old END */
>> +out:
>> +    /* Update END */
>> +    if (xive_router_set_end(xrtr, end_blk, end_idx, &end)) {
>> +        return H_HARDWARE;
>> +    }
> 
> Again the PAPR code owns the ENDs, so it can update them directly
> rather than going through an abstraction.

ok.

> 
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_GET_QUEUE_CONFIG hcall() is used to get a EQ for a given
>> + * target and priority.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-62: Reserved
>> + *      Bit 63: Debug: Return debug data
>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>> + *       "ibm,ppc-interrupt-gserver#s"
>> + * - "priority" is a valid priority not in
>> + *       "ibm,plat-res-int-priorities"
>> + *
>> + * Output:
>> + * - R4: "flags":
>> + *       Bits 0-61: Reserved
>> + *       Bit 62: The value of Event Queue Generation Number (g) per
>> + *              the XIVE spec if "Debug" = 1
>> + *       Bit 63: The value of Unconditional Notify (n) per the XIVE spec
>> + * - R5: The logical real address of the start of the EQ
>> + * - R6: The power of 2 EQ size per "ibm,xive-eq-sizes"
>> + * - R7: The value of Event Queue Offset Counter per XIVE spec
>> + *       if "Debug" = 1, else 0
>> + *
>> + */
>> +
>> +#define SPAPR_XIVE_END_DEBUG     PPC_BIT(63)
>> +
>> +static target_ulong h_int_get_queue_config(PowerPCCPU *cpu,
>> +                                           sPAPRMachineState *spapr,
>> +                                           target_ulong opcode,
>> +                                           target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    target_ulong flags = args[0];
>> +    target_ulong target = args[1];
>> +    target_ulong priority = args[2];
>> +    XiveEND end;
>> +    uint8_t end_blk;
>> +    uint32_t end_idx;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags & ~SPAPR_XIVE_END_DEBUG) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    if (!spapr_xive_priority_is_valid(priority)) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>> +                      priority);
>> +        return H_P3;
>> +    }
>> +
>> +    /* Validate that "target" is part of the list of threads allocated
>> +     * to the partition. For that, find the END corresponding to the
>> +     * target.
>> +     */
>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
>> +        return H_HARDWARE;
> 
> Again, assert() seems appropriate here.

ok

> 
>> +    }
>> +
>> +    args[0] = 0;
>> +    if (end.w0 & END_W0_UCOND_NOTIFY) {
>> +        args[0] |= SPAPR_XIVE_END_ALWAYS_NOTIFY;
>> +    }
>> +
>> +    if (end.w0 & END_W0_ENQUEUE) {
>> +        args[1] =
>> +            (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
>> +        args[2] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
>> +    } else {
>> +        args[1] = 0;
>> +        args[2] = 0;
>> +    }
>> +
>> +    /* TODO: do we need any locking on the END ? */
>> +    if (flags & SPAPR_XIVE_END_DEBUG) {
>> +        /* Load the event queue generation number into the return flags */
>> +        args[0] |= (uint64_t)GETFIELD(END_W1_GENERATION, end.w1) << 62;
>> +
>> +        /* Load R7 with the event queue offset counter */
>> +        args[3] = GETFIELD(END_W1_PAGE_OFF, end.w1);
>> +    } else {
>> +        args[3] = 0;
>> +    }
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_SET_OS_REPORTING_LINE hcall() is used to set the
>> + * reporting cache line pair for the calling thread.  The reporting
>> + * cache lines will contain the OS interrupt context when the OS
>> + * issues a CI store byte to @TIMA+0xC10 to acknowledge the OS
>> + * interrupt. The reporting cache lines can be reset by inputting -1
>> + * in "reportingLine".  Issuing the CI store byte without reporting
>> + * cache lines registered will result in the data not being accessible
>> + * to the OS.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-63: Reserved
>> + * - "reportingLine": The logical real address of the reporting cache
>> + *    line pair
>> + *
>> + * Output:
>> + * - None
>> + */
>> +static target_ulong h_int_set_os_reporting_line(PowerPCCPU *cpu,
>> +                                                sPAPRMachineState *spapr,
>> +                                                target_ulong opcode,
>> +                                                target_ulong *args)
>> +{
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    /* TODO: H_INT_SET_OS_REPORTING_LINE */
>> +    return H_FUNCTION;
>> +}
>> +
>> +/*
>> + * The H_INT_GET_OS_REPORTING_LINE hcall() is used to get the logical
>> + * real address of the reporting cache line pair set for the input
>> + * "target".  If no reporting cache line pair has been set, -1 is
>> + * returned.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-63: Reserved
>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>> + *       "ibm,ppc-interrupt-gserver#s"
>> + * - "reportingLine": The logical real address of the reporting cache
>> + *   line pair
>> + *
>> + * Output:
>> + * - R4: The logical real address of the reporting line if set, else -1
>> + */
>> +static target_ulong h_int_get_os_reporting_line(PowerPCCPU *cpu,
>> +                                                sPAPRMachineState *spapr,
>> +                                                target_ulong opcode,
>> +                                                target_ulong *args)
>> +{
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    /* TODO: H_INT_GET_OS_REPORTING_LINE */
>> +    return H_FUNCTION;
>> +}
>> +
>> +/*
>> + * The H_INT_ESB hcall() is used to issue a load or store to the ESB
>> + * page for the input "lisn".  This hcall is only supported for LISNs
>> + * that have the ESB hcall flag set to 1 when returned from hcall()
>> + * H_INT_GET_SOURCE_INFO.
> 
> Is there a reason for specifically restricting this to LISNs which
> advertise it, rather than allowing it for anything? 

It's in the specs but I did not implement the check. So H_INT_ESB can be 
used today by the OS for any interrupt number. Same under KVM.

But I should say so somewhere.

> Obviously using
> the direct MMIOs will generally be a faster option when possible, but
> I could see occasions where it might be simpler for the guest to
> always use H_INT_ESB (e.g. for micro-guests like kvm-unit-tests).

can not you use direct load and stores in these guests ? I haven't 
looked at how they are implemented.

> 
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-62: Reserved
>> + *      bit 63: Store: Store=1, store operation, else load operation
>> + * - "lisn" is per "interrupts", "interrupt-map", or
>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>> + *      ibm,query-interrupt-source-number RTAS call, or as
>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>> + * - "esbOffset" is the offset into the ESB page for the load or store operation
>> + * - "storeData" is the data to write for a store operation
>> + *
>> + * Output:
>> + * - R4: R4: The value of the load if load operation, else -1
>> + */
>> +
>> +#define SPAPR_XIVE_ESB_STORE PPC_BIT(63)
>> +
>> +static target_ulong h_int_esb(PowerPCCPU *cpu,
>> +                              sPAPRMachineState *spapr,
>> +                              target_ulong opcode,
>> +                              target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveEAS eas;
>> +    target_ulong flags  = args[0];
>> +    target_ulong lisn   = args[1];
>> +    target_ulong offset = args[2];
>> +    target_ulong data   = args[3];
>> +    hwaddr mmio_addr;
>> +    XiveSource *xsrc = &xive->source;
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags & ~SPAPR_XIVE_ESB_STORE) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (!(eas.w & EAS_VALID)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (offset > (1ull << xsrc->esb_shift)) {
>> +        return H_P3;
>> +    }
>> +
>> +    mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
>> +
>> +    if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8,
>> +                      (flags & SPAPR_XIVE_ESB_STORE))) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
>> +                      HWADDR_PRIx "\n", mmio_addr);
>> +        return H_HARDWARE;
>> +    }
>> +    args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data;
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_SYNC hcall() is used to issue hardware syncs that will
>> + * ensure any in flight events for the input lisn are in the event
>> + * queue.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-63: Reserved
>> + * - "lisn" is per "interrupts", "interrupt-map", or
>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>> + *      ibm,query-interrupt-source-number RTAS call, or as
>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>> + *
>> + * Output:
>> + * - None
>> + */
>> +static target_ulong h_int_sync(PowerPCCPU *cpu,
>> +                               sPAPRMachineState *spapr,
>> +                               target_ulong opcode,
>> +                               target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    XiveEAS eas;
>> +    target_ulong flags = args[0];
>> +    target_ulong lisn = args[1];
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (!(eas.w & EAS_VALID)) {
>> +        return H_P2;
>> +    }
>> +
>> +    /*
>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>> +     * This is not needed when running the emulation under QEMU
>> +     */
>> +
>> +    /* This is not real hardware. Nothing to be done */
> 
> At least, not as long as all the XIVE operations are under the BQL.

yes.

> 
>> +    return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * The H_INT_RESET hcall() is used to reset all of the partition's
>> + * interrupt exploitation structures to their initial state.  This
>> + * means losing all previously set interrupt state set via
>> + * H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
>> + *
>> + * Parameters:
>> + * Input:
>> + * - "flags"
>> + *      Bits 0-63: Reserved
>> + *
>> + * Output:
>> + * - None
>> + */
>> +static target_ulong h_int_reset(PowerPCCPU *cpu,
>> +                                sPAPRMachineState *spapr,
>> +                                target_ulong opcode,
>> +                                target_ulong *args)
>> +{
>> +    sPAPRXive *xive = spapr->xive;
>> +    target_ulong flags   = args[0];
>> +
>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        return H_FUNCTION;
>> +    }
>> +
>> +    if (flags) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    device_reset(DEVICE(xive));
>> +    return H_SUCCESS;
>> +}
>> +
>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr)
>> +{
>> +    spapr_register_hypercall(H_INT_GET_SOURCE_INFO, h_int_get_source_info);
>> +    spapr_register_hypercall(H_INT_SET_SOURCE_CONFIG, h_int_set_source_config);
>> +    spapr_register_hypercall(H_INT_GET_SOURCE_CONFIG, h_int_get_source_config);
>> +    spapr_register_hypercall(H_INT_GET_QUEUE_INFO, h_int_get_queue_info);
>> +    spapr_register_hypercall(H_INT_SET_QUEUE_CONFIG, h_int_set_queue_config);
>> +    spapr_register_hypercall(H_INT_GET_QUEUE_CONFIG, h_int_get_queue_config);
>> +    spapr_register_hypercall(H_INT_SET_OS_REPORTING_LINE,
>> +                             h_int_set_os_reporting_line);
>> +    spapr_register_hypercall(H_INT_GET_OS_REPORTING_LINE,
>> +                             h_int_get_os_reporting_line);
>> +    spapr_register_hypercall(H_INT_ESB, h_int_esb);
>> +    spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>> +    spapr_register_hypercall(H_INT_RESET, h_int_reset);
>> +}
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index 2569ae1bc7f8..da6fcfaa3c52 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -258,6 +258,8 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
>>          error_propagate(errp, local_err);
>>          return;
>>      }
>> +
>> +    spapr_xive_hcall_init(spapr);
>>  }
>>  
>>  static int spapr_irq_claim_xive(sPAPRMachineState *spapr, int irq, bool lsi,
>> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
>> index 301a8e972d91..eacd26836ebf 100644
>> --- a/hw/intc/Makefile.objs
>> +++ b/hw/intc/Makefile.objs
>> @@ -38,7 +38,7 @@ obj-$(CONFIG_XICS) += xics.o
>>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>>  obj-$(CONFIG_XIVE) += xive.o
>> -obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
>> +obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
>>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
>
David Gibson Nov. 29, 2018, 1:23 a.m. UTC | #3
On Wed, Nov 28, 2018 at 11:21:37PM +0100, Cédric Le Goater wrote:
> On 11/28/18 5:25 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
> >> The different XIVE virtualization structures (sources and event queues)
> >> are configured with a set of Hypervisor calls :
> >>
> >>  - H_INT_GET_SOURCE_INFO
> >>
> >>    used to obtain the address of the MMIO page of the Event State
> >>    Buffer (ESB) entry associated with the source.
> >>
> >>  - H_INT_SET_SOURCE_CONFIG
> >>
> >>    assigns a source to a "target".
> >>
> >>  - H_INT_GET_SOURCE_CONFIG
> >>
> >>    determines which "target" and "priority" is assigned to a source
> >>
> >>  - H_INT_GET_QUEUE_INFO
> >>
> >>    returns the address of the notification management page associated
> >>    with the specified "target" and "priority".
> >>
> >>  - H_INT_SET_QUEUE_CONFIG
> >>
> >>    sets or resets the event queue for a given "target" and "priority".
> >>    It is also used to set the notification configuration associated
> >>    with the queue, only unconditional notification is supported for
> >>    the moment. Reset is performed with a queue size of 0 and queueing
> >>    is disabled in that case.
> >>
> >>  - H_INT_GET_QUEUE_CONFIG
> >>
> >>    returns the queue settings for a given "target" and "priority".
> >>
> >>  - H_INT_RESET
> >>
> >>    resets all of the guest's internal interrupt structures to their
> >>    initial state, losing all configuration set via the hcalls
> >>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> >>
> >>  - H_INT_SYNC
> >>
> >>    issue a synchronisation on a source to make sure all notifications
> >>    have reached their queue.
> >>
> >> Calls that still need to be addressed :
> >>
> >>    H_INT_SET_OS_REPORTING_LINE
> >>    H_INT_GET_OS_REPORTING_LINE
> >>
> >> See the code for more documentation on each hcall.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  include/hw/ppc/spapr.h      |  15 +-
> >>  include/hw/ppc/spapr_xive.h |   6 +
> >>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
> >>  hw/ppc/spapr_irq.c          |   2 +
> >>  hw/intc/Makefile.objs       |   2 +-
> >>  5 files changed, 915 insertions(+), 2 deletions(-)
> >>  create mode 100644 hw/intc/spapr_xive_hcall.c
> >>
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index 1fbc2663e06c..8415faea7b82 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
> >>  #define H_INVALIDATE_PID        0x378
> >>  #define H_REGISTER_PROC_TBL     0x37C
> >>  #define H_SIGNAL_SYS_RESET      0x380
> >> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
> >> +
> >> +#define H_INT_GET_SOURCE_INFO   0x3A8
> >> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
> >> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
> >> +#define H_INT_GET_QUEUE_INFO    0x3B4
> >> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
> >> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
> >> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
> >> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
> >> +#define H_INT_ESB               0x3C8
> >> +#define H_INT_SYNC              0x3CC
> >> +#define H_INT_RESET             0x3D0
> >> +
> >> +#define MAX_HCALL_OPCODE        H_INT_RESET
> >>  
> >>  /* The hcalls above are standardized in PAPR and implemented by pHyp
> >>   * as well.
> >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >> index 3f65b8f485fd..418511f3dc10 100644
> >> --- a/include/hw/ppc/spapr_xive.h
> >> +++ b/include/hw/ppc/spapr_xive.h
> >> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
> >>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
> >>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
> >>  
> >> +bool spapr_xive_priority_is_valid(uint8_t priority);
> > 
> > AFAICT this could be a local function.
> 
> the KVM model uses it also, when collecting state from the KVM device 
> to build the QEMU ENDT.
> 
> >> +
> >> +typedef struct sPAPRMachineState sPAPRMachineState;
> >> +
> >> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
> >> +
> >>  #endif /* PPC_SPAPR_XIVE_H */
> >> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
> >> new file mode 100644
> >> index 000000000000..52e4e23995f5
> >> --- /dev/null
> >> +++ b/hw/intc/spapr_xive_hcall.c
> >> @@ -0,0 +1,892 @@
> >> +/*
> >> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> >> + *
> >> + * Copyright (c) 2017-2018, IBM Corporation.
> >> + *
> >> + * This code is licensed under the GPL version 2 or later. See the
> >> + * COPYING file in the top-level directory.
> >> + */
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qemu/log.h"
> >> +#include "qapi/error.h"
> >> +#include "cpu.h"
> >> +#include "hw/ppc/fdt.h"
> >> +#include "hw/ppc/spapr.h"
> >> +#include "hw/ppc/spapr_xive.h"
> >> +#include "hw/ppc/xive_regs.h"
> >> +#include "monitor/monitor.h"
> > 
> > Fwiw, I don't think it's particularly necessary to split the hcall
> > handling out into a separate .c file.
> 
> ok. let's move it to spapr_xive then ? It might help in reducing the 
> exported funtions. 

Yes, I think so.

> >> +/*
> >> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
> >> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
> >> + * available for the guest.
> > 
> > Referencing OPAL behaviour doesn't really make sense in the context of
> > PAPR.  
> 
> It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
> constraint also.

Right, I realized that a few patches on.  Maybe rephrase this to

   Linux hosts under OPAL reserve priority 7 for their own escalation
   interrupts.  So we only allow the guest to use priorities [0..6].

The point here is that we're emphasizing that this is a design
decision to make the host implementation easier, rather than a
fundamental constraint.

> > What I think you're getting at is that the PAPR spec only
> > allows a PAPR guest to use priorities 0..6 (or at least it will if the
> > XIVE updated spec ever gets published).  
> 
> It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
>  
> > The fact that this allows the
> > host use 7 for escalations is a design rationale 
> > but not really relevant to the guest device itself. 
> 
> The guest should be aware of which priorities are reserved for
> the hypervisor though.
> 
> >> + */
> >> +bool spapr_xive_priority_is_valid(uint8_t priority)
> >> +{
> >> +    switch (priority) {
> >> +    case 0 ... 6:
> >> +        return true;
> >> +    case 7: /* OPAL escalation queue */
> >> +    default:
> >> +        return false;
> >> +    }
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
> >> + * real address of the MMIO page through which the Event State Buffer
> >> + * entry associated with the value of the "lisn" parameter is managed.
> >> + *
> >> + * Parameters:
> >> + * Input
> >> + * - "flags"
> >> + *       Bits 0-63 reserved
> >> + * - "lisn" is per "interrupts", "interrupt-map", or
> >> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
> >> + *       ibm,query-interrupt-source-number RTAS call, or as returned
> >> + *       by the H_ALLOCATE_VAS_WINDOW hcall
> > 
> > I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
> > to implement in kvm/qemu, or is it only of interest for PowerVM?
> 
> The hcall is part of the PAPR NX Interfaces and it returns interrupt
> numbers. I don't know if any work has been done on the topic.  

What's a "PAPR NX"?

> > Also, putting the register numbers on the inputs as well as the
> > outputs would be helpful.
> 
> yes. I will add them.
> 
> >> + *
> >> + * Output
> >> + * - R4: "flags"
> >> + *       Bits 0-59: Reserved
> >> + *       Bit 60: H_INT_ESB must be used for Event State Buffer
> >> + *               management
> >> + *       Bit 61: 1 == LSI  0 == MSI
> >> + *       Bit 62: the full function page supports trigger
> >> + *       Bit 63: Store EOI Supported
> >> + * - R5: Logical Real address of full function Event State Buffer
> >> + *       management page, -1 if ESB hcall flag is set to 1.
> > 
> > You've defined what H_INT_ESB means above, so it will be clearer if
> > you reference that by name here.
> 
> yes. 
> 
> >> + * - R6: Logical Real Address of trigger only Event State Buffer
> >> + *       management page or -1.
> >> + * - R7: Power of 2 page size for the ESB management pages returned in
> >> + *       R5 and R6.
> >> + */
> >> +
> >> +#define SPAPR_XIVE_SRC_H_INT_ESB     PPC_BIT(60) /* ESB manage with H_INT_ESB */
> >> +#define SPAPR_XIVE_SRC_LSI           PPC_BIT(61) /* Virtual LSI type */
> >> +#define SPAPR_XIVE_SRC_TRIGGER       PPC_BIT(62) /* Trigger and management
> >> +                                                    on same page */
> >> +#define SPAPR_XIVE_SRC_STORE_EOI     PPC_BIT(63) /* Store EOI support */
> > 
> > Probably makes sense to put these #defines in spapr.h since they form
> > part of the PAPR interface definition.
> 
> ok.
> 
> > 
> >> +static target_ulong h_int_get_source_info(PowerPCCPU *cpu,
> >> +                                          sPAPRMachineState *spapr,
> >> +                                          target_ulong opcode,
> >> +                                          target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveSource *xsrc = &xive->source;
> >> +    XiveEAS eas;
> >> +    target_ulong flags  = args[0];
> >> +    target_ulong lisn   = args[1];
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (!(eas.w & EAS_VALID)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    /* All sources are emulated under the main XIVE object and share
> >> +     * the same characteristics.
> >> +     */
> >> +    args[0] = 0;
> >> +    if (!xive_source_esb_has_2page(xsrc)) {
> >> +        args[0] |= SPAPR_XIVE_SRC_TRIGGER;
> >> +    }
> >> +    if (xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
> >> +        args[0] |= SPAPR_XIVE_SRC_STORE_EOI;
> >> +    }
> >> +
> >> +    /*
> >> +     * Force the use of the H_INT_ESB hcall in case of an LSI
> >> +     * interrupt. This is necessary under KVM to re-trigger the
> >> +     * interrupt if the level is still asserted
> >> +     */
> >> +    if (xive_source_irq_is_lsi(xsrc, lisn)) {
> >> +        args[0] |= SPAPR_XIVE_SRC_H_INT_ESB | SPAPR_XIVE_SRC_LSI;
> >> +    }
> >> +
> >> +    if (!(args[0] & SPAPR_XIVE_SRC_H_INT_ESB)) {
> >> +        args[1] = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn);
> >> +    } else {
> >> +        args[1] = -1;
> >> +    }
> >> +
> >> +    if (xive_source_esb_has_2page(xsrc)) {
> >> +        args[2] = xive->vc_base + xive_source_esb_page(xsrc, lisn);
> >> +    } else {
> >> +        args[2] = -1;
> >> +    }
> > 
> > Do we also need to keep this address clear in the H_INT_ESB case?
> 
> I think not, but the specs are not very clear on that topic. I will
> ask for clarification and use a -1 for now. We can not do loads on
> the trigger page so it can not be used by the H_INT_ESB hcall.
> 
> > 
> >> +    args[3] = TARGET_PAGE_SIZE;
> > 
> > That seems wrong.  
> 
> This is utterly wrong. it should be a power of 2 number ... I got
> it right under KVM though. I guess that ioremap() under Linux rounds 
> up the size to the page size in use, so, that's why it didn't blow
> up under TCG.
> 
> > TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
> > actually be 64kiB?
> 
> yes. So what should I use to get a PAGE_SHIFT instead ? 

Erm, that gets a bit tricky, since qemu in a sense doesn't know the
guest's page size.

But.. don't you actually want the esb_shift here, not PAGE_SHIFT - it
could matter for the 2 page * 64kiB variant, yes?

> >> +
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_SET_SOURCE_CONFIG hcall() is used to assign a Logical
> >> + * Interrupt Source to a target. The Logical Interrupt Source is
> >> + * designated with the "lisn" parameter and the target is designated
> >> + * with the "target" and "priority" parameters.  Upon return from the
> >> + * hcall(), no additional interrupts will be directed to the old EQ.
> >> + *
> >> + * TODO: The old EQ should be investigated for interrupts that
> >> + * occurred prior to or during the hcall().
> > 
> > Isn't that the responsibility of the guest?
> 
> It should yes.

Right, so not a TODO for the qemu code.

> 
> > 
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-61: Reserved
> >> + *      Bit 62: set the "eisn" in the EA
> > 
> > What's the "EA"?  Do you mean the EAS?
> 
> Another XIVE acronym, EA for Event Assignment. I think we can forget
> this one and just use EAS.
>  
> > 
> >> + *      Bit 63: masks the interrupt source in the hardware interrupt
> >> + *      control structure. An interrupt masked by this mechanism will
> >> + *      be dropped, but it's source state bits will still be
> >> + *      set. There is no race-free way of unmasking and restoring the
> >> + *      source. Thus this should only be used in interrupts that are
> >> + *      also masked at the source, and only in cases where the
> >> + *      interrupt is not meant to be used for a large amount of time
> >> + *      because no valid target exists for it for example
> >> + * - "lisn" is per "interrupts", "interrupt-map", or
> >> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> >> + *      ibm,query-interrupt-source-number RTAS call, or as returned by
> >> + *      the H_ALLOCATE_VAS_WINDOW hcall
> >> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> >> + *      "ibm,ppc-interrupt-gserver#s"
> >> + * - "priority" is a valid priority not in
> >> + *      "ibm,plat-res-int-priorities"
> >> + * - "eisn" is the guest EISN associated with the "lisn"
> > 
> > I don't think the EISN term has been used before in the series.  
> 
> Effective Interrupt Source Number, which is the event data enqueued
> in the OS EQ.
> 
> I'm planning on adding some more acronyms used by the sPAPR hcalls
> in this file. There are only a couple.

That would be helpful.

> > I'm guessing this is the guest-assigned global interrupt number?
> 
> yes 
> 
> >> + *
> >> + * Output:
> >> + * - None
> >> + */
> >> +
> >> +#define SPAPR_XIVE_SRC_SET_EISN PPC_BIT(62)
> >> +#define SPAPR_XIVE_SRC_MASK     PPC_BIT(63)
> >> +
> >> +static target_ulong h_int_set_source_config(PowerPCCPU *cpu,
> >> +                                            sPAPRMachineState *spapr,
> >> +                                            target_ulong opcode,
> >> +                                            target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> >> +    XiveEAS eas, new_eas;
> >> +    target_ulong flags    = args[0];
> >> +    target_ulong lisn     = args[1];
> >> +    target_ulong target   = args[2];
> >> +    target_ulong priority = args[3];
> >> +    target_ulong eisn     = args[4];
> >> +    uint8_t end_blk;
> >> +    uint32_t end_idx;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags & ~(SPAPR_XIVE_SRC_SET_EISN | SPAPR_XIVE_SRC_MASK)) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (!(eas.w & EAS_VALID)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    /* priority 0xff is used to reset the EAS */
> >> +    if (priority == 0xff) {
> >> +        new_eas.w = EAS_VALID | EAS_MASKED;
> >> +        goto out;
> >> +    }
> >> +
> >> +    if (flags & SPAPR_XIVE_SRC_MASK) {
> >> +        new_eas.w = eas.w | EAS_MASKED;
> >> +    } else {
> >> +        new_eas.w = eas.w & ~EAS_MASKED;
> >> +    }
> >> +
> >> +    if (!spapr_xive_priority_is_valid(priority)) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> >> +                      priority);
> >> +        return H_P4;
> >> +    }
> >> +
> >> +    /* Validate that "target" is part of the list of threads allocated
> >> +     * to the partition. For that, find the END corresponding to the
> >> +     * target.
> >> +     */
> >> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> >> +        return H_P3;
> >> +    }
> >> +
> >> +    new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
> >> +    new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
> >> +
> >> +    if (flags & SPAPR_XIVE_SRC_SET_EISN) {
> >> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
> >> +    }
> >> +
> >> +out:
> >> +    if (xive_router_set_eas(xrtr, lisn, &new_eas)) {
> >> +        return H_HARDWARE;
> >> +    }
> > 
> > As noted earlier in the series, the spapr specific code owns the
> > memory backing the EAT, so you can just access it directly rather than
> > using a method here.
> 
> Yes. I will give a try. I wonder if I need accessors for the tables
> ?

You'll still need the read accessor since the routing core uses that.
I don't think you need a write accessor though.

> 
> > 
> >> +
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_GET_SOURCE_CONFIG hcall() is used to determine to which
> >> + * target/priority pair is assigned to the specified Logical Interrupt
> >> + * Source.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-63 Reserved
> >> + * - "lisn" is per "interrupts", "interrupt-map", or
> >> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> >> + *      ibm,query-interrupt-source-number RTAS call, or as
> >> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> >> + *
> >> + * Output:
> >> + * - R4: Target to which the specified Logical Interrupt Source is
> >> + *       assigned
> >> + * - R5: Priority to which the specified Logical Interrupt Source is
> >> + *       assigned
> >> + * - R6: EISN for the specified Logical Interrupt Source (this will be
> >> + *       equivalent to the LISN if not changed by H_INT_SET_SOURCE_CONFIG)
> >> + */
> >> +static target_ulong h_int_get_source_config(PowerPCCPU *cpu,
> >> +                                            sPAPRMachineState *spapr,
> >> +                                            target_ulong opcode,
> >> +                                            target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> >> +    target_ulong flags = args[0];
> >> +    target_ulong lisn = args[1];
> >> +    XiveEAS eas;
> >> +    XiveEND end;
> >> +    uint8_t end_blk, nvt_blk;
> >> +    uint32_t end_idx, nvt_idx;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (!(eas.w & EAS_VALID)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    end_blk = GETFIELD(EAS_END_BLOCK, eas.w);
> >> +    end_idx = GETFIELD(EAS_END_INDEX, eas.w);
> >> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
> >> +        /* Not sure what to return here */
> >> +        return H_HARDWARE;
> > 
> > IIUC this indicates a bug in the PAPR specific code, not the guest, so
> > an assert() is probably the right answer.
> 
> ok
> 
> >> +    }
> >> +
> >> +    nvt_blk = GETFIELD(END_W6_NVT_BLOCK, end.w6);
> >> +    nvt_idx = GETFIELD(END_W6_NVT_INDEX, end.w6);
> >> +    args[0] = spapr_xive_nvt_to_target(xive, nvt_blk, nvt_idx);
> > 
> > AIUI there's a specific END for each target & priority, so you could
> > avoid this second level lookup, 
> 
> yes 
> 
> > although I guess this might be
> > valuable if we do more complicated internal routing in the future.
> 
> I am not sure of that but I'd rather keep these converting helpers
> for the moment.

Ok.

> >> +    if (eas.w & EAS_MASKED) {
> >> +        args[1] = 0xff;
> >> +    } else {
> >> +        args[1] = GETFIELD(END_W7_F0_PRIORITY, end.w7);
> >> +    }
> >> +
> >> +    args[2] = GETFIELD(EAS_END_DATA, eas.w);
> >> +
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_GET_QUEUE_INFO hcall() is used to get the logical real
> >> + * address of the notification management page associated with the
> >> + * specified target and priority.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *       Bits 0-63 Reserved
> >> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> >> + *       "ibm,ppc-interrupt-gserver#s"
> >> + * - "priority" is a valid priority not in
> >> + *       "ibm,plat-res-int-priorities"
> >> + *
> >> + * Output:
> >> + * - R4: Logical real address of notification page
> >> + * - R5: Power of 2 page size of the notification page
> >> + */
> >> +static target_ulong h_int_get_queue_info(PowerPCCPU *cpu,
> >> +                                         sPAPRMachineState *spapr,
> >> +                                         target_ulong opcode,
> >> +                                         target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveENDSource *end_xsrc = &xive->end_source;
> >> +    target_ulong flags = args[0];
> >> +    target_ulong target = args[1];
> >> +    target_ulong priority = args[2];
> >> +    XiveEND end;
> >> +    uint8_t end_blk;
> >> +    uint32_t end_idx;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    if (!spapr_xive_priority_is_valid(priority)) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> >> +                      priority);
> >> +        return H_P3;
> >> +    }
> >> +
> >> +    /* Validate that "target" is part of the list of threads allocated
> >> +     * to the partition. For that, find the END corresponding to the
> >> +     * target.
> >> +     */
> >> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
> >> +        return H_HARDWARE;
> >> +    }
> >> +
> >> +    args[0] = xive->end_base + (1ull << (end_xsrc->esb_shift + 1)) * end_idx;
> >> +    if (end.w0 & END_W0_ENQUEUE) {
> >> +        args[1] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> >> +    } else {
> >> +        args[1] = 0;
> >> +    }
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_SET_QUEUE_CONFIG hcall() is used to set or reset a EQ for
> >> + * a given "target" and "priority".  It is also used to set the
> >> + * notification config associated with the EQ.  An EQ size of 0 is
> >> + * used to reset the EQ config for a given target and priority. If
> >> + * resetting the EQ config, the END associated with the given "target"
> >> + * and "priority" will be changed to disable queueing.
> >> + *
> >> + * Upon return from the hcall(), no additional interrupts will be
> >> + * directed to the old EQ (if one was set). The old EQ (if one was
> >> + * set) should be investigated for interrupts that occurred prior to
> >> + * or during the hcall().
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-62: Reserved
> >> + *      Bit 63: Unconditional Notify (n) per the XIVE spec
> >> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> >> + *       "ibm,ppc-interrupt-gserver#s"
> >> + * - "priority" is a valid priority not in
> >> + *       "ibm,plat-res-int-priorities"
> >> + * - "eventQueue": The logical real address of the start of the EQ
> >> + * - "eventQueueSize": The power of 2 EQ size per "ibm,xive-eq-sizes"
> >> + *
> >> + * Output:
> >> + * - None
> >> + */
> >> +
> >> +#define SPAPR_XIVE_END_ALWAYS_NOTIFY PPC_BIT(63)
> >> +
> >> +static target_ulong h_int_set_queue_config(PowerPCCPU *cpu,
> >> +                                           sPAPRMachineState *spapr,
> >> +                                           target_ulong opcode,
> >> +                                           target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
> >> +    target_ulong flags = args[0];
> >> +    target_ulong target = args[1];
> >> +    target_ulong priority = args[2];
> >> +    target_ulong qpage = args[3];
> >> +    target_ulong qsize = args[4];
> >> +    XiveEND end;
> >> +    uint8_t end_blk, nvt_blk;
> >> +    uint32_t end_idx, nvt_idx;
> >> +    uint32_t qdata;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags & ~SPAPR_XIVE_END_ALWAYS_NOTIFY) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    if (!spapr_xive_priority_is_valid(priority)) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> >> +                      priority);
> >> +        return H_P3;
> >> +    }
> >> +
> >> +    /* Validate that "target" is part of the list of threads allocated
> >> +     * to the partition. For that, find the END corresponding to the
> >> +     * target.
> >> +     */
> >> +
> >> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
> >> +        return H_HARDWARE;
> > 
> > Again, I think this indicates a qemu (spapr) code bug, so could be an assert().
> 
> ok
> 
> > 
> >> +    }
> >> +
> >> +    switch (qsize) {
> >> +    case 12:
> >> +    case 16:
> >> +    case 21:
> >> +    case 24:
> >> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
> > 
> > It just occurred to me that I haven't been looking for this across any
> > of these reviews.  Don't you need byteswaps when accessing these
> > in-memory structures?
> 
> yes this is done when some event data is enqueued in the EQ.

I'm not talking about the data in the EQ itself, but the fields in the
END (and the NVT).

> 
> > 
> >> +        end.w2 = (((uint64_t)qpage)) >> 32 & 0x0fffffff;
> >> +        end.w0 |= END_W0_ENQUEUE;
> >> +        end.w0 = SETFIELD(END_W0_QSIZE, end.w0, qsize - 12);
> >> +        break;
> >> +    case 0:
> >> +        /* reset queue and disable queueing */
> >> +        xive_end_reset(&end);
> >> +        goto out;
> >> +
> >> +    default:
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid EQ size %"PRIx64"\n",
> >> +                      qsize);
> >> +        return H_P5;
> >> +    }
> >> +
> >> +    if (qsize) {
> >> +        /*
> >> +         * Let's validate the EQ address with a read of the first EQ
> >> +         * entry. We could also check that the full queue has been
> >> +         * zeroed by the OS.
> >> +         */
> >> +        if (address_space_read(&address_space_memory, qpage,
> >> +                               MEMTXATTRS_UNSPECIFIED,
> >> +                               (uint8_t *) &qdata, sizeof(qdata))) {
> >> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to read EQ data @0x%"
> >> +                          HWADDR_PRIx "\n", qpage);
> >> +            return H_P4;
> > 
> > Just checking the first entry doesn't seem entirely safe.  Using
> > address_space_map() and making sure the returned plen doesn't get
> > reduced below the queue size might be a better option.
> 
> ok. That was on my todo list.
> 
> > 
> >> +        }
> >> +    }
> >> +
> >> +    if (spapr_xive_target_to_nvt(xive, target, &nvt_blk, &nvt_idx)) {
> >> +        return H_HARDWARE;
> > 
> > That could be caused by a bogus 'target' value, couldn't it?  
> 
> yes. It should have returned H_P2 above when spapr_xive_target_to_end() 
> is called.
> 
> > In which
> > case it a) should probably be checked earlier and b) should be
> > H_PARAMETER or similar, not H_HARDWARE, yes?
> 
> H_P2 may be again. It should be checked earlier
> 
> > 
> >> +    }
> >> +
> >> +    /* Ensure the priority and target are correctly set (they will not
> >> +     * be right after allocation)
> > 
> > AIUI there's a static association from END to target in the PAPR
> > model. 
> 
> yes. 8 priorities per cpu.
> 
> > So it seems to make more sense to get that set up right at
> > initialization / reset, rather than doing it lazily when the 
> > queue is configured.
> 
> Ah. You would preconfigure the word6 and word7 then. Yes, it would
> save us some of the conversion fuss. I will look at it.
> 
> >> +     */
> >> +    end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
> >> +        SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
> >> +    end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, priority);
> >> +
> >> +    if (flags & SPAPR_XIVE_END_ALWAYS_NOTIFY) {
> >> +        end.w0 |= END_W0_UCOND_NOTIFY;
> >> +    } else {
> >> +        end.w0 &= ~END_W0_UCOND_NOTIFY;
> >> +    }
> >> +
> >> +    /* The generation bit for the END starts at 1 and The END page
> >> +     * offset counter starts at 0.
> >> +     */
> >> +    end.w1 = END_W1_GENERATION | SETFIELD(END_W1_PAGE_OFF, 0ul, 0ul);
> >> +    end.w0 |= END_W0_VALID;
> >> +
> >> +    /* TODO: issue syncs required to ensure all in-flight interrupts
> >> +     * are complete on the old END */
> >> +out:
> >> +    /* Update END */
> >> +    if (xive_router_set_end(xrtr, end_blk, end_idx, &end)) {
> >> +        return H_HARDWARE;
> >> +    }
> > 
> > Again the PAPR code owns the ENDs, so it can update them directly
> > rather than going through an abstraction.
> 
> ok.
> 
> > 
> >> +
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_GET_QUEUE_CONFIG hcall() is used to get a EQ for a given
> >> + * target and priority.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-62: Reserved
> >> + *      Bit 63: Debug: Return debug data
> >> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> >> + *       "ibm,ppc-interrupt-gserver#s"
> >> + * - "priority" is a valid priority not in
> >> + *       "ibm,plat-res-int-priorities"
> >> + *
> >> + * Output:
> >> + * - R4: "flags":
> >> + *       Bits 0-61: Reserved
> >> + *       Bit 62: The value of Event Queue Generation Number (g) per
> >> + *              the XIVE spec if "Debug" = 1
> >> + *       Bit 63: The value of Unconditional Notify (n) per the XIVE spec
> >> + * - R5: The logical real address of the start of the EQ
> >> + * - R6: The power of 2 EQ size per "ibm,xive-eq-sizes"
> >> + * - R7: The value of Event Queue Offset Counter per XIVE spec
> >> + *       if "Debug" = 1, else 0
> >> + *
> >> + */
> >> +
> >> +#define SPAPR_XIVE_END_DEBUG     PPC_BIT(63)
> >> +
> >> +static target_ulong h_int_get_queue_config(PowerPCCPU *cpu,
> >> +                                           sPAPRMachineState *spapr,
> >> +                                           target_ulong opcode,
> >> +                                           target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    target_ulong flags = args[0];
> >> +    target_ulong target = args[1];
> >> +    target_ulong priority = args[2];
> >> +    XiveEND end;
> >> +    uint8_t end_blk;
> >> +    uint32_t end_idx;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags & ~SPAPR_XIVE_END_DEBUG) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    if (!spapr_xive_priority_is_valid(priority)) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
> >> +                      priority);
> >> +        return H_P3;
> >> +    }
> >> +
> >> +    /* Validate that "target" is part of the list of threads allocated
> >> +     * to the partition. For that, find the END corresponding to the
> >> +     * target.
> >> +     */
> >> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
> >> +        return H_HARDWARE;
> > 
> > Again, assert() seems appropriate here.
> 
> ok
> 
> > 
> >> +    }
> >> +
> >> +    args[0] = 0;
> >> +    if (end.w0 & END_W0_UCOND_NOTIFY) {
> >> +        args[0] |= SPAPR_XIVE_END_ALWAYS_NOTIFY;
> >> +    }
> >> +
> >> +    if (end.w0 & END_W0_ENQUEUE) {
> >> +        args[1] =
> >> +            (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
> >> +        args[2] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
> >> +    } else {
> >> +        args[1] = 0;
> >> +        args[2] = 0;
> >> +    }
> >> +
> >> +    /* TODO: do we need any locking on the END ? */
> >> +    if (flags & SPAPR_XIVE_END_DEBUG) {
> >> +        /* Load the event queue generation number into the return flags */
> >> +        args[0] |= (uint64_t)GETFIELD(END_W1_GENERATION, end.w1) << 62;
> >> +
> >> +        /* Load R7 with the event queue offset counter */
> >> +        args[3] = GETFIELD(END_W1_PAGE_OFF, end.w1);
> >> +    } else {
> >> +        args[3] = 0;
> >> +    }
> >> +
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_SET_OS_REPORTING_LINE hcall() is used to set the
> >> + * reporting cache line pair for the calling thread.  The reporting
> >> + * cache lines will contain the OS interrupt context when the OS
> >> + * issues a CI store byte to @TIMA+0xC10 to acknowledge the OS
> >> + * interrupt. The reporting cache lines can be reset by inputting -1
> >> + * in "reportingLine".  Issuing the CI store byte without reporting
> >> + * cache lines registered will result in the data not being accessible
> >> + * to the OS.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-63: Reserved
> >> + * - "reportingLine": The logical real address of the reporting cache
> >> + *    line pair
> >> + *
> >> + * Output:
> >> + * - None
> >> + */
> >> +static target_ulong h_int_set_os_reporting_line(PowerPCCPU *cpu,
> >> +                                                sPAPRMachineState *spapr,
> >> +                                                target_ulong opcode,
> >> +                                                target_ulong *args)
> >> +{
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    /* TODO: H_INT_SET_OS_REPORTING_LINE */
> >> +    return H_FUNCTION;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_GET_OS_REPORTING_LINE hcall() is used to get the logical
> >> + * real address of the reporting cache line pair set for the input
> >> + * "target".  If no reporting cache line pair has been set, -1 is
> >> + * returned.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-63: Reserved
> >> + * - "target" is per "ibm,ppc-interrupt-server#s" or
> >> + *       "ibm,ppc-interrupt-gserver#s"
> >> + * - "reportingLine": The logical real address of the reporting cache
> >> + *   line pair
> >> + *
> >> + * Output:
> >> + * - R4: The logical real address of the reporting line if set, else -1
> >> + */
> >> +static target_ulong h_int_get_os_reporting_line(PowerPCCPU *cpu,
> >> +                                                sPAPRMachineState *spapr,
> >> +                                                target_ulong opcode,
> >> +                                                target_ulong *args)
> >> +{
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    /* TODO: H_INT_GET_OS_REPORTING_LINE */
> >> +    return H_FUNCTION;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_ESB hcall() is used to issue a load or store to the ESB
> >> + * page for the input "lisn".  This hcall is only supported for LISNs
> >> + * that have the ESB hcall flag set to 1 when returned from hcall()
> >> + * H_INT_GET_SOURCE_INFO.
> > 
> > Is there a reason for specifically restricting this to LISNs which
> > advertise it, rather than allowing it for anything? 
> 
> It's in the specs but I did not implement the check. So H_INT_ESB can be 
> used today by the OS for any interrupt number. Same under KVM.
> 
> But I should say so somewhere.
> 
> > Obviously using
> > the direct MMIOs will generally be a faster option when possible, but
> > I could see occasions where it might be simpler for the guest to
> > always use H_INT_ESB (e.g. for micro-guests like kvm-unit-tests).
> 
> can not you use direct load and stores in these guests ? I haven't 
> looked at how they are implemented.

It's not that you can't, but that might involve setting up mappings
and so forth which could be more trouble than using an hcall.  At the
very least they'll also need H_INT_ESB support for the irqs that
require it, so allowing it for everything avoids one code variant.

> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-62: Reserved
> >> + *      bit 63: Store: Store=1, store operation, else load operation
> >> + * - "lisn" is per "interrupts", "interrupt-map", or
> >> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> >> + *      ibm,query-interrupt-source-number RTAS call, or as
> >> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> >> + * - "esbOffset" is the offset into the ESB page for the load or store operation
> >> + * - "storeData" is the data to write for a store operation
> >> + *
> >> + * Output:
> >> + * - R4: R4: The value of the load if load operation, else -1
> >> + */
> >> +
> >> +#define SPAPR_XIVE_ESB_STORE PPC_BIT(63)
> >> +
> >> +static target_ulong h_int_esb(PowerPCCPU *cpu,
> >> +                              sPAPRMachineState *spapr,
> >> +                              target_ulong opcode,
> >> +                              target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveEAS eas;
> >> +    target_ulong flags  = args[0];
> >> +    target_ulong lisn   = args[1];
> >> +    target_ulong offset = args[2];
> >> +    target_ulong data   = args[3];
> >> +    hwaddr mmio_addr;
> >> +    XiveSource *xsrc = &xive->source;
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags & ~SPAPR_XIVE_ESB_STORE) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (!(eas.w & EAS_VALID)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (offset > (1ull << xsrc->esb_shift)) {
> >> +        return H_P3;
> >> +    }
> >> +
> >> +    mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
> >> +
> >> +    if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8,
> >> +                      (flags & SPAPR_XIVE_ESB_STORE))) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
> >> +                      HWADDR_PRIx "\n", mmio_addr);
> >> +        return H_HARDWARE;
> >> +    }
> >> +    args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data;
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_SYNC hcall() is used to issue hardware syncs that will
> >> + * ensure any in flight events for the input lisn are in the event
> >> + * queue.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-63: Reserved
> >> + * - "lisn" is per "interrupts", "interrupt-map", or
> >> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
> >> + *      ibm,query-interrupt-source-number RTAS call, or as
> >> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
> >> + *
> >> + * Output:
> >> + * - None
> >> + */
> >> +static target_ulong h_int_sync(PowerPCCPU *cpu,
> >> +                               sPAPRMachineState *spapr,
> >> +                               target_ulong opcode,
> >> +                               target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    XiveEAS eas;
> >> +    target_ulong flags = args[0];
> >> +    target_ulong lisn = args[1];
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    if (!(eas.w & EAS_VALID)) {
> >> +        return H_P2;
> >> +    }
> >> +
> >> +    /*
> >> +     * H_STATE should be returned if a H_INT_RESET is in progress.
> >> +     * This is not needed when running the emulation under QEMU
> >> +     */
> >> +
> >> +    /* This is not real hardware. Nothing to be done */
> > 
> > At least, not as long as all the XIVE operations are under the BQL.
> 
> yes.
> 
> > 
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +/*
> >> + * The H_INT_RESET hcall() is used to reset all of the partition's
> >> + * interrupt exploitation structures to their initial state.  This
> >> + * means losing all previously set interrupt state set via
> >> + * H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> >> + *
> >> + * Parameters:
> >> + * Input:
> >> + * - "flags"
> >> + *      Bits 0-63: Reserved
> >> + *
> >> + * Output:
> >> + * - None
> >> + */
> >> +static target_ulong h_int_reset(PowerPCCPU *cpu,
> >> +                                sPAPRMachineState *spapr,
> >> +                                target_ulong opcode,
> >> +                                target_ulong *args)
> >> +{
> >> +    sPAPRXive *xive = spapr->xive;
> >> +    target_ulong flags   = args[0];
> >> +
> >> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        return H_FUNCTION;
> >> +    }
> >> +
> >> +    if (flags) {
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    device_reset(DEVICE(xive));
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +void spapr_xive_hcall_init(sPAPRMachineState *spapr)
> >> +{
> >> +    spapr_register_hypercall(H_INT_GET_SOURCE_INFO, h_int_get_source_info);
> >> +    spapr_register_hypercall(H_INT_SET_SOURCE_CONFIG, h_int_set_source_config);
> >> +    spapr_register_hypercall(H_INT_GET_SOURCE_CONFIG, h_int_get_source_config);
> >> +    spapr_register_hypercall(H_INT_GET_QUEUE_INFO, h_int_get_queue_info);
> >> +    spapr_register_hypercall(H_INT_SET_QUEUE_CONFIG, h_int_set_queue_config);
> >> +    spapr_register_hypercall(H_INT_GET_QUEUE_CONFIG, h_int_get_queue_config);
> >> +    spapr_register_hypercall(H_INT_SET_OS_REPORTING_LINE,
> >> +                             h_int_set_os_reporting_line);
> >> +    spapr_register_hypercall(H_INT_GET_OS_REPORTING_LINE,
> >> +                             h_int_get_os_reporting_line);
> >> +    spapr_register_hypercall(H_INT_ESB, h_int_esb);
> >> +    spapr_register_hypercall(H_INT_SYNC, h_int_sync);
> >> +    spapr_register_hypercall(H_INT_RESET, h_int_reset);
> >> +}
> >> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >> index 2569ae1bc7f8..da6fcfaa3c52 100644
> >> --- a/hw/ppc/spapr_irq.c
> >> +++ b/hw/ppc/spapr_irq.c
> >> @@ -258,6 +258,8 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
> >>          error_propagate(errp, local_err);
> >>          return;
> >>      }
> >> +
> >> +    spapr_xive_hcall_init(spapr);
> >>  }
> >>  
> >>  static int spapr_irq_claim_xive(sPAPRMachineState *spapr, int irq, bool lsi,
> >> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> >> index 301a8e972d91..eacd26836ebf 100644
> >> --- a/hw/intc/Makefile.objs
> >> +++ b/hw/intc/Makefile.objs
> >> @@ -38,7 +38,7 @@ obj-$(CONFIG_XICS) += xics.o
> >>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
> >>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> >>  obj-$(CONFIG_XIVE) += xive.o
> >> -obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
> >> +obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
> >>  obj-$(CONFIG_POWERNV) += xics_pnv.o
> >>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
> >>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
> > 
>
Cédric Le Goater Nov. 29, 2018, 4:04 p.m. UTC | #4
On 11/29/18 2:23 AM, David Gibson wrote:
> On Wed, Nov 28, 2018 at 11:21:37PM +0100, Cédric Le Goater wrote:
>> On 11/28/18 5:25 AM, David Gibson wrote:
>>> On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
>>>> The different XIVE virtualization structures (sources and event queues)
>>>> are configured with a set of Hypervisor calls :
>>>>
>>>>  - H_INT_GET_SOURCE_INFO
>>>>
>>>>    used to obtain the address of the MMIO page of the Event State
>>>>    Buffer (ESB) entry associated with the source.
>>>>
>>>>  - H_INT_SET_SOURCE_CONFIG
>>>>
>>>>    assigns a source to a "target".
>>>>
>>>>  - H_INT_GET_SOURCE_CONFIG
>>>>
>>>>    determines which "target" and "priority" is assigned to a source
>>>>
>>>>  - H_INT_GET_QUEUE_INFO
>>>>
>>>>    returns the address of the notification management page associated
>>>>    with the specified "target" and "priority".
>>>>
>>>>  - H_INT_SET_QUEUE_CONFIG
>>>>
>>>>    sets or resets the event queue for a given "target" and "priority".
>>>>    It is also used to set the notification configuration associated
>>>>    with the queue, only unconditional notification is supported for
>>>>    the moment. Reset is performed with a queue size of 0 and queueing
>>>>    is disabled in that case.
>>>>
>>>>  - H_INT_GET_QUEUE_CONFIG
>>>>
>>>>    returns the queue settings for a given "target" and "priority".
>>>>
>>>>  - H_INT_RESET
>>>>
>>>>    resets all of the guest's internal interrupt structures to their
>>>>    initial state, losing all configuration set via the hcalls
>>>>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
>>>>
>>>>  - H_INT_SYNC
>>>>
>>>>    issue a synchronisation on a source to make sure all notifications
>>>>    have reached their queue.
>>>>
>>>> Calls that still need to be addressed :
>>>>
>>>>    H_INT_SET_OS_REPORTING_LINE
>>>>    H_INT_GET_OS_REPORTING_LINE
>>>>
>>>> See the code for more documentation on each hcall.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  include/hw/ppc/spapr.h      |  15 +-
>>>>  include/hw/ppc/spapr_xive.h |   6 +
>>>>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
>>>>  hw/ppc/spapr_irq.c          |   2 +
>>>>  hw/intc/Makefile.objs       |   2 +-
>>>>  5 files changed, 915 insertions(+), 2 deletions(-)
>>>>  create mode 100644 hw/intc/spapr_xive_hcall.c
>>>>
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index 1fbc2663e06c..8415faea7b82 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
>>>>  #define H_INVALIDATE_PID        0x378
>>>>  #define H_REGISTER_PROC_TBL     0x37C
>>>>  #define H_SIGNAL_SYS_RESET      0x380
>>>> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
>>>> +
>>>> +#define H_INT_GET_SOURCE_INFO   0x3A8
>>>> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
>>>> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
>>>> +#define H_INT_GET_QUEUE_INFO    0x3B4
>>>> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
>>>> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
>>>> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
>>>> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
>>>> +#define H_INT_ESB               0x3C8
>>>> +#define H_INT_SYNC              0x3CC
>>>> +#define H_INT_RESET             0x3D0
>>>> +
>>>> +#define MAX_HCALL_OPCODE        H_INT_RESET
>>>>  
>>>>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>>>>   * as well.
>>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>>>> index 3f65b8f485fd..418511f3dc10 100644
>>>> --- a/include/hw/ppc/spapr_xive.h
>>>> +++ b/include/hw/ppc/spapr_xive.h
>>>> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
>>>>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
>>>>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
>>>>  
>>>> +bool spapr_xive_priority_is_valid(uint8_t priority);
>>>
>>> AFAICT this could be a local function.
>>
>> the KVM model uses it also, when collecting state from the KVM device 
>> to build the QEMU ENDT.
>>
>>>> +
>>>> +typedef struct sPAPRMachineState sPAPRMachineState;
>>>> +
>>>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
>>>> +
>>>>  #endif /* PPC_SPAPR_XIVE_H */
>>>> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
>>>> new file mode 100644
>>>> index 000000000000..52e4e23995f5
>>>> --- /dev/null
>>>> +++ b/hw/intc/spapr_xive_hcall.c
>>>> @@ -0,0 +1,892 @@
>>>> +/*
>>>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
>>>> + *
>>>> + * Copyright (c) 2017-2018, IBM Corporation.
>>>> + *
>>>> + * This code is licensed under the GPL version 2 or later. See the
>>>> + * COPYING file in the top-level directory.
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qemu/log.h"
>>>> +#include "qapi/error.h"
>>>> +#include "cpu.h"
>>>> +#include "hw/ppc/fdt.h"
>>>> +#include "hw/ppc/spapr.h"
>>>> +#include "hw/ppc/spapr_xive.h"
>>>> +#include "hw/ppc/xive_regs.h"
>>>> +#include "monitor/monitor.h"
>>>
>>> Fwiw, I don't think it's particularly necessary to split the hcall
>>> handling out into a separate .c file.
>>
>> ok. let's move it to spapr_xive then ? It might help in reducing the 
>> exported funtions. 
> 
> Yes, I think so.
> 
>>>> +/*
>>>> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
>>>> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
>>>> + * available for the guest.
>>>
>>> Referencing OPAL behaviour doesn't really make sense in the context of
>>> PAPR.  
>>
>> It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
>> constraint also.
> 
> Right, I realized that a few patches on.  Maybe rephrase this to
> 
>    Linux hosts under OPAL reserve priority 7 for their own escalation
>    interrupts.  So we only allow the guest to use priorities [0..6].

OK.

> The point here is that we're emphasizing that this is a design
> decision to make the host implementation easier, rather than a
> fundamental constraint.
> 
>>> What I think you're getting at is that the PAPR spec only
>>> allows a PAPR guest to use priorities 0..6 (or at least it will if the
>>> XIVE updated spec ever gets published).  
>>
>> It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
>>  
>>> The fact that this allows the
>>> host use 7 for escalations is a design rationale 
>>> but not really relevant to the guest device itself. 
>>
>> The guest should be aware of which priorities are reserved for
>> the hypervisor though.
>>
>>>> + */
>>>> +bool spapr_xive_priority_is_valid(uint8_t priority)
>>>> +{
>>>> +    switch (priority) {
>>>> +    case 0 ... 6:
>>>> +        return true;
>>>> +    case 7: /* OPAL escalation queue */
>>>> +    default:
>>>> +        return false;
>>>> +    }
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
>>>> + * real address of the MMIO page through which the Event State Buffer
>>>> + * entry associated with the value of the "lisn" parameter is managed.
>>>> + *
>>>> + * Parameters:
>>>> + * Input
>>>> + * - "flags"
>>>> + *       Bits 0-63 reserved
>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
>>>> + *       ibm,query-interrupt-source-number RTAS call, or as returned
>>>> + *       by the H_ALLOCATE_VAS_WINDOW hcall
>>>
>>> I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
>>> to implement in kvm/qemu, or is it only of interest for PowerVM?
>>
>> The hcall is part of the PAPR NX Interfaces and it returns interrupt
>> numbers. I don't know if any work has been done on the topic.  
> 
> What's a "PAPR NX"?

A way for the PAPR guests to access the POWER coprocessors doing 
compression and encryption. I really don't know much about this.

>>> Also, putting the register numbers on the inputs as well as the
>>> outputs would be helpful.
>>
>> yes. I will add them.
>>
>>>> + *
>>>> + * Output
>>>> + * - R4: "flags"
>>>> + *       Bits 0-59: Reserved
>>>> + *       Bit 60: H_INT_ESB must be used for Event State Buffer
>>>> + *               management
>>>> + *       Bit 61: 1 == LSI  0 == MSI
>>>> + *       Bit 62: the full function page supports trigger
>>>> + *       Bit 63: Store EOI Supported
>>>> + * - R5: Logical Real address of full function Event State Buffer
>>>> + *       management page, -1 if ESB hcall flag is set to 1.
>>>
>>> You've defined what H_INT_ESB means above, so it will be clearer if
>>> you reference that by name here.
>>
>> yes. 
>>
>>>> + * - R6: Logical Real Address of trigger only Event State Buffer
>>>> + *       management page or -1.
>>>> + * - R7: Power of 2 page size for the ESB management pages returned in
>>>> + *       R5 and R6.
>>>> + */
>>>> +
>>>> +#define SPAPR_XIVE_SRC_H_INT_ESB     PPC_BIT(60) /* ESB manage with H_INT_ESB */
>>>> +#define SPAPR_XIVE_SRC_LSI           PPC_BIT(61) /* Virtual LSI type */
>>>> +#define SPAPR_XIVE_SRC_TRIGGER       PPC_BIT(62) /* Trigger and management
>>>> +                                                    on same page */
>>>> +#define SPAPR_XIVE_SRC_STORE_EOI     PPC_BIT(63) /* Store EOI support */
>>>
>>> Probably makes sense to put these #defines in spapr.h since they form
>>> part of the PAPR interface definition.
>>
>> ok.
>>
>>>
>>>> +static target_ulong h_int_get_source_info(PowerPCCPU *cpu,
>>>> +                                          sPAPRMachineState *spapr,
>>>> +                                          target_ulong opcode,
>>>> +                                          target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveSource *xsrc = &xive->source;
>>>> +    XiveEAS eas;
>>>> +    target_ulong flags  = args[0];
>>>> +    target_ulong lisn   = args[1];
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (!(eas.w & EAS_VALID)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    /* All sources are emulated under the main XIVE object and share
>>>> +     * the same characteristics.
>>>> +     */
>>>> +    args[0] = 0;
>>>> +    if (!xive_source_esb_has_2page(xsrc)) {
>>>> +        args[0] |= SPAPR_XIVE_SRC_TRIGGER;
>>>> +    }
>>>> +    if (xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
>>>> +        args[0] |= SPAPR_XIVE_SRC_STORE_EOI;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Force the use of the H_INT_ESB hcall in case of an LSI
>>>> +     * interrupt. This is necessary under KVM to re-trigger the
>>>> +     * interrupt if the level is still asserted
>>>> +     */
>>>> +    if (xive_source_irq_is_lsi(xsrc, lisn)) {
>>>> +        args[0] |= SPAPR_XIVE_SRC_H_INT_ESB | SPAPR_XIVE_SRC_LSI;
>>>> +    }
>>>> +
>>>> +    if (!(args[0] & SPAPR_XIVE_SRC_H_INT_ESB)) {
>>>> +        args[1] = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn);
>>>> +    } else {
>>>> +        args[1] = -1;
>>>> +    }
>>>> +
>>>> +    if (xive_source_esb_has_2page(xsrc)) {
>>>> +        args[2] = xive->vc_base + xive_source_esb_page(xsrc, lisn);
>>>> +    } else {
>>>> +        args[2] = -1;
>>>> +    }
>>>
>>> Do we also need to keep this address clear in the H_INT_ESB case?
>>
>> I think not, but the specs are not very clear on that topic. I will
>> ask for clarification and use a -1 for now. We can not do loads on
>> the trigger page so it can not be used by the H_INT_ESB hcall.
>>
>>>
>>>> +    args[3] = TARGET_PAGE_SIZE;
>>>
>>> That seems wrong.  
>>
>> This is utterly wrong. it should be a power of 2 number ... I got
>> it right under KVM though. I guess that ioremap() under Linux rounds 
>> up the size to the page size in use, so, that's why it didn't blow
>> up under TCG.
>>
>>> TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
>>> actually be 64kiB?
>>
>> yes. So what should I use to get a PAGE_SHIFT instead ? 
> 
> Erm, that gets a bit tricky, since qemu in a sense doesn't know the
> guest's page size.
> 
> But.. don't you actually want the esb_shift here, not PAGE_SHIFT - it
> could matter for the 2 page * 64kiB variant, yes?

Yes. we just want the page_shift of the ESB page, whether it's one or
two pages. The other registers inform the guest if there are one or 
two ESB page in use. 


>>>> +
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_SET_SOURCE_CONFIG hcall() is used to assign a Logical
>>>> + * Interrupt Source to a target. The Logical Interrupt Source is
>>>> + * designated with the "lisn" parameter and the target is designated
>>>> + * with the "target" and "priority" parameters.  Upon return from the
>>>> + * hcall(), no additional interrupts will be directed to the old EQ.
>>>> + *
>>>> + * TODO: The old EQ should be investigated for interrupts that
>>>> + * occurred prior to or during the hcall().
>>>
>>> Isn't that the responsibility of the guest?
>>
>> It should yes.
> 
> Right, so not a TODO for the qemu code.

yes

> 
>>
>>>
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-61: Reserved
>>>> + *      Bit 62: set the "eisn" in the EA
>>>
>>> What's the "EA"?  Do you mean the EAS?
>>
>> Another XIVE acronym, EA for Event Assignment. I think we can forget
>> this one and just use EAS.
>>  
>>>
>>>> + *      Bit 63: masks the interrupt source in the hardware interrupt
>>>> + *      control structure. An interrupt masked by this mechanism will
>>>> + *      be dropped, but it's source state bits will still be
>>>> + *      set. There is no race-free way of unmasking and restoring the
>>>> + *      source. Thus this should only be used in interrupts that are
>>>> + *      also masked at the source, and only in cases where the
>>>> + *      interrupt is not meant to be used for a large amount of time
>>>> + *      because no valid target exists for it for example
>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>>>> + *      ibm,query-interrupt-source-number RTAS call, or as returned by
>>>> + *      the H_ALLOCATE_VAS_WINDOW hcall
>>>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>>>> + *      "ibm,ppc-interrupt-gserver#s"
>>>> + * - "priority" is a valid priority not in
>>>> + *      "ibm,plat-res-int-priorities"
>>>> + * - "eisn" is the guest EISN associated with the "lisn"
>>>
>>> I don't think the EISN term has been used before in the series.  
>>
>> Effective Interrupt Source Number, which is the event data enqueued
>> in the OS EQ.
>>
>> I'm planning on adding some more acronyms used by the sPAPR hcalls
>> in this file. There are only a couple.
> 
> That would be helpful.
> 
>>> I'm guessing this is the guest-assigned global interrupt number?
>>
>> yes 
>>
>>>> + *
>>>> + * Output:
>>>> + * - None
>>>> + */
>>>> +
>>>> +#define SPAPR_XIVE_SRC_SET_EISN PPC_BIT(62)
>>>> +#define SPAPR_XIVE_SRC_MASK     PPC_BIT(63)
>>>> +
>>>> +static target_ulong h_int_set_source_config(PowerPCCPU *cpu,
>>>> +                                            sPAPRMachineState *spapr,
>>>> +                                            target_ulong opcode,
>>>> +                                            target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>>>> +    XiveEAS eas, new_eas;
>>>> +    target_ulong flags    = args[0];
>>>> +    target_ulong lisn     = args[1];
>>>> +    target_ulong target   = args[2];
>>>> +    target_ulong priority = args[3];
>>>> +    target_ulong eisn     = args[4];
>>>> +    uint8_t end_blk;
>>>> +    uint32_t end_idx;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags & ~(SPAPR_XIVE_SRC_SET_EISN | SPAPR_XIVE_SRC_MASK)) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (!(eas.w & EAS_VALID)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    /* priority 0xff is used to reset the EAS */
>>>> +    if (priority == 0xff) {
>>>> +        new_eas.w = EAS_VALID | EAS_MASKED;
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    if (flags & SPAPR_XIVE_SRC_MASK) {
>>>> +        new_eas.w = eas.w | EAS_MASKED;
>>>> +    } else {
>>>> +        new_eas.w = eas.w & ~EAS_MASKED;
>>>> +    }
>>>> +
>>>> +    if (!spapr_xive_priority_is_valid(priority)) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>>>> +                      priority);
>>>> +        return H_P4;
>>>> +    }
>>>> +
>>>> +    /* Validate that "target" is part of the list of threads allocated
>>>> +     * to the partition. For that, find the END corresponding to the
>>>> +     * target.
>>>> +     */
>>>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>>>> +        return H_P3;
>>>> +    }
>>>> +
>>>> +    new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
>>>> +    new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
>>>> +
>>>> +    if (flags & SPAPR_XIVE_SRC_SET_EISN) {
>>>> +        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
>>>> +    }
>>>> +
>>>> +out:
>>>> +    if (xive_router_set_eas(xrtr, lisn, &new_eas)) {
>>>> +        return H_HARDWARE;
>>>> +    }
>>>
>>> As noted earlier in the series, the spapr specific code owns the
>>> memory backing the EAT, so you can just access it directly rather than
>>> using a method here.
>>
>> Yes. I will give a try. I wonder if I need accessors for the tables
>> ?
> 
> You'll still need the read accessor since the routing core uses that.
> I don't think you need a write accessor though.
> 
>>
>>>
>>>> +
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_GET_SOURCE_CONFIG hcall() is used to determine to which
>>>> + * target/priority pair is assigned to the specified Logical Interrupt
>>>> + * Source.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-63 Reserved
>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>>>> + *      ibm,query-interrupt-source-number RTAS call, or as
>>>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>>>> + *
>>>> + * Output:
>>>> + * - R4: Target to which the specified Logical Interrupt Source is
>>>> + *       assigned
>>>> + * - R5: Priority to which the specified Logical Interrupt Source is
>>>> + *       assigned
>>>> + * - R6: EISN for the specified Logical Interrupt Source (this will be
>>>> + *       equivalent to the LISN if not changed by H_INT_SET_SOURCE_CONFIG)
>>>> + */
>>>> +static target_ulong h_int_get_source_config(PowerPCCPU *cpu,
>>>> +                                            sPAPRMachineState *spapr,
>>>> +                                            target_ulong opcode,
>>>> +                                            target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>>>> +    target_ulong flags = args[0];
>>>> +    target_ulong lisn = args[1];
>>>> +    XiveEAS eas;
>>>> +    XiveEND end;
>>>> +    uint8_t end_blk, nvt_blk;
>>>> +    uint32_t end_idx, nvt_idx;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_eas(xrtr, lisn, &eas)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (!(eas.w & EAS_VALID)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    end_blk = GETFIELD(EAS_END_BLOCK, eas.w);
>>>> +    end_idx = GETFIELD(EAS_END_INDEX, eas.w);
>>>> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
>>>> +        /* Not sure what to return here */
>>>> +        return H_HARDWARE;
>>>
>>> IIUC this indicates a bug in the PAPR specific code, not the guest, so
>>> an assert() is probably the right answer.
>>
>> ok
>>
>>>> +    }
>>>> +
>>>> +    nvt_blk = GETFIELD(END_W6_NVT_BLOCK, end.w6);
>>>> +    nvt_idx = GETFIELD(END_W6_NVT_INDEX, end.w6);
>>>> +    args[0] = spapr_xive_nvt_to_target(xive, nvt_blk, nvt_idx);
>>>
>>> AIUI there's a specific END for each target & priority, so you could
>>> avoid this second level lookup, 
>>
>> yes 
>>
>>> although I guess this might be
>>> valuable if we do more complicated internal routing in the future.
>>
>> I am not sure of that but I'd rather keep these converting helpers
>> for the moment.
> 
> Ok.
> 
>>>> +    if (eas.w & EAS_MASKED) {
>>>> +        args[1] = 0xff;
>>>> +    } else {
>>>> +        args[1] = GETFIELD(END_W7_F0_PRIORITY, end.w7);
>>>> +    }
>>>> +
>>>> +    args[2] = GETFIELD(EAS_END_DATA, eas.w);
>>>> +
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_GET_QUEUE_INFO hcall() is used to get the logical real
>>>> + * address of the notification management page associated with the
>>>> + * specified target and priority.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *       Bits 0-63 Reserved
>>>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>>>> + *       "ibm,ppc-interrupt-gserver#s"
>>>> + * - "priority" is a valid priority not in
>>>> + *       "ibm,plat-res-int-priorities"
>>>> + *
>>>> + * Output:
>>>> + * - R4: Logical real address of notification page
>>>> + * - R5: Power of 2 page size of the notification page
>>>> + */
>>>> +static target_ulong h_int_get_queue_info(PowerPCCPU *cpu,
>>>> +                                         sPAPRMachineState *spapr,
>>>> +                                         target_ulong opcode,
>>>> +                                         target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveENDSource *end_xsrc = &xive->end_source;
>>>> +    target_ulong flags = args[0];
>>>> +    target_ulong target = args[1];
>>>> +    target_ulong priority = args[2];
>>>> +    XiveEND end;
>>>> +    uint8_t end_blk;
>>>> +    uint32_t end_idx;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    if (!spapr_xive_priority_is_valid(priority)) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>>>> +                      priority);
>>>> +        return H_P3;
>>>> +    }
>>>> +
>>>> +    /* Validate that "target" is part of the list of threads allocated
>>>> +     * to the partition. For that, find the END corresponding to the
>>>> +     * target.
>>>> +     */
>>>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
>>>> +        return H_HARDWARE;
>>>> +    }
>>>> +
>>>> +    args[0] = xive->end_base + (1ull << (end_xsrc->esb_shift + 1)) * end_idx;
>>>> +    if (end.w0 & END_W0_ENQUEUE) {
>>>> +        args[1] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
>>>> +    } else {
>>>> +        args[1] = 0;
>>>> +    }
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_SET_QUEUE_CONFIG hcall() is used to set or reset a EQ for
>>>> + * a given "target" and "priority".  It is also used to set the
>>>> + * notification config associated with the EQ.  An EQ size of 0 is
>>>> + * used to reset the EQ config for a given target and priority. If
>>>> + * resetting the EQ config, the END associated with the given "target"
>>>> + * and "priority" will be changed to disable queueing.
>>>> + *
>>>> + * Upon return from the hcall(), no additional interrupts will be
>>>> + * directed to the old EQ (if one was set). The old EQ (if one was
>>>> + * set) should be investigated for interrupts that occurred prior to
>>>> + * or during the hcall().
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-62: Reserved
>>>> + *      Bit 63: Unconditional Notify (n) per the XIVE spec
>>>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>>>> + *       "ibm,ppc-interrupt-gserver#s"
>>>> + * - "priority" is a valid priority not in
>>>> + *       "ibm,plat-res-int-priorities"
>>>> + * - "eventQueue": The logical real address of the start of the EQ
>>>> + * - "eventQueueSize": The power of 2 EQ size per "ibm,xive-eq-sizes"
>>>> + *
>>>> + * Output:
>>>> + * - None
>>>> + */
>>>> +
>>>> +#define SPAPR_XIVE_END_ALWAYS_NOTIFY PPC_BIT(63)
>>>> +
>>>> +static target_ulong h_int_set_queue_config(PowerPCCPU *cpu,
>>>> +                                           sPAPRMachineState *spapr,
>>>> +                                           target_ulong opcode,
>>>> +                                           target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveRouter *xrtr = XIVE_ROUTER(xive);
>>>> +    target_ulong flags = args[0];
>>>> +    target_ulong target = args[1];
>>>> +    target_ulong priority = args[2];
>>>> +    target_ulong qpage = args[3];
>>>> +    target_ulong qsize = args[4];
>>>> +    XiveEND end;
>>>> +    uint8_t end_blk, nvt_blk;
>>>> +    uint32_t end_idx, nvt_idx;
>>>> +    uint32_t qdata;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags & ~SPAPR_XIVE_END_ALWAYS_NOTIFY) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    if (!spapr_xive_priority_is_valid(priority)) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>>>> +                      priority);
>>>> +        return H_P3;
>>>> +    }
>>>> +
>>>> +    /* Validate that "target" is part of the list of threads allocated
>>>> +     * to the partition. For that, find the END corresponding to the
>>>> +     * target.
>>>> +     */
>>>> +
>>>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
>>>> +        return H_HARDWARE;
>>>
>>> Again, I think this indicates a qemu (spapr) code bug, so could be an assert().
>>
>> ok
>>
>>>
>>>> +    }
>>>> +
>>>> +    switch (qsize) {
>>>> +    case 12:
>>>> +    case 16:
>>>> +    case 21:
>>>> +    case 24:
>>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
>>>
>>> It just occurred to me that I haven't been looking for this across any
>>> of these reviews.  Don't you need byteswaps when accessing these
>>> in-memory structures?
>>
>> yes this is done when some event data is enqueued in the EQ.
> 
> I'm not talking about the data in the EQ itself, but the fields in the
> END (and the NVT).

XIVE is all BE.

> 
>>
>>>
>>>> +        end.w2 = (((uint64_t)qpage)) >> 32 & 0x0fffffff;
>>>> +        end.w0 |= END_W0_ENQUEUE;
>>>> +        end.w0 = SETFIELD(END_W0_QSIZE, end.w0, qsize - 12);
>>>> +        break;
>>>> +    case 0:
>>>> +        /* reset queue and disable queueing */
>>>> +        xive_end_reset(&end);
>>>> +        goto out;
>>>> +
>>>> +    default:
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid EQ size %"PRIx64"\n",
>>>> +                      qsize);
>>>> +        return H_P5;
>>>> +    }
>>>> +
>>>> +    if (qsize) {
>>>> +        /*
>>>> +         * Let's validate the EQ address with a read of the first EQ
>>>> +         * entry. We could also check that the full queue has been
>>>> +         * zeroed by the OS.
>>>> +         */
>>>> +        if (address_space_read(&address_space_memory, qpage,
>>>> +                               MEMTXATTRS_UNSPECIFIED,
>>>> +                               (uint8_t *) &qdata, sizeof(qdata))) {
>>>> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to read EQ data @0x%"
>>>> +                          HWADDR_PRIx "\n", qpage);
>>>> +            return H_P4;
>>>
>>> Just checking the first entry doesn't seem entirely safe.  Using
>>> address_space_map() and making sure the returned plen doesn't get
>>> reduced below the queue size might be a better option.
>>
>> ok. That was on my todo list.
>>
>>>
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (spapr_xive_target_to_nvt(xive, target, &nvt_blk, &nvt_idx)) {
>>>> +        return H_HARDWARE;
>>>
>>> That could be caused by a bogus 'target' value, couldn't it?  
>>
>> yes. It should have returned H_P2 above when spapr_xive_target_to_end() 
>> is called.
>>
>>> In which
>>> case it a) should probably be checked earlier and b) should be
>>> H_PARAMETER or similar, not H_HARDWARE, yes?
>>
>> H_P2 may be again. It should be checked earlier
>>
>>>
>>>> +    }
>>>> +
>>>> +    /* Ensure the priority and target are correctly set (they will not
>>>> +     * be right after allocation)
>>>
>>> AIUI there's a static association from END to target in the PAPR
>>> model. 
>>
>> yes. 8 priorities per cpu.
>>
>>> So it seems to make more sense to get that set up right at
>>> initialization / reset, rather than doing it lazily when the 
>>> queue is configured.
>>
>> Ah. You would preconfigure the word6 and word7 then. Yes, it would
>> save us some of the conversion fuss. I will look at it.
>>
>>>> +     */
>>>> +    end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
>>>> +        SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
>>>> +    end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, priority);
>>>> +
>>>> +    if (flags & SPAPR_XIVE_END_ALWAYS_NOTIFY) {
>>>> +        end.w0 |= END_W0_UCOND_NOTIFY;
>>>> +    } else {
>>>> +        end.w0 &= ~END_W0_UCOND_NOTIFY;
>>>> +    }
>>>> +
>>>> +    /* The generation bit for the END starts at 1 and The END page
>>>> +     * offset counter starts at 0.
>>>> +     */
>>>> +    end.w1 = END_W1_GENERATION | SETFIELD(END_W1_PAGE_OFF, 0ul, 0ul);
>>>> +    end.w0 |= END_W0_VALID;
>>>> +
>>>> +    /* TODO: issue syncs required to ensure all in-flight interrupts
>>>> +     * are complete on the old END */
>>>> +out:
>>>> +    /* Update END */
>>>> +    if (xive_router_set_end(xrtr, end_blk, end_idx, &end)) {
>>>> +        return H_HARDWARE;
>>>> +    }
>>>
>>> Again the PAPR code owns the ENDs, so it can update them directly
>>> rather than going through an abstraction.
>>
>> ok.
>>
>>>
>>>> +
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_GET_QUEUE_CONFIG hcall() is used to get a EQ for a given
>>>> + * target and priority.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-62: Reserved
>>>> + *      Bit 63: Debug: Return debug data
>>>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>>>> + *       "ibm,ppc-interrupt-gserver#s"
>>>> + * - "priority" is a valid priority not in
>>>> + *       "ibm,plat-res-int-priorities"
>>>> + *
>>>> + * Output:
>>>> + * - R4: "flags":
>>>> + *       Bits 0-61: Reserved
>>>> + *       Bit 62: The value of Event Queue Generation Number (g) per
>>>> + *              the XIVE spec if "Debug" = 1
>>>> + *       Bit 63: The value of Unconditional Notify (n) per the XIVE spec
>>>> + * - R5: The logical real address of the start of the EQ
>>>> + * - R6: The power of 2 EQ size per "ibm,xive-eq-sizes"
>>>> + * - R7: The value of Event Queue Offset Counter per XIVE spec
>>>> + *       if "Debug" = 1, else 0
>>>> + *
>>>> + */
>>>> +
>>>> +#define SPAPR_XIVE_END_DEBUG     PPC_BIT(63)
>>>> +
>>>> +static target_ulong h_int_get_queue_config(PowerPCCPU *cpu,
>>>> +                                           sPAPRMachineState *spapr,
>>>> +                                           target_ulong opcode,
>>>> +                                           target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    target_ulong flags = args[0];
>>>> +    target_ulong target = args[1];
>>>> +    target_ulong priority = args[2];
>>>> +    XiveEND end;
>>>> +    uint8_t end_blk;
>>>> +    uint32_t end_idx;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags & ~SPAPR_XIVE_END_DEBUG) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    if (!spapr_xive_priority_is_valid(priority)) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
>>>> +                      priority);
>>>> +        return H_P3;
>>>> +    }
>>>> +
>>>> +    /* Validate that "target" is part of the list of threads allocated
>>>> +     * to the partition. For that, find the END corresponding to the
>>>> +     * target.
>>>> +     */
>>>> +    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
>>>> +        return H_HARDWARE;
>>>
>>> Again, assert() seems appropriate here.
>>
>> ok
>>
>>>
>>>> +    }
>>>> +
>>>> +    args[0] = 0;
>>>> +    if (end.w0 & END_W0_UCOND_NOTIFY) {
>>>> +        args[0] |= SPAPR_XIVE_END_ALWAYS_NOTIFY;
>>>> +    }
>>>> +
>>>> +    if (end.w0 & END_W0_ENQUEUE) {
>>>> +        args[1] =
>>>> +            (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
>>>> +        args[2] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
>>>> +    } else {
>>>> +        args[1] = 0;
>>>> +        args[2] = 0;
>>>> +    }
>>>> +
>>>> +    /* TODO: do we need any locking on the END ? */
>>>> +    if (flags & SPAPR_XIVE_END_DEBUG) {
>>>> +        /* Load the event queue generation number into the return flags */
>>>> +        args[0] |= (uint64_t)GETFIELD(END_W1_GENERATION, end.w1) << 62;
>>>> +
>>>> +        /* Load R7 with the event queue offset counter */
>>>> +        args[3] = GETFIELD(END_W1_PAGE_OFF, end.w1);
>>>> +    } else {
>>>> +        args[3] = 0;
>>>> +    }
>>>> +
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_SET_OS_REPORTING_LINE hcall() is used to set the
>>>> + * reporting cache line pair for the calling thread.  The reporting
>>>> + * cache lines will contain the OS interrupt context when the OS
>>>> + * issues a CI store byte to @TIMA+0xC10 to acknowledge the OS
>>>> + * interrupt. The reporting cache lines can be reset by inputting -1
>>>> + * in "reportingLine".  Issuing the CI store byte without reporting
>>>> + * cache lines registered will result in the data not being accessible
>>>> + * to the OS.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-63: Reserved
>>>> + * - "reportingLine": The logical real address of the reporting cache
>>>> + *    line pair
>>>> + *
>>>> + * Output:
>>>> + * - None
>>>> + */
>>>> +static target_ulong h_int_set_os_reporting_line(PowerPCCPU *cpu,
>>>> +                                                sPAPRMachineState *spapr,
>>>> +                                                target_ulong opcode,
>>>> +                                                target_ulong *args)
>>>> +{
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    /* TODO: H_INT_SET_OS_REPORTING_LINE */
>>>> +    return H_FUNCTION;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_GET_OS_REPORTING_LINE hcall() is used to get the logical
>>>> + * real address of the reporting cache line pair set for the input
>>>> + * "target".  If no reporting cache line pair has been set, -1 is
>>>> + * returned.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-63: Reserved
>>>> + * - "target" is per "ibm,ppc-interrupt-server#s" or
>>>> + *       "ibm,ppc-interrupt-gserver#s"
>>>> + * - "reportingLine": The logical real address of the reporting cache
>>>> + *   line pair
>>>> + *
>>>> + * Output:
>>>> + * - R4: The logical real address of the reporting line if set, else -1
>>>> + */
>>>> +static target_ulong h_int_get_os_reporting_line(PowerPCCPU *cpu,
>>>> +                                                sPAPRMachineState *spapr,
>>>> +                                                target_ulong opcode,
>>>> +                                                target_ulong *args)
>>>> +{
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    /* TODO: H_INT_GET_OS_REPORTING_LINE */
>>>> +    return H_FUNCTION;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_ESB hcall() is used to issue a load or store to the ESB
>>>> + * page for the input "lisn".  This hcall is only supported for LISNs
>>>> + * that have the ESB hcall flag set to 1 when returned from hcall()
>>>> + * H_INT_GET_SOURCE_INFO.
>>>
>>> Is there a reason for specifically restricting this to LISNs which
>>> advertise it, rather than allowing it for anything? 
>>
>> It's in the specs but I did not implement the check. So H_INT_ESB can be 
>> used today by the OS for any interrupt number. Same under KVM.
>>
>> But I should say so somewhere.
>>
>>> Obviously using
>>> the direct MMIOs will generally be a faster option when possible, but
>>> I could see occasions where it might be simpler for the guest to
>>> always use H_INT_ESB (e.g. for micro-guests like kvm-unit-tests).
>>
>> can not you use direct load and stores in these guests ? I haven't 
>> looked at how they are implemented.
> 
> It's not that you can't, but that might involve setting up mappings
> and so forth which could be more trouble than using an hcall.  At the
> very least they'll also need H_INT_ESB support for the irqs that
> require it, so allowing it for everything avoids one code variant.

ok. All good then.

Thanks,

C.

> 
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-62: Reserved
>>>> + *      bit 63: Store: Store=1, store operation, else load operation
>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>>>> + *      ibm,query-interrupt-source-number RTAS call, or as
>>>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>>>> + * - "esbOffset" is the offset into the ESB page for the load or store operation
>>>> + * - "storeData" is the data to write for a store operation
>>>> + *
>>>> + * Output:
>>>> + * - R4: R4: The value of the load if load operation, else -1
>>>> + */
>>>> +
>>>> +#define SPAPR_XIVE_ESB_STORE PPC_BIT(63)
>>>> +
>>>> +static target_ulong h_int_esb(PowerPCCPU *cpu,
>>>> +                              sPAPRMachineState *spapr,
>>>> +                              target_ulong opcode,
>>>> +                              target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveEAS eas;
>>>> +    target_ulong flags  = args[0];
>>>> +    target_ulong lisn   = args[1];
>>>> +    target_ulong offset = args[2];
>>>> +    target_ulong data   = args[3];
>>>> +    hwaddr mmio_addr;
>>>> +    XiveSource *xsrc = &xive->source;
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags & ~SPAPR_XIVE_ESB_STORE) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (!(eas.w & EAS_VALID)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (offset > (1ull << xsrc->esb_shift)) {
>>>> +        return H_P3;
>>>> +    }
>>>> +
>>>> +    mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
>>>> +
>>>> +    if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8,
>>>> +                      (flags & SPAPR_XIVE_ESB_STORE))) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
>>>> +                      HWADDR_PRIx "\n", mmio_addr);
>>>> +        return H_HARDWARE;
>>>> +    }
>>>> +    args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data;
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_SYNC hcall() is used to issue hardware syncs that will
>>>> + * ensure any in flight events for the input lisn are in the event
>>>> + * queue.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-63: Reserved
>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>> + *      "ibm,xive-lisn-ranges" properties, or as returned by the
>>>> + *      ibm,query-interrupt-source-number RTAS call, or as
>>>> + *      returned by the H_ALLOCATE_VAS_WINDOW hcall
>>>> + *
>>>> + * Output:
>>>> + * - None
>>>> + */
>>>> +static target_ulong h_int_sync(PowerPCCPU *cpu,
>>>> +                               sPAPRMachineState *spapr,
>>>> +                               target_ulong opcode,
>>>> +                               target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    XiveEAS eas;
>>>> +    target_ulong flags = args[0];
>>>> +    target_ulong lisn = args[1];
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    if (!(eas.w & EAS_VALID)) {
>>>> +        return H_P2;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * H_STATE should be returned if a H_INT_RESET is in progress.
>>>> +     * This is not needed when running the emulation under QEMU
>>>> +     */
>>>> +
>>>> +    /* This is not real hardware. Nothing to be done */
>>>
>>> At least, not as long as all the XIVE operations are under the BQL.
>>
>> yes.
>>
>>>
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The H_INT_RESET hcall() is used to reset all of the partition's
>>>> + * interrupt exploitation structures to their initial state.  This
>>>> + * means losing all previously set interrupt state set via
>>>> + * H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
>>>> + *
>>>> + * Parameters:
>>>> + * Input:
>>>> + * - "flags"
>>>> + *      Bits 0-63: Reserved
>>>> + *
>>>> + * Output:
>>>> + * - None
>>>> + */
>>>> +static target_ulong h_int_reset(PowerPCCPU *cpu,
>>>> +                                sPAPRMachineState *spapr,
>>>> +                                target_ulong opcode,
>>>> +                                target_ulong *args)
>>>> +{
>>>> +    sPAPRXive *xive = spapr->xive;
>>>> +    target_ulong flags   = args[0];
>>>> +
>>>> +    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        return H_FUNCTION;
>>>> +    }
>>>> +
>>>> +    if (flags) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    device_reset(DEVICE(xive));
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr)
>>>> +{
>>>> +    spapr_register_hypercall(H_INT_GET_SOURCE_INFO, h_int_get_source_info);
>>>> +    spapr_register_hypercall(H_INT_SET_SOURCE_CONFIG, h_int_set_source_config);
>>>> +    spapr_register_hypercall(H_INT_GET_SOURCE_CONFIG, h_int_get_source_config);
>>>> +    spapr_register_hypercall(H_INT_GET_QUEUE_INFO, h_int_get_queue_info);
>>>> +    spapr_register_hypercall(H_INT_SET_QUEUE_CONFIG, h_int_set_queue_config);
>>>> +    spapr_register_hypercall(H_INT_GET_QUEUE_CONFIG, h_int_get_queue_config);
>>>> +    spapr_register_hypercall(H_INT_SET_OS_REPORTING_LINE,
>>>> +                             h_int_set_os_reporting_line);
>>>> +    spapr_register_hypercall(H_INT_GET_OS_REPORTING_LINE,
>>>> +                             h_int_get_os_reporting_line);
>>>> +    spapr_register_hypercall(H_INT_ESB, h_int_esb);
>>>> +    spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>>>> +    spapr_register_hypercall(H_INT_RESET, h_int_reset);
>>>> +}
>>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>>>> index 2569ae1bc7f8..da6fcfaa3c52 100644
>>>> --- a/hw/ppc/spapr_irq.c
>>>> +++ b/hw/ppc/spapr_irq.c
>>>> @@ -258,6 +258,8 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
>>>>          error_propagate(errp, local_err);
>>>>          return;
>>>>      }
>>>> +
>>>> +    spapr_xive_hcall_init(spapr);
>>>>  }
>>>>  
>>>>  static int spapr_irq_claim_xive(sPAPRMachineState *spapr, int irq, bool lsi,
>>>> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
>>>> index 301a8e972d91..eacd26836ebf 100644
>>>> --- a/hw/intc/Makefile.objs
>>>> +++ b/hw/intc/Makefile.objs
>>>> @@ -38,7 +38,7 @@ obj-$(CONFIG_XICS) += xics.o
>>>>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>>>>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>>>>  obj-$(CONFIG_XIVE) += xive.o
>>>> -obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
>>>> +obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
>>>>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>>>>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>>>>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
>>>
>>
>
David Gibson Nov. 30, 2018, 1:23 a.m. UTC | #5
On Thu, Nov 29, 2018 at 05:04:50PM +0100, Cédric Le Goater wrote:
> On 11/29/18 2:23 AM, David Gibson wrote:
> > On Wed, Nov 28, 2018 at 11:21:37PM +0100, Cédric Le Goater wrote:
> >> On 11/28/18 5:25 AM, David Gibson wrote:
> >>> On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
> >>>> The different XIVE virtualization structures (sources and event queues)
> >>>> are configured with a set of Hypervisor calls :
> >>>>
> >>>>  - H_INT_GET_SOURCE_INFO
> >>>>
> >>>>    used to obtain the address of the MMIO page of the Event State
> >>>>    Buffer (ESB) entry associated with the source.
> >>>>
> >>>>  - H_INT_SET_SOURCE_CONFIG
> >>>>
> >>>>    assigns a source to a "target".
> >>>>
> >>>>  - H_INT_GET_SOURCE_CONFIG
> >>>>
> >>>>    determines which "target" and "priority" is assigned to a source
> >>>>
> >>>>  - H_INT_GET_QUEUE_INFO
> >>>>
> >>>>    returns the address of the notification management page associated
> >>>>    with the specified "target" and "priority".
> >>>>
> >>>>  - H_INT_SET_QUEUE_CONFIG
> >>>>
> >>>>    sets or resets the event queue for a given "target" and "priority".
> >>>>    It is also used to set the notification configuration associated
> >>>>    with the queue, only unconditional notification is supported for
> >>>>    the moment. Reset is performed with a queue size of 0 and queueing
> >>>>    is disabled in that case.
> >>>>
> >>>>  - H_INT_GET_QUEUE_CONFIG
> >>>>
> >>>>    returns the queue settings for a given "target" and "priority".
> >>>>
> >>>>  - H_INT_RESET
> >>>>
> >>>>    resets all of the guest's internal interrupt structures to their
> >>>>    initial state, losing all configuration set via the hcalls
> >>>>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> >>>>
> >>>>  - H_INT_SYNC
> >>>>
> >>>>    issue a synchronisation on a source to make sure all notifications
> >>>>    have reached their queue.
> >>>>
> >>>> Calls that still need to be addressed :
> >>>>
> >>>>    H_INT_SET_OS_REPORTING_LINE
> >>>>    H_INT_GET_OS_REPORTING_LINE
> >>>>
> >>>> See the code for more documentation on each hcall.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  include/hw/ppc/spapr.h      |  15 +-
> >>>>  include/hw/ppc/spapr_xive.h |   6 +
> >>>>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
> >>>>  hw/ppc/spapr_irq.c          |   2 +
> >>>>  hw/intc/Makefile.objs       |   2 +-
> >>>>  5 files changed, 915 insertions(+), 2 deletions(-)
> >>>>  create mode 100644 hw/intc/spapr_xive_hcall.c
> >>>>
> >>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>> index 1fbc2663e06c..8415faea7b82 100644
> >>>> --- a/include/hw/ppc/spapr.h
> >>>> +++ b/include/hw/ppc/spapr.h
> >>>> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
> >>>>  #define H_INVALIDATE_PID        0x378
> >>>>  #define H_REGISTER_PROC_TBL     0x37C
> >>>>  #define H_SIGNAL_SYS_RESET      0x380
> >>>> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
> >>>> +
> >>>> +#define H_INT_GET_SOURCE_INFO   0x3A8
> >>>> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
> >>>> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
> >>>> +#define H_INT_GET_QUEUE_INFO    0x3B4
> >>>> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
> >>>> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
> >>>> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
> >>>> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
> >>>> +#define H_INT_ESB               0x3C8
> >>>> +#define H_INT_SYNC              0x3CC
> >>>> +#define H_INT_RESET             0x3D0
> >>>> +
> >>>> +#define MAX_HCALL_OPCODE        H_INT_RESET
> >>>>  
> >>>>  /* The hcalls above are standardized in PAPR and implemented by pHyp
> >>>>   * as well.
> >>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >>>> index 3f65b8f485fd..418511f3dc10 100644
> >>>> --- a/include/hw/ppc/spapr_xive.h
> >>>> +++ b/include/hw/ppc/spapr_xive.h
> >>>> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
> >>>>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
> >>>>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
> >>>>  
> >>>> +bool spapr_xive_priority_is_valid(uint8_t priority);
> >>>
> >>> AFAICT this could be a local function.
> >>
> >> the KVM model uses it also, when collecting state from the KVM device 
> >> to build the QEMU ENDT.
> >>
> >>>> +
> >>>> +typedef struct sPAPRMachineState sPAPRMachineState;
> >>>> +
> >>>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
> >>>> +
> >>>>  #endif /* PPC_SPAPR_XIVE_H */
> >>>> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
> >>>> new file mode 100644
> >>>> index 000000000000..52e4e23995f5
> >>>> --- /dev/null
> >>>> +++ b/hw/intc/spapr_xive_hcall.c
> >>>> @@ -0,0 +1,892 @@
> >>>> +/*
> >>>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> >>>> + *
> >>>> + * Copyright (c) 2017-2018, IBM Corporation.
> >>>> + *
> >>>> + * This code is licensed under the GPL version 2 or later. See the
> >>>> + * COPYING file in the top-level directory.
> >>>> + */
> >>>> +
> >>>> +#include "qemu/osdep.h"
> >>>> +#include "qemu/log.h"
> >>>> +#include "qapi/error.h"
> >>>> +#include "cpu.h"
> >>>> +#include "hw/ppc/fdt.h"
> >>>> +#include "hw/ppc/spapr.h"
> >>>> +#include "hw/ppc/spapr_xive.h"
> >>>> +#include "hw/ppc/xive_regs.h"
> >>>> +#include "monitor/monitor.h"
> >>>
> >>> Fwiw, I don't think it's particularly necessary to split the hcall
> >>> handling out into a separate .c file.
> >>
> >> ok. let's move it to spapr_xive then ? It might help in reducing the 
> >> exported funtions. 
> > 
> > Yes, I think so.
> > 
> >>>> +/*
> >>>> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
> >>>> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
> >>>> + * available for the guest.
> >>>
> >>> Referencing OPAL behaviour doesn't really make sense in the context of
> >>> PAPR.  
> >>
> >> It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
> >> constraint also.
> > 
> > Right, I realized that a few patches on.  Maybe rephrase this to
> > 
> >    Linux hosts under OPAL reserve priority 7 for their own escalation
> >    interrupts.  So we only allow the guest to use priorities [0..6].
> 
> OK.
> 
> > The point here is that we're emphasizing that this is a design
> > decision to make the host implementation easier, rather than a
> > fundamental constraint.
> > 
> >>> What I think you're getting at is that the PAPR spec only
> >>> allows a PAPR guest to use priorities 0..6 (or at least it will if the
> >>> XIVE updated spec ever gets published).  
> >>
> >> It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
> >>  
> >>> The fact that this allows the
> >>> host use 7 for escalations is a design rationale 
> >>> but not really relevant to the guest device itself. 
> >>
> >> The guest should be aware of which priorities are reserved for
> >> the hypervisor though.
> >>
> >>>> + */
> >>>> +bool spapr_xive_priority_is_valid(uint8_t priority)
> >>>> +{
> >>>> +    switch (priority) {
> >>>> +    case 0 ... 6:
> >>>> +        return true;
> >>>> +    case 7: /* OPAL escalation queue */
> >>>> +    default:
> >>>> +        return false;
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
> >>>> + * real address of the MMIO page through which the Event State Buffer
> >>>> + * entry associated with the value of the "lisn" parameter is managed.
> >>>> + *
> >>>> + * Parameters:
> >>>> + * Input
> >>>> + * - "flags"
> >>>> + *       Bits 0-63 reserved
> >>>> + * - "lisn" is per "interrupts", "interrupt-map", or
> >>>> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
> >>>> + *       ibm,query-interrupt-source-number RTAS call, or as returned
> >>>> + *       by the H_ALLOCATE_VAS_WINDOW hcall
> >>>
> >>> I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
> >>> to implement in kvm/qemu, or is it only of interest for PowerVM?
> >>
> >> The hcall is part of the PAPR NX Interfaces and it returns interrupt
> >> numbers. I don't know if any work has been done on the topic.  
> > 
> > What's a "PAPR NX"?
> 
> A way for the PAPR guests to access the POWER coprocessors doing 
> compression and encryption. I really don't know much about this.

Ah, ok.

[snip]
> >> I think not, but the specs are not very clear on that topic. I will
> >> ask for clarification and use a -1 for now. We can not do loads on
> >> the trigger page so it can not be used by the H_INT_ESB hcall.
> >>
> >>>
> >>>> +    args[3] = TARGET_PAGE_SIZE;
> >>>
> >>> That seems wrong.  
> >>
> >> This is utterly wrong. it should be a power of 2 number ... I got
> >> it right under KVM though. I guess that ioremap() under Linux rounds 
> >> up the size to the page size in use, so, that's why it didn't blow
> >> up under TCG.
> >>
> >>> TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
> >>> actually be 64kiB?
> >>
> >> yes. So what should I use to get a PAGE_SHIFT instead ? 
> > 
> > Erm, that gets a bit tricky, since qemu in a sense doesn't know the
> > guest's page size.
> > 
> > But.. don't you actually want the esb_shift here, not PAGE_SHIFT - it
> > could matter for the 2 page * 64kiB variant, yes?
> 
> Yes. we just want the page_shift of the ESB page, whether it's one or
> two pages. The other registers inform the guest if there are one or 
> two ESB page in use. 

Ok, still sounds like you should base it on esb_shift, just adjust for
the two page case.

> >>>> +    }
> >>>> +
> >>>> +    switch (qsize) {
> >>>> +    case 12:
> >>>> +    case 16:
> >>>> +    case 21:
> >>>> +    case 24:
> >>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
> >>>
> >>> It just occurred to me that I haven't been looking for this across any
> >>> of these reviews.  Don't you need byteswaps when accessing these
> >>> in-memory structures?
> >>
> >> yes this is done when some event data is enqueued in the EQ.
> > 
> > I'm not talking about the data in the EQ itself, but the fields in the
> > END (and the NVT).
> 
> XIVE is all BE.

Yes... the qemu host might not be, which is why you need byteswaps.

I realized eventually you have the swaps in your pnv get/set
accessors.  I don't like that at all for a couple of reasons:

1) Although the END structure is made up of word-sized fields because
that's convenient, the END really is made of a bunch of subfields of
different sizes.  Knowing that it wouldn't be unreasonable for people
to expect they can look into the XIVE by byte offsets; that will break
if you're working with a copy that has already been byte-swapped on
word-sized units.

2) At different points in the code you're storing both BE and
native-endian data in the same struct.  That's both confusing to
someone reading the code (if they see that struct they don't know if
it's byteswapped already) and also means you can't use sparse
annotations to make sure you have it right.
Cédric Le Goater Nov. 30, 2018, 8:07 a.m. UTC | #6
On 11/30/18 2:23 AM, David Gibson wrote:
> On Thu, Nov 29, 2018 at 05:04:50PM +0100, Cédric Le Goater wrote:
>> On 11/29/18 2:23 AM, David Gibson wrote:
>>> On Wed, Nov 28, 2018 at 11:21:37PM +0100, Cédric Le Goater wrote:
>>>> On 11/28/18 5:25 AM, David Gibson wrote:
>>>>> On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
>>>>>> The different XIVE virtualization structures (sources and event queues)
>>>>>> are configured with a set of Hypervisor calls :
>>>>>>
>>>>>>  - H_INT_GET_SOURCE_INFO
>>>>>>
>>>>>>    used to obtain the address of the MMIO page of the Event State
>>>>>>    Buffer (ESB) entry associated with the source.
>>>>>>
>>>>>>  - H_INT_SET_SOURCE_CONFIG
>>>>>>
>>>>>>    assigns a source to a "target".
>>>>>>
>>>>>>  - H_INT_GET_SOURCE_CONFIG
>>>>>>
>>>>>>    determines which "target" and "priority" is assigned to a source
>>>>>>
>>>>>>  - H_INT_GET_QUEUE_INFO
>>>>>>
>>>>>>    returns the address of the notification management page associated
>>>>>>    with the specified "target" and "priority".
>>>>>>
>>>>>>  - H_INT_SET_QUEUE_CONFIG
>>>>>>
>>>>>>    sets or resets the event queue for a given "target" and "priority".
>>>>>>    It is also used to set the notification configuration associated
>>>>>>    with the queue, only unconditional notification is supported for
>>>>>>    the moment. Reset is performed with a queue size of 0 and queueing
>>>>>>    is disabled in that case.
>>>>>>
>>>>>>  - H_INT_GET_QUEUE_CONFIG
>>>>>>
>>>>>>    returns the queue settings for a given "target" and "priority".
>>>>>>
>>>>>>  - H_INT_RESET
>>>>>>
>>>>>>    resets all of the guest's internal interrupt structures to their
>>>>>>    initial state, losing all configuration set via the hcalls
>>>>>>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
>>>>>>
>>>>>>  - H_INT_SYNC
>>>>>>
>>>>>>    issue a synchronisation on a source to make sure all notifications
>>>>>>    have reached their queue.
>>>>>>
>>>>>> Calls that still need to be addressed :
>>>>>>
>>>>>>    H_INT_SET_OS_REPORTING_LINE
>>>>>>    H_INT_GET_OS_REPORTING_LINE
>>>>>>
>>>>>> See the code for more documentation on each hcall.
>>>>>>
>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>> ---
>>>>>>  include/hw/ppc/spapr.h      |  15 +-
>>>>>>  include/hw/ppc/spapr_xive.h |   6 +
>>>>>>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
>>>>>>  hw/ppc/spapr_irq.c          |   2 +
>>>>>>  hw/intc/Makefile.objs       |   2 +-
>>>>>>  5 files changed, 915 insertions(+), 2 deletions(-)
>>>>>>  create mode 100644 hw/intc/spapr_xive_hcall.c
>>>>>>
>>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>>>> index 1fbc2663e06c..8415faea7b82 100644
>>>>>> --- a/include/hw/ppc/spapr.h
>>>>>> +++ b/include/hw/ppc/spapr.h
>>>>>> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
>>>>>>  #define H_INVALIDATE_PID        0x378
>>>>>>  #define H_REGISTER_PROC_TBL     0x37C
>>>>>>  #define H_SIGNAL_SYS_RESET      0x380
>>>>>> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
>>>>>> +
>>>>>> +#define H_INT_GET_SOURCE_INFO   0x3A8
>>>>>> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
>>>>>> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
>>>>>> +#define H_INT_GET_QUEUE_INFO    0x3B4
>>>>>> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
>>>>>> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
>>>>>> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
>>>>>> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
>>>>>> +#define H_INT_ESB               0x3C8
>>>>>> +#define H_INT_SYNC              0x3CC
>>>>>> +#define H_INT_RESET             0x3D0
>>>>>> +
>>>>>> +#define MAX_HCALL_OPCODE        H_INT_RESET
>>>>>>  
>>>>>>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>>>>>>   * as well.
>>>>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>>>>>> index 3f65b8f485fd..418511f3dc10 100644
>>>>>> --- a/include/hw/ppc/spapr_xive.h
>>>>>> +++ b/include/hw/ppc/spapr_xive.h
>>>>>> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
>>>>>>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
>>>>>>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
>>>>>>  
>>>>>> +bool spapr_xive_priority_is_valid(uint8_t priority);
>>>>>
>>>>> AFAICT this could be a local function.
>>>>
>>>> the KVM model uses it also, when collecting state from the KVM device 
>>>> to build the QEMU ENDT.
>>>>
>>>>>> +
>>>>>> +typedef struct sPAPRMachineState sPAPRMachineState;
>>>>>> +
>>>>>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
>>>>>> +
>>>>>>  #endif /* PPC_SPAPR_XIVE_H */
>>>>>> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
>>>>>> new file mode 100644
>>>>>> index 000000000000..52e4e23995f5
>>>>>> --- /dev/null
>>>>>> +++ b/hw/intc/spapr_xive_hcall.c
>>>>>> @@ -0,0 +1,892 @@
>>>>>> +/*
>>>>>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
>>>>>> + *
>>>>>> + * Copyright (c) 2017-2018, IBM Corporation.
>>>>>> + *
>>>>>> + * This code is licensed under the GPL version 2 or later. See the
>>>>>> + * COPYING file in the top-level directory.
>>>>>> + */
>>>>>> +
>>>>>> +#include "qemu/osdep.h"
>>>>>> +#include "qemu/log.h"
>>>>>> +#include "qapi/error.h"
>>>>>> +#include "cpu.h"
>>>>>> +#include "hw/ppc/fdt.h"
>>>>>> +#include "hw/ppc/spapr.h"
>>>>>> +#include "hw/ppc/spapr_xive.h"
>>>>>> +#include "hw/ppc/xive_regs.h"
>>>>>> +#include "monitor/monitor.h"
>>>>>
>>>>> Fwiw, I don't think it's particularly necessary to split the hcall
>>>>> handling out into a separate .c file.
>>>>
>>>> ok. let's move it to spapr_xive then ? It might help in reducing the 
>>>> exported funtions. 
>>>
>>> Yes, I think so.
>>>
>>>>>> +/*
>>>>>> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
>>>>>> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
>>>>>> + * available for the guest.
>>>>>
>>>>> Referencing OPAL behaviour doesn't really make sense in the context of
>>>>> PAPR.  
>>>>
>>>> It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
>>>> constraint also.
>>>
>>> Right, I realized that a few patches on.  Maybe rephrase this to
>>>
>>>    Linux hosts under OPAL reserve priority 7 for their own escalation
>>>    interrupts.  So we only allow the guest to use priorities [0..6].
>>
>> OK.
>>
>>> The point here is that we're emphasizing that this is a design
>>> decision to make the host implementation easier, rather than a
>>> fundamental constraint.
>>>
>>>>> What I think you're getting at is that the PAPR spec only
>>>>> allows a PAPR guest to use priorities 0..6 (or at least it will if the
>>>>> XIVE updated spec ever gets published).  
>>>>
>>>> It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
>>>>  
>>>>> The fact that this allows the
>>>>> host use 7 for escalations is a design rationale 
>>>>> but not really relevant to the guest device itself. 
>>>>
>>>> The guest should be aware of which priorities are reserved for
>>>> the hypervisor though.
>>>>
>>>>>> + */
>>>>>> +bool spapr_xive_priority_is_valid(uint8_t priority)
>>>>>> +{
>>>>>> +    switch (priority) {
>>>>>> +    case 0 ... 6:
>>>>>> +        return true;
>>>>>> +    case 7: /* OPAL escalation queue */
>>>>>> +    default:
>>>>>> +        return false;
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
>>>>>> + * real address of the MMIO page through which the Event State Buffer
>>>>>> + * entry associated with the value of the "lisn" parameter is managed.
>>>>>> + *
>>>>>> + * Parameters:
>>>>>> + * Input
>>>>>> + * - "flags"
>>>>>> + *       Bits 0-63 reserved
>>>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
>>>>>> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
>>>>>> + *       ibm,query-interrupt-source-number RTAS call, or as returned
>>>>>> + *       by the H_ALLOCATE_VAS_WINDOW hcall
>>>>>
>>>>> I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
>>>>> to implement in kvm/qemu, or is it only of interest for PowerVM?
>>>>
>>>> The hcall is part of the PAPR NX Interfaces and it returns interrupt
>>>> numbers. I don't know if any work has been done on the topic.  
>>>
>>> What's a "PAPR NX"?
>>
>> A way for the PAPR guests to access the POWER coprocessors doing 
>> compression and encryption. I really don't know much about this.
> 
> Ah, ok.
> 
> [snip]
>>>> I think not, but the specs are not very clear on that topic. I will
>>>> ask for clarification and use a -1 for now. We can not do loads on
>>>> the trigger page so it can not be used by the H_INT_ESB hcall.
>>>>
>>>>>
>>>>>> +    args[3] = TARGET_PAGE_SIZE;
>>>>>
>>>>> That seems wrong.  
>>>>
>>>> This is utterly wrong. it should be a power of 2 number ... I got
>>>> it right under KVM though. I guess that ioremap() under Linux rounds 
>>>> up the size to the page size in use, so, that's why it didn't blow
>>>> up under TCG.
>>>>
>>>>> TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
>>>>> actually be 64kiB?
>>>>
>>>> yes. So what should I use to get a PAGE_SHIFT instead ? 
>>>
>>> Erm, that gets a bit tricky, since qemu in a sense doesn't know the
>>> guest's page size.
>>>
>>> But.. don't you actually want the esb_shift here, not PAGE_SHIFT - it
>>> could matter for the 2 page * 64kiB variant, yes?
>>
>> Yes. we just want the page_shift of the ESB page, whether it's one or
>> two pages. The other registers inform the guest if there are one or 
>> two ESB page in use. 
> 
> Ok, still sounds like you should base it on esb_shift, just adjust for
> the two page case.

yes.


>>>>>> +    }
>>>>>> +
>>>>>> +    switch (qsize) {
>>>>>> +    case 12:
>>>>>> +    case 16:
>>>>>> +    case 21:
>>>>>> +    case 24:
>>>>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
>>>>>
>>>>> It just occurred to me that I haven't been looking for this across any
>>>>> of these reviews.  Don't you need byteswaps when accessing these
>>>>> in-memory structures?
>>>>
>>>> yes this is done when some event data is enqueued in the EQ.
>>>
>>> I'm not talking about the data in the EQ itself, but the fields in the
>>> END (and the NVT).
>>
>> XIVE is all BE.
> 
> Yes... the qemu host might not be, which is why you need byteswaps.

ok. I understand.

> I realized eventually you have the swaps in your pnv get/set
> accessors.  

Yes. because skiboot is BE, like the XIVE structures.  

> I don't like that at all for a couple of reasons:
> 
> 1) Although the END structure is made up of word-sized fields because
> that's convenient, the END really is made of a bunch of subfields of
> different sizes.  Knowing that it wouldn't be unreasonable for people
> to expect they can look into the XIVE by byte offsets; 

These structures should be accessed with GETFIELD and SETFIELD macros
using the XIVE definitions in the xive_regs.h header file. I would want 
to keep that common with skiboot  for sure.

Are you suggesting we should define each field of the XIVE structures 
with C attributes ? That would be very unfortunate. 
 
> that will break
> if you're working with a copy that has already been byte-swapped on
> word-sized units.

I am not sure I understand the last sentence. 

the code working with a copy would necessarily know that the structure 
has been byteswapped and use correct offsets for the expected endianess. 
no ? why would it break ?  

> 2) At different points in the code you're storing both BE and
> native-endian data in the same struct. 

on sPAPR, it's all native (which is a violation I agree). TIMA is BE.

> That's both confusing to
> someone reading the code (if they see that struct they don't know if
> it's byteswapped already) and also means you can't use sparse
> annotations to make sure you have it right.

XIVE structures are architected to be BE. That's immutable.

It's a not problem for skiboot which is BE. The PnvXIVE model for the 
QEMU PowerNV machine reads these VSTs (Virtual Structure Tables) from 
the guest RAM and byteswaps the structure before using it. I think
that's fine. Isn't it ? 

It becomes a problem with the sPAPR model which is using the XIVE structures 
in native endianess and not BE anymore. But the guest OS never manipulates 
these structures, so under the hood, I think we are free to use them in 
native and keep the common definitions.

Except that the event data entries in the OS EQs are BE. So the only place 
where we convert is when an event data is enqueued. 

What would you put in place if you think this is a too strong violation 
of the architecture ? I am afraid of something too complex to manipulate
to be honest. May be we can drop the map/unmap access methods only keep 
the very basic ones.  

C.
David Gibson Dec. 3, 2018, 1:36 a.m. UTC | #7
On Fri, Nov 30, 2018 at 09:07:19AM +0100, Cédric Le Goater wrote:
> On 11/30/18 2:23 AM, David Gibson wrote:
> > On Thu, Nov 29, 2018 at 05:04:50PM +0100, Cédric Le Goater wrote:
> >> On 11/29/18 2:23 AM, David Gibson wrote:
> >>> On Wed, Nov 28, 2018 at 11:21:37PM +0100, Cédric Le Goater wrote:
> >>>> On 11/28/18 5:25 AM, David Gibson wrote:
> >>>>> On Fri, Nov 16, 2018 at 11:57:09AM +0100, Cédric Le Goater wrote:
> >>>>>> The different XIVE virtualization structures (sources and event queues)
> >>>>>> are configured with a set of Hypervisor calls :
> >>>>>>
> >>>>>>  - H_INT_GET_SOURCE_INFO
> >>>>>>
> >>>>>>    used to obtain the address of the MMIO page of the Event State
> >>>>>>    Buffer (ESB) entry associated with the source.
> >>>>>>
> >>>>>>  - H_INT_SET_SOURCE_CONFIG
> >>>>>>
> >>>>>>    assigns a source to a "target".
> >>>>>>
> >>>>>>  - H_INT_GET_SOURCE_CONFIG
> >>>>>>
> >>>>>>    determines which "target" and "priority" is assigned to a source
> >>>>>>
> >>>>>>  - H_INT_GET_QUEUE_INFO
> >>>>>>
> >>>>>>    returns the address of the notification management page associated
> >>>>>>    with the specified "target" and "priority".
> >>>>>>
> >>>>>>  - H_INT_SET_QUEUE_CONFIG
> >>>>>>
> >>>>>>    sets or resets the event queue for a given "target" and "priority".
> >>>>>>    It is also used to set the notification configuration associated
> >>>>>>    with the queue, only unconditional notification is supported for
> >>>>>>    the moment. Reset is performed with a queue size of 0 and queueing
> >>>>>>    is disabled in that case.
> >>>>>>
> >>>>>>  - H_INT_GET_QUEUE_CONFIG
> >>>>>>
> >>>>>>    returns the queue settings for a given "target" and "priority".
> >>>>>>
> >>>>>>  - H_INT_RESET
> >>>>>>
> >>>>>>    resets all of the guest's internal interrupt structures to their
> >>>>>>    initial state, losing all configuration set via the hcalls
> >>>>>>    H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
> >>>>>>
> >>>>>>  - H_INT_SYNC
> >>>>>>
> >>>>>>    issue a synchronisation on a source to make sure all notifications
> >>>>>>    have reached their queue.
> >>>>>>
> >>>>>> Calls that still need to be addressed :
> >>>>>>
> >>>>>>    H_INT_SET_OS_REPORTING_LINE
> >>>>>>    H_INT_GET_OS_REPORTING_LINE
> >>>>>>
> >>>>>> See the code for more documentation on each hcall.
> >>>>>>
> >>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>> ---
> >>>>>>  include/hw/ppc/spapr.h      |  15 +-
> >>>>>>  include/hw/ppc/spapr_xive.h |   6 +
> >>>>>>  hw/intc/spapr_xive_hcall.c  | 892 ++++++++++++++++++++++++++++++++++++
> >>>>>>  hw/ppc/spapr_irq.c          |   2 +
> >>>>>>  hw/intc/Makefile.objs       |   2 +-
> >>>>>>  5 files changed, 915 insertions(+), 2 deletions(-)
> >>>>>>  create mode 100644 hw/intc/spapr_xive_hcall.c
> >>>>>>
> >>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>>>> index 1fbc2663e06c..8415faea7b82 100644
> >>>>>> --- a/include/hw/ppc/spapr.h
> >>>>>> +++ b/include/hw/ppc/spapr.h
> >>>>>> @@ -452,7 +452,20 @@ struct sPAPRMachineState {
> >>>>>>  #define H_INVALIDATE_PID        0x378
> >>>>>>  #define H_REGISTER_PROC_TBL     0x37C
> >>>>>>  #define H_SIGNAL_SYS_RESET      0x380
> >>>>>> -#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
> >>>>>> +
> >>>>>> +#define H_INT_GET_SOURCE_INFO   0x3A8
> >>>>>> +#define H_INT_SET_SOURCE_CONFIG 0x3AC
> >>>>>> +#define H_INT_GET_SOURCE_CONFIG 0x3B0
> >>>>>> +#define H_INT_GET_QUEUE_INFO    0x3B4
> >>>>>> +#define H_INT_SET_QUEUE_CONFIG  0x3B8
> >>>>>> +#define H_INT_GET_QUEUE_CONFIG  0x3BC
> >>>>>> +#define H_INT_SET_OS_REPORTING_LINE 0x3C0
> >>>>>> +#define H_INT_GET_OS_REPORTING_LINE 0x3C4
> >>>>>> +#define H_INT_ESB               0x3C8
> >>>>>> +#define H_INT_SYNC              0x3CC
> >>>>>> +#define H_INT_RESET             0x3D0
> >>>>>> +
> >>>>>> +#define MAX_HCALL_OPCODE        H_INT_RESET
> >>>>>>  
> >>>>>>  /* The hcalls above are standardized in PAPR and implemented by pHyp
> >>>>>>   * as well.
> >>>>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >>>>>> index 3f65b8f485fd..418511f3dc10 100644
> >>>>>> --- a/include/hw/ppc/spapr_xive.h
> >>>>>> +++ b/include/hw/ppc/spapr_xive.h
> >>>>>> @@ -60,4 +60,10 @@ int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
> >>>>>>  int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
> >>>>>>                            uint8_t *out_end_blk, uint32_t *out_end_idx);
> >>>>>>  
> >>>>>> +bool spapr_xive_priority_is_valid(uint8_t priority);
> >>>>>
> >>>>> AFAICT this could be a local function.
> >>>>
> >>>> the KVM model uses it also, when collecting state from the KVM device 
> >>>> to build the QEMU ENDT.
> >>>>
> >>>>>> +
> >>>>>> +typedef struct sPAPRMachineState sPAPRMachineState;
> >>>>>> +
> >>>>>> +void spapr_xive_hcall_init(sPAPRMachineState *spapr);
> >>>>>> +
> >>>>>>  #endif /* PPC_SPAPR_XIVE_H */
> >>>>>> diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
> >>>>>> new file mode 100644
> >>>>>> index 000000000000..52e4e23995f5
> >>>>>> --- /dev/null
> >>>>>> +++ b/hw/intc/spapr_xive_hcall.c
> >>>>>> @@ -0,0 +1,892 @@
> >>>>>> +/*
> >>>>>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> >>>>>> + *
> >>>>>> + * Copyright (c) 2017-2018, IBM Corporation.
> >>>>>> + *
> >>>>>> + * This code is licensed under the GPL version 2 or later. See the
> >>>>>> + * COPYING file in the top-level directory.
> >>>>>> + */
> >>>>>> +
> >>>>>> +#include "qemu/osdep.h"
> >>>>>> +#include "qemu/log.h"
> >>>>>> +#include "qapi/error.h"
> >>>>>> +#include "cpu.h"
> >>>>>> +#include "hw/ppc/fdt.h"
> >>>>>> +#include "hw/ppc/spapr.h"
> >>>>>> +#include "hw/ppc/spapr_xive.h"
> >>>>>> +#include "hw/ppc/xive_regs.h"
> >>>>>> +#include "monitor/monitor.h"
> >>>>>
> >>>>> Fwiw, I don't think it's particularly necessary to split the hcall
> >>>>> handling out into a separate .c file.
> >>>>
> >>>> ok. let's move it to spapr_xive then ? It might help in reducing the 
> >>>> exported funtions. 
> >>>
> >>> Yes, I think so.
> >>>
> >>>>>> +/*
> >>>>>> + * OPAL uses the priority 7 EQ to automatically escalate interrupts
> >>>>>> + * for all other queues (DD2.X POWER9). So only priorities [0..6] are
> >>>>>> + * available for the guest.
> >>>>>
> >>>>> Referencing OPAL behaviour doesn't really make sense in the context of
> >>>>> PAPR.  
> >>>>
> >>>> It's an OPAL constraint which pHyp doesn't have. So its a QEMU/KVM 
> >>>> constraint also.
> >>>
> >>> Right, I realized that a few patches on.  Maybe rephrase this to
> >>>
> >>>    Linux hosts under OPAL reserve priority 7 for their own escalation
> >>>    interrupts.  So we only allow the guest to use priorities [0..6].
> >>
> >> OK.
> >>
> >>> The point here is that we're emphasizing that this is a design
> >>> decision to make the host implementation easier, rather than a
> >>> fundamental constraint.
> >>>
> >>>>> What I think you're getting at is that the PAPR spec only
> >>>>> allows a PAPR guest to use priorities 0..6 (or at least it will if the
> >>>>> XIVE updated spec ever gets published).  
> >>>>
> >>>> It's not in the spec. the XIVE sPAPR spec should be frozen soon btw. 
> >>>>  
> >>>>> The fact that this allows the
> >>>>> host use 7 for escalations is a design rationale 
> >>>>> but not really relevant to the guest device itself. 
> >>>>
> >>>> The guest should be aware of which priorities are reserved for
> >>>> the hypervisor though.
> >>>>
> >>>>>> + */
> >>>>>> +bool spapr_xive_priority_is_valid(uint8_t priority)
> >>>>>> +{
> >>>>>> +    switch (priority) {
> >>>>>> +    case 0 ... 6:
> >>>>>> +        return true;
> >>>>>> +    case 7: /* OPAL escalation queue */
> >>>>>> +    default:
> >>>>>> +        return false;
> >>>>>> +    }
> >>>>>> +}
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
> >>>>>> + * real address of the MMIO page through which the Event State Buffer
> >>>>>> + * entry associated with the value of the "lisn" parameter is managed.
> >>>>>> + *
> >>>>>> + * Parameters:
> >>>>>> + * Input
> >>>>>> + * - "flags"
> >>>>>> + *       Bits 0-63 reserved
> >>>>>> + * - "lisn" is per "interrupts", "interrupt-map", or
> >>>>>> + *       "ibm,xive-lisn-ranges" properties, or as returned by the
> >>>>>> + *       ibm,query-interrupt-source-number RTAS call, or as returned
> >>>>>> + *       by the H_ALLOCATE_VAS_WINDOW hcall
> >>>>>
> >>>>> I've not heard of H_ALLOCATE_VAS_WINDOW.  Is that something we intend
> >>>>> to implement in kvm/qemu, or is it only of interest for PowerVM?
> >>>>
> >>>> The hcall is part of the PAPR NX Interfaces and it returns interrupt
> >>>> numbers. I don't know if any work has been done on the topic.  
> >>>
> >>> What's a "PAPR NX"?
> >>
> >> A way for the PAPR guests to access the POWER coprocessors doing 
> >> compression and encryption. I really don't know much about this.
> > 
> > Ah, ok.
> > 
> > [snip]
> >>>> I think not, but the specs are not very clear on that topic. I will
> >>>> ask for clarification and use a -1 for now. We can not do loads on
> >>>> the trigger page so it can not be used by the H_INT_ESB hcall.
> >>>>
> >>>>>
> >>>>>> +    args[3] = TARGET_PAGE_SIZE;
> >>>>>
> >>>>> That seems wrong.  
> >>>>
> >>>> This is utterly wrong. it should be a power of 2 number ... I got
> >>>> it right under KVM though. I guess that ioremap() under Linux rounds 
> >>>> up the size to the page size in use, so, that's why it didn't blow
> >>>> up under TCG.
> >>>>
> >>>>> TARGET_PAGE_SIZE is generally 4kiB, but won't these usually
> >>>>> actually be 64kiB?
> >>>>
> >>>> yes. So what should I use to get a PAGE_SHIFT instead ? 
> >>>
> >>> Erm, that gets a bit tricky, since qemu in a sense doesn't know the
> >>> guest's page size.
> >>>
> >>> But.. don't you actually want the esb_shift here, not PAGE_SHIFT - it
> >>> could matter for the 2 page * 64kiB variant, yes?
> >>
> >> Yes. we just want the page_shift of the ESB page, whether it's one or
> >> two pages. The other registers inform the guest if there are one or 
> >> two ESB page in use. 
> > 
> > Ok, still sounds like you should base it on esb_shift, just adjust for
> > the two page case.
> 
> yes.
> 
> 
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    switch (qsize) {
> >>>>>> +    case 12:
> >>>>>> +    case 16:
> >>>>>> +    case 21:
> >>>>>> +    case 24:
> >>>>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
> >>>>>
> >>>>> It just occurred to me that I haven't been looking for this across any
> >>>>> of these reviews.  Don't you need byteswaps when accessing these
> >>>>> in-memory structures?
> >>>>
> >>>> yes this is done when some event data is enqueued in the EQ.
> >>>
> >>> I'm not talking about the data in the EQ itself, but the fields in the
> >>> END (and the NVT).
> >>
> >> XIVE is all BE.
> > 
> > Yes... the qemu host might not be, which is why you need byteswaps.
> 
> ok. I understand.
> 
> > I realized eventually you have the swaps in your pnv get/set
> > accessors.  
> 
> Yes. because skiboot is BE, like the XIVE structures.

skiboot's endiannness isn't really relevant, because we're modelling
below that level.

> > I don't like that at all for a couple of reasons:
> > 
> > 1) Although the END structure is made up of word-sized fields because
> > that's convenient, the END really is made of a bunch of subfields of
> > different sizes.  Knowing that it wouldn't be unreasonable for people
> > to expect they can look into the XIVE by byte offsets; 
> 
> These structures should be accessed with GETFIELD and SETFIELD macros
> using the XIVE definitions in the xive_regs.h header file. I would want 
> to keep that common with skiboot  for sure.

Right.  It might make sense to make some helper macros or inlines that
include both the GETFIELD/SETFIELD and the byteswap.

> Are you suggesting we should define each field of the XIVE structures 
> with C attributes ? That would be very unfortunate.

Oh no, bitfields are a complete mess.

> > that will break
> > if you're working with a copy that has already been byte-swapped on
> > word-sized units.
> 
> I am not sure I understand the last sentence.

I mean that GETFIELD/SETFIELD only work on values that are already
native endian, but using byte offsets would only work on values that
are still in BE.

> the code working with a copy would necessarily know that the structure 
> has been byteswapped and use correct offsets for the expected endianess. 
> no ? why would it break ?  
> 
> > 2) At different points in the code you're storing both BE and
> > native-endian data in the same struct. 
> 
> on sPAPR, it's all native (which is a violation I agree).

Don't do that.  Having the same structure be BE in some situations and
native endian in other situations is a sure path to madness.

> TIMA is BE.
> 
> > That's both confusing to
> > someone reading the code (if they see that struct they don't know if
> > it's byteswapped already) and also means you can't use sparse
> > annotations to make sure you have it right.
> 
> XIVE structures are architected to be BE. That's immutable.

Yes, absolutely.  So don't represent them in C structs that are in
native endian.  Ever, even temporarily.

> It's a not problem for skiboot which is BE. The PnvXIVE model for the 
> QEMU PowerNV machine reads these VSTs (Virtual Structure Tables) from 
> the guest RAM and byteswaps the structure before using it. I think
> that's fine. Isn't it ?

Byteswapping structures - rather than individual fields as you use
them - is almost always a bad idea.  It's insanely easy to lose track
of whether this particular instance of the structure is swapped yet or
not, and you can't use sparse (or whatever) to check it for you.

Stick to one endianness for a struct, and do the byteswaps when you
access the fields (using helpers if that's, well, helpful).

> It becomes a problem with the sPAPR model which is using the XIVE structures 
> in native endianess and not BE anymore. But the guest OS never manipulates 
> these structures, so under the hood, I think we are free to use them in 
> native and keep the common definitions.

Free to in the sense that it can theoretically work, yes.  But there's
no upside (byteswaps are essentially free on POWER, and of trivial
cost compared to memory access basically everywhere).  The downside is
that having the same variables / structures have data in different
endianness in different situations makes it exceedingly easy to forget
which one you're dealing with right now and therefore forget some
swaps or put in extra ones.

> Except that the event data entries in the OS EQs are BE. So the only place 
> where we convert is when an event data is enqueued. 
> 
> What would you put in place if you think this is a too strong violation 
> of the architecture ? I am afraid of something too complex to manipulate
> to be honest. May be we can drop the map/unmap access methods only keep 
> the very basic ones.

THe complexity of having extra swaps is almost always less than having
the complexity of having those swaps not be in a consistent place.
Especially if you use helpers (including the swaps) to access your
structure.
Cédric Le Goater Dec. 3, 2018, 4:49 p.m. UTC | #8
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    switch (qsize) {
>>>>>>>> +    case 12:
>>>>>>>> +    case 16:
>>>>>>>> +    case 21:
>>>>>>>> +    case 24:
>>>>>>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
>>>>>>>
>>>>>>> It just occurred to me that I haven't been looking for this across any
>>>>>>> of these reviews.  Don't you need byteswaps when accessing these
>>>>>>> in-memory structures?
>>>>>>
>>>>>> yes this is done when some event data is enqueued in the EQ.
>>>>>
>>>>> I'm not talking about the data in the EQ itself, but the fields in the
>>>>> END (and the NVT).
>>>>
>>>> XIVE is all BE.
>>>
>>> Yes... the qemu host might not be, which is why you need byteswaps.
>>
>> ok. I understand.
>>
>>> I realized eventually you have the swaps in your pnv get/set
>>> accessors.  
>>
>> Yes. because skiboot is BE, like the XIVE structures.
> 
> skiboot's endiannness isn't really relevant, because we're modelling
> below that level.
> 
>>> I don't like that at all for a couple of reasons:
>>>
>>> 1) Although the END structure is made up of word-sized fields because
>>> that's convenient, the END really is made of a bunch of subfields of
>>> different sizes.  Knowing that it wouldn't be unreasonable for people
>>> to expect they can look into the XIVE by byte offsets; 
>>
>> These structures should be accessed with GETFIELD and SETFIELD macros
>> using the XIVE definitions in the xive_regs.h header file. I would want 
>> to keep that common with skiboot  for sure.
> 
> Right.  It might make sense to make some helper macros or inlines that
> include both the GETFIELD/SETFIELD and the byteswap.

ah. I have to evaluate the added complexity, because we don't really
have a struct. it's just an array of BE words. 

So for each field or bit we are interested in, we would have an helper 
routine picking the correct word from the XIVE structure, doing the 
byteswap and extracting the value ?  

sigh. 

C.

>> Are you suggesting we should define each field of the XIVE structures 
>> with C attributes ? That would be very unfortunate.
> 
> Oh no, bitfields are a complete mess.
>
>>> that will break
>>> if you're working with a copy that has already been byte-swapped on
>>> word-sized units.
>>
>> I am not sure I understand the last sentence.
> 
> I mean that GETFIELD/SETFIELD only work on values that are already
> native endian, but using byte offsets would only work on values that
> are still in BE.
> 
>> the code working with a copy would necessarily know that the structure 
>> has been byteswapped and use correct offsets for the expected endianess. 
>> no ? why would it break ?  
>>
>>> 2) At different points in the code you're storing both BE and
>>> native-endian data in the same struct. 
>>
>> on sPAPR, it's all native (which is a violation I agree).
> 
> Don't do that.  Having the same structure be BE in some situations and
> native endian in other situations is a sure path to madness.
> 
>> TIMA is BE.
>>
>>> That's both confusing to
>>> someone reading the code (if they see that struct they don't know if
>>> it's byteswapped already) and also means you can't use sparse
>>> annotations to make sure you have it right.
>>
>> XIVE structures are architected to be BE. That's immutable.
> 
> Yes, absolutely.  So don't represent them in C structs that are in
> native endian.  Ever, even temporarily.
> 
>> It's a not problem for skiboot which is BE. The PnvXIVE model for the 
>> QEMU PowerNV machine reads these VSTs (Virtual Structure Tables) from 
>> the guest RAM and byteswaps the structure before using it. I think
>> that's fine. Isn't it ?
> 
> Byteswapping structures - rather than individual fields as you use
> them - is almost always a bad idea.  It's insanely easy to lose track
> of whether this particular instance of the structure is swapped yet or
> not, and you can't use sparse (or whatever) to check it for you.
> 
> Stick to one endianness for a struct, and do the byteswaps when you
> access the fields (using helpers if that's, well, helpful).
> 
>> It becomes a problem with the sPAPR model which is using the XIVE structures 
>> in native endianess and not BE anymore. But the guest OS never manipulates 
>> these structures, so under the hood, I think we are free to use them in 
>> native and keep the common definitions.
> 
> Free to in the sense that it can theoretically work, yes.  But there's
> no upside (byteswaps are essentially free on POWER, and of trivial
> cost compared to memory access basically everywhere).  The downside is
> that having the same variables / structures have data in different
> endianness in different situations makes it exceedingly easy to forget
> which one you're dealing with right now and therefore forget some
> swaps or put in extra ones.
> 
>> Except that the event data entries in the OS EQs are BE. So the only place 
>> where we convert is when an event data is enqueued. 
>>
>> What would you put in place if you think this is a too strong violation 
>> of the architecture ? I am afraid of something too complex to manipulate
>> to be honest. May be we can drop the map/unmap access methods only keep 
>> the very basic ones.
> 
> THe complexity of having extra swaps is almost always less than having
> the complexity of having those swaps not be in a consistent place.
> Especially if you use helpers (including the swaps) to access your
> structure.
>
David Gibson Dec. 4, 2018, 1:56 a.m. UTC | #9
On Mon, Dec 03, 2018 at 05:49:37PM +0100, Cédric Le Goater wrote:
> >>>>>>>> +    }
> >>>>>>>> +
> >>>>>>>> +    switch (qsize) {
> >>>>>>>> +    case 12:
> >>>>>>>> +    case 16:
> >>>>>>>> +    case 21:
> >>>>>>>> +    case 24:
> >>>>>>>> +        end.w3 = ((uint64_t)qpage) & 0xffffffff;
> >>>>>>>
> >>>>>>> It just occurred to me that I haven't been looking for this across any
> >>>>>>> of these reviews.  Don't you need byteswaps when accessing these
> >>>>>>> in-memory structures?
> >>>>>>
> >>>>>> yes this is done when some event data is enqueued in the EQ.
> >>>>>
> >>>>> I'm not talking about the data in the EQ itself, but the fields in the
> >>>>> END (and the NVT).
> >>>>
> >>>> XIVE is all BE.
> >>>
> >>> Yes... the qemu host might not be, which is why you need byteswaps.
> >>
> >> ok. I understand.
> >>
> >>> I realized eventually you have the swaps in your pnv get/set
> >>> accessors.  
> >>
> >> Yes. because skiboot is BE, like the XIVE structures.
> > 
> > skiboot's endiannness isn't really relevant, because we're modelling
> > below that level.
> > 
> >>> I don't like that at all for a couple of reasons:
> >>>
> >>> 1) Although the END structure is made up of word-sized fields because
> >>> that's convenient, the END really is made of a bunch of subfields of
> >>> different sizes.  Knowing that it wouldn't be unreasonable for people
> >>> to expect they can look into the XIVE by byte offsets; 
> >>
> >> These structures should be accessed with GETFIELD and SETFIELD macros
> >> using the XIVE definitions in the xive_regs.h header file. I would want 
> >> to keep that common with skiboot  for sure.
> > 
> > Right.  It might make sense to make some helper macros or inlines that
> > include both the GETFIELD/SETFIELD and the byteswap.
> 
> ah. I have to evaluate the added complexity, because we don't really
> have a struct. it's just an array of BE words.

You're still treating as a struct which reflects an in-memory layout,
even if all the fields are words of the same size.

> So for each field or bit we are interested in, we would have an helper 
> routine picking the correct word from the XIVE structure, doing the 
> byteswap and extracting the value ?
> 
> sigh.

Oh, no, I was just thinking a version of GETFIELD that byteswaps the
value before extracting the given field.  Likewise a SETFIELD variant
that swaps, deposits the field, then swaps back.
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 1fbc2663e06c..8415faea7b82 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -452,7 +452,20 @@  struct sPAPRMachineState {
 #define H_INVALIDATE_PID        0x378
 #define H_REGISTER_PROC_TBL     0x37C
 #define H_SIGNAL_SYS_RESET      0x380
-#define MAX_HCALL_OPCODE        H_SIGNAL_SYS_RESET
+
+#define H_INT_GET_SOURCE_INFO   0x3A8
+#define H_INT_SET_SOURCE_CONFIG 0x3AC
+#define H_INT_GET_SOURCE_CONFIG 0x3B0
+#define H_INT_GET_QUEUE_INFO    0x3B4
+#define H_INT_SET_QUEUE_CONFIG  0x3B8
+#define H_INT_GET_QUEUE_CONFIG  0x3BC
+#define H_INT_SET_OS_REPORTING_LINE 0x3C0
+#define H_INT_GET_OS_REPORTING_LINE 0x3C4
+#define H_INT_ESB               0x3C8
+#define H_INT_SYNC              0x3CC
+#define H_INT_RESET             0x3D0
+
+#define MAX_HCALL_OPCODE        H_INT_RESET
 
 /* The hcalls above are standardized in PAPR and implemented by pHyp
  * as well.
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 3f65b8f485fd..418511f3dc10 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -60,4 +60,10 @@  int spapr_xive_target_to_end(sPAPRXive *xive, uint32_t target, uint8_t prio,
 int spapr_xive_cpu_to_end(sPAPRXive *xive, PowerPCCPU *cpu, uint8_t prio,
                           uint8_t *out_end_blk, uint32_t *out_end_idx);
 
+bool spapr_xive_priority_is_valid(uint8_t priority);
+
+typedef struct sPAPRMachineState sPAPRMachineState;
+
+void spapr_xive_hcall_init(sPAPRMachineState *spapr);
+
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/hw/intc/spapr_xive_hcall.c b/hw/intc/spapr_xive_hcall.c
new file mode 100644
index 000000000000..52e4e23995f5
--- /dev/null
+++ b/hw/intc/spapr_xive_hcall.c
@@ -0,0 +1,892 @@ 
+/*
+ * QEMU PowerPC sPAPR XIVE interrupt controller model
+ *
+ * Copyright (c) 2017-2018, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/error.h"
+#include "cpu.h"
+#include "hw/ppc/fdt.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_xive.h"
+#include "hw/ppc/xive_regs.h"
+#include "monitor/monitor.h"
+
+/*
+ * OPAL uses the priority 7 EQ to automatically escalate interrupts
+ * for all other queues (DD2.X POWER9). So only priorities [0..6] are
+ * available for the guest.
+ */
+bool spapr_xive_priority_is_valid(uint8_t priority)
+{
+    switch (priority) {
+    case 0 ... 6:
+        return true;
+    case 7: /* OPAL escalation queue */
+    default:
+        return false;
+    }
+}
+
+/*
+ * The H_INT_GET_SOURCE_INFO hcall() is used to obtain the logical
+ * real address of the MMIO page through which the Event State Buffer
+ * entry associated with the value of the "lisn" parameter is managed.
+ *
+ * Parameters:
+ * Input
+ * - "flags"
+ *       Bits 0-63 reserved
+ * - "lisn" is per "interrupts", "interrupt-map", or
+ *       "ibm,xive-lisn-ranges" properties, or as returned by the
+ *       ibm,query-interrupt-source-number RTAS call, or as returned
+ *       by the H_ALLOCATE_VAS_WINDOW hcall
+ *
+ * Output
+ * - R4: "flags"
+ *       Bits 0-59: Reserved
+ *       Bit 60: H_INT_ESB must be used for Event State Buffer
+ *               management
+ *       Bit 61: 1 == LSI  0 == MSI
+ *       Bit 62: the full function page supports trigger
+ *       Bit 63: Store EOI Supported
+ * - R5: Logical Real address of full function Event State Buffer
+ *       management page, -1 if ESB hcall flag is set to 1.
+ * - R6: Logical Real Address of trigger only Event State Buffer
+ *       management page or -1.
+ * - R7: Power of 2 page size for the ESB management pages returned in
+ *       R5 and R6.
+ */
+
+#define SPAPR_XIVE_SRC_H_INT_ESB     PPC_BIT(60) /* ESB manage with H_INT_ESB */
+#define SPAPR_XIVE_SRC_LSI           PPC_BIT(61) /* Virtual LSI type */
+#define SPAPR_XIVE_SRC_TRIGGER       PPC_BIT(62) /* Trigger and management
+                                                    on same page */
+#define SPAPR_XIVE_SRC_STORE_EOI     PPC_BIT(63) /* Store EOI support */
+
+static target_ulong h_int_get_source_info(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          target_ulong opcode,
+                                          target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveSource *xsrc = &xive->source;
+    XiveEAS eas;
+    target_ulong flags  = args[0];
+    target_ulong lisn   = args[1];
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags) {
+        return H_PARAMETER;
+    }
+
+    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
+        return H_P2;
+    }
+
+    if (!(eas.w & EAS_VALID)) {
+        return H_P2;
+    }
+
+    /* All sources are emulated under the main XIVE object and share
+     * the same characteristics.
+     */
+    args[0] = 0;
+    if (!xive_source_esb_has_2page(xsrc)) {
+        args[0] |= SPAPR_XIVE_SRC_TRIGGER;
+    }
+    if (xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
+        args[0] |= SPAPR_XIVE_SRC_STORE_EOI;
+    }
+
+    /*
+     * Force the use of the H_INT_ESB hcall in case of an LSI
+     * interrupt. This is necessary under KVM to re-trigger the
+     * interrupt if the level is still asserted
+     */
+    if (xive_source_irq_is_lsi(xsrc, lisn)) {
+        args[0] |= SPAPR_XIVE_SRC_H_INT_ESB | SPAPR_XIVE_SRC_LSI;
+    }
+
+    if (!(args[0] & SPAPR_XIVE_SRC_H_INT_ESB)) {
+        args[1] = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn);
+    } else {
+        args[1] = -1;
+    }
+
+    if (xive_source_esb_has_2page(xsrc)) {
+        args[2] = xive->vc_base + xive_source_esb_page(xsrc, lisn);
+    } else {
+        args[2] = -1;
+    }
+
+    args[3] = TARGET_PAGE_SIZE;
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_SET_SOURCE_CONFIG hcall() is used to assign a Logical
+ * Interrupt Source to a target. The Logical Interrupt Source is
+ * designated with the "lisn" parameter and the target is designated
+ * with the "target" and "priority" parameters.  Upon return from the
+ * hcall(), no additional interrupts will be directed to the old EQ.
+ *
+ * TODO: The old EQ should be investigated for interrupts that
+ * occurred prior to or during the hcall().
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-61: Reserved
+ *      Bit 62: set the "eisn" in the EA
+ *      Bit 63: masks the interrupt source in the hardware interrupt
+ *      control structure. An interrupt masked by this mechanism will
+ *      be dropped, but it's source state bits will still be
+ *      set. There is no race-free way of unmasking and restoring the
+ *      source. Thus this should only be used in interrupts that are
+ *      also masked at the source, and only in cases where the
+ *      interrupt is not meant to be used for a large amount of time
+ *      because no valid target exists for it for example
+ * - "lisn" is per "interrupts", "interrupt-map", or
+ *      "ibm,xive-lisn-ranges" properties, or as returned by the
+ *      ibm,query-interrupt-source-number RTAS call, or as returned by
+ *      the H_ALLOCATE_VAS_WINDOW hcall
+ * - "target" is per "ibm,ppc-interrupt-server#s" or
+ *      "ibm,ppc-interrupt-gserver#s"
+ * - "priority" is a valid priority not in
+ *      "ibm,plat-res-int-priorities"
+ * - "eisn" is the guest EISN associated with the "lisn"
+ *
+ * Output:
+ * - None
+ */
+
+#define SPAPR_XIVE_SRC_SET_EISN PPC_BIT(62)
+#define SPAPR_XIVE_SRC_MASK     PPC_BIT(63)
+
+static target_ulong h_int_set_source_config(PowerPCCPU *cpu,
+                                            sPAPRMachineState *spapr,
+                                            target_ulong opcode,
+                                            target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveRouter *xrtr = XIVE_ROUTER(xive);
+    XiveEAS eas, new_eas;
+    target_ulong flags    = args[0];
+    target_ulong lisn     = args[1];
+    target_ulong target   = args[2];
+    target_ulong priority = args[3];
+    target_ulong eisn     = args[4];
+    uint8_t end_blk;
+    uint32_t end_idx;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags & ~(SPAPR_XIVE_SRC_SET_EISN | SPAPR_XIVE_SRC_MASK)) {
+        return H_PARAMETER;
+    }
+
+    if (xive_router_get_eas(xrtr, lisn, &eas)) {
+        return H_P2;
+    }
+
+    if (!(eas.w & EAS_VALID)) {
+        return H_P2;
+    }
+
+    /* priority 0xff is used to reset the EAS */
+    if (priority == 0xff) {
+        new_eas.w = EAS_VALID | EAS_MASKED;
+        goto out;
+    }
+
+    if (flags & SPAPR_XIVE_SRC_MASK) {
+        new_eas.w = eas.w | EAS_MASKED;
+    } else {
+        new_eas.w = eas.w & ~EAS_MASKED;
+    }
+
+    if (!spapr_xive_priority_is_valid(priority)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
+                      priority);
+        return H_P4;
+    }
+
+    /* Validate that "target" is part of the list of threads allocated
+     * to the partition. For that, find the END corresponding to the
+     * target.
+     */
+    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
+        return H_P3;
+    }
+
+    new_eas.w = SETFIELD(EAS_END_BLOCK, new_eas.w, end_blk);
+    new_eas.w = SETFIELD(EAS_END_INDEX, new_eas.w, end_idx);
+
+    if (flags & SPAPR_XIVE_SRC_SET_EISN) {
+        new_eas.w = SETFIELD(EAS_END_DATA, new_eas.w, eisn);
+    }
+
+out:
+    if (xive_router_set_eas(xrtr, lisn, &new_eas)) {
+        return H_HARDWARE;
+    }
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_GET_SOURCE_CONFIG hcall() is used to determine to which
+ * target/priority pair is assigned to the specified Logical Interrupt
+ * Source.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-63 Reserved
+ * - "lisn" is per "interrupts", "interrupt-map", or
+ *      "ibm,xive-lisn-ranges" properties, or as returned by the
+ *      ibm,query-interrupt-source-number RTAS call, or as
+ *      returned by the H_ALLOCATE_VAS_WINDOW hcall
+ *
+ * Output:
+ * - R4: Target to which the specified Logical Interrupt Source is
+ *       assigned
+ * - R5: Priority to which the specified Logical Interrupt Source is
+ *       assigned
+ * - R6: EISN for the specified Logical Interrupt Source (this will be
+ *       equivalent to the LISN if not changed by H_INT_SET_SOURCE_CONFIG)
+ */
+static target_ulong h_int_get_source_config(PowerPCCPU *cpu,
+                                            sPAPRMachineState *spapr,
+                                            target_ulong opcode,
+                                            target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveRouter *xrtr = XIVE_ROUTER(xive);
+    target_ulong flags = args[0];
+    target_ulong lisn = args[1];
+    XiveEAS eas;
+    XiveEND end;
+    uint8_t end_blk, nvt_blk;
+    uint32_t end_idx, nvt_idx;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags) {
+        return H_PARAMETER;
+    }
+
+    if (xive_router_get_eas(xrtr, lisn, &eas)) {
+        return H_P2;
+    }
+
+    if (!(eas.w & EAS_VALID)) {
+        return H_P2;
+    }
+
+    end_blk = GETFIELD(EAS_END_BLOCK, eas.w);
+    end_idx = GETFIELD(EAS_END_INDEX, eas.w);
+    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
+        /* Not sure what to return here */
+        return H_HARDWARE;
+    }
+
+    nvt_blk = GETFIELD(END_W6_NVT_BLOCK, end.w6);
+    nvt_idx = GETFIELD(END_W6_NVT_INDEX, end.w6);
+    args[0] = spapr_xive_nvt_to_target(xive, nvt_blk, nvt_idx);
+
+    if (eas.w & EAS_MASKED) {
+        args[1] = 0xff;
+    } else {
+        args[1] = GETFIELD(END_W7_F0_PRIORITY, end.w7);
+    }
+
+    args[2] = GETFIELD(EAS_END_DATA, eas.w);
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_GET_QUEUE_INFO hcall() is used to get the logical real
+ * address of the notification management page associated with the
+ * specified target and priority.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *       Bits 0-63 Reserved
+ * - "target" is per "ibm,ppc-interrupt-server#s" or
+ *       "ibm,ppc-interrupt-gserver#s"
+ * - "priority" is a valid priority not in
+ *       "ibm,plat-res-int-priorities"
+ *
+ * Output:
+ * - R4: Logical real address of notification page
+ * - R5: Power of 2 page size of the notification page
+ */
+static target_ulong h_int_get_queue_info(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         target_ulong opcode,
+                                         target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveENDSource *end_xsrc = &xive->end_source;
+    target_ulong flags = args[0];
+    target_ulong target = args[1];
+    target_ulong priority = args[2];
+    XiveEND end;
+    uint8_t end_blk;
+    uint32_t end_idx;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags) {
+        return H_PARAMETER;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    if (!spapr_xive_priority_is_valid(priority)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
+                      priority);
+        return H_P3;
+    }
+
+    /* Validate that "target" is part of the list of threads allocated
+     * to the partition. For that, find the END corresponding to the
+     * target.
+     */
+    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
+        return H_P2;
+    }
+
+    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
+        return H_HARDWARE;
+    }
+
+    args[0] = xive->end_base + (1ull << (end_xsrc->esb_shift + 1)) * end_idx;
+    if (end.w0 & END_W0_ENQUEUE) {
+        args[1] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
+    } else {
+        args[1] = 0;
+    }
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_SET_QUEUE_CONFIG hcall() is used to set or reset a EQ for
+ * a given "target" and "priority".  It is also used to set the
+ * notification config associated with the EQ.  An EQ size of 0 is
+ * used to reset the EQ config for a given target and priority. If
+ * resetting the EQ config, the END associated with the given "target"
+ * and "priority" will be changed to disable queueing.
+ *
+ * Upon return from the hcall(), no additional interrupts will be
+ * directed to the old EQ (if one was set). The old EQ (if one was
+ * set) should be investigated for interrupts that occurred prior to
+ * or during the hcall().
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-62: Reserved
+ *      Bit 63: Unconditional Notify (n) per the XIVE spec
+ * - "target" is per "ibm,ppc-interrupt-server#s" or
+ *       "ibm,ppc-interrupt-gserver#s"
+ * - "priority" is a valid priority not in
+ *       "ibm,plat-res-int-priorities"
+ * - "eventQueue": The logical real address of the start of the EQ
+ * - "eventQueueSize": The power of 2 EQ size per "ibm,xive-eq-sizes"
+ *
+ * Output:
+ * - None
+ */
+
+#define SPAPR_XIVE_END_ALWAYS_NOTIFY PPC_BIT(63)
+
+static target_ulong h_int_set_queue_config(PowerPCCPU *cpu,
+                                           sPAPRMachineState *spapr,
+                                           target_ulong opcode,
+                                           target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveRouter *xrtr = XIVE_ROUTER(xive);
+    target_ulong flags = args[0];
+    target_ulong target = args[1];
+    target_ulong priority = args[2];
+    target_ulong qpage = args[3];
+    target_ulong qsize = args[4];
+    XiveEND end;
+    uint8_t end_blk, nvt_blk;
+    uint32_t end_idx, nvt_idx;
+    uint32_t qdata;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags & ~SPAPR_XIVE_END_ALWAYS_NOTIFY) {
+        return H_PARAMETER;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    if (!spapr_xive_priority_is_valid(priority)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
+                      priority);
+        return H_P3;
+    }
+
+    /* Validate that "target" is part of the list of threads allocated
+     * to the partition. For that, find the END corresponding to the
+     * target.
+     */
+
+    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
+        return H_P2;
+    }
+
+    if (xive_router_get_end(xrtr, end_blk, end_idx, &end)) {
+        return H_HARDWARE;
+    }
+
+    switch (qsize) {
+    case 12:
+    case 16:
+    case 21:
+    case 24:
+        end.w3 = ((uint64_t)qpage) & 0xffffffff;
+        end.w2 = (((uint64_t)qpage)) >> 32 & 0x0fffffff;
+        end.w0 |= END_W0_ENQUEUE;
+        end.w0 = SETFIELD(END_W0_QSIZE, end.w0, qsize - 12);
+        break;
+    case 0:
+        /* reset queue and disable queueing */
+        xive_end_reset(&end);
+        goto out;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid EQ size %"PRIx64"\n",
+                      qsize);
+        return H_P5;
+    }
+
+    if (qsize) {
+        /*
+         * Let's validate the EQ address with a read of the first EQ
+         * entry. We could also check that the full queue has been
+         * zeroed by the OS.
+         */
+        if (address_space_read(&address_space_memory, qpage,
+                               MEMTXATTRS_UNSPECIFIED,
+                               (uint8_t *) &qdata, sizeof(qdata))) {
+            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to read EQ data @0x%"
+                          HWADDR_PRIx "\n", qpage);
+            return H_P4;
+        }
+    }
+
+    if (spapr_xive_target_to_nvt(xive, target, &nvt_blk, &nvt_idx)) {
+        return H_HARDWARE;
+    }
+
+    /* Ensure the priority and target are correctly set (they will not
+     * be right after allocation)
+     */
+    end.w6 = SETFIELD(END_W6_NVT_BLOCK, 0ul, nvt_blk) |
+        SETFIELD(END_W6_NVT_INDEX, 0ul, nvt_idx);
+    end.w7 = SETFIELD(END_W7_F0_PRIORITY, 0ul, priority);
+
+    if (flags & SPAPR_XIVE_END_ALWAYS_NOTIFY) {
+        end.w0 |= END_W0_UCOND_NOTIFY;
+    } else {
+        end.w0 &= ~END_W0_UCOND_NOTIFY;
+    }
+
+    /* The generation bit for the END starts at 1 and The END page
+     * offset counter starts at 0.
+     */
+    end.w1 = END_W1_GENERATION | SETFIELD(END_W1_PAGE_OFF, 0ul, 0ul);
+    end.w0 |= END_W0_VALID;
+
+    /* TODO: issue syncs required to ensure all in-flight interrupts
+     * are complete on the old END */
+out:
+    /* Update END */
+    if (xive_router_set_end(xrtr, end_blk, end_idx, &end)) {
+        return H_HARDWARE;
+    }
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_GET_QUEUE_CONFIG hcall() is used to get a EQ for a given
+ * target and priority.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-62: Reserved
+ *      Bit 63: Debug: Return debug data
+ * - "target" is per "ibm,ppc-interrupt-server#s" or
+ *       "ibm,ppc-interrupt-gserver#s"
+ * - "priority" is a valid priority not in
+ *       "ibm,plat-res-int-priorities"
+ *
+ * Output:
+ * - R4: "flags":
+ *       Bits 0-61: Reserved
+ *       Bit 62: The value of Event Queue Generation Number (g) per
+ *              the XIVE spec if "Debug" = 1
+ *       Bit 63: The value of Unconditional Notify (n) per the XIVE spec
+ * - R5: The logical real address of the start of the EQ
+ * - R6: The power of 2 EQ size per "ibm,xive-eq-sizes"
+ * - R7: The value of Event Queue Offset Counter per XIVE spec
+ *       if "Debug" = 1, else 0
+ *
+ */
+
+#define SPAPR_XIVE_END_DEBUG     PPC_BIT(63)
+
+static target_ulong h_int_get_queue_config(PowerPCCPU *cpu,
+                                           sPAPRMachineState *spapr,
+                                           target_ulong opcode,
+                                           target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    target_ulong flags = args[0];
+    target_ulong target = args[1];
+    target_ulong priority = args[2];
+    XiveEND end;
+    uint8_t end_blk;
+    uint32_t end_idx;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags & ~SPAPR_XIVE_END_DEBUG) {
+        return H_PARAMETER;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    if (!spapr_xive_priority_is_valid(priority)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid priority %ld requested\n",
+                      priority);
+        return H_P3;
+    }
+
+    /* Validate that "target" is part of the list of threads allocated
+     * to the partition. For that, find the END corresponding to the
+     * target.
+     */
+    if (spapr_xive_target_to_end(xive, target, priority, &end_blk, &end_idx)) {
+        return H_P2;
+    }
+
+    if (xive_router_get_end(XIVE_ROUTER(xive), end_blk, end_idx, &end)) {
+        return H_HARDWARE;
+    }
+
+    args[0] = 0;
+    if (end.w0 & END_W0_UCOND_NOTIFY) {
+        args[0] |= SPAPR_XIVE_END_ALWAYS_NOTIFY;
+    }
+
+    if (end.w0 & END_W0_ENQUEUE) {
+        args[1] =
+            (((uint64_t)(end.w2 & 0x0fffffff)) << 32) | end.w3;
+        args[2] = GETFIELD(END_W0_QSIZE, end.w0) + 12;
+    } else {
+        args[1] = 0;
+        args[2] = 0;
+    }
+
+    /* TODO: do we need any locking on the END ? */
+    if (flags & SPAPR_XIVE_END_DEBUG) {
+        /* Load the event queue generation number into the return flags */
+        args[0] |= (uint64_t)GETFIELD(END_W1_GENERATION, end.w1) << 62;
+
+        /* Load R7 with the event queue offset counter */
+        args[3] = GETFIELD(END_W1_PAGE_OFF, end.w1);
+    } else {
+        args[3] = 0;
+    }
+
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_SET_OS_REPORTING_LINE hcall() is used to set the
+ * reporting cache line pair for the calling thread.  The reporting
+ * cache lines will contain the OS interrupt context when the OS
+ * issues a CI store byte to @TIMA+0xC10 to acknowledge the OS
+ * interrupt. The reporting cache lines can be reset by inputting -1
+ * in "reportingLine".  Issuing the CI store byte without reporting
+ * cache lines registered will result in the data not being accessible
+ * to the OS.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-63: Reserved
+ * - "reportingLine": The logical real address of the reporting cache
+ *    line pair
+ *
+ * Output:
+ * - None
+ */
+static target_ulong h_int_set_os_reporting_line(PowerPCCPU *cpu,
+                                                sPAPRMachineState *spapr,
+                                                target_ulong opcode,
+                                                target_ulong *args)
+{
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    /* TODO: H_INT_SET_OS_REPORTING_LINE */
+    return H_FUNCTION;
+}
+
+/*
+ * The H_INT_GET_OS_REPORTING_LINE hcall() is used to get the logical
+ * real address of the reporting cache line pair set for the input
+ * "target".  If no reporting cache line pair has been set, -1 is
+ * returned.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-63: Reserved
+ * - "target" is per "ibm,ppc-interrupt-server#s" or
+ *       "ibm,ppc-interrupt-gserver#s"
+ * - "reportingLine": The logical real address of the reporting cache
+ *   line pair
+ *
+ * Output:
+ * - R4: The logical real address of the reporting line if set, else -1
+ */
+static target_ulong h_int_get_os_reporting_line(PowerPCCPU *cpu,
+                                                sPAPRMachineState *spapr,
+                                                target_ulong opcode,
+                                                target_ulong *args)
+{
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    /* TODO: H_INT_GET_OS_REPORTING_LINE */
+    return H_FUNCTION;
+}
+
+/*
+ * The H_INT_ESB hcall() is used to issue a load or store to the ESB
+ * page for the input "lisn".  This hcall is only supported for LISNs
+ * that have the ESB hcall flag set to 1 when returned from hcall()
+ * H_INT_GET_SOURCE_INFO.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-62: Reserved
+ *      bit 63: Store: Store=1, store operation, else load operation
+ * - "lisn" is per "interrupts", "interrupt-map", or
+ *      "ibm,xive-lisn-ranges" properties, or as returned by the
+ *      ibm,query-interrupt-source-number RTAS call, or as
+ *      returned by the H_ALLOCATE_VAS_WINDOW hcall
+ * - "esbOffset" is the offset into the ESB page for the load or store operation
+ * - "storeData" is the data to write for a store operation
+ *
+ * Output:
+ * - R4: R4: The value of the load if load operation, else -1
+ */
+
+#define SPAPR_XIVE_ESB_STORE PPC_BIT(63)
+
+static target_ulong h_int_esb(PowerPCCPU *cpu,
+                              sPAPRMachineState *spapr,
+                              target_ulong opcode,
+                              target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveEAS eas;
+    target_ulong flags  = args[0];
+    target_ulong lisn   = args[1];
+    target_ulong offset = args[2];
+    target_ulong data   = args[3];
+    hwaddr mmio_addr;
+    XiveSource *xsrc = &xive->source;
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags & ~SPAPR_XIVE_ESB_STORE) {
+        return H_PARAMETER;
+    }
+
+    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
+        return H_P2;
+    }
+
+    if (!(eas.w & EAS_VALID)) {
+        return H_P2;
+    }
+
+    if (offset > (1ull << xsrc->esb_shift)) {
+        return H_P3;
+    }
+
+    mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
+
+    if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8,
+                      (flags & SPAPR_XIVE_ESB_STORE))) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
+                      HWADDR_PRIx "\n", mmio_addr);
+        return H_HARDWARE;
+    }
+    args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data;
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_SYNC hcall() is used to issue hardware syncs that will
+ * ensure any in flight events for the input lisn are in the event
+ * queue.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-63: Reserved
+ * - "lisn" is per "interrupts", "interrupt-map", or
+ *      "ibm,xive-lisn-ranges" properties, or as returned by the
+ *      ibm,query-interrupt-source-number RTAS call, or as
+ *      returned by the H_ALLOCATE_VAS_WINDOW hcall
+ *
+ * Output:
+ * - None
+ */
+static target_ulong h_int_sync(PowerPCCPU *cpu,
+                               sPAPRMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    XiveEAS eas;
+    target_ulong flags = args[0];
+    target_ulong lisn = args[1];
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags) {
+        return H_PARAMETER;
+    }
+
+    if (xive_router_get_eas(XIVE_ROUTER(xive), lisn, &eas)) {
+        return H_P2;
+    }
+
+    if (!(eas.w & EAS_VALID)) {
+        return H_P2;
+    }
+
+    /*
+     * H_STATE should be returned if a H_INT_RESET is in progress.
+     * This is not needed when running the emulation under QEMU
+     */
+
+    /* This is not real hardware. Nothing to be done */
+    return H_SUCCESS;
+}
+
+/*
+ * The H_INT_RESET hcall() is used to reset all of the partition's
+ * interrupt exploitation structures to their initial state.  This
+ * means losing all previously set interrupt state set via
+ * H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
+ *
+ * Parameters:
+ * Input:
+ * - "flags"
+ *      Bits 0-63: Reserved
+ *
+ * Output:
+ * - None
+ */
+static target_ulong h_int_reset(PowerPCCPU *cpu,
+                                sPAPRMachineState *spapr,
+                                target_ulong opcode,
+                                target_ulong *args)
+{
+    sPAPRXive *xive = spapr->xive;
+    target_ulong flags   = args[0];
+
+    if (!spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        return H_FUNCTION;
+    }
+
+    if (flags) {
+        return H_PARAMETER;
+    }
+
+    device_reset(DEVICE(xive));
+    return H_SUCCESS;
+}
+
+void spapr_xive_hcall_init(sPAPRMachineState *spapr)
+{
+    spapr_register_hypercall(H_INT_GET_SOURCE_INFO, h_int_get_source_info);
+    spapr_register_hypercall(H_INT_SET_SOURCE_CONFIG, h_int_set_source_config);
+    spapr_register_hypercall(H_INT_GET_SOURCE_CONFIG, h_int_get_source_config);
+    spapr_register_hypercall(H_INT_GET_QUEUE_INFO, h_int_get_queue_info);
+    spapr_register_hypercall(H_INT_SET_QUEUE_CONFIG, h_int_set_queue_config);
+    spapr_register_hypercall(H_INT_GET_QUEUE_CONFIG, h_int_get_queue_config);
+    spapr_register_hypercall(H_INT_SET_OS_REPORTING_LINE,
+                             h_int_set_os_reporting_line);
+    spapr_register_hypercall(H_INT_GET_OS_REPORTING_LINE,
+                             h_int_get_os_reporting_line);
+    spapr_register_hypercall(H_INT_ESB, h_int_esb);
+    spapr_register_hypercall(H_INT_SYNC, h_int_sync);
+    spapr_register_hypercall(H_INT_RESET, h_int_reset);
+}
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 2569ae1bc7f8..da6fcfaa3c52 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -258,6 +258,8 @@  static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
         error_propagate(errp, local_err);
         return;
     }
+
+    spapr_xive_hcall_init(spapr);
 }
 
 static int spapr_irq_claim_xive(sPAPRMachineState *spapr, int irq, bool lsi,
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 301a8e972d91..eacd26836ebf 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -38,7 +38,7 @@  obj-$(CONFIG_XICS) += xics.o
 obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
 obj-$(CONFIG_XICS_KVM) += xics_kvm.o
 obj-$(CONFIG_XIVE) += xive.o
-obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o
+obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
 obj-$(CONFIG_POWERNV) += xics_pnv.o
 obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
 obj-$(CONFIG_S390_FLIC) += s390_flic.o