diff mbox series

ppc/spapr: Advertise StoreEOI for POWER10 compat guests

Message ID 20220214141157.3800212-1-clg@kaod.org
State New
Headers show
Series ppc/spapr: Advertise StoreEOI for POWER10 compat guests | expand

Commit Message

Cédric Le Goater Feb. 14, 2022, 2:11 p.m. UTC
When an interrupt has been handled, the OS notifies the interrupt
controller with a EOI sequence. On a POWER9 and POWER10 systems using
the XIVE interrupt controller, this can be done with a load or a store
operation on the ESB interrupt management page of the interrupt. The
StoreEOI operation has less latency and improves interrupt handling
performance but it was deactivated during the POWER9 DD2.0 timeframe
because of ordering issues. POWER9 systems use the LoadEOI instead.
POWER10 compat guests should have fixed the issue with
Load-after-Store ordering and StoreEOI can be activated for them
again.

To maintain performance, this ordering is only enforced for the
XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
disable temporarily an interrupt source. If StoreEOI is active, a
source could be left enabled if the load and store operations come
out of order.

Add a check in our XIVE emulation model for Load-after-Store when
StoreEOI is active. It should catch unreliable sequences. Other load
operations should be fine without it.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |  1 +
 include/hw/ppc/xive.h       |  8 ++++++++
 hw/intc/spapr_xive.c        | 15 +++++++++++++++
 hw/intc/spapr_xive_kvm.c    | 15 +++++++++++++++
 hw/intc/xive.c              |  6 ++++++
 hw/ppc/spapr_hcall.c        |  7 +++++++
 6 files changed, 52 insertions(+)

Comments

Daniel Henrique Barboza Feb. 16, 2022, 10:17 p.m. UTC | #1
On 2/14/22 11:11, Cédric Le Goater wrote:
> When an interrupt has been handled, the OS notifies the interrupt
> controller with a EOI sequence. On a POWER9 and POWER10 systems using

nit:  s/a EOI sequence/an EOI sequence


> the XIVE interrupt controller, this can be done with a load or a store
> operation on the ESB interrupt management page of the interrupt. The
> StoreEOI operation has less latency and improves interrupt handling
> performance but it was deactivated during the POWER9 DD2.0 timeframe
> because of ordering issues. POWER9 systems use the LoadEOI instead.
> POWER10 compat guests should have fixed the issue with
> Load-after-Store ordering and StoreEOI can be activated for them
> again.
> 
> To maintain performance, this ordering is only enforced for the
> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
> disable temporarily an interrupt source. If StoreEOI is active, a
> source could be left enabled if the load and store operations come
> out of order.
> 
> Add a check in our XIVE emulation model for Load-after-Store when
> StoreEOI is active. It should catch unreliable sequences. Other load
> operations should be fine without it.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>   include/hw/ppc/spapr_xive.h |  1 +
>   include/hw/ppc/xive.h       |  8 ++++++++
>   hw/intc/spapr_xive.c        | 15 +++++++++++++++
>   hw/intc/spapr_xive_kvm.c    | 15 +++++++++++++++
>   hw/intc/xive.c              |  6 ++++++
>   hw/ppc/spapr_hcall.c        |  7 +++++++
>   6 files changed, 52 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index b282960ad90d..9c247d8bf57d 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -73,6 +73,7 @@ void spapr_xive_map_mmio(SpaprXive *xive);
>   
>   int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>                                uint32_t *out_server, uint8_t *out_prio);
> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
>   
>   /*
>    * KVM XIVE device helpers
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 126e4e2c3a17..133f308c2792 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -285,6 +285,14 @@ uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
>   #define XIVE_ESB_SET_PQ_10      0xe00 /* Load */
>   #define XIVE_ESB_SET_PQ_11      0xf00 /* Load */
>   
> +/*
> + * Load-after-store ordering
> + *
> + * Adding this offset to the load address will enforce
> + * load-after-store ordering. This is required to use with StoreEOI.
> + */
> +#define XIVE_ESB_LD_ST_MO       0x40 /* Load-after-store ordering */
> +
>   uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
>   uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
>   
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index dc641cc604bf..0b8a246ad594 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -25,6 +25,7 @@
>   #include "hw/ppc/xive_regs.h"
>   #include "hw/qdev-properties.h"
>   #include "trace.h"
> +#include "cpu-models.h"
>   
>   /*
>    * XIVE Virtualization Controller BAR and Thread Managment BAR that we
> @@ -1854,3 +1855,17 @@ void spapr_xive_hcall_init(SpaprMachineState *spapr)
>       spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>       spapr_register_hypercall(H_INT_RESET, h_int_reset);
>   }
> +
> +/*
> + * Advertise StoreEOI for a P10 compat guest. OS is required to
> + * enforce load-after-store ordering.
> + */
> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
> +{
> +    if (enable) {
> +        xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
> +    } else {
> +        xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
> +    }
> +
> +}
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 61fe7bd2d322..bd023407bd7f 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -296,6 +296,21 @@ static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>   
>   static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
>   {
> +    /*
> +     * The XIVE_ESB_SET_PQ_10 load operation is used to disable
> +     * temporarily an interrupt source. If StoreEOI is active, a
> +     * source could be left enabled if the load and store operations
> +     * come out of order.
> +     *
> +     * As we don't know the characteristics of the host source
> +     * interrupts (StoreEOI or not), enforce the load-after-store
> +     * ordering always. The performance penalty will be very small for
> +     * QEMU.
> +     */
> +    if (offset == XIVE_ESB_SET_PQ_10) {
> +        offset |= XIVE_ESB_LD_ST_MO;
> +    }
> +
>       return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
>   }
>   
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index b8e4c7294d59..d62881873b1b 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -1024,6 +1024,12 @@ static uint64_t xive_source_esb_read(void *opaque, hwaddr addr, unsigned size)
>       case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
>       case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
>       case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
> +        if (offset == XIVE_ESB_SET_PQ_10 &&
> +            xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: load-after-store ordering "
> +                          "not enforced with Store EOI active for IRQ %d\n",
> +                          srcno);
> +        }

I'm afraid I have a lack of understanding of what is happening here but I'll try. Up
there, in xive_esb_read(), you forced the load-after-store ordering for the Load
operation that can left the source enabled if we have an out of order store-after-load
situation. Given that it's a function in spapr_xive_kvm.c I understand that this is
a wrapper for the kvmppc() calls into the host kernel XIVE irqchip, so that piece
of code is forcing the load-after-store at all times for that Load operation since
the performance penalty is not relevant.

Assuming that what I said is mostly correct, I don't understand why we can't do a
similar thing in the hw/intc/xive.c case, where I suppose it's the XIVE emulation
controller implementation that we're going to use if we don't have an  irqchip to
use with KVM. Can't we force the load-after-store ordering in the emulated
XIVE_ESB_SET_PQ_10 instead of throwing an error?


Everything else in the patch looks fine to me.


Thanks,


Daniel

>           ret = xive_source_esb_set(xsrc, srcno, (offset >> 8) & 0x3);
>           break;
>       default:
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 8ffb13ada08e..6b888c963ac4 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1210,11 +1210,18 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>        * otherwise terminate the boot.
>        */
>       if (guest_xive) {
> +        bool enable;
> +
>           if (!spapr->irq->xive) {
>               error_report(
>   "Guest requested unavailable interrupt mode (XIVE), try the ic-mode=xive or ic-mode=dual machine property");
>               exit(EXIT_FAILURE);
>           }
> +
> +        /* Advertise StoreEOI for a P10 compat guest. */
> +        enable = ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0,
> +                                  cpu->compat_pvr);
> +        spapr_xive_enable_store_eoi(spapr->xive, enable);
>       } else {
>           if (!spapr->irq->xics) {
>               error_report(
Cédric Le Goater Feb. 17, 2022, 7:51 a.m. UTC | #2
[ adding a few people for the comments ]

On 2/16/22 23:17, Daniel Henrique Barboza wrote:
> 
> 
> On 2/14/22 11:11, Cédric Le Goater wrote:
>> When an interrupt has been handled, the OS notifies the interrupt
>> controller with a EOI sequence. On a POWER9 and POWER10 systems using
> 
> nit:  s/a EOI sequence/an EOI sequence
> 
> 
>> the XIVE interrupt controller, this can be done with a load or a store
>> operation on the ESB interrupt management page of the interrupt. The
>> StoreEOI operation has less latency and improves interrupt handling
>> performance but it was deactivated during the POWER9 DD2.0 timeframe
>> because of ordering issues. POWER9 systems use the LoadEOI instead.
>> POWER10 compat guests should have fixed the issue with
>> Load-after-Store ordering and StoreEOI can be activated for them
>> again.
>>
>> To maintain performance, this ordering is only enforced for the
>> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
>> disable temporarily an interrupt source. If StoreEOI is active, a
>> source could be left enabled if the load and store operations come
>> out of order.
>>
>> Add a check in our XIVE emulation model for Load-after-Store when
>> StoreEOI is active. It should catch unreliable sequences. Other load
>> operations should be fine without it.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>   include/hw/ppc/spapr_xive.h |  1 +
>>   include/hw/ppc/xive.h       |  8 ++++++++
>>   hw/intc/spapr_xive.c        | 15 +++++++++++++++
>>   hw/intc/spapr_xive_kvm.c    | 15 +++++++++++++++
>>   hw/intc/xive.c              |  6 ++++++
>>   hw/ppc/spapr_hcall.c        |  7 +++++++
>>   6 files changed, 52 insertions(+)
>>
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index b282960ad90d..9c247d8bf57d 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -73,6 +73,7 @@ void spapr_xive_map_mmio(SpaprXive *xive);
>>   int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>>                                uint32_t *out_server, uint8_t *out_prio);
>> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
>>   /*
>>    * KVM XIVE device helpers
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 126e4e2c3a17..133f308c2792 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -285,6 +285,14 @@ uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
>>   #define XIVE_ESB_SET_PQ_10      0xe00 /* Load */
>>   #define XIVE_ESB_SET_PQ_11      0xf00 /* Load */
>> +/*
>> + * Load-after-store ordering
>> + *
>> + * Adding this offset to the load address will enforce
>> + * load-after-store ordering. This is required to use with StoreEOI.
>> + */
>> +#define XIVE_ESB_LD_ST_MO       0x40 /* Load-after-store ordering */
>> +
>>   uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
>>   uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index dc641cc604bf..0b8a246ad594 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -25,6 +25,7 @@
>>   #include "hw/ppc/xive_regs.h"
>>   #include "hw/qdev-properties.h"
>>   #include "trace.h"
>> +#include "cpu-models.h"
>>   /*
>>    * XIVE Virtualization Controller BAR and Thread Managment BAR that we
>> @@ -1854,3 +1855,17 @@ void spapr_xive_hcall_init(SpaprMachineState *spapr)
>>       spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>>       spapr_register_hypercall(H_INT_RESET, h_int_reset);
>>   }
>> +
>> +/*
>> + * Advertise StoreEOI for a P10 compat guest. OS is required to
>> + * enforce load-after-store ordering.
>> + */
>> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
>> +{
>> +    if (enable) {
>> +        xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
>> +    } else {
>> +        xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
>> +    }
>> +
>> +}
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index 61fe7bd2d322..bd023407bd7f 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -296,6 +296,21 @@ static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>>   static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
>>   {
>> +    /*
>> +     * The XIVE_ESB_SET_PQ_10 load operation is used to disable
>> +     * temporarily an interrupt source. If StoreEOI is active, a
>> +     * source could be left enabled if the load and store operations
>> +     * come out of order.
>> +     *
>> +     * As we don't know the characteristics of the host source
>> +     * interrupts (StoreEOI or not), enforce the load-after-store
>> +     * ordering always. The performance penalty will be very small for
>> +     * QEMU.
>> +     */
>> +    if (offset == XIVE_ESB_SET_PQ_10) {
>> +        offset |= XIVE_ESB_LD_ST_MO;
>> +    }
>> +
>>       return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
>>   }
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index b8e4c7294d59..d62881873b1b 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -1024,6 +1024,12 @@ static uint64_t xive_source_esb_read(void *opaque, hwaddr addr, unsigned size)
>>       case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
>>       case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
>>       case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
>> +        if (offset == XIVE_ESB_SET_PQ_10 &&
>> +            xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
>> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: load-after-store ordering "
>> +                          "not enforced with Store EOI active for IRQ %d\n",
>> +                          srcno);
>> +        }
> 
> I'm afraid I have a lack of understanding of what is happening here but I'll try. Up
> there, in xive_esb_read(), you forced the load-after-store ordering for the Load
> operation that can left the source enabled if we have an out of order store-after-load
> situation. Given that it's a function in spapr_xive_kvm.c 

Yes, everything done in spapr_xive_kvm.c is related to the KVM support.

This means that the memory load/store operations are done directly on
the HW MMIO page of the XIVE controller and this from the QEMU user
space program. And so ...


> I understand that this is
> a wrapper for the kvmppc() calls into the host kernel XIVE irqchip, 

the host kernel XIVE irqchip is not involded. QEMU manipulates HW
directly bypassing the host kernel. This is important to understand.
QEMU "pokes and peeks" HW from user space.

> so that piece
> of code is forcing the load-after-store at all times for that Load operation since
> the performance penalty is not relevant.

yes. In that part, we don't know if StoreEOI has been enabled, so we
use the magic offset in any case.

This would be a no-op on P9 and would enforce load-after-store always
on P10. on P10, all sources should have StoreEOI, so it is a correct
thing to do.

xive_esb_read() is used in many places:

  1. in the machine change state handler, for migration and to catch
     inflight interrupts.
  2. to query state, for migration and for the monitor
  3. for LSI handling

We care about 1.

> Assuming that what I said is mostly correct, I don't understand why we can't do a
> similar thing in the hw/intc/xive.c case, where I suppose it's the XIVE emulation
> controller implementation that we're going to use if we don't have an  irqchip to
> use with KVM.

yes. This part of the code is always used for emulated sources. When
the KVM device is in use, it is not involved.

> Can't we force the load-after-store ordering in the emulated
> XIVE_ESB_SET_PQ_10 instead of throwing an error?

There are no load-after-store issues when under emulation. This is a
HW-only problem on high end systems. I think you need more than 4sockets
to show up. I have never seen it on 2 sockets.

But the HW does not tell if something is wrong in the code. At some point,
the OS freezes and possibly complains because an IRQ is (wrongly) pending.
The check done in xive_source_esb_read() is here to catch a misbehaving
OS which would use in some code path the racy XIVE_ESB_SET_PQ_10 without
adding the load-after-store ordering offset XIVE_ESB_LD_ST_MO. That's
how I tracked all the possible paths using XIVE_ESB_SET_PQ_10 under Linux.

Hope this is mostly clear.

Thanks,

C.
Daniel Henrique Barboza Feb. 17, 2022, 11:28 a.m. UTC | #3
On 2/17/22 04:51, Cédric Le Goater wrote:
> [ adding a few people for the comments ]
> 
> On 2/16/22 23:17, Daniel Henrique Barboza wrote:
>>
>>
>> On 2/14/22 11:11, Cédric Le Goater wrote:
>>> When an interrupt has been handled, the OS notifies the interrupt
>>> controller with a EOI sequence. On a POWER9 and POWER10 systems using
>>
>> nit:  s/a EOI sequence/an EOI sequence
>>
>>
>>> the XIVE interrupt controller, this can be done with a load or a store
>>> operation on the ESB interrupt management page of the interrupt. The
>>> StoreEOI operation has less latency and improves interrupt handling
>>> performance but it was deactivated during the POWER9 DD2.0 timeframe
>>> because of ordering issues. POWER9 systems use the LoadEOI instead.
>>> POWER10 compat guests should have fixed the issue with
>>> Load-after-Store ordering and StoreEOI can be activated for them
>>> again.
>>>
>>> To maintain performance, this ordering is only enforced for the
>>> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
>>> disable temporarily an interrupt source. If StoreEOI is active, a
>>> source could be left enabled if the load and store operations come
>>> out of order.
>>>
>>> Add a check in our XIVE emulation model for Load-after-Store when
>>> StoreEOI is active. It should catch unreliable sequences. Other load
>>> operations should be fine without it.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>>   include/hw/ppc/spapr_xive.h |  1 +
>>>   include/hw/ppc/xive.h       |  8 ++++++++
>>>   hw/intc/spapr_xive.c        | 15 +++++++++++++++
>>>   hw/intc/spapr_xive_kvm.c    | 15 +++++++++++++++
>>>   hw/intc/xive.c              |  6 ++++++
>>>   hw/ppc/spapr_hcall.c        |  7 +++++++
>>>   6 files changed, 52 insertions(+)
>>>
>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>>> index b282960ad90d..9c247d8bf57d 100644
>>> --- a/include/hw/ppc/spapr_xive.h
>>> +++ b/include/hw/ppc/spapr_xive.h
>>> @@ -73,6 +73,7 @@ void spapr_xive_map_mmio(SpaprXive *xive);
>>>   int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>>>                                uint32_t *out_server, uint8_t *out_prio);
>>> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
>>>   /*
>>>    * KVM XIVE device helpers
>>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>>> index 126e4e2c3a17..133f308c2792 100644
>>> --- a/include/hw/ppc/xive.h
>>> +++ b/include/hw/ppc/xive.h
>>> @@ -285,6 +285,14 @@ uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
>>>   #define XIVE_ESB_SET_PQ_10      0xe00 /* Load */
>>>   #define XIVE_ESB_SET_PQ_11      0xf00 /* Load */
>>> +/*
>>> + * Load-after-store ordering
>>> + *
>>> + * Adding this offset to the load address will enforce
>>> + * load-after-store ordering. This is required to use with StoreEOI.
>>> + */
>>> +#define XIVE_ESB_LD_ST_MO       0x40 /* Load-after-store ordering */
>>> +
>>>   uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
>>>   uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>> index dc641cc604bf..0b8a246ad594 100644
>>> --- a/hw/intc/spapr_xive.c
>>> +++ b/hw/intc/spapr_xive.c
>>> @@ -25,6 +25,7 @@
>>>   #include "hw/ppc/xive_regs.h"
>>>   #include "hw/qdev-properties.h"
>>>   #include "trace.h"
>>> +#include "cpu-models.h"
>>>   /*
>>>    * XIVE Virtualization Controller BAR and Thread Managment BAR that we
>>> @@ -1854,3 +1855,17 @@ void spapr_xive_hcall_init(SpaprMachineState *spapr)
>>>       spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>>>       spapr_register_hypercall(H_INT_RESET, h_int_reset);
>>>   }
>>> +
>>> +/*
>>> + * Advertise StoreEOI for a P10 compat guest. OS is required to
>>> + * enforce load-after-store ordering.
>>> + */
>>> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
>>> +{
>>> +    if (enable) {
>>> +        xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
>>> +    } else {
>>> +        xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
>>> +    }
>>> +
>>> +}
>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>> index 61fe7bd2d322..bd023407bd7f 100644
>>> --- a/hw/intc/spapr_xive_kvm.c
>>> +++ b/hw/intc/spapr_xive_kvm.c
>>> @@ -296,6 +296,21 @@ static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>>>   static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
>>>   {
>>> +    /*
>>> +     * The XIVE_ESB_SET_PQ_10 load operation is used to disable
>>> +     * temporarily an interrupt source. If StoreEOI is active, a
>>> +     * source could be left enabled if the load and store operations
>>> +     * come out of order.
>>> +     *
>>> +     * As we don't know the characteristics of the host source
>>> +     * interrupts (StoreEOI or not), enforce the load-after-store
>>> +     * ordering always. The performance penalty will be very small for
>>> +     * QEMU.
>>> +     */
>>> +    if (offset == XIVE_ESB_SET_PQ_10) {
>>> +        offset |= XIVE_ESB_LD_ST_MO;
>>> +    }
>>> +
>>>       return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
>>>   }
>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>> index b8e4c7294d59..d62881873b1b 100644
>>> --- a/hw/intc/xive.c
>>> +++ b/hw/intc/xive.c
>>> @@ -1024,6 +1024,12 @@ static uint64_t xive_source_esb_read(void *opaque, hwaddr addr, unsigned size)
>>>       case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
>>>       case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
>>>       case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
>>> +        if (offset == XIVE_ESB_SET_PQ_10 &&
>>> +            xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
>>> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: load-after-store ordering "
>>> +                          "not enforced with Store EOI active for IRQ %d\n",
>>> +                          srcno);
>>> +        }
>>
>> I'm afraid I have a lack of understanding of what is happening here but I'll try. Up
>> there, in xive_esb_read(), you forced the load-after-store ordering for the Load
>> operation that can left the source enabled if we have an out of order store-after-load
>> situation. Given that it's a function in spapr_xive_kvm.c 
> 
> Yes, everything done in spapr_xive_kvm.c is related to the KVM support.
> 
> This means that the memory load/store operations are done directly on
> the HW MMIO page of the XIVE controller and this from the QEMU user
> space program. And so ...
> 
> 
>> I understand that this is
>> a wrapper for the kvmppc() calls into the host kernel XIVE irqchip, 
> 
> the host kernel XIVE irqchip is not involded. QEMU manipulates HW
> directly bypassing the host kernel. This is important to understand.
> QEMU "pokes and peeks" HW from user space.

Oh, I thought the host XIVE chip was involved in this case. Good to know.

> 
>> so that piece
>> of code is forcing the load-after-store at all times for that Load operation since
>> the performance penalty is not relevant.
> 
> yes. In that part, we don't know if StoreEOI has been enabled, so we
> use the magic offset in any case.
> 
> This would be a no-op on P9 and would enforce load-after-store always
> on P10. on P10, all sources should have StoreEOI, so it is a correct
> thing to do.
> 
> xive_esb_read() is used in many places:
> 
>   1. in the machine change state handler, for migration and to catch
>      inflight interrupts.
>   2. to query state, for migration and for the monitor
>   3. for LSI handling
> 
> We care about 1.
> 
>> Assuming that what I said is mostly correct, I don't understand why we can't do a
>> similar thing in the hw/intc/xive.c case, where I suppose it's the XIVE emulation
>> controller implementation that we're going to use if we don't have an  irqchip to
>> use with KVM.
> 
> yes. This part of the code is always used for emulated sources. When
> the KVM device is in use, it is not involved.
> 
>> Can't we force the load-after-store ordering in the emulated
>> XIVE_ESB_SET_PQ_10 instead of throwing an error?
> 
> There are no load-after-store issues when under emulation. This is a
> HW-only problem on high end systems. I think you need more than 4sockets
> to show up. I have never seen it on 2 sockets.
> 
> But the HW does not tell if something is wrong in the code. At some point,
> the OS freezes and possibly complains because an IRQ is (wrongly) pending.
> The check done in xive_source_esb_read() is here to catch a misbehaving
> OS which would use in some code path the racy XIVE_ESB_SET_PQ_10 without
> adding the load-after-store ordering offset XIVE_ESB_LD_ST_MO. That's
> how I tracked all the possible paths using XIVE_ESB_SET_PQ_10 under Linux.
> 
> Hope this is mostly clear.

Yes, much clear. Thanks!


Daniel

> 
> Thanks,
> 
> C.
Daniel Henrique Barboza Feb. 17, 2022, 11:28 a.m. UTC | #4
On 2/14/22 11:11, Cédric Le Goater wrote:
> When an interrupt has been handled, the OS notifies the interrupt
> controller with a EOI sequence. On a POWER9 and POWER10 systems using
> the XIVE interrupt controller, this can be done with a load or a store
> operation on the ESB interrupt management page of the interrupt. The
> StoreEOI operation has less latency and improves interrupt handling
> performance but it was deactivated during the POWER9 DD2.0 timeframe
> because of ordering issues. POWER9 systems use the LoadEOI instead.
> POWER10 compat guests should have fixed the issue with
> Load-after-Store ordering and StoreEOI can be activated for them
> again.
> 
> To maintain performance, this ordering is only enforced for the
> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
> disable temporarily an interrupt source. If StoreEOI is active, a
> source could be left enabled if the load and store operations come
> out of order.
> 
> Add a check in our XIVE emulation model for Load-after-Store when
> StoreEOI is active. It should catch unreliable sequences. Other load
> operations should be fine without it.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---

Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>

>   include/hw/ppc/spapr_xive.h |  1 +
>   include/hw/ppc/xive.h       |  8 ++++++++
>   hw/intc/spapr_xive.c        | 15 +++++++++++++++
>   hw/intc/spapr_xive_kvm.c    | 15 +++++++++++++++
>   hw/intc/xive.c              |  6 ++++++
>   hw/ppc/spapr_hcall.c        |  7 +++++++
>   6 files changed, 52 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index b282960ad90d..9c247d8bf57d 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -73,6 +73,7 @@ void spapr_xive_map_mmio(SpaprXive *xive);
>   
>   int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>                                uint32_t *out_server, uint8_t *out_prio);
> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
>   
>   /*
>    * KVM XIVE device helpers
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 126e4e2c3a17..133f308c2792 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -285,6 +285,14 @@ uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
>   #define XIVE_ESB_SET_PQ_10      0xe00 /* Load */
>   #define XIVE_ESB_SET_PQ_11      0xf00 /* Load */
>   
> +/*
> + * Load-after-store ordering
> + *
> + * Adding this offset to the load address will enforce
> + * load-after-store ordering. This is required to use with StoreEOI.
> + */
> +#define XIVE_ESB_LD_ST_MO       0x40 /* Load-after-store ordering */
> +
>   uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
>   uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
>   
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index dc641cc604bf..0b8a246ad594 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -25,6 +25,7 @@
>   #include "hw/ppc/xive_regs.h"
>   #include "hw/qdev-properties.h"
>   #include "trace.h"
> +#include "cpu-models.h"
>   
>   /*
>    * XIVE Virtualization Controller BAR and Thread Managment BAR that we
> @@ -1854,3 +1855,17 @@ void spapr_xive_hcall_init(SpaprMachineState *spapr)
>       spapr_register_hypercall(H_INT_SYNC, h_int_sync);
>       spapr_register_hypercall(H_INT_RESET, h_int_reset);
>   }
> +
> +/*
> + * Advertise StoreEOI for a P10 compat guest. OS is required to
> + * enforce load-after-store ordering.
> + */
> +void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
> +{
> +    if (enable) {
> +        xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
> +    } else {
> +        xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
> +    }
> +
> +}
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 61fe7bd2d322..bd023407bd7f 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -296,6 +296,21 @@ static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>   
>   static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
>   {
> +    /*
> +     * The XIVE_ESB_SET_PQ_10 load operation is used to disable
> +     * temporarily an interrupt source. If StoreEOI is active, a
> +     * source could be left enabled if the load and store operations
> +     * come out of order.
> +     *
> +     * As we don't know the characteristics of the host source
> +     * interrupts (StoreEOI or not), enforce the load-after-store
> +     * ordering always. The performance penalty will be very small for
> +     * QEMU.
> +     */
> +    if (offset == XIVE_ESB_SET_PQ_10) {
> +        offset |= XIVE_ESB_LD_ST_MO;
> +    }
> +
>       return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
>   }
>   
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index b8e4c7294d59..d62881873b1b 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -1024,6 +1024,12 @@ static uint64_t xive_source_esb_read(void *opaque, hwaddr addr, unsigned size)
>       case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
>       case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
>       case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
> +        if (offset == XIVE_ESB_SET_PQ_10 &&
> +            xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: load-after-store ordering "
> +                          "not enforced with Store EOI active for IRQ %d\n",
> +                          srcno);
> +        }
>           ret = xive_source_esb_set(xsrc, srcno, (offset >> 8) & 0x3);
>           break;
>       default:
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 8ffb13ada08e..6b888c963ac4 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1210,11 +1210,18 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>        * otherwise terminate the boot.
>        */
>       if (guest_xive) {
> +        bool enable;
> +
>           if (!spapr->irq->xive) {
>               error_report(
>   "Guest requested unavailable interrupt mode (XIVE), try the ic-mode=xive or ic-mode=dual machine property");
>               exit(EXIT_FAILURE);
>           }
> +
> +        /* Advertise StoreEOI for a P10 compat guest. */
> +        enable = ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0,
> +                                  cpu->compat_pvr);
> +        spapr_xive_enable_store_eoi(spapr->xive, enable);
>       } else {
>           if (!spapr->irq->xics) {
>               error_report(
Cédric Le Goater Feb. 17, 2022, 1:23 p.m. UTC | #5
On 2/17/22 12:28, Daniel Henrique Barboza wrote:
> 
> 
> On 2/14/22 11:11, Cédric Le Goater wrote:
>> When an interrupt has been handled, the OS notifies the interrupt
>> controller with a EOI sequence. On a POWER9 and POWER10 systems using
>> the XIVE interrupt controller, this can be done with a load or a store
>> operation on the ESB interrupt management page of the interrupt. The
>> StoreEOI operation has less latency and improves interrupt handling
>> performance but it was deactivated during the POWER9 DD2.0 timeframe
>> because of ordering issues. POWER9 systems use the LoadEOI instead.
>> POWER10 compat guests should have fixed the issue with
>> Load-after-Store ordering and StoreEOI can be activated for them
>> again.
>>
>> To maintain performance, this ordering is only enforced for the
>> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
>> disable temporarily an interrupt source. If StoreEOI is active, a
>> source could be left enabled if the load and store operations come
>> out of order.
>>
>> Add a check in our XIVE emulation model for Load-after-Store when
>> StoreEOI is active. It should catch unreliable sequences. Other load
>> operations should be fine without it.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
> 
> Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>


Unfortunetaly, this patch breaks migration under TCG because the XIVE
source flag is not updated on the target side. KVM is not impacted
because the emulated sources are not used. This needs to be addressed
in a v2.

That said, even without this patch, TCG migration is broken. some CPUs
on the receive side are stalled on CPU Hard LOCKUPs. QEMU 6.2 is impacted.
So it has been a while :/

See below.

C.



[   24.113608] watchdog: CPU 0 detected hard LOCKUP on other CPUs 1,3
[   24.116534] watchdog: CPU 0 TB:15585461459, last SMP heartbeat TB:7394335409 (15998ms ago)
[   24.117840] watchdog: CPU 1 Hard LOCKUP
[   24.117956] watchdog: CPU 1 TB:15587843000, last heartbeat TB:5355690415 (19984ms ago)
[   24.117999] Modules linked in:
[   24.118387] irq event stamp: 341399
[   24.118399] hardirqs last  enabled at (341399): [<c000000000caea2c>] snooze_loop+0x9c/0x290
[   24.118900] hardirqs last disabled at (341398): [<c000000000208b9c>] do_idle+0x12c/0x450
[   24.118943] softirqs last  enabled at (9798): [<c000000000f97dfc>] __do_softirq+0x60c/0x678
[   24.118971] softirqs last disabled at (9789): [<c0000000001b06f8>] __irq_exit_rcu+0x158/0x1c0
[   24.119127] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc4-dirty #984
[   24.119293] NIP:  c000000000caea78 LR: c000000000caea38 CTR: c000000000cae990
[   24.119315] REGS: c0000000fff43d60 TRAP: 0100   Not tainted  (5.17.0-rc4-dirty)
[   24.119352] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28000228  XER: 00000006
[   24.119554] CFAR: c000000000caea98 IRQMASK: 0
[   24.119554] GPR00: c000000000caea2c c000000002bbbd80 c000000001c30b00 0000000000000000
[   24.119554] GPR04: 0000000000000006 0000000000000000 000000000000c800 c000000001c7dc38
[   24.119554] GPR08: c000000002b5d500 0000000000000000 00000003a115ef39 36d551ed00000000
[   24.119554] GPR12: c000000000cae990 c0000000fffff300 0000000000000000 0000000000000000
[   24.119554] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[   24.119554] GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000001b3a660
[   24.119554] GPR24: c0000000ffa4fb48 000000059d7c5070 c000000001c78e48 0000000000000000
[   24.119554] GPR28: c000000001b3a660 c0000000015422e0 c0000000015422e8 0000000000000000
[   24.119845] NIP [c000000000caea78] snooze_loop+0xe8/0x290
[   24.119866] LR [c000000000caea38] snooze_loop+0xa8/0x290
[   24.119998] Call Trace:
[   24.120029] [c000000002bbbd80] [c000000000caea2c] snooze_loop+0x9c/0x290 (unreliable)
[   24.120097] [c000000002bbbdc0] [c000000000cab730] cpuidle_enter_state+0x300/0x730
[   24.120119] [c000000002bbbe30] [c000000000cabbfc] cpuidle_enter+0x4c/0x70
[   24.120131] [c000000002bbbe70] [c000000000208d98] do_idle+0x328/0x450
[   24.120141] [c000000002bbbf00] [c00000000020926c] cpu_startup_entry+0x3c/0x40
[   24.120150] [c000000002bbbf30] [c00000000005e144] start_secondary+0x2a4/0x2b0
[   24.120161] [c000000002bbbf90] [c00000000000d054] start_secondary_prolog+0x10/0x14
[   24.120238] Instruction dump:
[   24.120320] e9280080 e8c7d148 3ce20005 71290004 38e7d138 7d4a3214 4082003c 60000000
[   24.120357] 60000000 60420000 7c210b78 7ffffb78 <8927000c> 2c090000 41820010 7d2c42a6
Daniel Henrique Barboza Feb. 17, 2022, 1:30 p.m. UTC | #6
On 2/17/22 10:23, Cédric Le Goater wrote:
> On 2/17/22 12:28, Daniel Henrique Barboza wrote:
>>
>>
>> On 2/14/22 11:11, Cédric Le Goater wrote:
>>> When an interrupt has been handled, the OS notifies the interrupt
>>> controller with a EOI sequence. On a POWER9 and POWER10 systems using
>>> the XIVE interrupt controller, this can be done with a load or a store
>>> operation on the ESB interrupt management page of the interrupt. The
>>> StoreEOI operation has less latency and improves interrupt handling
>>> performance but it was deactivated during the POWER9 DD2.0 timeframe
>>> because of ordering issues. POWER9 systems use the LoadEOI instead.
>>> POWER10 compat guests should have fixed the issue with
>>> Load-after-Store ordering and StoreEOI can be activated for them
>>> again.
>>>
>>> To maintain performance, this ordering is only enforced for the
>>> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
>>> disable temporarily an interrupt source. If StoreEOI is active, a
>>> source could be left enabled if the load and store operations come
>>> out of order.
>>>
>>> Add a check in our XIVE emulation model for Load-after-Store when
>>> StoreEOI is active. It should catch unreliable sequences. Other load
>>> operations should be fine without it.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>
>> Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> 
> 
> Unfortunetaly, this patch breaks migration under TCG because the XIVE
> source flag is not updated on the target side. KVM is not impacted
> because the emulated sources are not used. This needs to be addressed
> in a v2.
> 
> That said, even without this patch, TCG migration is broken. some CPUs
> on the receive side are stalled on CPU Hard LOCKUPs. QEMU 6.2 is impacted.
> So it has been a while :/

Ouch. Guess we need to add TCG migration tests in the test workflow ...



Daniel

> 
> See below.
> 
> C.
> 
> 
> 
> [   24.113608] watchdog: CPU 0 detected hard LOCKUP on other CPUs 1,3
> [   24.116534] watchdog: CPU 0 TB:15585461459, last SMP heartbeat TB:7394335409 (15998ms ago)
> [   24.117840] watchdog: CPU 1 Hard LOCKUP
> [   24.117956] watchdog: CPU 1 TB:15587843000, last heartbeat TB:5355690415 (19984ms ago)
> [   24.117999] Modules linked in:
> [   24.118387] irq event stamp: 341399
> [   24.118399] hardirqs last  enabled at (341399): [<c000000000caea2c>] snooze_loop+0x9c/0x290
> [   24.118900] hardirqs last disabled at (341398): [<c000000000208b9c>] do_idle+0x12c/0x450
> [   24.118943] softirqs last  enabled at (9798): [<c000000000f97dfc>] __do_softirq+0x60c/0x678
> [   24.118971] softirqs last disabled at (9789): [<c0000000001b06f8>] __irq_exit_rcu+0x158/0x1c0
> [   24.119127] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc4-dirty #984
> [   24.119293] NIP:  c000000000caea78 LR: c000000000caea38 CTR: c000000000cae990
> [   24.119315] REGS: c0000000fff43d60 TRAP: 0100   Not tainted  (5.17.0-rc4-dirty)
> [   24.119352] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28000228  XER: 00000006
> [   24.119554] CFAR: c000000000caea98 IRQMASK: 0
> [   24.119554] GPR00: c000000000caea2c c000000002bbbd80 c000000001c30b00 0000000000000000
> [   24.119554] GPR04: 0000000000000006 0000000000000000 000000000000c800 c000000001c7dc38
> [   24.119554] GPR08: c000000002b5d500 0000000000000000 00000003a115ef39 36d551ed00000000
> [   24.119554] GPR12: c000000000cae990 c0000000fffff300 0000000000000000 0000000000000000
> [   24.119554] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   24.119554] GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000001b3a660
> [   24.119554] GPR24: c0000000ffa4fb48 000000059d7c5070 c000000001c78e48 0000000000000000
> [   24.119554] GPR28: c000000001b3a660 c0000000015422e0 c0000000015422e8 0000000000000000
> [   24.119845] NIP [c000000000caea78] snooze_loop+0xe8/0x290
> [   24.119866] LR [c000000000caea38] snooze_loop+0xa8/0x290
> [   24.119998] Call Trace:
> [   24.120029] [c000000002bbbd80] [c000000000caea2c] snooze_loop+0x9c/0x290 (unreliable)
> [   24.120097] [c000000002bbbdc0] [c000000000cab730] cpuidle_enter_state+0x300/0x730
> [   24.120119] [c000000002bbbe30] [c000000000cabbfc] cpuidle_enter+0x4c/0x70
> [   24.120131] [c000000002bbbe70] [c000000000208d98] do_idle+0x328/0x450
> [   24.120141] [c000000002bbbf00] [c00000000020926c] cpu_startup_entry+0x3c/0x40
> [   24.120150] [c000000002bbbf30] [c00000000005e144] start_secondary+0x2a4/0x2b0
> [   24.120161] [c000000002bbbf90] [c00000000000d054] start_secondary_prolog+0x10/0x14
> [   24.120238] Instruction dump:
> [   24.120320] e9280080 e8c7d148 3ce20005 71290004 38e7d138 7d4a3214 4082003c 60000000
> [   24.120357] 60000000 60420000 7c210b78 7ffffb78 <8927000c> 2c090000 41820010 7d2c42a6
>
Cédric Le Goater Feb. 17, 2022, 5:42 p.m. UTC | #7
>> Unfortunately, this patch breaks migration under TCG because the XIVE
>> source flag is not updated on the target side. KVM is not impacted
>> because the emulated sources are not used. This needs to be addressed
>> in a v2.
>>
>> That said, even without this patch, TCG migration is broken. some CPUs
>> on the receive side are stalled on CPU Hard LOCKUPs. QEMU 6.2 is impacted.
>> So it has been a while :/
> 
> Ouch. Guess we need to add TCG migration tests in the test workflow ...

Regarding the first issue with the new XIVE source flag, this routine
changes an object property after realize which is a no-no for migration :

     void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
     {
         if (enable) {
             xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
         } else {
             xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
         }
     }

I think we need a new SpaprXive state to represent the characteristic
of the source indirectly negotiated by CAS when the CPU is a POWER10.
we would use it to update xive->source.esb_flags at post_load time
after migration.

Or simply mimick CAS :

@@ -531,6 +531,14 @@ static int spapr_xive_post_load(SpaprInt
          return kvmppc_xive_post_load(xive, version_id);
      }
  
+    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
+    bool enable = ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_10, 0,
+                                   first_ppc_cpu->compat_pvr);
+    spapr_xive_enable_store_eoi(xive, enable);
+
      return 0;
  }
  
which has the benefit of being stateless.

Ideas ?

Thanks,

C.
Daniel Henrique Barboza Feb. 17, 2022, 9:03 p.m. UTC | #8
On 2/17/22 10:23, Cédric Le Goater wrote:
> On 2/17/22 12:28, Daniel Henrique Barboza wrote:
>>
>>
>> On 2/14/22 11:11, Cédric Le Goater wrote:
>>> When an interrupt has been handled, the OS notifies the interrupt
>>> controller with a EOI sequence. On a POWER9 and POWER10 systems using
>>> the XIVE interrupt controller, this can be done with a load or a store
>>> operation on the ESB interrupt management page of the interrupt. The
>>> StoreEOI operation has less latency and improves interrupt handling
>>> performance but it was deactivated during the POWER9 DD2.0 timeframe
>>> because of ordering issues. POWER9 systems use the LoadEOI instead.
>>> POWER10 compat guests should have fixed the issue with
>>> Load-after-Store ordering and StoreEOI can be activated for them
>>> again.
>>>
>>> To maintain performance, this ordering is only enforced for the
>>> XIVE_ESB_SET_PQ_10 load operation. This operation can be used to
>>> disable temporarily an interrupt source. If StoreEOI is active, a
>>> source could be left enabled if the load and store operations come
>>> out of order.
>>>
>>> Add a check in our XIVE emulation model for Load-after-Store when
>>> StoreEOI is active. It should catch unreliable sequences. Other load
>>> operations should be fine without it.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>
>> Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> 
> 
> Unfortunetaly, this patch breaks migration under TCG because the XIVE
> source flag is not updated on the target side. KVM is not impacted
> because the emulated sources are not used. This needs to be addressed
> in a v2.
> 
> That said, even without this patch, TCG migration is broken. some CPUs
> on the receive side are stalled on CPU Hard LOCKUPs. QEMU 6.2 is impacted.
> So it has been a while :/



I've done a few tests and I can see Hard Lockups with TCG pseries migration, when using
multiples CPUs (I used -smp 4 like you suggested in private), since at least QEMU
v6.0.0.

This is hardly surprising since TCG migration isn't something that we ever supported in
a product or even in the community*. It would be good to understand why and get it fixed,
but for now we can take a bit comfort in knowing that:

- it has been broken for awhile (if ever worked). If this was a recent 7.0 regression
we would need to solve it for this upcoming release;

- single CPU TCG migration seems to be working fine, so we can count with this TCG
migration scenario for testing.



* I'm hoping David and Greg can push back on this if my assumption is wrong.



Thanks,



Daniel






> 
> See below.
> 
> C.
> 
> 
> 
> [   24.113608] watchdog: CPU 0 detected hard LOCKUP on other CPUs 1,3
> [   24.116534] watchdog: CPU 0 TB:15585461459, last SMP heartbeat TB:7394335409 (15998ms ago)
> [   24.117840] watchdog: CPU 1 Hard LOCKUP
> [   24.117956] watchdog: CPU 1 TB:15587843000, last heartbeat TB:5355690415 (19984ms ago)
> [   24.117999] Modules linked in:
> [   24.118387] irq event stamp: 341399
> [   24.118399] hardirqs last  enabled at (341399): [<c000000000caea2c>] snooze_loop+0x9c/0x290
> [   24.118900] hardirqs last disabled at (341398): [<c000000000208b9c>] do_idle+0x12c/0x450
> [   24.118943] softirqs last  enabled at (9798): [<c000000000f97dfc>] __do_softirq+0x60c/0x678
> [   24.118971] softirqs last disabled at (9789): [<c0000000001b06f8>] __irq_exit_rcu+0x158/0x1c0
> [   24.119127] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc4-dirty #984
> [   24.119293] NIP:  c000000000caea78 LR: c000000000caea38 CTR: c000000000cae990
> [   24.119315] REGS: c0000000fff43d60 TRAP: 0100   Not tainted  (5.17.0-rc4-dirty)
> [   24.119352] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28000228  XER: 00000006
> [   24.119554] CFAR: c000000000caea98 IRQMASK: 0
> [   24.119554] GPR00: c000000000caea2c c000000002bbbd80 c000000001c30b00 0000000000000000
> [   24.119554] GPR04: 0000000000000006 0000000000000000 000000000000c800 c000000001c7dc38
> [   24.119554] GPR08: c000000002b5d500 0000000000000000 00000003a115ef39 36d551ed00000000
> [   24.119554] GPR12: c000000000cae990 c0000000fffff300 0000000000000000 0000000000000000
> [   24.119554] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   24.119554] GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000001b3a660
> [   24.119554] GPR24: c0000000ffa4fb48 000000059d7c5070 c000000001c78e48 0000000000000000
> [   24.119554] GPR28: c000000001b3a660 c0000000015422e0 c0000000015422e8 0000000000000000
> [   24.119845] NIP [c000000000caea78] snooze_loop+0xe8/0x290
> [   24.119866] LR [c000000000caea38] snooze_loop+0xa8/0x290
> [   24.119998] Call Trace:
> [   24.120029] [c000000002bbbd80] [c000000000caea2c] snooze_loop+0x9c/0x290 (unreliable)
> [   24.120097] [c000000002bbbdc0] [c000000000cab730] cpuidle_enter_state+0x300/0x730
> [   24.120119] [c000000002bbbe30] [c000000000cabbfc] cpuidle_enter+0x4c/0x70
> [   24.120131] [c000000002bbbe70] [c000000000208d98] do_idle+0x328/0x450
> [   24.120141] [c000000002bbbf00] [c00000000020926c] cpu_startup_entry+0x3c/0x40
> [   24.120150] [c000000002bbbf30] [c00000000005e144] start_secondary+0x2a4/0x2b0
> [   24.120161] [c000000002bbbf90] [c00000000000d054] start_secondary_prolog+0x10/0x14
> [   24.120238] Instruction dump:
> [   24.120320] e9280080 e8c7d148 3ce20005 71290004 38e7d138 7d4a3214 4082003c 60000000
> [   24.120357] 60000000 60420000 7c210b78 7ffffb78 <8927000c> 2c090000 41820010 7d2c42a6
>
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index b282960ad90d..9c247d8bf57d 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -73,6 +73,7 @@  void spapr_xive_map_mmio(SpaprXive *xive);
 
 int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
                              uint32_t *out_server, uint8_t *out_prio);
+void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable);
 
 /*
  * KVM XIVE device helpers
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 126e4e2c3a17..133f308c2792 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -285,6 +285,14 @@  uint8_t xive_esb_set(uint8_t *pq, uint8_t value);
 #define XIVE_ESB_SET_PQ_10      0xe00 /* Load */
 #define XIVE_ESB_SET_PQ_11      0xf00 /* Load */
 
+/*
+ * Load-after-store ordering
+ *
+ * Adding this offset to the load address will enforce
+ * load-after-store ordering. This is required to use with StoreEOI.
+ */
+#define XIVE_ESB_LD_ST_MO       0x40 /* Load-after-store ordering */
+
 uint8_t xive_source_esb_get(XiveSource *xsrc, uint32_t srcno);
 uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t srcno, uint8_t pq);
 
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index dc641cc604bf..0b8a246ad594 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -25,6 +25,7 @@ 
 #include "hw/ppc/xive_regs.h"
 #include "hw/qdev-properties.h"
 #include "trace.h"
+#include "cpu-models.h"
 
 /*
  * XIVE Virtualization Controller BAR and Thread Managment BAR that we
@@ -1854,3 +1855,17 @@  void spapr_xive_hcall_init(SpaprMachineState *spapr)
     spapr_register_hypercall(H_INT_SYNC, h_int_sync);
     spapr_register_hypercall(H_INT_RESET, h_int_reset);
 }
+
+/*
+ * Advertise StoreEOI for a P10 compat guest. OS is required to
+ * enforce load-after-store ordering.
+ */
+void spapr_xive_enable_store_eoi(SpaprXive *xive, bool enable)
+{
+    if (enable) {
+        xive->source.esb_flags |= XIVE_SRC_STORE_EOI;
+    } else {
+        xive->source.esb_flags &= ~XIVE_SRC_STORE_EOI;
+    }
+
+}
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 61fe7bd2d322..bd023407bd7f 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -296,6 +296,21 @@  static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
 
 static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset)
 {
+    /*
+     * The XIVE_ESB_SET_PQ_10 load operation is used to disable
+     * temporarily an interrupt source. If StoreEOI is active, a
+     * source could be left enabled if the load and store operations
+     * come out of order.
+     *
+     * As we don't know the characteristics of the host source
+     * interrupts (StoreEOI or not), enforce the load-after-store
+     * ordering always. The performance penalty will be very small for
+     * QEMU.
+     */
+    if (offset == XIVE_ESB_SET_PQ_10) {
+        offset |= XIVE_ESB_LD_ST_MO;
+    }
+
     return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3;
 }
 
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index b8e4c7294d59..d62881873b1b 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1024,6 +1024,12 @@  static uint64_t xive_source_esb_read(void *opaque, hwaddr addr, unsigned size)
     case XIVE_ESB_SET_PQ_01 ... XIVE_ESB_SET_PQ_01 + 0x0FF:
     case XIVE_ESB_SET_PQ_10 ... XIVE_ESB_SET_PQ_10 + 0x0FF:
     case XIVE_ESB_SET_PQ_11 ... XIVE_ESB_SET_PQ_11 + 0x0FF:
+        if (offset == XIVE_ESB_SET_PQ_10 &&
+            xsrc->esb_flags & XIVE_SRC_STORE_EOI) {
+            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: load-after-store ordering "
+                          "not enforced with Store EOI active for IRQ %d\n",
+                          srcno);
+        }
         ret = xive_source_esb_set(xsrc, srcno, (offset >> 8) & 0x3);
         break;
     default:
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 8ffb13ada08e..6b888c963ac4 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1210,11 +1210,18 @@  target_ulong do_client_architecture_support(PowerPCCPU *cpu,
      * otherwise terminate the boot.
      */
     if (guest_xive) {
+        bool enable;
+
         if (!spapr->irq->xive) {
             error_report(
 "Guest requested unavailable interrupt mode (XIVE), try the ic-mode=xive or ic-mode=dual machine property");
             exit(EXIT_FAILURE);
         }
+
+        /* Advertise StoreEOI for a P10 compat guest. */
+        enable = ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0,
+                                  cpu->compat_pvr);
+        spapr_xive_enable_store_eoi(spapr->xive, enable);
     } else {
         if (!spapr->irq->xics) {
             error_report(