diff mbox series

[for-6.0,v2,2/3] spapr/xive: Fix size of END table and number of claimed IPIs

Message ID 20201130165258.744611-3-groug@kaod.org
State New
Headers show
Series spapr: Address the confusion between IPI numbers and vCPU ids | expand

Commit Message

Greg Kurz Nov. 30, 2020, 4:52 p.m. UTC
The sPAPR XIVE device has an internal ENDT table the size of
which is configurable by the machine. This table is supposed
to contain END structures for all possible vCPUs that may
enter the guest. The machine must also claim IPIs for all
possible vCPUs since this is expected by the guest.

spapr_irq_init() takes care of that under the assumption that
spapr_max_vcpu_ids() returns the number of possible vCPUs.
This happens to be the case when the VSMT mode is set to match
the number of threads per core in the guest (default behavior).
With non-default VSMT settings, this limit is > to the number
of vCPUs. In the worst case, we can end up allocating an 8 times
bigger ENDT and claiming 8 times more IPIs than needed. But more
importantly, this creates a confusion between number of vCPUs and
vCPU ids, which can lead to subtle bugs like [1].

Use smp.max_cpus instead of spapr_max_vcpu_ids() in
spapr_irq_init() for the latest machine type. Older machine
types continue to use spapr_max_vcpu_ids() since the size of
the ENDT is migration visible.

[1] https://bugs.launchpad.net/qemu/+bug/1900241

Signed-off-by: Greg Kurz <groug@kaod.org>
---
 include/hw/ppc/spapr.h |  1 +
 hw/ppc/spapr.c         |  3 +++
 hw/ppc/spapr_irq.c     | 16 +++++++++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

Comments

Cédric Le Goater Nov. 30, 2020, 6:07 p.m. UTC | #1
On 11/30/20 5:52 PM, Greg Kurz wrote:
> The sPAPR XIVE device has an internal ENDT table the size of
> which is configurable by the machine. This table is supposed
> to contain END structures for all possible vCPUs that may
> enter the guest. The machine must also claim IPIs for all
> possible vCPUs since this is expected by the guest.
> 
> spapr_irq_init() takes care of that under the assumption that
> spapr_max_vcpu_ids() returns the number of possible vCPUs.
> This happens to be the case when the VSMT mode is set to match
> the number of threads per core in the guest (default behavior).
> With non-default VSMT settings, this limit is > to the number
> of vCPUs. In the worst case, we can end up allocating an 8 times
> bigger ENDT and claiming 8 times more IPIs than needed. But more
> importantly, this creates a confusion between number of vCPUs and
> vCPU ids, which can lead to subtle bugs like [1].
> 
> Use smp.max_cpus instead of spapr_max_vcpu_ids() in
> spapr_irq_init() for the latest machine type. Older machine
> types continue to use spapr_max_vcpu_ids() since the size of
> the ENDT is migration visible.
> 
> [1] https://bugs.launchpad.net/qemu/+bug/1900241
> 
> Signed-off-by: Greg Kurz <groug@kaod.org>


Reviewed-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C.

> ---
>  include/hw/ppc/spapr.h |  1 +
>  hw/ppc/spapr.c         |  3 +++
>  hw/ppc/spapr_irq.c     | 16 +++++++++++++---
>  3 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index dc99d45e2852..95bf210d0bf6 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -139,6 +139,7 @@ struct SpaprMachineClass {
>      hwaddr rma_limit;          /* clamp the RMA to this size */
>      bool pre_5_1_assoc_refpoints;
>      bool pre_5_2_numa_associativity;
> +    bool pre_6_0_xive_max_cpus;
>  
>      bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio, 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index ab59bfe941d0..227a926dfd48 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -4530,8 +4530,11 @@ DEFINE_SPAPR_MACHINE(6_0, "6.0", true);
>   */
>  static void spapr_machine_5_2_class_options(MachineClass *mc)
>  {
> +    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +
>      spapr_machine_6_0_class_options(mc);
>      compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
> +    smc->pre_6_0_xive_max_cpus = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 552e30e93036..4d9ecd5d0af8 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -324,17 +324,27 @@ void spapr_irq_init(SpaprMachineState *spapr, Error **errp)
>      }
>  
>      if (spapr->irq->xive) {
> -        uint32_t max_vcpu_ids = spapr_max_vcpu_ids(spapr);
> +        uint32_t max_cpus = MACHINE(spapr)->smp.max_cpus;
>          DeviceState *dev;
>          int i;
>  
> +        /*
> +         * Older machine types used to size the ENDT and IPI range
> +         * according to the upper limit of vCPU ids, which can be
> +         * higher than smp.max_cpus with custom VSMT settings. Keep
> +         * the previous behavior for compatibility with such setups.
> +         */
> +        if (smc->pre_6_0_xive_max_cpus) {
> +            max_cpus = spapr_max_vcpu_ids(spapr);
> +        }
> +
>          dev = qdev_new(TYPE_SPAPR_XIVE);
>          qdev_prop_set_uint32(dev, "nr-irqs", smc->nr_xirqs + SPAPR_XIRQ_BASE);
>          /*
>           * 8 XIVE END structures per CPU. One for each available
>           * priority
>           */
> -        qdev_prop_set_uint32(dev, "nr-ends", max_vcpu_ids << 3);
> +        qdev_prop_set_uint32(dev, "nr-ends", max_cpus << 3);
>          object_property_set_link(OBJECT(dev), "xive-fabric", OBJECT(spapr),
>                                   &error_abort);
>          sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
> @@ -342,7 +352,7 @@ void spapr_irq_init(SpaprMachineState *spapr, Error **errp)
>          spapr->xive = SPAPR_XIVE(dev);
>  
>          /* Enable the CPU IPIs */
> -        for (i = 0; i < max_vcpu_ids; ++i) {
> +        for (i = 0; i < max_cpus; ++i) {
>              SpaprInterruptControllerClass *sicc
>                  = SPAPR_INTC_GET_CLASS(spapr->xive);
>  
>
Cédric Le Goater Dec. 3, 2020, 9:51 a.m. UTC | #2
On 11/30/20 7:07 PM, Cédric Le Goater wrote:
> On 11/30/20 5:52 PM, Greg Kurz wrote:
>> The sPAPR XIVE device has an internal ENDT table the size of
>> which is configurable by the machine. This table is supposed
>> to contain END structures for all possible vCPUs that may
>> enter the guest. The machine must also claim IPIs for all
>> possible vCPUs since this is expected by the guest.
>>
>> spapr_irq_init() takes care of that under the assumption that
>> spapr_max_vcpu_ids() returns the number of possible vCPUs.
>> This happens to be the case when the VSMT mode is set to match
>> the number of threads per core in the guest (default behavior).
>> With non-default VSMT settings, this limit is > to the number
>> of vCPUs. In the worst case, we can end up allocating an 8 times
>> bigger ENDT and claiming 8 times more IPIs than needed. But more
>> importantly, this creates a confusion between number of vCPUs and
>> vCPU ids, which can lead to subtle bugs like [1].
>>
>> Use smp.max_cpus instead of spapr_max_vcpu_ids() in
>> spapr_irq_init() for the latest machine type. Older machine
>> types continue to use spapr_max_vcpu_ids() since the size of
>> the ENDT is migration visible.
>>
>> [1] https://bugs.launchpad.net/qemu/+bug/1900241
>>
>> Signed-off-by: Greg Kurz <groug@kaod.org>
> 
> 
> Reviewed-by: Cédric Le Goater <clg@kaod.org>


I gave patch 2 and 3 a little more thinking. 

I don't think we need much more than patch 1 which clarifies the 
nature of the values being manipulated, quantities vs. numbering.

The last 2 patches are adding complexity to try to optimize the 
XIVE VP space in a case scenario which is not very common (vSMT). 
May be it's not worth it. 

Today, we can start 4K (-2) KVM guests with 16 vCPUs each on 
a witherspoon (2 socket P9) and we are far from reaching the 
limits of the VP space. Available RAM is more a problem. 

VP space is even bigger on P10. The width was increased to 24bit 
per chip.

C.
Greg Kurz Dec. 3, 2020, 10:50 a.m. UTC | #3
On Thu, 3 Dec 2020 10:51:10 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> On 11/30/20 7:07 PM, Cédric Le Goater wrote:
> > On 11/30/20 5:52 PM, Greg Kurz wrote:
> >> The sPAPR XIVE device has an internal ENDT table the size of
> >> which is configurable by the machine. This table is supposed
> >> to contain END structures for all possible vCPUs that may
> >> enter the guest. The machine must also claim IPIs for all
> >> possible vCPUs since this is expected by the guest.
> >>
> >> spapr_irq_init() takes care of that under the assumption that
> >> spapr_max_vcpu_ids() returns the number of possible vCPUs.
> >> This happens to be the case when the VSMT mode is set to match
> >> the number of threads per core in the guest (default behavior).
> >> With non-default VSMT settings, this limit is > to the number
> >> of vCPUs. In the worst case, we can end up allocating an 8 times
> >> bigger ENDT and claiming 8 times more IPIs than needed. But more
> >> importantly, this creates a confusion between number of vCPUs and
> >> vCPU ids, which can lead to subtle bugs like [1].
> >>
> >> Use smp.max_cpus instead of spapr_max_vcpu_ids() in
> >> spapr_irq_init() for the latest machine type. Older machine
> >> types continue to use spapr_max_vcpu_ids() since the size of
> >> the ENDT is migration visible.
> >>
> >> [1] https://bugs.launchpad.net/qemu/+bug/1900241
> >>
> >> Signed-off-by: Greg Kurz <groug@kaod.org>
> > 
> > 
> > Reviewed-by: Cédric Le Goater <clg@kaod.org>
> 
> 
> I gave patch 2 and 3 a little more thinking. 
> 
> I don't think we need much more than patch 1 which clarifies the 
> nature of the values being manipulated, quantities vs. numbering.
> 
> The last 2 patches are adding complexity to try to optimize the 
> XIVE VP space in a case scenario which is not very common (vSMT). 
> May be it's not worth it. 
> 

Well, the motivation isn't about optimization really since
a non-default vSMT setting already wastes VP space because
of the vCPU spacing. This is more about not using values
with wrong semantics in the code to avoid confusion in
future changes.

I agree though that the extra complexity, especially the
compat crust, might be excessive. So maybe I should just
add comments in the code to clarify when we're using the
wrong semantics ?

> Today, we can start 4K (-2) KVM guests with 16 vCPUs each on 
> a witherspoon (2 socket P9) and we are far from reaching the 
> limits of the VP space. Available RAM is more a problem. 
> 
> VP space is even bigger on P10. The width was increased to 24bit 
> per chip.
> 
> C.
Cédric Le Goater Dec. 4, 2020, 8:46 a.m. UTC | #4
>> I don't think we need much more than patch 1 which clarifies the 
>> nature of the values being manipulated, quantities vs. numbering.
>>
>> The last 2 patches are adding complexity to try to optimize the 
>> XIVE VP space in a case scenario which is not very common (vSMT). 
>> May be it's not worth it. 
>>
> 
> Well, the motivation isn't about optimization really since
> a non-default vSMT setting already wastes VP space because
> of the vCPU spacing. 

I don't see any VPs being wasted when not using vSMT. What's
your command line ?

> This is more about not using values
> with wrong semantics in the code to avoid confusion in
> future changes.

yes.

> I agree though that the extra complexity, especially the
> compat crust, might be excessive. 

It's nice and correct but it seems a bit like extra noise 
if the default case is not wasting VPs. Let's check that 
first. 

> So maybe I should just
> add comments in the code to clarify when we're using the
> wrong semantics ?

yes. I think this is enough.

Thanks,

C.
Greg Kurz Dec. 4, 2020, 9:11 a.m. UTC | #5
On Fri, 4 Dec 2020 09:46:31 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> >> I don't think we need much more than patch 1 which clarifies the 
> >> nature of the values being manipulated, quantities vs. numbering.
> >>
> >> The last 2 patches are adding complexity to try to optimize the 
> >> XIVE VP space in a case scenario which is not very common (vSMT). 
> >> May be it's not worth it. 
> >>
> > 
> > Well, the motivation isn't about optimization really since
> > a non-default vSMT setting already wastes VP space because
> > of the vCPU spacing. 
> 
> I don't see any VPs being wasted when not using vSMT. What's
> your command line ?
> 

I think there's confusion here. VSMT is always being used.
When you don't specify it on the command line, the machine
code sets it internally for you to be equal to the number
of threads/core of the guest. Thanks to that, you get
consecutive vCPU ids and no VP waste. Of course, you
get the same result if you do:

-M pseries,vsmt=N -smp threads=N

If you pass different values to vsmt and threads, though,
you get the spacing and the VP waste.

> > This is more about not using values
> > with wrong semantics in the code to avoid confusion in
> > future changes.
> 
> yes.
> 
> > I agree though that the extra complexity, especially the
> > compat crust, might be excessive. 
> 
> It's nice and correct but it seems a bit like extra noise 
> if the default case is not wasting VPs. Let's check that 
> first. 
> 
> > So maybe I should just
> > add comments in the code to clarify when we're using the
> > wrong semantics ?
> 
> yes. I think this is enough.
> 

I'll do this in v3 then.

> Thanks,
> 
> C.
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index dc99d45e2852..95bf210d0bf6 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -139,6 +139,7 @@  struct SpaprMachineClass {
     hwaddr rma_limit;          /* clamp the RMA to this size */
     bool pre_5_1_assoc_refpoints;
     bool pre_5_2_numa_associativity;
+    bool pre_6_0_xive_max_cpus;
 
     bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
                           uint64_t *buid, hwaddr *pio, 
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index ab59bfe941d0..227a926dfd48 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4530,8 +4530,11 @@  DEFINE_SPAPR_MACHINE(6_0, "6.0", true);
  */
 static void spapr_machine_5_2_class_options(MachineClass *mc)
 {
+    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
     spapr_machine_6_0_class_options(mc);
     compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+    smc->pre_6_0_xive_max_cpus = true;
 }
 
 DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 552e30e93036..4d9ecd5d0af8 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -324,17 +324,27 @@  void spapr_irq_init(SpaprMachineState *spapr, Error **errp)
     }
 
     if (spapr->irq->xive) {
-        uint32_t max_vcpu_ids = spapr_max_vcpu_ids(spapr);
+        uint32_t max_cpus = MACHINE(spapr)->smp.max_cpus;
         DeviceState *dev;
         int i;
 
+        /*
+         * Older machine types used to size the ENDT and IPI range
+         * according to the upper limit of vCPU ids, which can be
+         * higher than smp.max_cpus with custom VSMT settings. Keep
+         * the previous behavior for compatibility with such setups.
+         */
+        if (smc->pre_6_0_xive_max_cpus) {
+            max_cpus = spapr_max_vcpu_ids(spapr);
+        }
+
         dev = qdev_new(TYPE_SPAPR_XIVE);
         qdev_prop_set_uint32(dev, "nr-irqs", smc->nr_xirqs + SPAPR_XIRQ_BASE);
         /*
          * 8 XIVE END structures per CPU. One for each available
          * priority
          */
-        qdev_prop_set_uint32(dev, "nr-ends", max_vcpu_ids << 3);
+        qdev_prop_set_uint32(dev, "nr-ends", max_cpus << 3);
         object_property_set_link(OBJECT(dev), "xive-fabric", OBJECT(spapr),
                                  &error_abort);
         sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
@@ -342,7 +352,7 @@  void spapr_irq_init(SpaprMachineState *spapr, Error **errp)
         spapr->xive = SPAPR_XIVE(dev);
 
         /* Enable the CPU IPIs */
-        for (i = 0; i < max_vcpu_ids; ++i) {
+        for (i = 0; i < max_cpus; ++i) {
             SpaprInterruptControllerClass *sicc
                 = SPAPR_INTC_GET_CLASS(spapr->xive);