diff mbox series

[1/1] spapr: Check SMT based on KVM_CAP_PPC_SMT_POSSIBLE

Message ID 20180106004722.1152-2-joserz@linux.vnet.ibm.com
State New
Headers show
Series Check SMT based on KVM_CAP_PPC_SMT_POSSIBLE | expand

Commit Message

Jose Ricardo Ziviani Jan. 6, 2018, 12:47 a.m. UTC
Power9 supports 4 HW threads/core but it's possible to emulate
doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
which returns a bitmap with all SMT modes supported by the host.

Today, QEMU forces the SMT mode based on PVR compat table, this is
silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
guest will end up with 4 threads/core without any feedback to the user.
It is confusing and will crash QEMU if a cpu is hotplugged in that
guest.

This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
supports the SMT mode so it allows Power9 guests to have 8 threads/core
if desired.

Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c       | 14 +++++++++++++-
 hw/ppc/trace-events  |  1 +
 target/ppc/kvm.c     |  5 +++++
 target/ppc/kvm_ppc.h |  6 ++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

Comments

Greg Kurz Jan. 9, 2018, 12:48 p.m. UTC | #1
On Fri,  5 Jan 2018 22:47:22 -0200
Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com> wrote:

> Power9 supports 4 HW threads/core but it's possible to emulate
> doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> which returns a bitmap with all SMT modes supported by the host.
> 
> Today, QEMU forces the SMT mode based on PVR compat table, this is
> silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> guest will end up with 4 threads/core without any feedback to the user.
> It is confusing and will crash QEMU if a cpu is hotplugged in that
> guest.
> 
> This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> supports the SMT mode so it allows Power9 guests to have 8 threads/core
> if desired.
> 
> Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> ---

Hi,

I agree with the general idea but I have a few questions.

The MIN(smp_threads, ppc_compat_max_threads(cpu)) computation is
performed in spapr_fixup_cpu_dt() at CAS, but it is also performed
in spapr_populate_cpu_dt() at machine reset or when a CPU is added.

Shouldn't your patch address the latter as well ?

>  hw/ppc/spapr.c       | 14 +++++++++++++-
>  hw/ppc/trace-events  |  1 +
>  target/ppc/kvm.c     |  5 +++++
>  target/ppc/kvm_ppc.h |  6 ++++++
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d1acfe8858..ea2503cd2f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>          DeviceClass *dc = DEVICE_GET_CLASS(cs);
>          int index = spapr_vcpu_id(cpu);
> -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));

Considering that we have:

int ppc_compat_max_threads(PowerPCCPU *cpu)
{
    const CompatInfo *compat = compat_by_pvr(cpu->compat_pvr);
    int n_threads = CPU(cpu)->nr_threads;

    if (cpu->compat_pvr) {
        g_assert(compat);
        n_threads = MIN(n_threads, compat->max_threads);
    }

    return n_threads;
}

and

void qemu_init_vcpu(CPUState *cpu)
{
    cpu->nr_cores = smp_cores;
    cpu->nr_threads = smp_threads;
...
}

ppc_compat_max_threads() already returns the smaller value of
smp_threads and the maximum number of HW threads for the PVR.

I don't quite understand why we had this compat_smt calculation
in the first place...

> +
> +        /* set smt to maximum for this current pvr if the number
> +         * passed is higher than defined by PVR compat mode AND
> +         * if KVM cannot emulate it.*/
> +        int compat_smt = smp_threads;
> +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> +                smp_threads > ppc_compat_max_threads(cpu)) {
> +            compat_smt = ppc_compat_max_threads(cpu);
> +
> +            trace_spapr_fixup_cpu_smt(index, smp_threads,
> +                    kvmppc_cap_smt_possible(),
> +                    ppc_compat_max_threads(cpu));
> +        }

... so I'm wondering if the above shouldn't be performed in
ppc_compat_max_threads() directly ? 

>  
>          if ((index % smt) != 0) {
>              continue;
> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> index b7c3e64b5e..a8e29d7ab1 100644
> --- a/hw/ppc/trace-events
> +++ b/hw/ppc/trace-events
> @@ -16,6 +16,7 @@ spapr_irq_alloc(int irq) "irq %d"
>  spapr_irq_alloc_block(int first, int num, bool lsi, int align) "first irq %d, %d irqs, lsi=%d, alignnum %d"
>  spapr_irq_free(int src, int irq, int num) "Source#%d, first irq %d, %d irqs"
>  spapr_irq_free_warn(int src, int irq) "Source#%d, irq %d is already free"
> +spapr_fixup_cpu_smt(int idx, int smpt, int kvmt, int pvrt) "CPU(%d): expected smt %d, kvm support %d, max smt pvr %d"
>  
>  # hw/ppc/spapr_hcall.c
>  spapr_cas_pvr_try(uint32_t pvr) "0x%x"
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 518dd06e98..aac5667bf4 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -2456,6 +2456,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>      return cap_mmu_hash_v3;
>  }
>  
> +int kvmppc_cap_smt_possible(void)
> +{
> +    return cap_ppc_smt_possible;
> +}
> +
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
>  {
>      uint32_t host_pvr = mfpvr();
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ecb55493cc..6ac33d2b4a 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>  bool kvmppc_has_cap_htm(void);
>  bool kvmppc_has_cap_mmu_radix(void);
>  bool kvmppc_has_cap_mmu_hash_v3(void);
> +int kvmppc_cap_smt_possible(void);
>  int kvmppc_enable_hwrng(void);
>  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> @@ -290,6 +291,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>      return false;
>  }
>  
> +static inline int kvmppc_cap_smt_possible(void)
> +{
> +    return -1;

When CONFIG_KVM is set, the semantics of kvmppc_cap_smt_possible() is:
- a bitmap with supported SMT modes if KVM has KVM_CAP_PPC_SMT_POSSIBLE
- 0 if KVM doesn't have KVM_CAP_PPC_SMT_POSSIBLE or we're running in
  TCG mode

so it looks a bit weird to return -1 when CONFIG_KVM isn't set (when
running in TCG mode, we would get different values depending on how
the QEMU binary was compiled).

Shouldn't this stub return 0 instead ?

Cheers,

--
Greg

> +}
> +
>  static inline int kvmppc_enable_hwrng(void)
>  {
>      return -1;
Jose Ricardo Ziviani Jan. 11, 2018, 12:14 a.m. UTC | #2
On Tue, Jan 09, 2018 at 01:48:13PM +0100, Greg Kurz wrote:
> On Fri,  5 Jan 2018 22:47:22 -0200
> Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com> wrote:
> 
> > Power9 supports 4 HW threads/core but it's possible to emulate
> > doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> > which returns a bitmap with all SMT modes supported by the host.
> > 
> > Today, QEMU forces the SMT mode based on PVR compat table, this is
> > silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> > guest will end up with 4 threads/core without any feedback to the user.
> > It is confusing and will crash QEMU if a cpu is hotplugged in that
> > guest.
> > 
> > This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> > supports the SMT mode so it allows Power9 guests to have 8 threads/core
> > if desired.
> > 
> > Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> > Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> > ---
> 
> Hi,
> 
> I agree with the general idea but I have a few questions.

Hello!!!! Thank you for your detailed review. :)

I'm copying David too because I've seen other bugs related with (v)smt
topic (specially migration) that it could address.

> 
> The MIN(smp_threads, ppc_compat_max_threads(cpu)) computation is
> performed in spapr_fixup_cpu_dt() at CAS, but it is also performed
> in spapr_populate_cpu_dt() at machine reset or when a CPU is added.
> 
> Shouldn't your patch address the latter as well ?

As far as I investigated, I found out that ppc_compat_max_threads() is
called several times, but it always returns the number of threads from
the argument line. Only in spapr_fixup_cpu_dt(), that happens during
the guest kernel initialization when it's realizing the CPUS, is that
ppc_compat_max_threads() will return that MIN(n_threads, compat->max_threads).

Until them, if(cpu->compat_pvr) is zeroed and QEMU doesn't know the
max_threads yet.

That's the reason that I added the code only in spapr_fixup_cpu_dt()
because this is where the change really happens.

> 
> >  hw/ppc/spapr.c       | 14 +++++++++++++-
> >  hw/ppc/trace-events  |  1 +
> >  target/ppc/kvm.c     |  5 +++++
> >  target/ppc/kvm_ppc.h |  6 ++++++
> >  4 files changed, 25 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index d1acfe8858..ea2503cd2f 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
> >          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >          DeviceClass *dc = DEVICE_GET_CLASS(cs);
> >          int index = spapr_vcpu_id(cpu);
> > -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
> 
> Considering that we have:
> 
> int ppc_compat_max_threads(PowerPCCPU *cpu)
> {
>     const CompatInfo *compat = compat_by_pvr(cpu->compat_pvr);
>     int n_threads = CPU(cpu)->nr_threads;
> 
>     if (cpu->compat_pvr) {
>         g_assert(compat);
>         n_threads = MIN(n_threads, compat->max_threads);
>     }
> 
>     return n_threads;
> }
> 
> and
> 
> void qemu_init_vcpu(CPUState *cpu)
> {
>     cpu->nr_cores = smp_cores;
>     cpu->nr_threads = smp_threads;
> ...
> }
> 
> ppc_compat_max_threads() already returns the smaller value of
> smp_threads and the maximum number of HW threads for the PVR.
> 
> I don't quite understand why we had this compat_smt calculation
> in the first place...

Mostly it only returns "n_threads = CPU(cpu)->nr_threads" because until
the guest kernel initialization cpu->compat_pvr is false so that
MIN() macro is not executed.

So, until late, QEMU thinks its guest will have 8 threads/core. During
the guest kernel init., that fixup code calls ppc_compat_max_threads
that will now have cpu->compat_pvr true and will change the number
of threads to 4. Example:

qemu-system-ppc64 -smp sockets=1,cores=1,threads=8
 +-> qemu_init_vcpu, spapr_populate_cpu_dt: 8 threads/core
 +-> guest kernel is running and asks about CPUs, spapr_fixup_cpu_dt()
     runs, sets threads to 4, set ibm,ppc-interrupt-server#s and done.
 +-> guest now believes it only has 4 threads.

> 
> > +
> > +        /* set smt to maximum for this current pvr if the number
> > +         * passed is higher than defined by PVR compat mode AND
> > +         * if KVM cannot emulate it.*/
> > +        int compat_smt = smp_threads;
> > +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> > +                smp_threads > ppc_compat_max_threads(cpu)) {
> > +            compat_smt = ppc_compat_max_threads(cpu);
> > +
> > +            trace_spapr_fixup_cpu_smt(index, smp_threads,
> > +                    kvmppc_cap_smt_possible(),
> > +                    ppc_compat_max_threads(cpu));
> > +        }
> 
> ... so I'm wondering if the above shouldn't be performed in
> ppc_compat_max_threads() directly ? 

Hmm, now I'm believe that the whole code could rely on that
kvmppc_cap_smt_possible() since it will always return the number of
threads supported by the underlying HW. We could have a check in the
very beginning:

if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads) {
    // explain the user that such setup is wrong and quit.
}

and that part in fixup code could be unecessary.

> 
> >  
> >          if ((index % smt) != 0) {
> >              continue;
> > diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> > index b7c3e64b5e..a8e29d7ab1 100644
> > --- a/hw/ppc/trace-events
> > +++ b/hw/ppc/trace-events
> > @@ -16,6 +16,7 @@ spapr_irq_alloc(int irq) "irq %d"
> >  spapr_irq_alloc_block(int first, int num, bool lsi, int align) "first irq %d, %d irqs, lsi=%d, alignnum %d"
> >  spapr_irq_free(int src, int irq, int num) "Source#%d, first irq %d, %d irqs"
> >  spapr_irq_free_warn(int src, int irq) "Source#%d, irq %d is already free"
> > +spapr_fixup_cpu_smt(int idx, int smpt, int kvmt, int pvrt) "CPU(%d): expected smt %d, kvm support %d, max smt pvr %d"
> >  
> >  # hw/ppc/spapr_hcall.c
> >  spapr_cas_pvr_try(uint32_t pvr) "0x%x"
> > diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> > index 518dd06e98..aac5667bf4 100644
> > --- a/target/ppc/kvm.c
> > +++ b/target/ppc/kvm.c
> > @@ -2456,6 +2456,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
> >      return cap_mmu_hash_v3;
> >  }
> >  
> > +int kvmppc_cap_smt_possible(void)
> > +{
> > +    return cap_ppc_smt_possible;
> > +}
> > +
> >  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
> >  {
> >      uint32_t host_pvr = mfpvr();
> > diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> > index ecb55493cc..6ac33d2b4a 100644
> > --- a/target/ppc/kvm_ppc.h
> > +++ b/target/ppc/kvm_ppc.h
> > @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
> >  bool kvmppc_has_cap_htm(void);
> >  bool kvmppc_has_cap_mmu_radix(void);
> >  bool kvmppc_has_cap_mmu_hash_v3(void);
> > +int kvmppc_cap_smt_possible(void);
> >  int kvmppc_enable_hwrng(void);
> >  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
> >  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> > @@ -290,6 +291,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
> >      return false;
> >  }
> >  
> > +static inline int kvmppc_cap_smt_possible(void)
> > +{
> > +    return -1;
> 
> When CONFIG_KVM is set, the semantics of kvmppc_cap_smt_possible() is:
> - a bitmap with supported SMT modes if KVM has KVM_CAP_PPC_SMT_POSSIBLE
> - 0 if KVM doesn't have KVM_CAP_PPC_SMT_POSSIBLE or we're running in
>   TCG mode
> 
> so it looks a bit weird to return -1 when CONFIG_KVM isn't set (when
> running in TCG mode, we would get different values depending on how
> the QEMU binary was compiled).
> 
> Shouldn't this stub return 0 instead ?

YES! it *must* be otherwise TCG would accept any smt mode,
I'll change it.

Thanks :-)

> 
> Cheers,
> 
> --
> Greg
> 
> > +}
> > +
> >  static inline int kvmppc_enable_hwrng(void)
> >  {
> >      return -1;
>
Laurent Vivier Jan. 11, 2018, 11:29 a.m. UTC | #3
On 06/01/2018 01:47, Jose Ricardo Ziviani wrote:
> Power9 supports 4 HW threads/core but it's possible to emulate
> doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> which returns a bitmap with all SMT modes supported by the host.
> 
> Today, QEMU forces the SMT mode based on PVR compat table, this is
> silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> guest will end up with 4 threads/core without any feedback to the user.
> It is confusing and will crash QEMU if a cpu is hotplugged in that
> guest.
> 
> This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> supports the SMT mode so it allows Power9 guests to have 8 threads/core
> if desired.
> 
> Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c       | 14 +++++++++++++-
>  hw/ppc/trace-events  |  1 +
>  target/ppc/kvm.c     |  5 +++++
>  target/ppc/kvm_ppc.h |  6 ++++++
>  4 files changed, 25 insertions(+), 1 deletion(-)

According to the tests I have done on P9 and P8, you should also set
vsmt to the max value found in KVM_CAP_PPC_SMT_POSSIBLE to keep the same
vcpu_id on a P8 and P9 with the same configuration "-smp
W,sockets=X,cores=Y,threads=Z".

This is required for migration.

The formula is in spapr_cpu_core_realize():

cpu->vcpu_id = (cc->core_id * spapr->vsmt / smp_threads) + i);

Thanks,
Laurent
Laurent Vivier Jan. 11, 2018, 1:10 p.m. UTC | #4
On 11/01/2018 12:29, Laurent Vivier wrote:
> On 06/01/2018 01:47, Jose Ricardo Ziviani wrote:
>> Power9 supports 4 HW threads/core but it's possible to emulate
>> doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
>> which returns a bitmap with all SMT modes supported by the host.
>>
>> Today, QEMU forces the SMT mode based on PVR compat table, this is
>> silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
>> guest will end up with 4 threads/core without any feedback to the user.
>> It is confusing and will crash QEMU if a cpu is hotplugged in that
>> guest.
>>
>> This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
>> supports the SMT mode so it allows Power9 guests to have 8 threads/core
>> if desired.
>>
>> Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
>> Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c       | 14 +++++++++++++-
>>  hw/ppc/trace-events  |  1 +
>>  target/ppc/kvm.c     |  5 +++++
>>  target/ppc/kvm_ppc.h |  6 ++++++
>>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> According to the tests I have done on P9 and P8, you should also set
> vsmt to the max value found in KVM_CAP_PPC_SMT_POSSIBLE to keep the same
> vcpu_id on a P8 and P9 with the same configuration "-smp
> W,sockets=X,cores=Y,threads=Z".
> 
> This is required for migration.
> 
> The formula is in spapr_cpu_core_realize():
> 
> cpu->vcpu_id = (cc->core_id * spapr->vsmt / smp_threads) + i);
> 

I think spapr->vsmt should be set to something like
"ppc_compat_max_threads(cpu) & kvmppc_cap_smt_possible()" to set vsmt to
the maximum value for the compat mode but it doesn't seem easy to do as
"cpu" is not available in spapr_set_vsmt_mode().

Thanks,
Laurent
David Gibson Jan. 11, 2018, 2:16 p.m. UTC | #5
On Fri, Jan 05, 2018 at 10:47:22PM -0200, Jose Ricardo Ziviani wrote:
> Power9 supports 4 HW threads/core but it's possible to emulate
> doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> which returns a bitmap with all SMT modes supported by the host.
> 
> Today, QEMU forces the SMT mode based on PVR compat table, this is
> silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> guest will end up with 4 threads/core without any feedback to the user.
> It is confusing and will crash QEMU if a cpu is hotplugged in that
> guest.
> 
> This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> supports the SMT mode so it allows Power9 guests to have 8 threads/core
> if desired.
> 
> Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c       | 14 +++++++++++++-
>  hw/ppc/trace-events  |  1 +
>  target/ppc/kvm.c     |  5 +++++
>  target/ppc/kvm_ppc.h |  6 ++++++
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d1acfe8858..ea2503cd2f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>          DeviceClass *dc = DEVICE_GET_CLASS(cs);
>          int index = spapr_vcpu_id(cpu);
> -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
> +
> +        /* set smt to maximum for this current pvr if the number
> +         * passed is higher than defined by PVR compat mode AND
> +         * if KVM cannot emulate it.*/
> +        int compat_smt = smp_threads;
> +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> +                smp_threads > ppc_compat_max_threads(cpu)) {
> +            compat_smt = ppc_compat_max_threads(cpu);

I don't think this is the right approach.  We've been trying to remove
places where host properties (such as those read from KVM
capabilities) affect guest visible properties of the VM - like vsmt.
Places like that break migration and often libvirt expectations as
well.

This is putting one back in, and so a step in the wrong direction.
Greg Kurz Jan. 11, 2018, 3 p.m. UTC | #6
On Wed, 10 Jan 2018 22:14:37 -0200
joserz@linux.vnet.ibm.com wrote:

> On Tue, Jan 09, 2018 at 01:48:13PM +0100, Greg Kurz wrote:
> > On Fri,  5 Jan 2018 22:47:22 -0200
> > Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com> wrote:
> >   
> > > Power9 supports 4 HW threads/core but it's possible to emulate
> > > doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> > > which returns a bitmap with all SMT modes supported by the host.
> > > 
> > > Today, QEMU forces the SMT mode based on PVR compat table, this is
> > > silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> > > guest will end up with 4 threads/core without any feedback to the user.
> > > It is confusing and will crash QEMU if a cpu is hotplugged in that
> > > guest.
> > > 
> > > This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> > > supports the SMT mode so it allows Power9 guests to have 8 threads/core
> > > if desired.
> > > 
> > > Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> > > Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> > > ---  
> > 
> > Hi,
> > 
> > I agree with the general idea but I have a few questions.  
> 
> Hello!!!! Thank you for your detailed review. :)
> 
> I'm copying David too because I've seen other bugs related with (v)smt
> topic (specially migration) that it could address.
> 

David was already in the Cc list, as expected for all ppc patches :)

> > 
> > The MIN(smp_threads, ppc_compat_max_threads(cpu)) computation is
> > performed in spapr_fixup_cpu_dt() at CAS, but it is also performed
> > in spapr_populate_cpu_dt() at machine reset or when a CPU is added.
> > 
> > Shouldn't your patch address the latter as well ?  
> 
> As far as I investigated, I found out that ppc_compat_max_threads() is
> called several times, but it always returns the number of threads from
> the argument line. Only in spapr_fixup_cpu_dt(), that happens during
> the guest kernel initialization when it's realizing the CPUS, is that

I guess you mean 'onlining the CPUS' ?

> ppc_compat_max_threads() will return that MIN(n_threads, compat->max_threads).
> 
> Until them, if(cpu->compat_pvr) is zeroed and QEMU doesn't know the
> max_threads yet.
> 

Indeed, compat mode is negotiated during CAS which happens before secondary
cpus onlining... but ppc_compat_max_threads() is also called when we hot-plug
cpus, at a time QEMU should know about compat->max_threads.

Speaking of that, it looks like we have another bug in this case: hot-plugged
cpus are exposed with a real PVR in the DT...

[root@localhost ~]# dtc -f -I fs -O dts /proc/device-tree | grep cpu-version
                        cpu-version = <0x4e1200>;
                        cpu-version = <0xf000005>;
                        cpu-version = <0xf000005>;

Should ppc_set_compat() should be called before we populate the FDT nodes of
hot-plugged cpus, instead of of calling it from rtas_start_cpu() ?

> That's the reason that I added the code only in spapr_fixup_cpu_dt()
> because this is where the change really happens.
> 
> >   
> > >  hw/ppc/spapr.c       | 14 +++++++++++++-
> > >  hw/ppc/trace-events  |  1 +
> > >  target/ppc/kvm.c     |  5 +++++
> > >  target/ppc/kvm_ppc.h |  6 ++++++
> > >  4 files changed, 25 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > index d1acfe8858..ea2503cd2f 100644
> > > --- a/hw/ppc/spapr.c
> > > +++ b/hw/ppc/spapr.c
> > > @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
> > >          PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >          DeviceClass *dc = DEVICE_GET_CLASS(cs);
> > >          int index = spapr_vcpu_id(cpu);
> > > -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));  
> > 
> > Considering that we have:
> > 
> > int ppc_compat_max_threads(PowerPCCPU *cpu)
> > {
> >     const CompatInfo *compat = compat_by_pvr(cpu->compat_pvr);
> >     int n_threads = CPU(cpu)->nr_threads;
> > 
> >     if (cpu->compat_pvr) {
> >         g_assert(compat);
> >         n_threads = MIN(n_threads, compat->max_threads);
> >     }
> > 
> >     return n_threads;
> > }
> > 
> > and
> > 
> > void qemu_init_vcpu(CPUState *cpu)
> > {
> >     cpu->nr_cores = smp_cores;
> >     cpu->nr_threads = smp_threads;
> > ...
> > }
> > 
> > ppc_compat_max_threads() already returns the smaller value of
> > smp_threads and the maximum number of HW threads for the PVR.
> > 
> > I don't quite understand why we had this compat_smt calculation
> > in the first place...  
> 
> Mostly it only returns "n_threads = CPU(cpu)->nr_threads" because until
> the guest kernel initialization cpu->compat_pvr is false so that
> MIN() macro is not executed.
> 

I'm referring to the computation in spapr_fixup_cpu_dt(), which basically
does:

compat_smt = MIN(smp_threads, MIN(smp_threads, compat->max_threads));

> So, until late, QEMU thinks its guest will have 8 threads/core. During
> the guest kernel init., that fixup code calls ppc_compat_max_threads
> that will now have cpu->compat_pvr true and will change the number
> of threads to 4. Example:
> 
> qemu-system-ppc64 -smp sockets=1,cores=1,threads=8
>  +-> qemu_init_vcpu, spapr_populate_cpu_dt: 8 threads/core
>  +-> guest kernel is running and asks about CPUs, spapr_fixup_cpu_dt()
>      runs, sets threads to 4, set ibm,ppc-interrupt-server#s and done.
>  +-> guest now believes it only has 4 threads.
> 
> >   
> > > +
> > > +        /* set smt to maximum for this current pvr if the number
> > > +         * passed is higher than defined by PVR compat mode AND
> > > +         * if KVM cannot emulate it.*/
> > > +        int compat_smt = smp_threads;
> > > +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> > > +                smp_threads > ppc_compat_max_threads(cpu)) {
> > > +            compat_smt = ppc_compat_max_threads(cpu);
> > > +
> > > +            trace_spapr_fixup_cpu_smt(index, smp_threads,
> > > +                    kvmppc_cap_smt_possible(),
> > > +                    ppc_compat_max_threads(cpu));
> > > +        }  
> > 
> > ... so I'm wondering if the above shouldn't be performed in
> > ppc_compat_max_threads() directly ?   
> 
> Hmm, now I'm believe that the whole code could rely on that
> kvmppc_cap_smt_possible() since it will always return the number of
> threads supported by the underlying HW. We could have a check in the

Not always, it may return 0 with older KVM or TCG...

> very beginning:
> 
> if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads) {

... in which case, the check ^^ would be wrong.

>     // explain the user that such setup is wrong and quit.
> }
> 
> and that part in fixup code could be unecessary.
> 
> >   
> > >  
> > >          if ((index % smt) != 0) {
> > >              continue;
> > > diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> > > index b7c3e64b5e..a8e29d7ab1 100644
> > > --- a/hw/ppc/trace-events
> > > +++ b/hw/ppc/trace-events
> > > @@ -16,6 +16,7 @@ spapr_irq_alloc(int irq) "irq %d"
> > >  spapr_irq_alloc_block(int first, int num, bool lsi, int align) "first irq %d, %d irqs, lsi=%d, alignnum %d"
> > >  spapr_irq_free(int src, int irq, int num) "Source#%d, first irq %d, %d irqs"
> > >  spapr_irq_free_warn(int src, int irq) "Source#%d, irq %d is already free"
> > > +spapr_fixup_cpu_smt(int idx, int smpt, int kvmt, int pvrt) "CPU(%d): expected smt %d, kvm support %d, max smt pvr %d"
> > >  
> > >  # hw/ppc/spapr_hcall.c
> > >  spapr_cas_pvr_try(uint32_t pvr) "0x%x"
> > > diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> > > index 518dd06e98..aac5667bf4 100644
> > > --- a/target/ppc/kvm.c
> > > +++ b/target/ppc/kvm.c
> > > @@ -2456,6 +2456,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
> > >      return cap_mmu_hash_v3;
> > >  }
> > >  
> > > +int kvmppc_cap_smt_possible(void)
> > > +{
> > > +    return cap_ppc_smt_possible;
> > > +}
> > > +
> > >  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
> > >  {
> > >      uint32_t host_pvr = mfpvr();
> > > diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> > > index ecb55493cc..6ac33d2b4a 100644
> > > --- a/target/ppc/kvm_ppc.h
> > > +++ b/target/ppc/kvm_ppc.h
> > > @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
> > >  bool kvmppc_has_cap_htm(void);
> > >  bool kvmppc_has_cap_mmu_radix(void);
> > >  bool kvmppc_has_cap_mmu_hash_v3(void);
> > > +int kvmppc_cap_smt_possible(void);
> > >  int kvmppc_enable_hwrng(void);
> > >  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
> > >  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> > > @@ -290,6 +291,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
> > >      return false;
> > >  }
> > >  
> > > +static inline int kvmppc_cap_smt_possible(void)
> > > +{
> > > +    return -1;  
> > 
> > When CONFIG_KVM is set, the semantics of kvmppc_cap_smt_possible() is:
> > - a bitmap with supported SMT modes if KVM has KVM_CAP_PPC_SMT_POSSIBLE
> > - 0 if KVM doesn't have KVM_CAP_PPC_SMT_POSSIBLE or we're running in
> >   TCG mode
> > 
> > so it looks a bit weird to return -1 when CONFIG_KVM isn't set (when
> > running in TCG mode, we would get different values depending on how
> > the QEMU binary was compiled).
> > 
> > Shouldn't this stub return 0 instead ?  
> 
> YES! it *must* be otherwise TCG would accept any smt mode,
> I'll change it.
> 
> Thanks :-)
> 
> > 
> > Cheers,
> > 
> > --
> > Greg
> >   
> > > +}
> > > +
> > >  static inline int kvmppc_enable_hwrng(void)
> > >  {
> > >      return -1;  
> >   
>
David Gibson Jan. 12, 2018, 3:46 a.m. UTC | #7
On Fri, Jan 12, 2018 at 01:16:22AM +1100, David Gibson wrote:
> On Fri, Jan 05, 2018 at 10:47:22PM -0200, Jose Ricardo Ziviani wrote:
> > Power9 supports 4 HW threads/core but it's possible to emulate
> > doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> > which returns a bitmap with all SMT modes supported by the host.
> > 
> > Today, QEMU forces the SMT mode based on PVR compat table, this is
> > silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> > guest will end up with 4 threads/core without any feedback to the user.
> > It is confusing and will crash QEMU if a cpu is hotplugged in that
> > guest.
> > 
> > This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> > supports the SMT mode so it allows Power9 guests to have 8 threads/core
> > if desired.
> > 
> > Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> > Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr.c       | 14 +++++++++++++-
> >  hw/ppc/trace-events  |  1 +
> >  target/ppc/kvm.c     |  5 +++++
> >  target/ppc/kvm_ppc.h |  6 ++++++
> >  4 files changed, 25 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index d1acfe8858..ea2503cd2f 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
> >          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >          DeviceClass *dc = DEVICE_GET_CLASS(cs);
> >          int index = spapr_vcpu_id(cpu);
> > -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
> > +
> > +        /* set smt to maximum for this current pvr if the number
> > +         * passed is higher than defined by PVR compat mode AND
> > +         * if KVM cannot emulate it.*/
> > +        int compat_smt = smp_threads;
> > +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> > +                smp_threads > ppc_compat_max_threads(cpu)) {
> > +            compat_smt = ppc_compat_max_threads(cpu);
> 
> I don't think this is the right approach.  We've been trying to remove
> places where host properties (such as those read from KVM
> capabilities) affect guest visible properties of the VM - like vsmt.
> Places like that break migration and often libvirt expectations as
> well.
> 
> This is putting one back in, and so a step in the wrong direction.

Following on from this with some more investigation.

I think the discussion is going off the rails because the reason for
this compat threads limit has been forgotten.

*It has nothing to do with the host*.  It's been recoded a bunch, but
the logic was originally introduced by 2a48d993 "spapr: Limit threads
per core according to current compatibility mode".  It states that it
was to avoid confusing the *guest* by presenting more threads than it
expects on an compatible-with-old CPU.

This also explains why it's done in this otherwise weird way - just
truncating the presented threads in the device tree, while the other
threads still "exist" in the qemu and KVM model:

It's because this happens at CAS time, for the benefit of guests which
don't understand newer CPUs, at which point it's really too late to
report an error to the user or renumber the CPUs.

So I think what we actually want to do here is just *remove* the
compat limit for POWER9 cpus.  Strictly it's to do with the host
kernel rather than the cpu, but in practice we can count on POWER9
guests not being confused by >4 threads (since they POWER8 compat
guests running on POWER9 have to anyway).


As a separate matter, we need to validate guest threads-per-core
against what the host's capable of, but that doesn't belong here.
Jose Ricardo Ziviani Jan. 12, 2018, 9:24 a.m. UTC | #8
On Fri, Jan 12, 2018 at 02:46:19PM +1100, David Gibson wrote:
> On Fri, Jan 12, 2018 at 01:16:22AM +1100, David Gibson wrote:
> > On Fri, Jan 05, 2018 at 10:47:22PM -0200, Jose Ricardo Ziviani wrote:
> > > Power9 supports 4 HW threads/core but it's possible to emulate
> > > doorbells to implement virtual SMT. KVM has the KVM_CAP_PPC_SMT_POSSIBLE
> > > which returns a bitmap with all SMT modes supported by the host.
> > > 
> > > Today, QEMU forces the SMT mode based on PVR compat table, this is
> > > silently done in spapr_fixup_cpu_dt. Then, if user passes thread=8 the
> > > guest will end up with 4 threads/core without any feedback to the user.
> > > It is confusing and will crash QEMU if a cpu is hotplugged in that
> > > guest.
> > > 
> > > This patch makes use of KVM_CAP_PPC_SMT_POSSIBLE to check if the host
> > > supports the SMT mode so it allows Power9 guests to have 8 threads/core
> > > if desired.
> > > 
> > > Reported-by: Satheesh Rajendran <satheera@in.ibm.com>
> > > Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/spapr.c       | 14 +++++++++++++-
> > >  hw/ppc/trace-events  |  1 +
> > >  target/ppc/kvm.c     |  5 +++++
> > >  target/ppc/kvm_ppc.h |  6 ++++++
> > >  4 files changed, 25 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > index d1acfe8858..ea2503cd2f 100644
> > > --- a/hw/ppc/spapr.c
> > > +++ b/hw/ppc/spapr.c
> > > @@ -345,7 +345,19 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
> > >          PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >          DeviceClass *dc = DEVICE_GET_CLASS(cs);
> > >          int index = spapr_vcpu_id(cpu);
> > > -        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
> > > +
> > > +        /* set smt to maximum for this current pvr if the number
> > > +         * passed is higher than defined by PVR compat mode AND
> > > +         * if KVM cannot emulate it.*/
> > > +        int compat_smt = smp_threads;
> > > +        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
> > > +                smp_threads > ppc_compat_max_threads(cpu)) {
> > > +            compat_smt = ppc_compat_max_threads(cpu);
> > 
> > I don't think this is the right approach.  We've been trying to remove
> > places where host properties (such as those read from KVM
> > capabilities) affect guest visible properties of the VM - like vsmt.
> > Places like that break migration and often libvirt expectations as
> > well.
> > 
> > This is putting one back in, and so a step in the wrong direction.
> 
> Following on from this with some more investigation.
> 
> I think the discussion is going off the rails because the reason for
> this compat threads limit has been forgotten.
> 
> *It has nothing to do with the host*.  It's been recoded a bunch, but
> the logic was originally introduced by 2a48d993 "spapr: Limit threads
> per core according to current compatibility mode".  It states that it
> was to avoid confusing the *guest* by presenting more threads than it
> expects on an compatible-with-old CPU.
> 
> This also explains why it's done in this otherwise weird way - just
> truncating the presented threads in the device tree, while the other
> threads still "exist" in the qemu and KVM model:
> 
> It's because this happens at CAS time, for the benefit of guests which
> don't understand newer CPUs, at which point it's really too late to
> report an error to the user or renumber the CPUs.
> 
> So I think what we actually want to do here is just *remove* the
> compat limit for POWER9 cpus.  Strictly it's to do with the host
> kernel rather than the cpu, but in practice we can count on POWER9
> guests not being confused by >4 threads (since they POWER8 compat
> guests running on POWER9 have to anyway).
> 
> 
> As a separate matter, we need to validate guest threads-per-core
> against what the host's capable of, but that doesn't belong here.

Awesome explanation

I'm going to make these changes, perform a lot of more tests and send
the patches.

Thank you

> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d1acfe8858..ea2503cd2f 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -345,7 +345,19 @@  static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
         PowerPCCPU *cpu = POWERPC_CPU(cs);
         DeviceClass *dc = DEVICE_GET_CLASS(cs);
         int index = spapr_vcpu_id(cpu);
-        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
+
+        /* set smt to maximum for this current pvr if the number
+         * passed is higher than defined by PVR compat mode AND
+         * if KVM cannot emulate it.*/
+        int compat_smt = smp_threads;
+        if ((kvmppc_cap_smt_possible() & smp_threads) != smp_threads &&
+                smp_threads > ppc_compat_max_threads(cpu)) {
+            compat_smt = ppc_compat_max_threads(cpu);
+
+            trace_spapr_fixup_cpu_smt(index, smp_threads,
+                    kvmppc_cap_smt_possible(),
+                    ppc_compat_max_threads(cpu));
+        }
 
         if ((index % smt) != 0) {
             continue;
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index b7c3e64b5e..a8e29d7ab1 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -16,6 +16,7 @@  spapr_irq_alloc(int irq) "irq %d"
 spapr_irq_alloc_block(int first, int num, bool lsi, int align) "first irq %d, %d irqs, lsi=%d, alignnum %d"
 spapr_irq_free(int src, int irq, int num) "Source#%d, first irq %d, %d irqs"
 spapr_irq_free_warn(int src, int irq) "Source#%d, irq %d is already free"
+spapr_fixup_cpu_smt(int idx, int smpt, int kvmt, int pvrt) "CPU(%d): expected smt %d, kvm support %d, max smt pvr %d"
 
 # hw/ppc/spapr_hcall.c
 spapr_cas_pvr_try(uint32_t pvr) "0x%x"
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 518dd06e98..aac5667bf4 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -2456,6 +2456,11 @@  bool kvmppc_has_cap_mmu_hash_v3(void)
     return cap_mmu_hash_v3;
 }
 
+int kvmppc_cap_smt_possible(void)
+{
+    return cap_ppc_smt_possible;
+}
+
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
 {
     uint32_t host_pvr = mfpvr();
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index ecb55493cc..6ac33d2b4a 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -59,6 +59,7 @@  bool kvmppc_has_cap_fixup_hcalls(void);
 bool kvmppc_has_cap_htm(void);
 bool kvmppc_has_cap_mmu_radix(void);
 bool kvmppc_has_cap_mmu_hash_v3(void);
+int kvmppc_cap_smt_possible(void);
 int kvmppc_enable_hwrng(void);
 int kvmppc_put_books_sregs(PowerPCCPU *cpu);
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
@@ -290,6 +291,11 @@  static inline bool kvmppc_has_cap_mmu_hash_v3(void)
     return false;
 }
 
+static inline int kvmppc_cap_smt_possible(void)
+{
+    return -1;
+}
+
 static inline int kvmppc_enable_hwrng(void)
 {
     return -1;