diff mbox series

[RFC] spapr: Add SPAPR_CAP_AIL_MODES for supported AIL modes for H_SET_MODE hcall

Message ID 20220129065007.103681-1-npiggin@gmail.com
State New
Headers show
Series [RFC] spapr: Add SPAPR_CAP_AIL_MODES for supported AIL modes for H_SET_MODE hcall | expand

Commit Message

Nicholas Piggin Jan. 29, 2022, 6:50 a.m. UTC
The behaviour of the Address Translation Mode on Interrupt resource is
not consistently supported by all CPU versions or all KVM versions.  In
particular KVM HV only supports mode 0 on POWER7 processors, and does
not support mode 2 on any processors. KVM PR only supports mode 0. TCG
can support all modes (0,2,3).

This leads to inconsistencies in guest behaviour and could cause
problems migrating guests.

This was not too noticable for Linux guests for a long time because the
kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
advisory (KVM would not always honor it either) and it kept both sets of
interrupt vectors around.

Recent Linux guests depend on the AIL mode working as defined by the ISA
to support the SCV facility interrupt. If AIL mode 3 can not be provided,
then Linux must be given an error so it can disable the SCV facility.

Add the ail-modes capability which is a bitmap of the supported values
for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
a new KVM CAP that exports the same thing, and provide defaults for PR
and HV KVM that predate the cap.
---

I just wanted to get some feedback on the approach before submitting a
patch for the KVM cap.

The reason I don't make that a boolean cap for AIL=3 is that future
processors might implement new modes a guest would like to use even
though it's not the nicest interface.

Thanks,
Nick

 hw/ppc/spapr.c            |  1 +
 hw/ppc/spapr_caps.c       | 81 +++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_hcall.c      | 16 ++------
 include/hw/ppc/spapr.h    | 10 ++++-
 linux-headers/linux/kvm.h |  1 +
 target/ppc/kvm.c          | 25 ++++++++++++
 target/ppc/kvm_ppc.h      |  6 +++
 7 files changed, 126 insertions(+), 14 deletions(-)

Comments

Fabiano Rosas Jan. 31, 2022, 3:51 p.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:

> The behaviour of the Address Translation Mode on Interrupt resource is
> not consistently supported by all CPU versions or all KVM versions.  In
> particular KVM HV only supports mode 0 on POWER7 processors, and does
> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
> can support all modes (0,2,3).
>
> This leads to inconsistencies in guest behaviour and could cause
> problems migrating guests.
>
> This was not too noticable for Linux guests for a long time because the
> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
> advisory (KVM would not always honor it either) and it kept both sets of
> interrupt vectors around.
>
> Recent Linux guests depend on the AIL mode working as defined by the ISA
> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
> then Linux must be given an error so it can disable the SCV facility.
>
> Add the ail-modes capability which is a bitmap of the supported values
> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
> a new KVM CAP that exports the same thing, and provide defaults for PR
> and HV KVM that predate the cap.
> ---
>
> I just wanted to get some feedback on the approach before submitting a
> patch for the KVM cap.

Could you expand a bit on what is the use case for setting this in the
QEMU cmdline? I looks to me we already have all the information we need
with just the KVM cap.

> The reason I don't make that a boolean cap for AIL=3 is that future
> processors might implement new modes a guest would like to use even
> though it's not the nicest interface.
>
> Thanks,
> Nick
>
>  hw/ppc/spapr.c            |  1 +
>  hw/ppc/spapr_caps.c       | 81 +++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_hcall.c      | 16 ++------
>  include/hw/ppc/spapr.h    | 10 ++++-
>  linux-headers/linux/kvm.h |  1 +
>  target/ppc/kvm.c          | 25 ++++++++++++
>  target/ppc/kvm_ppc.h      |  6 +++
>  7 files changed, 126 insertions(+), 14 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 163c90388a..2a7a510aa7 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -4604,6 +4604,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
>      smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
>      smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
> +    smc->default_caps.caps[SPAPR_CAP_AIL_MODES] = SPAPR_CAP_AIL_MODES_DEFAULT;
>      spapr_caps_add_properties(smc);
>      smc->irq = &spapr_irq_dual;
>      smc->dr_phb_enabled = true;
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index ed7c077a0d..21cc93af86 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -94,6 +94,30 @@ static void spapr_cap_set_bool(Object *obj, Visitor *v, const char *name,
>      spapr->eff.caps[cap->index] = value ? SPAPR_CAP_ON : SPAPR_CAP_OFF;
>  }
>  
> +static void spapr_cap_get_uint8(Object *obj, Visitor *v, const char *name,
> +                               void *opaque, Error **errp)
> +{
> +    SpaprCapabilityInfo *cap = opaque;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +    uint8_t value = spapr_get_cap(spapr, cap->index) == SPAPR_CAP_ON;
> +
> +    visit_type_uint8(v, name, &value, errp);
> +}
> +
> +static void spapr_cap_set_uint8(Object *obj, Visitor *v, const char *name,
> +                               void *opaque, Error **errp)
> +{
> +    SpaprCapabilityInfo *cap = opaque;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +    uint8_t value;
> +
> +    if (!visit_type_uint8(v, name, &value, errp)) {
> +        return;
> +    }
> +
> +    spapr->cmd_line_caps[cap->index] = true;
> +    spapr->eff.caps[cap->index] = value;
> +}
>  
>  static void  spapr_cap_get_string(Object *obj, Visitor *v, const char *name,
>                                    void *opaque, Error **errp)
> @@ -613,6 +637,54 @@ static void cap_rpt_invalidate_apply(SpaprMachineState *spapr,
>      }
>  }
>  
> +static void cap_ail_apply(SpaprMachineState *spapr,
> +                                     uint8_t val, Error **errp)
> +{
> +    ERRP_GUARD();
> +
> +    if (!(val & (0x01 << 0))) {
> +        error_setg(errp, "cap-ail-modes mode must include AIL=0");
> +        error_append_hint(errp,
> +                          "Ensure bit 0 (value 1) is set in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (val & ~((0x01 << 0) | (0x01 << 1) | (0x01 << 2) | (0x01 << 3))) {
> +        error_setg(errp, "Unknown cap-ail-modes value");
> +        error_append_hint(errp,
> +                          "Ensure only bits between 0 and 3 are set in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (val & (0x01 << 1)) {
> +        error_setg(errp, "Unsupported cap-ail-modes mode AIL=1");
> +        error_append_hint(errp,
> +                          "Ensure bit 1 (value 2) is clear in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (kvm_enabled()) {
> +        if (val & (0x01 << 2)) {
> +            error_setg(errp, "KVM does not support cap-ail-modes mode AIL=2");

Isn't this something KVM should tell us via the capability?

> +            error_append_hint(errp,
> +                              "Ensure bit 2 (value 4) is clear in cap-ail-modes\n");
> +            if (kvmppc_has_cap_ail_3()) {
> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=9\n");
> +            } else {
> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
> +            }
> +            return;
> +        }
> +        if ((val & (0x01 << 3)) && !kvmppc_has_cap_ail_3()) {
> +            error_setg(errp, "KVM implementation does not support cap-ail-modes AIL=3");
> +            error_append_hint(errp,
> +                              "Ensure bit 3 (value 8) is clear in cap-ail-modes\n");
> +            error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
> +            return;
> +        }
> +    }
> +}

I think the error reporting here is too complex. A user who just wants
to make their guest start will not bother thinking about binary
representation. There's also some room for confusion in having three
numbers present in the error message (bit #, decimal value and AIL
mode). Imagine dealing with this in a bug report, for instance.

I would just tell outright what the supported values are. Perhaps in a
little table:

Supported AIL modes:
 AIL = 0   | cap-ail-modes=1
 AIL = 2   | cap-ail-modes=5
 AIL = 3   | cap-ail-modes=9
 AIL = 2&3 | cap-ail-modes=13

We could then make the code a bit more generic. Roughly:

---
if (kvm_enabled())
    ail_supported = kvmppc_cap_ail();
else
    ail_supported = ((0x01 << 0) | (0x01 << 2) | (0x01 << 3));

if ((val & ail_supported) != val)
   print_ail_hint(ail_supported);

void print_ail_hint(int ail) {
    printf("Supported AIL modes:\n");

    if (ail & 0x1)
        printf(" AIL = 0   | cap-ail-modes=1\n");
    
    if (ail & 0x5)
        printf(" AIL = 2   | cap-ail-modes=5\n");
    
    if (ail & 0x9)
        printf(" AIL = 3   | cap-ail-modes=9\n");
    
    if (ail & 0xd)
        printf(" AIL = 2&3 | cap-ail-modes=13\n");
}
---

> +
>  SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>      [SPAPR_CAP_HTM] = {
>          .name = "htm",
> @@ -730,6 +802,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>          .type = "bool",
>          .apply = cap_rpt_invalidate_apply,
>      },
> +    [SPAPR_CAP_AIL_MODES] = {
> +        .name = "ail-modes",
> +        .description = "Bitmap of AIL (alternate interrupt location) mode support",
> +        .index = SPAPR_CAP_AIL_MODES,
> +        .get = spapr_cap_get_uint8,
> +        .set = spapr_cap_set_uint8,
> +        .type = "uint8",
> +        .apply = cap_ail_apply,
> +    },
>  };
>  
>  static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 222c1b6bbd..3de4553b35 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -811,15 +811,11 @@ static target_ulong h_set_mode_resource_le(PowerPCCPU *cpu,
>  }
>  
>  static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
> +                                                        SpaprMachineState *spapr,
>                                                          target_ulong mflags,
>                                                          target_ulong value1,
>                                                          target_ulong value2)
>  {
> -    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> -
> -    if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> -        return H_P2;
> -    }
>      if (value1) {
>          return H_P3;
>      }
> @@ -827,13 +823,7 @@ static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
>          return H_P4;
>      }
>  
> -    if (mflags == 1) {
> -        /* AIL=1 is reserved in POWER8/POWER9/POWER10 */
> -        return H_UNSUPPORTED_FLAG;
> -    }
> -
> -    if (mflags == 2 && (pcc->insns_flags2 & PPC2_ISA310)) {
> -        /* AIL=2 is reserved in POWER10 (ISA v3.1) */
> +    if (!(spapr_get_cap(spapr, SPAPR_CAP_AIL_MODES) & (0x01 << mflags))) {
>          return H_UNSUPPORTED_FLAG;
>      }
>  
> @@ -853,7 +843,7 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, SpaprMachineState *spapr,
>          ret = h_set_mode_resource_le(cpu, spapr, args[0], args[2], args[3]);
>          break;
>      case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
> -        ret = h_set_mode_resource_addr_trans_mode(cpu, args[0],
> +        ret = h_set_mode_resource_addr_trans_mode(cpu, spapr, args[0],
>                                                    args[2], args[3]);
>          break;
>      }
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ee7504b976..307b89adb7 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -49,6 +49,12 @@ typedef enum {
>      SPAPR_RESIZE_HPT_REQUIRED,
>  } SpaprResizeHpt;
>  
> +/**
> + * This cap is a bitmask of supported modes. Default to always supporting
> + * 0 and 3 (PR KVM and POWER7 and earlier only support 0).
> + */
> +#define SPAPR_CAP_AIL_MODES_DEFAULT     ((0x01 << 0) | (0x01 << 3))
> +
>  /**
>   * Capabilities
>   */
> @@ -77,8 +83,10 @@ typedef enum {
>  #define SPAPR_CAP_FWNMI                 0x0A
>  /* Support H_RPT_INVALIDATE */
>  #define SPAPR_CAP_RPT_INVALIDATE        0x0B
> +/* Support for AIL modes */
> +#define SPAPR_CAP_AIL_MODES             0x0C
>  /* Num Caps */
> -#define SPAPR_CAP_NUM                   (SPAPR_CAP_RPT_INVALIDATE + 1)
> +#define SPAPR_CAP_NUM                   (SPAPR_CAP_AIL_MODES + 1)
>  
>  /*
>   * Capability Values
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index bcaf66cc4d..5e94ecaa8e 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -1112,6 +1112,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_BINARY_STATS_FD 203
>  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>  #define KVM_CAP_ARM_MTE 205
> +#define KVM_CAP_PPC_AIL_MODES 206
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..c6a5a8e6e5 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>  static int cap_large_decr;
>  static int cap_fwnmi;
>  static int cap_rpt_invalidate;
> +static int cap_ail_modes;
>  
>  static uint32_t debug_inst_opcode;
>  
> @@ -154,6 +155,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      }
>  
>      cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
> +    cap_ail_modes = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODES);
>      kvm_ppc_register_host_cpu_type();
>  
>      return 0;
> @@ -2563,6 +2565,29 @@ int kvmppc_has_cap_rpt_invalidate(void)
>      return cap_rpt_invalidate;
>  }
>  
> +int kvmppc_has_cap_ail_3(void)
> +{
> +    if (!cap_ail_modes) {
> +        PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
> +
> +        /*
> +         * KVM HV hosts which do not support CAP_AIL_MODES capability still
> +         * support AIL=3 on POWER8 and above. Special-case this so as not to
> +         * lose performance on those hosts.
> +         */
> +        if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> +            return 0;
> +        }
> +
> +        if (kvmppc_is_pr(kvm_state)) {
> +            return 0;
> +        } else {
> +            return 1;
> +        }
> +    }
> +    return !!(cap_ail_modes & (1 << 3));
> +}
> +
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
>  {
>      uint32_t host_pvr = mfpvr();
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..efafa67b83 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -73,6 +73,7 @@ int kvmppc_set_cap_nested_kvm_hv(int enable);
>  int kvmppc_get_cap_large_decr(void);
>  int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
>  int kvmppc_has_cap_rpt_invalidate(void);
> +int kvmppc_has_cap_ail_3(void);
>  int kvmppc_enable_hwrng(void);
>  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> @@ -393,6 +394,11 @@ static inline int kvmppc_has_cap_rpt_invalidate(void)
>      return false;
>  }
>  
> +static inline int kvmppc_has_cap_ail_3(void)
> +{
> +    return false;
> +}
> +
>  static inline int kvmppc_enable_hwrng(void)
>  {
>      return -1;
Daniel Henrique Barboza Jan. 31, 2022, 7:10 p.m. UTC | #2
On 1/29/22 03:50, Nicholas Piggin wrote:
> The behaviour of the Address Translation Mode on Interrupt resource is
> not consistently supported by all CPU versions or all KVM versions.  In
> particular KVM HV only supports mode 0 on POWER7 processors, and does
> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
> can support all modes (0,2,3).
> 
> This leads to inconsistencies in guest behaviour and could cause
> problems migrating guests.
> 
> This was not too noticable for Linux guests for a long time because the
> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
> advisory (KVM would not always honor it either) and it kept both sets of
> interrupt vectors around.
> 
> Recent Linux guests depend on the AIL mode working as defined by the ISA
> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
> then Linux must be given an error so it can disable the SCV facility.

Is this the scenario where migration failures can occur? I don't understand
what are the migration problems you cited that were possible to happen.

> 
> Add the ail-modes capability which is a bitmap of the supported values
> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
> a new KVM CAP that exports the same thing, and provide defaults for PR
> and HV KVM that predate the cap.

Why add a new machine cap in this case? Isn't something that the KVM capability
should be able to handle by itself, where we always assume that we should have
the best AIL value possible?

Besides, the way it is coded here, we're adding an user-visible capability that
mimics the exact behavior we want from h_set_mode_resource_addr_trans_mode(),
meaning that only bits 0,1,2 and 3 of cap-ail-modes can be set, but:

- bit 0 must always be set
- bit 1 must always be cleared
- if kvm_enabled():
    * bit 2 must always be cleared
    * bit 3 can be cleared or not depending on kvmppc_has_cap_ail_3(), which translates
to not allowed if running with KVM_PR and allowing it if it we're running with Power8
and newer

i.e. bit 0 is always set, bit 1 is always cleared, bit 2 can be set or not for TCG but
always cleared for KVM, and bit 3 can be set depending on the circunstances.

Note that this would allow an user to set this guest in a Power9/10 machine:

-machine pseries,accel=kvm,cap-ail-modes=1

And the guest will end up having degraded performance because AIL=3 is being disabled.

If we want to avoid this and force AIL=3 to be used in this case, then this capability
would be used just to set or clear AIL=2 when running with TCG.


I believe the chunks in which we check for kvm_pr and allow only AIL=0 are improvements
of h_set_mode_resource_addr_trans_mode(), but other than that I'm afraid that exposing
this cap to users is a bit overkill.


Thanks,


Daniel


> ---
> 
> I just wanted to get some feedback on the approach before submitting a
> patch for the KVM cap.
> 
> The reason I don't make that a boolean cap for AIL=3 is that future
> processors might implement new modes a guest would like to use even
> though it's not the nicest interface.
> 
> Thanks,
> Nick
> 
>   hw/ppc/spapr.c            |  1 +
>   hw/ppc/spapr_caps.c       | 81 +++++++++++++++++++++++++++++++++++++++
>   hw/ppc/spapr_hcall.c      | 16 ++------
>   include/hw/ppc/spapr.h    | 10 ++++-
>   linux-headers/linux/kvm.h |  1 +
>   target/ppc/kvm.c          | 25 ++++++++++++
>   target/ppc/kvm_ppc.h      |  6 +++
>   7 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 163c90388a..2a7a510aa7 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -4604,6 +4604,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>       smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
>       smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
>       smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
> +    smc->default_caps.caps[SPAPR_CAP_AIL_MODES] = SPAPR_CAP_AIL_MODES_DEFAULT;
>       spapr_caps_add_properties(smc);
>       smc->irq = &spapr_irq_dual;
>       smc->dr_phb_enabled = true;
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index ed7c077a0d..21cc93af86 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -94,6 +94,30 @@ static void spapr_cap_set_bool(Object *obj, Visitor *v, const char *name,
>       spapr->eff.caps[cap->index] = value ? SPAPR_CAP_ON : SPAPR_CAP_OFF;
>   }
>   
> +static void spapr_cap_get_uint8(Object *obj, Visitor *v, const char *name,
> +                               void *opaque, Error **errp)
> +{
> +    SpaprCapabilityInfo *cap = opaque;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +    uint8_t value = spapr_get_cap(spapr, cap->index) == SPAPR_CAP_ON;
> +
> +    visit_type_uint8(v, name, &value, errp);
> +}
> +
> +static void spapr_cap_set_uint8(Object *obj, Visitor *v, const char *name,
> +                               void *opaque, Error **errp)
> +{
> +    SpaprCapabilityInfo *cap = opaque;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +    uint8_t value;
> +
> +    if (!visit_type_uint8(v, name, &value, errp)) {
> +        return;
> +    }
> +
> +    spapr->cmd_line_caps[cap->index] = true;
> +    spapr->eff.caps[cap->index] = value;
> +}
>   
>   static void  spapr_cap_get_string(Object *obj, Visitor *v, const char *name,
>                                     void *opaque, Error **errp)
> @@ -613,6 +637,54 @@ static void cap_rpt_invalidate_apply(SpaprMachineState *spapr,
>       }
>   }
>   
> +static void cap_ail_apply(SpaprMachineState *spapr,
> +                                     uint8_t val, Error **errp)
> +{
> +    ERRP_GUARD();
> +
> +    if (!(val & (0x01 << 0))) {
> +        error_setg(errp, "cap-ail-modes mode must include AIL=0");
> +        error_append_hint(errp,
> +                          "Ensure bit 0 (value 1) is set in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (val & ~((0x01 << 0) | (0x01 << 1) | (0x01 << 2) | (0x01 << 3))) {
> +        error_setg(errp, "Unknown cap-ail-modes value");
> +        error_append_hint(errp,
> +                          "Ensure only bits between 0 and 3 are set in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (val & (0x01 << 1)) {
> +        error_setg(errp, "Unsupported cap-ail-modes mode AIL=1");
> +        error_append_hint(errp,
> +                          "Ensure bit 1 (value 2) is clear in cap-ail-modes\n");
> +        return;
> +    }
> +
> +    if (kvm_enabled()) {
> +        if (val & (0x01 << 2)) {
> +            error_setg(errp, "KVM does not support cap-ail-modes mode AIL=2");
> +            error_append_hint(errp,
> +                              "Ensure bit 2 (value 4) is clear in cap-ail-modes\n");
> +            if (kvmppc_has_cap_ail_3()) {
> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=9\n");
> +            } else {
> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
> +            }
> +            return;
> +        }
> +        if ((val & (0x01 << 3)) && !kvmppc_has_cap_ail_3()) {
> +            error_setg(errp, "KVM implementation does not support cap-ail-modes AIL=3");
> +            error_append_hint(errp,
> +                              "Ensure bit 3 (value 8) is clear in cap-ail-modes\n");
> +            error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
> +            return;
> +        }
> +    }
> +}
> +
>   SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>       [SPAPR_CAP_HTM] = {
>           .name = "htm",
> @@ -730,6 +802,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>           .type = "bool",
>           .apply = cap_rpt_invalidate_apply,
>       },
> +    [SPAPR_CAP_AIL_MODES] = {
> +        .name = "ail-modes",
> +        .description = "Bitmap of AIL (alternate interrupt location) mode support",
> +        .index = SPAPR_CAP_AIL_MODES,
> +        .get = spapr_cap_get_uint8,
> +        .set = spapr_cap_set_uint8,
> +        .type = "uint8",
> +        .apply = cap_ail_apply,
> +    },
>   };
>   
>   static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 222c1b6bbd..3de4553b35 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -811,15 +811,11 @@ static target_ulong h_set_mode_resource_le(PowerPCCPU *cpu,
>   }
>   
>   static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
> +                                                        SpaprMachineState *spapr,
>                                                           target_ulong mflags,
>                                                           target_ulong value1,
>                                                           target_ulong value2)
>   {
> -    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> -
> -    if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> -        return H_P2;
> -    }
>       if (value1) {
>           return H_P3;
>       }
> @@ -827,13 +823,7 @@ static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
>           return H_P4;
>       }
>   
> -    if (mflags == 1) {
> -        /* AIL=1 is reserved in POWER8/POWER9/POWER10 */
> -        return H_UNSUPPORTED_FLAG;
> -    }
> -
> -    if (mflags == 2 && (pcc->insns_flags2 & PPC2_ISA310)) {
> -        /* AIL=2 is reserved in POWER10 (ISA v3.1) */
> +    if (!(spapr_get_cap(spapr, SPAPR_CAP_AIL_MODES) & (0x01 << mflags))) {
>           return H_UNSUPPORTED_FLAG;
>       }
>   
> @@ -853,7 +843,7 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, SpaprMachineState *spapr,
>           ret = h_set_mode_resource_le(cpu, spapr, args[0], args[2], args[3]);
>           break;
>       case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
> -        ret = h_set_mode_resource_addr_trans_mode(cpu, args[0],
> +        ret = h_set_mode_resource_addr_trans_mode(cpu, spapr, args[0],
>                                                     args[2], args[3]);
>           break;
>       }
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ee7504b976..307b89adb7 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -49,6 +49,12 @@ typedef enum {
>       SPAPR_RESIZE_HPT_REQUIRED,
>   } SpaprResizeHpt;
>   
> +/**
> + * This cap is a bitmask of supported modes. Default to always supporting
> + * 0 and 3 (PR KVM and POWER7 and earlier only support 0).
> + */
> +#define SPAPR_CAP_AIL_MODES_DEFAULT     ((0x01 << 0) | (0x01 << 3))
> +
>   /**
>    * Capabilities
>    */
> @@ -77,8 +83,10 @@ typedef enum {
>   #define SPAPR_CAP_FWNMI                 0x0A
>   /* Support H_RPT_INVALIDATE */
>   #define SPAPR_CAP_RPT_INVALIDATE        0x0B
> +/* Support for AIL modes */
> +#define SPAPR_CAP_AIL_MODES             0x0C
>   /* Num Caps */
> -#define SPAPR_CAP_NUM                   (SPAPR_CAP_RPT_INVALIDATE + 1)
> +#define SPAPR_CAP_NUM                   (SPAPR_CAP_AIL_MODES + 1)
>   
>   /*
>    * Capability Values
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index bcaf66cc4d..5e94ecaa8e 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -1112,6 +1112,7 @@ struct kvm_ppc_resize_hpt {
>   #define KVM_CAP_BINARY_STATS_FD 203
>   #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>   #define KVM_CAP_ARM_MTE 205
> +#define KVM_CAP_PPC_AIL_MODES 206
>   
>   #ifdef KVM_CAP_IRQ_ROUTING
>   
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..c6a5a8e6e5 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>   static int cap_large_decr;
>   static int cap_fwnmi;
>   static int cap_rpt_invalidate;
> +static int cap_ail_modes;
>   
>   static uint32_t debug_inst_opcode;
>   
> @@ -154,6 +155,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>       }
>   
>       cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
> +    cap_ail_modes = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODES);
>       kvm_ppc_register_host_cpu_type();
>   
>       return 0;
> @@ -2563,6 +2565,29 @@ int kvmppc_has_cap_rpt_invalidate(void)
>       return cap_rpt_invalidate;
>   }
>   
> +int kvmppc_has_cap_ail_3(void)
> +{
> +    if (!cap_ail_modes) {
> +        PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
> +
> +        /*
> +         * KVM HV hosts which do not support CAP_AIL_MODES capability still
> +         * support AIL=3 on POWER8 and above. Special-case this so as not to
> +         * lose performance on those hosts.
> +         */
> +        if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> +            return 0;
> +        }
> +
> +        if (kvmppc_is_pr(kvm_state)) {
> +            return 0;
> +        } else {
> +            return 1;
> +        }
> +    }
> +    return !!(cap_ail_modes & (1 << 3));
> +}
> +
>   PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
>   {
>       uint32_t host_pvr = mfpvr();
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..efafa67b83 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -73,6 +73,7 @@ int kvmppc_set_cap_nested_kvm_hv(int enable);
>   int kvmppc_get_cap_large_decr(void);
>   int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
>   int kvmppc_has_cap_rpt_invalidate(void);
> +int kvmppc_has_cap_ail_3(void);
>   int kvmppc_enable_hwrng(void);
>   int kvmppc_put_books_sregs(PowerPCCPU *cpu);
>   PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> @@ -393,6 +394,11 @@ static inline int kvmppc_has_cap_rpt_invalidate(void)
>       return false;
>   }
>   
> +static inline int kvmppc_has_cap_ail_3(void)
> +{
> +    return false;
> +}
> +
>   static inline int kvmppc_enable_hwrng(void)
>   {
>       return -1;
Nicholas Piggin Feb. 1, 2022, 6:02 a.m. UTC | #3
Excerpts from Fabiano Rosas's message of February 1, 2022 1:51 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> The behaviour of the Address Translation Mode on Interrupt resource is
>> not consistently supported by all CPU versions or all KVM versions.  In
>> particular KVM HV only supports mode 0 on POWER7 processors, and does
>> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
>> can support all modes (0,2,3).
>>
>> This leads to inconsistencies in guest behaviour and could cause
>> problems migrating guests.
>>
>> This was not too noticable for Linux guests for a long time because the
>> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
>> advisory (KVM would not always honor it either) and it kept both sets of
>> interrupt vectors around.
>>
>> Recent Linux guests depend on the AIL mode working as defined by the ISA
>> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
>> then Linux must be given an error so it can disable the SCV facility.
>>
>> Add the ail-modes capability which is a bitmap of the supported values
>> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
>> a new KVM CAP that exports the same thing, and provide defaults for PR
>> and HV KVM that predate the cap.
>> ---
>>
>> I just wanted to get some feedback on the approach before submitting a
>> patch for the KVM cap.
> 
> Could you expand a bit on what is the use case for setting this in the
> QEMU cmdline? I looks to me we already have all the information we need
> with just the KVM cap.

To be able to match TCG with KVM HV or PR behaviour here.
I guess I'm not sure how much that is actually needed though.

>> +    if (kvm_enabled()) {
>> +        if (val & (0x01 << 2)) {
>> +            error_setg(errp, "KVM does not support cap-ail-modes mode AIL=2");
> 
> Isn't this something KVM should tell us via the capability?

Yeah, might as well do that. I changed some of the interfaces halfway
through and didn't clean this up.

>> +            error_append_hint(errp,
>> +                              "Ensure bit 2 (value 4) is clear in cap-ail-modes\n");
>> +            if (kvmppc_has_cap_ail_3()) {
>> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=9\n");
>> +            } else {
>> +                error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
>> +            }
>> +            return;
>> +        }
>> +        if ((val & (0x01 << 3)) && !kvmppc_has_cap_ail_3()) {
>> +            error_setg(errp, "KVM implementation does not support cap-ail-modes AIL=3");
>> +            error_append_hint(errp,
>> +                              "Ensure bit 3 (value 8) is clear in cap-ail-modes\n");
>> +            error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
>> +            return;
>> +        }
>> +    }
>> +}
> 
> I think the error reporting here is too complex. A user who just wants
> to make their guest start will not bother thinking about binary
> representation. There's also some room for confusion in having three
> numbers present in the error message (bit #, decimal value and AIL
> mode). Imagine dealing with this in a bug report, for instance.
> 
> I would just tell outright what the supported values are. Perhaps in a
> little table:
> 
> Supported AIL modes:
>  AIL = 0   | cap-ail-modes=1
>  AIL = 2   | cap-ail-modes=5
>  AIL = 3   | cap-ail-modes=9
>  AIL = 2&3 | cap-ail-modes=13
> 
> We could then make the code a bit more generic. Roughly:

Yeah I didn't like the interface either :P

The nicest option I guess is to be able to give it a list

cap-ail-modes=0,2,3

Maybe there's already some parsing to be able to do that. I'll
look a bit harder.

Thanks,
Nick
Nicholas Piggin Feb. 1, 2022, 6:18 a.m. UTC | #4
Excerpts from Daniel Henrique Barboza's message of February 1, 2022 5:10 am:
> 
> 
> On 1/29/22 03:50, Nicholas Piggin wrote:
>> The behaviour of the Address Translation Mode on Interrupt resource is
>> not consistently supported by all CPU versions or all KVM versions.  In
>> particular KVM HV only supports mode 0 on POWER7 processors, and does
>> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
>> can support all modes (0,2,3).
>> 
>> This leads to inconsistencies in guest behaviour and could cause
>> problems migrating guests.
>> 
>> This was not too noticable for Linux guests for a long time because the
>> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
>> advisory (KVM would not always honor it either) and it kept both sets of
>> interrupt vectors around.
>> 
>> Recent Linux guests depend on the AIL mode working as defined by the ISA
>> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
>> then Linux must be given an error so it can disable the SCV facility.
> 
> Is this the scenario where migration failures can occur? I don't understand
> what are the migration problems you cited that were possible to happen.

Maybe I'm overly concerned and nothing would practically use it (beyond 
testing which we could just hack around). I was thinking of if we 
implemented AIL=2 in KVM HV, or AIL=3 in PR.

> 
>> 
>> Add the ail-modes capability which is a bitmap of the supported values
>> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
>> a new KVM CAP that exports the same thing, and provide defaults for PR
>> and HV KVM that predate the cap.
> 
> Why add a new machine cap in this case? Isn't something that the KVM capability
> should be able to handle by itself, where we always assume that we should have
> the best AIL value possible?
> 
> Besides, the way it is coded here, we're adding an user-visible capability that
> mimics the exact behavior we want from h_set_mode_resource_addr_trans_mode(),
> meaning that only bits 0,1,2 and 3 of cap-ail-modes can be set, but:
> 
> - bit 0 must always be set
> - bit 1 must always be cleared
> - if kvm_enabled():
>     * bit 2 must always be cleared
>     * bit 3 can be cleared or not depending on kvmppc_has_cap_ail_3(), which translates
> to not allowed if running with KVM_PR and allowing it if it we're running with Power8
> and newer
> 
> i.e. bit 0 is always set, bit 1 is always cleared, bit 2 can be set or not for TCG but
> always cleared for KVM, and bit 3 can be set depending on the circunstances.
> 
> Note that this would allow an user to set this guest in a Power9/10 machine:
> 
> -machine pseries,accel=kvm,cap-ail-modes=1
> 
> And the guest will end up having degraded performance because AIL=3 is being disabled.
> 
> If we want to avoid this and force AIL=3 to be used in this case, then this capability
> would be used just to set or clear AIL=2 when running with TCG.

I was thinking how it could be more flexible with maybe possibly future 
AIL modes and things we don't foresee. In theory AIL=0 could go away
(although unlikely in practice).

> I believe the chunks in which we check for kvm_pr and allow only AIL=0 are improvements
> of h_set_mode_resource_addr_trans_mode(), but other than that I'm afraid that exposing
> this cap to users is a bit overkill.

That said, maybe you are right and it's overkill until a real need comes 
up.

I will split and submit the KVM cap part of it, at least.

Thanks,
Nick
David Gibson Feb. 7, 2022, 1:41 a.m. UTC | #5
On Sat, Jan 29, 2022 at 04:50:07PM +1000, Nicholas Piggin wrote:
> The behaviour of the Address Translation Mode on Interrupt resource is
> not consistently supported by all CPU versions or all KVM versions.  In
> particular KVM HV only supports mode 0 on POWER7 processors, and does
> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
> can support all modes (0,2,3).
> 
> This leads to inconsistencies in guest behaviour and could cause
> problems migrating guests.
> 
> This was not too noticable for Linux guests for a long time because the
> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
> advisory (KVM would not always honor it either) and it kept both sets of
> interrupt vectors around.
> 
> Recent Linux guests depend on the AIL mode working as defined by the ISA
> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
> then Linux must be given an error so it can disable the SCV facility.
> 
> Add the ail-modes capability which is a bitmap of the supported values
> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
> a new KVM CAP that exports the same thing, and provide defaults for PR
> and HV KVM that predate the cap.
> ---
> 
> I just wanted to get some feedback on the approach before submitting a
> patch for the KVM cap.
> 
> The reason I don't make that a boolean cap for AIL=3 is that future
> processors might implement new modes a guest would like to use even
> though it's not the nicest interface.

[snip]
>  SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>      [SPAPR_CAP_HTM] = {
>          .name = "htm",
> @@ -730,6 +802,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>          .type = "bool",
>          .apply = cap_rpt_invalidate_apply,
>      },
> +    [SPAPR_CAP_AIL_MODES] = {
> +        .name = "ail-modes",
> +        .description = "Bitmap of AIL (alternate interrupt location) mode support",

A bitmap doesn't quite work as an spapr cap.  The general caps code
assumes that bigger is always better, or more precisely that migrating
from an instance that has a lower value to one which has a higher
value is "good enough" to be compatible.  That's obviously not the
case for a bitmap.

I think to handle this properly within the limitations of papr caps,
you instead want a separate boolean cap for each supported AIL mode
(or at least for each AIL mode you want to have control over).
David Gibson Feb. 7, 2022, 1:50 a.m. UTC | #6
On Mon, Jan 31, 2022 at 12:51:00PM -0300, Fabiano Rosas wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
> > The behaviour of the Address Translation Mode on Interrupt resource is
> > not consistently supported by all CPU versions or all KVM versions.  In
> > particular KVM HV only supports mode 0 on POWER7 processors, and does
> > not support mode 2 on any processors. KVM PR only supports mode 0. TCG
> > can support all modes (0,2,3).
> >
> > This leads to inconsistencies in guest behaviour and could cause
> > problems migrating guests.
> >
> > This was not too noticable for Linux guests for a long time because the
> > kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
> > advisory (KVM would not always honor it either) and it kept both sets of
> > interrupt vectors around.
> >
> > Recent Linux guests depend on the AIL mode working as defined by the ISA
> > to support the SCV facility interrupt. If AIL mode 3 can not be provided,
> > then Linux must be given an error so it can disable the SCV facility.
> >
> > Add the ail-modes capability which is a bitmap of the supported values
> > for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
> > a new KVM CAP that exports the same thing, and provide defaults for PR
> > and HV KVM that predate the cap.
> > ---
> >
> > I just wanted to get some feedback on the approach before submitting a
> > patch for the KVM cap.
> 
> Could you expand a bit on what is the use case for setting this in the
> QEMU cmdline? I looks to me we already have all the information we need
> with just the KVM cap.

The problem is that changing guest visible behaviour based on a KVM
cap is a no-no.  It causes grief with migration, because the basic
assumption of qemu's migration model is that identically configured
machines are migration compatible, but that's no longer the case if
the machine's behaviour can be altered by host properties like a KVM
cap.

In this specific case, I can see one approach that *might* avoid the
need for an spapr cap.  That would be to declare that we were always
(at least since the AIL setting hypercall was introduced) "supposed"
to support AIL modes 0 and 3 and the fact that we didn't on certain
hosts was simply a bug.  Then we could start enforcing this and simply
error outrun at all on host where KVM doesn't support those modes.
The trade-off, obviously, is that we'd now fail on certain setups
which previously worked (albeit in a fragile way).

You'd also need to enforce that the guest not select AIL mode 2, of
course.  That's technically an incompatible breaking change, but I
think it's one we could get away with, since the vast majority of
setups in practice never worked with it anyway.  If you did want to
add support for AIL mode 2, then you'd need an spapr cap.
David Gibson Feb. 7, 2022, 1:54 a.m. UTC | #7
On Mon, Jan 31, 2022 at 04:10:34PM -0300, Daniel Henrique Barboza wrote:
> 
> 
> On 1/29/22 03:50, Nicholas Piggin wrote:
> > The behaviour of the Address Translation Mode on Interrupt resource is
> > not consistently supported by all CPU versions or all KVM versions.  In
> > particular KVM HV only supports mode 0 on POWER7 processors, and does
> > not support mode 2 on any processors. KVM PR only supports mode 0. TCG
> > can support all modes (0,2,3).
> > 
> > This leads to inconsistencies in guest behaviour and could cause
> > problems migrating guests.
> > 
> > This was not too noticable for Linux guests for a long time because the
> > kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
> > advisory (KVM would not always honor it either) and it kept both sets of
> > interrupt vectors around.
> > 
> > Recent Linux guests depend on the AIL mode working as defined by the ISA
> > to support the SCV facility interrupt. If AIL mode 3 can not be provided,
> > then Linux must be given an error so it can disable the SCV facility.
> 
> Is this the scenario where migration failures can occur? I don't understand
> what are the migration problems you cited that were possible to
> happen.

The problem case (well, the main one) is migrating from qemu on a
recent KVM running with AIL==3 to qemu on an older KVM (or PR) where
AIL==3 doesn't work properly.

Theoretically, a qemu running with AIL==2 on TCG to a qemu running on
KVM is also a problem, though it's not going to arise in practice,
since AFAIK no guests we care about use AIL==2.

> > Add the ail-modes capability which is a bitmap of the supported values
> > for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
> > a new KVM CAP that exports the same thing, and provide defaults for PR
> > and HV KVM that predate the cap.
> 
> Why add a new machine cap in this case? Isn't something that the KVM capability
> should be able to handle by itself, where we always assume that we should have
> the best AIL value possible?

Because the "best AIL possible" might change across a migration.
Daniel Henrique Barboza Feb. 7, 2022, 11:33 a.m. UTC | #8
On 2/6/22 22:54, David Gibson wrote:
> On Mon, Jan 31, 2022 at 04:10:34PM -0300, Daniel Henrique Barboza wrote:
>>
>>
>> On 1/29/22 03:50, Nicholas Piggin wrote:
>>> The behaviour of the Address Translation Mode on Interrupt resource is
>>> not consistently supported by all CPU versions or all KVM versions.  In
>>> particular KVM HV only supports mode 0 on POWER7 processors, and does
>>> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
>>> can support all modes (0,2,3).
>>>
>>> This leads to inconsistencies in guest behaviour and could cause
>>> problems migrating guests.
>>>
>>> This was not too noticable for Linux guests for a long time because the
>>> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
>>> advisory (KVM would not always honor it either) and it kept both sets of
>>> interrupt vectors around.
>>>
>>> Recent Linux guests depend on the AIL mode working as defined by the ISA
>>> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
>>> then Linux must be given an error so it can disable the SCV facility.
>>
>> Is this the scenario where migration failures can occur? I don't understand
>> what are the migration problems you cited that were possible to
>> happen.
> 
> The problem case (well, the main one) is migrating from qemu on a
> recent KVM running with AIL==3 to qemu on an older KVM (or PR) where
> AIL==3 doesn't work properly.
> 
> Theoretically, a qemu running with AIL==2 on TCG to a qemu running on
> KVM is also a problem, though it's not going to arise in practice,
> since AFAIK no guests we care about use AIL==2.
> 
>>> Add the ail-modes capability which is a bitmap of the supported values
>>> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
>>> a new KVM CAP that exports the same thing, and provide defaults for PR
>>> and HV KVM that predate the cap.
>>
>> Why add a new machine cap in this case? Isn't something that the KVM capability
>> should be able to handle by itself, where we always assume that we should have
>> the best AIL value possible?
> 
> Because the "best AIL possible" might change across a migration.

Well, since there's a possible migration breakage scenario I believe we'll need to
consider adding this new cap then.

Given that Nick is using a sensible default and most users won't be affected by this
cap I think it's ok.


Thanks,


Daniel

>
Nicholas Piggin Feb. 14, 2022, 7:54 a.m. UTC | #9
Excerpts from David Gibson's message of February 7, 2022 11:41 am:
> On Sat, Jan 29, 2022 at 04:50:07PM +1000, Nicholas Piggin wrote:
>> The behaviour of the Address Translation Mode on Interrupt resource is
>> not consistently supported by all CPU versions or all KVM versions.  In
>> particular KVM HV only supports mode 0 on POWER7 processors, and does
>> not support mode 2 on any processors. KVM PR only supports mode 0. TCG
>> can support all modes (0,2,3).
>> 
>> This leads to inconsistencies in guest behaviour and could cause
>> problems migrating guests.
>> 
>> This was not too noticable for Linux guests for a long time because the
>> kernel only used mode 0 or 3, and it used to consider AIL to be somewhat
>> advisory (KVM would not always honor it either) and it kept both sets of
>> interrupt vectors around.
>> 
>> Recent Linux guests depend on the AIL mode working as defined by the ISA
>> to support the SCV facility interrupt. If AIL mode 3 can not be provided,
>> then Linux must be given an error so it can disable the SCV facility.
>> 
>> Add the ail-modes capability which is a bitmap of the supported values
>> for the H_SET_MODE Address Translation Mode on Interrupt resource. Add
>> a new KVM CAP that exports the same thing, and provide defaults for PR
>> and HV KVM that predate the cap.
>> ---
>> 
>> I just wanted to get some feedback on the approach before submitting a
>> patch for the KVM cap.
>> 
>> The reason I don't make that a boolean cap for AIL=3 is that future
>> processors might implement new modes a guest would like to use even
>> though it's not the nicest interface.
> 
> [snip]
>>  SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>>      [SPAPR_CAP_HTM] = {
>>          .name = "htm",
>> @@ -730,6 +802,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
>>          .type = "bool",
>>          .apply = cap_rpt_invalidate_apply,
>>      },
>> +    [SPAPR_CAP_AIL_MODES] = {
>> +        .name = "ail-modes",
>> +        .description = "Bitmap of AIL (alternate interrupt location) mode support",
> 
> A bitmap doesn't quite work as an spapr cap.  The general caps code
> assumes that bigger is always better, or more precisely that migrating
> from an instance that has a lower value to one which has a higher
> value is "good enough" to be compatible.  That's obviously not the
> case for a bitmap.

Yeah, it was clearly a nasty wart :P

> I think to handle this properly within the limitations of papr caps,
> you instead want a separate boolean cap for each supported AIL mode
> (or at least for each AIL mode you want to have control over).

Oh that's a good idea. We could just do ail-mode-3 for now.

2 is supported by previous processors, but is now deprecated. I don't 
think AIX ever used it although it (or something else) may have. Even
KVM never really implemented it correctly. Although I think TCG does.

So in theory we could be causing a regression if we leave out mode 2,
although it should be easy to re-add (we can leave the support in TCG
for a while and it's not much work anyway).

I'll try with just 3 as the optional cap. Should make it a lot cleaner.

Thanks,
Nick
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 163c90388a..2a7a510aa7 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4604,6 +4604,7 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
     smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+    smc->default_caps.caps[SPAPR_CAP_AIL_MODES] = SPAPR_CAP_AIL_MODES_DEFAULT;
     spapr_caps_add_properties(smc);
     smc->irq = &spapr_irq_dual;
     smc->dr_phb_enabled = true;
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index ed7c077a0d..21cc93af86 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -94,6 +94,30 @@  static void spapr_cap_set_bool(Object *obj, Visitor *v, const char *name,
     spapr->eff.caps[cap->index] = value ? SPAPR_CAP_ON : SPAPR_CAP_OFF;
 }
 
+static void spapr_cap_get_uint8(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    SpaprCapabilityInfo *cap = opaque;
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+    uint8_t value = spapr_get_cap(spapr, cap->index) == SPAPR_CAP_ON;
+
+    visit_type_uint8(v, name, &value, errp);
+}
+
+static void spapr_cap_set_uint8(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    SpaprCapabilityInfo *cap = opaque;
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+    uint8_t value;
+
+    if (!visit_type_uint8(v, name, &value, errp)) {
+        return;
+    }
+
+    spapr->cmd_line_caps[cap->index] = true;
+    spapr->eff.caps[cap->index] = value;
+}
 
 static void  spapr_cap_get_string(Object *obj, Visitor *v, const char *name,
                                   void *opaque, Error **errp)
@@ -613,6 +637,54 @@  static void cap_rpt_invalidate_apply(SpaprMachineState *spapr,
     }
 }
 
+static void cap_ail_apply(SpaprMachineState *spapr,
+                                     uint8_t val, Error **errp)
+{
+    ERRP_GUARD();
+
+    if (!(val & (0x01 << 0))) {
+        error_setg(errp, "cap-ail-modes mode must include AIL=0");
+        error_append_hint(errp,
+                          "Ensure bit 0 (value 1) is set in cap-ail-modes\n");
+        return;
+    }
+
+    if (val & ~((0x01 << 0) | (0x01 << 1) | (0x01 << 2) | (0x01 << 3))) {
+        error_setg(errp, "Unknown cap-ail-modes value");
+        error_append_hint(errp,
+                          "Ensure only bits between 0 and 3 are set in cap-ail-modes\n");
+        return;
+    }
+
+    if (val & (0x01 << 1)) {
+        error_setg(errp, "Unsupported cap-ail-modes mode AIL=1");
+        error_append_hint(errp,
+                          "Ensure bit 1 (value 2) is clear in cap-ail-modes\n");
+        return;
+    }
+
+    if (kvm_enabled()) {
+        if (val & (0x01 << 2)) {
+            error_setg(errp, "KVM does not support cap-ail-modes mode AIL=2");
+            error_append_hint(errp,
+                              "Ensure bit 2 (value 4) is clear in cap-ail-modes\n");
+            if (kvmppc_has_cap_ail_3()) {
+                error_append_hint(errp, "Try appending -machine cap-ail-modes=9\n");
+            } else {
+                error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
+            }
+            return;
+        }
+        if ((val & (0x01 << 3)) && !kvmppc_has_cap_ail_3()) {
+            error_setg(errp, "KVM implementation does not support cap-ail-modes AIL=3");
+            error_append_hint(errp,
+                              "Ensure bit 3 (value 8) is clear in cap-ail-modes\n");
+            error_append_hint(errp, "Try appending -machine cap-ail-modes=1\n");
+            return;
+        }
+    }
+}
+
 SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
     [SPAPR_CAP_HTM] = {
         .name = "htm",
@@ -730,6 +802,15 @@  SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
         .type = "bool",
         .apply = cap_rpt_invalidate_apply,
     },
+    [SPAPR_CAP_AIL_MODES] = {
+        .name = "ail-modes",
+        .description = "Bitmap of AIL (alternate interrupt location) mode support",
+        .index = SPAPR_CAP_AIL_MODES,
+        .get = spapr_cap_get_uint8,
+        .set = spapr_cap_set_uint8,
+        .type = "uint8",
+        .apply = cap_ail_apply,
+    },
 };
 
 static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 222c1b6bbd..3de4553b35 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -811,15 +811,11 @@  static target_ulong h_set_mode_resource_le(PowerPCCPU *cpu,
 }
 
 static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
+                                                        SpaprMachineState *spapr,
                                                         target_ulong mflags,
                                                         target_ulong value1,
                                                         target_ulong value2)
 {
-    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
-
-    if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
-        return H_P2;
-    }
     if (value1) {
         return H_P3;
     }
@@ -827,13 +823,7 @@  static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
         return H_P4;
     }
 
-    if (mflags == 1) {
-        /* AIL=1 is reserved in POWER8/POWER9/POWER10 */
-        return H_UNSUPPORTED_FLAG;
-    }
-
-    if (mflags == 2 && (pcc->insns_flags2 & PPC2_ISA310)) {
-        /* AIL=2 is reserved in POWER10 (ISA v3.1) */
+    if (!(spapr_get_cap(spapr, SPAPR_CAP_AIL_MODES) & (0x01 << mflags))) {
         return H_UNSUPPORTED_FLAG;
     }
 
@@ -853,7 +843,7 @@  static target_ulong h_set_mode(PowerPCCPU *cpu, SpaprMachineState *spapr,
         ret = h_set_mode_resource_le(cpu, spapr, args[0], args[2], args[3]);
         break;
     case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
-        ret = h_set_mode_resource_addr_trans_mode(cpu, args[0],
+        ret = h_set_mode_resource_addr_trans_mode(cpu, spapr, args[0],
                                                   args[2], args[3]);
         break;
     }
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ee7504b976..307b89adb7 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -49,6 +49,12 @@  typedef enum {
     SPAPR_RESIZE_HPT_REQUIRED,
 } SpaprResizeHpt;
 
+/**
+ * This cap is a bitmask of supported modes. Default to always supporting
+ * 0 and 3 (PR KVM and POWER7 and earlier only support 0).
+ */
+#define SPAPR_CAP_AIL_MODES_DEFAULT     ((0x01 << 0) | (0x01 << 3))
+
 /**
  * Capabilities
  */
@@ -77,8 +83,10 @@  typedef enum {
 #define SPAPR_CAP_FWNMI                 0x0A
 /* Support H_RPT_INVALIDATE */
 #define SPAPR_CAP_RPT_INVALIDATE        0x0B
+/* Support for AIL modes */
+#define SPAPR_CAP_AIL_MODES             0x0C
 /* Num Caps */
-#define SPAPR_CAP_NUM                   (SPAPR_CAP_RPT_INVALIDATE + 1)
+#define SPAPR_CAP_NUM                   (SPAPR_CAP_AIL_MODES + 1)
 
 /*
  * Capability Values
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index bcaf66cc4d..5e94ecaa8e 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1112,6 +1112,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_BINARY_STATS_FD 203
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_PPC_AIL_MODES 206
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index dc93b99189..c6a5a8e6e5 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -90,6 +90,7 @@  static int cap_ppc_nested_kvm_hv;
 static int cap_large_decr;
 static int cap_fwnmi;
 static int cap_rpt_invalidate;
+static int cap_ail_modes;
 
 static uint32_t debug_inst_opcode;
 
@@ -154,6 +155,7 @@  int kvm_arch_init(MachineState *ms, KVMState *s)
     }
 
     cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
+    cap_ail_modes = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODES);
     kvm_ppc_register_host_cpu_type();
 
     return 0;
@@ -2563,6 +2565,29 @@  int kvmppc_has_cap_rpt_invalidate(void)
     return cap_rpt_invalidate;
 }
 
+int kvmppc_has_cap_ail_3(void)
+{
+    if (!cap_ail_modes) {
+        PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
+
+        /*
+         * KVM HV hosts which do not support CAP_AIL_MODES capability still
+         * support AIL=3 on POWER8 and above. Special-case this so as not to
+         * lose performance on those hosts.
+         */
+        if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
+            return 0;
+        }
+
+        if (kvmppc_is_pr(kvm_state)) {
+            return 0;
+        } else {
+            return 1;
+        }
+    }
+    return !!(cap_ail_modes & (1 << 3));
+}
+
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
 {
     uint32_t host_pvr = mfpvr();
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index ee9325bf9a..efafa67b83 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -73,6 +73,7 @@  int kvmppc_set_cap_nested_kvm_hv(int enable);
 int kvmppc_get_cap_large_decr(void);
 int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
 int kvmppc_has_cap_rpt_invalidate(void);
+int kvmppc_has_cap_ail_3(void);
 int kvmppc_enable_hwrng(void);
 int kvmppc_put_books_sregs(PowerPCCPU *cpu);
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
@@ -393,6 +394,11 @@  static inline int kvmppc_has_cap_rpt_invalidate(void)
     return false;
 }
 
+static inline int kvmppc_has_cap_ail_3(void)
+{
+    return false;
+}
+
 static inline int kvmppc_enable_hwrng(void)
 {
     return -1;