Patchwork powerpc iommu: enable multiple TCE requests

login
register
mail settings
Submitter Alexey Kardashevskiy
Date Aug. 7, 2013, 8:08 a.m.
Message ID <1375862885-12132-1-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/265388/
State New
Headers show

Comments

Alexey Kardashevskiy - Aug. 7, 2013, 8:08 a.m.
Currently only single TCE entry per requiest is supported (H_PUT_TCE).
However PAPR+ specification allows multiple entry requests such as
H_PUT_TCE_INDIRECT and H_STUFFF_TCE. Having less transitions to the host
kernel via ioctls, support of these calls can accelerate IOMMU operations.

This also removes some leftovers in debug output of the H_PUT_TCE handler.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---

This patch requires a KVM_CAP_SPAPR_MULTITCE capability from kernel headers
which is not there yet.
However it still would be nice to have "Reviewed-by" from someone when
the capability will make it to the upstream. Thanks.

---
 hw/ppc/spapr.c       | 16 ++++++++++--
 hw/ppc/spapr_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++---
 target-ppc/kvm.c     |  7 +++++
 target-ppc/kvm_ppc.h |  7 +++++
 4 files changed, 99 insertions(+), 5 deletions(-)
Alexey Kardashevskiy - Aug. 16, 2013, 9:49 a.m.
On 08/07/2013 06:08 PM, Alexey Kardashevskiy wrote:
> Currently only single TCE entry per requiest is supported (H_PUT_TCE).
> However PAPR+ specification allows multiple entry requests such as
> H_PUT_TCE_INDIRECT and H_STUFFF_TCE. Having less transitions to the host
> kernel via ioctls, support of these calls can accelerate IOMMU operations.
> 
> This also removes some leftovers in debug output of the H_PUT_TCE handler.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> 
> This patch requires a KVM_CAP_SPAPR_MULTITCE capability from kernel headers
> which is not there yet.
> However it still would be nice to have "Reviewed-by" from someone when
> the capability will make it to the upstream. Thanks.


Ping, anyone?

Is it good as it is (but I am sure it is not)?

Thank you all.


> 
> ---
>  hw/ppc/spapr.c       | 16 ++++++++++--
>  hw/ppc/spapr_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  target-ppc/kvm.c     |  7 +++++
>  target-ppc/kvm_ppc.h |  7 +++++
>  4 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 9494915..a6b1f54 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -301,6 +301,8 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>      CPUState *cs;
>      uint32_t start_prop = cpu_to_be32(initrd_base);
>      uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> +    char hypertas_propm[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
> +        "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk\0hcall-multi-tce";
>      char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>          "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
>      char qemu_hypertas_prop[] = "hcall-memop1";
> @@ -480,8 +482,18 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>      /* RTAS */
>      _FDT((fdt_begin_node(fdt, "rtas")));
>  
> -    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> -                       sizeof(hypertas_prop))));
> +    /* In TCG mode, the multitce functions, which we implement are a
> +     * win.  With KVM, we could fall back to the qemu implementation
> +     * when KVM doesn't support them, but that would be much slower
> +     * than just using the KVM implementations of the single TCE
> +     * hypercalls. */
> +    if (kvmppc_spapr_use_multitce()) {
> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
> +                           sizeof(hypertas_propm))));
> +    } else {
> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> +                           sizeof(hypertas_prop))));
> +    }
>      _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
>                         sizeof(qemu_hypertas_prop))));
>  
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 3d4a1fc..22b09be 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -244,6 +244,71 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
>      return H_SUCCESS;
>  }
>  
> +static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
> +                                       sPAPREnvironment *spapr,
> +                                       target_ulong opcode, target_ulong *args)
> +{
> +    int i;
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce_list = args[2];
> +    target_ulong npages = args[3];
> +    target_ulong ret = 0;
> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +
> +    if (tcet) {
> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> +                                        i * sizeof(target_ulong));
> +            ret = put_tce_emu(tcet, ioba, tce);
> +            if (ret) {
> +                break;
> +            }
> +        }
> +        return ret;
> +    }
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce_list, ret);
> +#endif
> +
> +    return H_PARAMETER;
> +}
> +
> +static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> +                              target_ulong opcode, target_ulong *args)
> +{
> +    int i;
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce_value = args[2];
> +    target_ulong npages = args[3];
> +    target_ulong ret = 0;
> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +
> +    ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> +
> +    if (tcet) {
> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +            ret = put_tce_emu(tcet, ioba, tce_value);
> +            if (ret) {
> +                break;
> +            }
> +        }
> +        return ret;
> +    }
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce_value, ret);
> +#endif
> +
> +    return H_PARAMETER;
> +}
> +
>  static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -258,9 +323,10 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>          return put_tce_emu(tcet, ioba, tce);
>      }
>  #ifdef DEBUG_TCE
> -    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/
> -            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx "\n",
> -            __func__, liobn, /*dev->qdev.id, */ioba, tce);
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce, ret);
>  #endif
>  
>      return H_PARAMETER;
> @@ -318,6 +384,8 @@ static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
>  
>      /* hcall-tce */
>      spapr_register_hypercall(H_PUT_TCE, h_put_tce);
> +    spapr_register_hypercall(H_PUT_TCE_INDIRECT, h_put_tce_indirect);
> +    spapr_register_hypercall(H_STUFF_TCE, h_stuff_tce);
>  }
>  
>  static TypeInfo spapr_tce_table_info = {
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 8afa7eb..97ac670 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -60,6 +60,7 @@ static int cap_booke_sregs;
>  static int cap_ppc_smt;
>  static int cap_ppc_rma;
>  static int cap_spapr_tce;
> +static int cap_spapr_multitce;
>  static int cap_hior;
>  static int cap_one_reg;
>  static int cap_epr;
> @@ -96,6 +97,7 @@ int kvm_arch_init(KVMState *s)
>      cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
>      cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
>      cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
> +    cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
>      cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>      cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>      cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
> @@ -1603,6 +1605,11 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
>  }
>  #endif
>  
> +bool kvmppc_spapr_use_multitce(void)
> +{
> +    return !kvm_enabled() || cap_spapr_multitce;
> +}
> +
>  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd)
>  {
>      struct kvm_create_spapr_tce args = {
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index 12564ef..a2a903f 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -31,6 +31,7 @@ int kvmppc_set_tcr(PowerPCCPU *cpu);
>  int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
>  #ifndef CONFIG_USER_ONLY
>  off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
> +bool kvmppc_spapr_use_multitce(void);
>  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
>  int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
>  int kvmppc_reset_htab(int shift_hint);
> @@ -125,6 +126,12 @@ static inline off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
>      return 0;
>  }
>  
> +static inline bool kvmppc_spapr_use_multitce(void)
> +{
> +    /* No KVM, so always use the qemu multitce implementations */
> +    return true;
> +}
> +
>  static inline void *kvmppc_create_spapr_tce(uint32_t liobn,
>                                              uint32_t window_size, int *fd)
>  {
>
Alexander Graf - Aug. 16, 2013, 1:15 p.m.
On 07.08.2013, at 10:08, Alexey Kardashevskiy wrote:

> Currently only single TCE entry per requiest is supported (H_PUT_TCE).
> However PAPR+ specification allows multiple entry requests such as
> H_PUT_TCE_INDIRECT and H_STUFFF_TCE. Having less transitions to the host
> kernel via ioctls, support of these calls can accelerate IOMMU operations.
> 
> This also removes some leftovers in debug output of the H_PUT_TCE handler.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> 
> This patch requires a KVM_CAP_SPAPR_MULTITCE capability from kernel headers
> which is not there yet.
> However it still would be nice to have "Reviewed-by" from someone when
> the capability will make it to the upstream. Thanks.
> 
> ---
> hw/ppc/spapr.c       | 16 ++++++++++--
> hw/ppc/spapr_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++---
> target-ppc/kvm.c     |  7 +++++
> target-ppc/kvm_ppc.h |  7 +++++
> 4 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 9494915..a6b1f54 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -301,6 +301,8 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>     CPUState *cs;
>     uint32_t start_prop = cpu_to_be32(initrd_base);
>     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> +    char hypertas_propm[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
> +        "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk\0hcall-multi-tce";
>     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
>     char qemu_hypertas_prop[] = "hcall-memop1";
> @@ -480,8 +482,18 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>     /* RTAS */
>     _FDT((fdt_begin_node(fdt, "rtas")));
> 
> -    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> -                       sizeof(hypertas_prop))));
> +    /* In TCG mode, the multitce functions, which we implement are a
> +     * win.  With KVM, we could fall back to the qemu implementation
> +     * when KVM doesn't support them, but that would be much slower
> +     * than just using the KVM implementations of the single TCE
> +     * hypercalls. */
> +    if (kvmppc_spapr_use_multitce()) {
> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
> +                           sizeof(hypertas_propm))));
> +    } else {
> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> +                           sizeof(hypertas_prop))));
> +    }
>     _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
>                        sizeof(qemu_hypertas_prop))));
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 3d4a1fc..22b09be 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -244,6 +244,71 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
>     return H_SUCCESS;
> }
> 
> +static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
> +                                       sPAPREnvironment *spapr,
> +                                       target_ulong opcode, target_ulong *args)
> +{
> +    int i;
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce_list = args[2];
> +    target_ulong npages = args[3];
> +    target_ulong ret = 0;
> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +
> +    if (tcet) {
> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {

i++

> +            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +

I think it makes sense to do the masking when you assign the variable - makes it easier to read.

> +                                        i * sizeof(target_ulong));
> +            ret = put_tce_emu(tcet, ioba, tce);
> +            if (ret) {
> +                break;
> +            }
> +        }
> +        return ret;
> +    }
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx

Could you please convert this into something that doesn't bitrot? Either a DPRINTF style macro that gets format checking done even when unused or a trace point.

> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce_list, ret);
> +#endif
> +
> +    return H_PARAMETER;
> +}
> +
> +static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,

Same comments as above in this function.

> +                              target_ulong opcode, target_ulong *args)
> +{
> +    int i;
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce_value = args[2];
> +    target_ulong npages = args[3];
> +    target_ulong ret = 0;
> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +
> +    ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);

Heh - here you actually do the mask separately. This is good.

> +
> +    if (tcet) {
> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +            ret = put_tce_emu(tcet, ioba, tce_value);
> +            if (ret) {
> +                break;
> +            }
> +        }
> +        return ret;
> +    }
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce_value, ret);
> +#endif
> +
> +    return H_PARAMETER;

Hrm. These 2 functions look very similar. Does it make sense to merge them into one with a bool indirect flag?

> +}
> +
> static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>                               target_ulong opcode, target_ulong *args)
> {
> @@ -258,9 +323,10 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>         return put_tce_emu(tcet, ioba, tce);
>     }
> #ifdef DEBUG_TCE
> -    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/
> -            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx "\n",
> -            __func__, liobn, /*dev->qdev.id, */ioba, tce);
> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> +            " ret = " TARGET_FMT_ld "\n",
> +            __func__, liobn, ioba, tce, ret);
> #endif
> 
>     return H_PARAMETER;
> @@ -318,6 +384,8 @@ static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
> 
>     /* hcall-tce */
>     spapr_register_hypercall(H_PUT_TCE, h_put_tce);
> +    spapr_register_hypercall(H_PUT_TCE_INDIRECT, h_put_tce_indirect);
> +    spapr_register_hypercall(H_STUFF_TCE, h_stuff_tce);
> }
> 
> static TypeInfo spapr_tce_table_info = {
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 8afa7eb..97ac670 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -60,6 +60,7 @@ static int cap_booke_sregs;
> static int cap_ppc_smt;
> static int cap_ppc_rma;
> static int cap_spapr_tce;
> +static int cap_spapr_multitce;
> static int cap_hior;
> static int cap_one_reg;
> static int cap_epr;
> @@ -96,6 +97,7 @@ int kvm_arch_init(KVMState *s)
>     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
>     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
>     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
> +    cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);

This isn't defined yet, no? Just make it "yes on TCG, no on KVM" for now and add the multitce cap bit later when the kernel patches are in.

Rest looks good.


Alex

>     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
> @@ -1603,6 +1605,11 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
> }
> #endif
> 
> +bool kvmppc_spapr_use_multitce(void)
> +{
> +    return !kvm_enabled() || cap_spapr_multitce;
> +}
> +
> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd)
> {
>     struct kvm_create_spapr_tce args = {
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index 12564ef..a2a903f 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -31,6 +31,7 @@ int kvmppc_set_tcr(PowerPCCPU *cpu);
> int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
> #ifndef CONFIG_USER_ONLY
> off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
> +bool kvmppc_spapr_use_multitce(void);
> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
> int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
> int kvmppc_reset_htab(int shift_hint);
> @@ -125,6 +126,12 @@ static inline off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
>     return 0;
> }
> 
> +static inline bool kvmppc_spapr_use_multitce(void)
> +{
> +    /* No KVM, so always use the qemu multitce implementations */
> +    return true;
> +}
> +
> static inline void *kvmppc_create_spapr_tce(uint32_t liobn,
>                                             uint32_t window_size, int *fd)
> {
> -- 
> 1.8.3.2
>
Paolo Bonzini - Aug. 18, 2013, 3:22 p.m.
Il 16/08/2013 11:49, Alexey Kardashevskiy ha scritto:
> With KVM, we could fall back to the qemu implementation
>> +     * when KVM doesn't support them, but that would be much slower
>> +     * than just using the KVM implementations of the single TCE
>> +     * hypercalls. */
>> +    if (kvmppc_spapr_use_multitce()) {
>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
>> +                           sizeof(hypertas_propm))));
>> +    } else {
>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>> +                           sizeof(hypertas_prop))));
>> +    }

This prevents migration from newer kernel to older kernel.  Can you
ensure that the fallback to the QEMU implementation works, even though
it is not used in practice?

Paolo
Alexey Kardashevskiy - Aug. 19, 2013, 7:30 a.m.
On 08/19/2013 01:22 AM, Paolo Bonzini wrote:
> Il 16/08/2013 11:49, Alexey Kardashevskiy ha scritto:
>> With KVM, we could fall back to the qemu implementation
>>> +     * when KVM doesn't support them, but that would be much slower
>>> +     * than just using the KVM implementations of the single TCE
>>> +     * hypercalls. */
>>> +    if (kvmppc_spapr_use_multitce()) {
>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
>>> +                           sizeof(hypertas_propm))));
>>> +    } else {
>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>>> +                           sizeof(hypertas_prop))));
>>> +    }
> 
> This prevents migration from newer kernel to older kernel.  Can you
> ensure that the fallback to the QEMU implementation works, even though
> it is not used in practice?

How would it break? By having a device tree with "multi-tce" in it and not
having KVM PPC capability for that?

If this is the case, it will not prevent from migration as the "multi-tce"
feature is supported anyway by this patch. The only reason for not
advertising it to the guest is that the host kernel already has
acceleration for H_PUT_TCE (single page map/unmap) and advertising
"multi-tce" without having it in the host kernel (but only in QEMU) would
slow things down (but it still will work).
Alexander Graf - Aug. 19, 2013, 8:01 a.m.
Am 19.08.2013 um 09:30 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:

> On 08/19/2013 01:22 AM, Paolo Bonzini wrote:
>> Il 16/08/2013 11:49, Alexey Kardashevskiy ha scritto:
>>> With KVM, we could fall back to the qemu implementation
>>>> +     * when KVM doesn't support them, but that would be much slower
>>>> +     * than just using the KVM implementations of the single TCE
>>>> +     * hypercalls. */
>>>> +    if (kvmppc_spapr_use_multitce()) {
>>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
>>>> +                           sizeof(hypertas_propm))));
>>>> +    } else {
>>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>>>> +                           sizeof(hypertas_prop))));
>>>> +    }
>> 
>> This prevents migration from newer kernel to older kernel.  Can you
>> ensure that the fallback to the QEMU implementation works, even though
>> it is not used in practice?
> 
> How would it break? By having a device tree with "multi-tce" in it and not
> having KVM PPC capability for that?
> 
> If this is the case, it will not prevent from migration as the "multi-tce"
> feature is supported anyway by this patch. The only reason for not
> advertising it to the guest is that the host kernel already has
> acceleration for H_PUT_TCE (single page map/unmap) and advertising
> "multi-tce" without having it in the host kernel (but only in QEMU) would
> slow things down (but it still will work).

It means that if you use the same QEMU version with the same command line on a different kernel version, your guest looks different because we generate the dtb differently.

The usual way to avoid this is to have a command line option to at least make it possible for a management tool to nail down feature flags regardless of the host configuration.

Considering that IIRC we haven't actually flagged -M pseries as backwards compatible (avoid breaking migration, etc) we can probably get away with enabling multi-tce always and live with the performance penalty on older host kernels.


Alex
Alexey Kardashevskiy - Aug. 19, 2013, 8:44 a.m.
On 08/19/2013 06:01 PM, Alexander Graf wrote:
> 
> 
> Am 19.08.2013 um 09:30 schrieb Alexey Kardashevskiy <aik@ozlabs.ru>:
> 
>> On 08/19/2013 01:22 AM, Paolo Bonzini wrote:
>>> Il 16/08/2013 11:49, Alexey Kardashevskiy ha scritto:
>>>> With KVM, we could fall back to the qemu implementation
>>>>> +     * when KVM doesn't support them, but that would be much slower
>>>>> +     * than just using the KVM implementations of the single TCE
>>>>> +     * hypercalls. */
>>>>> +    if (kvmppc_spapr_use_multitce()) {
>>>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
>>>>> +                           sizeof(hypertas_propm))));
>>>>> +    } else {
>>>>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>>>>> +                           sizeof(hypertas_prop))));
>>>>> +    }
>>>
>>> This prevents migration from newer kernel to older kernel.  Can you
>>> ensure that the fallback to the QEMU implementation works, even though
>>> it is not used in practice?
>>
>> How would it break? By having a device tree with "multi-tce" in it and not
>> having KVM PPC capability for that?
>>
>> If this is the case, it will not prevent from migration as the "multi-tce"
>> feature is supported anyway by this patch. The only reason for not
>> advertising it to the guest is that the host kernel already has
>> acceleration for H_PUT_TCE (single page map/unmap) and advertising
>> "multi-tce" without having it in the host kernel (but only in QEMU) would
>> slow things down (but it still will work).
> 

> It means that if you use the same QEMU version with the same command
> line on a different kernel version, your guest looks different because
> we generate the dtb differently.

Oh. Sorry for my ignorance again, I am not playing dump or anything like
that - I do not understand how the device tree (which we cook in QEMU) on
the destination can possibly survive migration and not to be overwritten by
the one from the source. What was in the destination RAM before migration
does not matter at all (including dt), QEMU device tree is what matters but
this does not change. As it is "the same QEMU version", hypercalls are
supported anyway, the only difference where they will be handled - in the
host kernel or QEMU. What do I miss?


> The usual way to avoid this is to have a command line option to at least
> make it possible for a management tool to nail down feature flags
> regardless of the host configuration.


> Considering that IIRC we haven't actually flagged -M pseries as
> backwards compatible (avoid breaking migration, etc) we can probably get
> away with enabling multi-tce always and live with the performance
> penalty on older host kernels.

We have H_PUT_TCE accelerated in older kernel for quite a while and we do
not want guests running on older hosts become slower for no good reason,
this is why we added this capability at the first place.
Alexey Kardashevskiy - Aug. 19, 2013, 9:01 a.m.
On 08/16/2013 11:15 PM, Alexander Graf wrote:
> 
> On 07.08.2013, at 10:08, Alexey Kardashevskiy wrote:
> 
>> Currently only single TCE entry per requiest is supported (H_PUT_TCE).
>> However PAPR+ specification allows multiple entry requests such as
>> H_PUT_TCE_INDIRECT and H_STUFFF_TCE. Having less transitions to the host
>> kernel via ioctls, support of these calls can accelerate IOMMU operations.
>>
>> This also removes some leftovers in debug output of the H_PUT_TCE handler.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>> ---
>>
>> This patch requires a KVM_CAP_SPAPR_MULTITCE capability from kernel headers
>> which is not there yet.
>> However it still would be nice to have "Reviewed-by" from someone when
>> the capability will make it to the upstream. Thanks.
>>
>> ---
>> hw/ppc/spapr.c       | 16 ++++++++++--
>> hw/ppc/spapr_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++---
>> target-ppc/kvm.c     |  7 +++++
>> target-ppc/kvm_ppc.h |  7 +++++
>> 4 files changed, 99 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 9494915..a6b1f54 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -301,6 +301,8 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>>     CPUState *cs;
>>     uint32_t start_prop = cpu_to_be32(initrd_base);
>>     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>> +    char hypertas_propm[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>> +        "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk\0hcall-multi-tce";
>>     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>>         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
>>     char qemu_hypertas_prop[] = "hcall-memop1";
>> @@ -480,8 +482,18 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>>     /* RTAS */
>>     _FDT((fdt_begin_node(fdt, "rtas")));
>>
>> -    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>> -                       sizeof(hypertas_prop))));
>> +    /* In TCG mode, the multitce functions, which we implement are a
>> +     * win.  With KVM, we could fall back to the qemu implementation
>> +     * when KVM doesn't support them, but that would be much slower
>> +     * than just using the KVM implementations of the single TCE
>> +     * hypercalls. */
>> +    if (kvmppc_spapr_use_multitce()) {
>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
>> +                           sizeof(hypertas_propm))));
>> +    } else {
>> +        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
>> +                           sizeof(hypertas_prop))));
>> +    }
>>     _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
>>                        sizeof(qemu_hypertas_prop))));
>>
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 3d4a1fc..22b09be 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -244,6 +244,71 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
>>     return H_SUCCESS;
>> }
>>
>> +static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>> +                                       sPAPREnvironment *spapr,
>> +                                       target_ulong opcode, target_ulong *args)
>> +{
>> +    int i;
>> +    target_ulong liobn = args[0];
>> +    target_ulong ioba = args[1];
>> +    target_ulong tce_list = args[2];
>> +    target_ulong npages = args[3];
>> +    target_ulong ret = 0;
>> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>> +
>> +    if (tcet) {
>> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> 
> i++
> 
>> +            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> 
> I think it makes sense to do the masking when you assign the variable - makes it easier to read.
> 
>> +                                        i * sizeof(target_ulong));
>> +            ret = put_tce_emu(tcet, ioba, tce);
>> +            if (ret) {
>> +                break;
>> +            }
>> +        }
>> +        return ret;
>> +    }
>> +#ifdef DEBUG_TCE
>> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> 
> Could you please convert this into something that doesn't bitrot? Either a DPRINTF style macro that gets format checking done even when unused or a trace point.


This file does not have DPRINTF and every time David or I tried to add one,
we were loudly shouted not to do this. So - tracepoints. But the file uses
#ifdef DEBUG_TCE heavily, and there is already a "bitrot" trace in
H_PUT_TCE (the only hcall in the group which is already in the file).

So what do I do now with traces? 2 patches - first converts everything to
tracepoints and second patch which actually adds multi-tce? Or remove old
"bitrot" traces from this patch and repost (I can survive without any debug
code in upstream)? I am fine with both ways.



>> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
>> +            " ret = " TARGET_FMT_ld "\n",
>> +            __func__, liobn, ioba, tce_list, ret);
>> +#endif
>> +
>> +    return H_PARAMETER;
>> +}
>> +
>> +static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> 
> Same comments as above in this function.
> 
>> +                              target_ulong opcode, target_ulong *args)
>> +{
>> +    int i;
>> +    target_ulong liobn = args[0];
>> +    target_ulong ioba = args[1];
>> +    target_ulong tce_value = args[2];
>> +    target_ulong npages = args[3];
>> +    target_ulong ret = 0;
>> +    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>> +
>> +    ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> 
> Heh - here you actually do the mask separately. This is good.
> 
>> +
>> +    if (tcet) {
>> +        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
>> +            ret = put_tce_emu(tcet, ioba, tce_value);
>> +            if (ret) {
>> +                break;
>> +            }
>> +        }
>> +        return ret;
>> +    }
>> +#ifdef DEBUG_TCE
>> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
>> +            " ret = " TARGET_FMT_ld "\n",
>> +            __func__, liobn, ioba, tce_value, ret);
>> +#endif
>> +
>> +    return H_PARAMETER;
> 
> Hrm. These 2 functions look very similar. Does it make sense to merge them into one with a bool indirect flag?


It does not for me. Yes, they look pretty similar but they do opposite
things AND args[2] has completely different meaning - value vs. address. I
do not see how the function which accepts both tce_value and tce_list will
still look easy/nice to read. I can change it though if you insist.

The rest I'll fix, thanks for review.


>> +}
>> +
>> static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>>                               target_ulong opcode, target_ulong *args)
>> {
>> @@ -258,9 +323,10 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>>         return put_tce_emu(tcet, ioba, tce);
>>     }
>> #ifdef DEBUG_TCE
>> -    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/
>> -            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx "\n",
>> -            __func__, liobn, /*dev->qdev.id, */ioba, tce);
>> +    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>> +            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
>> +            " ret = " TARGET_FMT_ld "\n",
>> +            __func__, liobn, ioba, tce, ret);
>> #endif
>>
>>     return H_PARAMETER;
>> @@ -318,6 +384,8 @@ static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
>>
>>     /* hcall-tce */
>>     spapr_register_hypercall(H_PUT_TCE, h_put_tce);
>> +    spapr_register_hypercall(H_PUT_TCE_INDIRECT, h_put_tce_indirect);
>> +    spapr_register_hypercall(H_STUFF_TCE, h_stuff_tce);
>> }
>>
>> static TypeInfo spapr_tce_table_info = {
>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>> index 8afa7eb..97ac670 100644
>> --- a/target-ppc/kvm.c
>> +++ b/target-ppc/kvm.c
>> @@ -60,6 +60,7 @@ static int cap_booke_sregs;
>> static int cap_ppc_smt;
>> static int cap_ppc_rma;
>> static int cap_spapr_tce;
>> +static int cap_spapr_multitce;
>> static int cap_hior;
>> static int cap_one_reg;
>> static int cap_epr;
>> @@ -96,6 +97,7 @@ int kvm_arch_init(KVMState *s)
>>     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
>>     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
>>     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
>> +    cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
> 

> This isn't defined yet, no? Just make it "yes on TCG, no on KVM" for now
> and add the multitce cap bit later when the kernel patches are in.

> Rest looks good.
> 
> 
> Alex
> 
>>     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>>     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>>     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
>> @@ -1603,6 +1605,11 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
>> }
>> #endif
>>
>> +bool kvmppc_spapr_use_multitce(void)
>> +{
>> +    return !kvm_enabled() || cap_spapr_multitce;
>> +}
>> +
>> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd)
>> {
>>     struct kvm_create_spapr_tce args = {
>> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
>> index 12564ef..a2a903f 100644
>> --- a/target-ppc/kvm_ppc.h
>> +++ b/target-ppc/kvm_ppc.h
>> @@ -31,6 +31,7 @@ int kvmppc_set_tcr(PowerPCCPU *cpu);
>> int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
>> #ifndef CONFIG_USER_ONLY
>> off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
>> +bool kvmppc_spapr_use_multitce(void);
>> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
>> int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
>> int kvmppc_reset_htab(int shift_hint);
>> @@ -125,6 +126,12 @@ static inline off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
>>     return 0;
>> }
>>
>> +static inline bool kvmppc_spapr_use_multitce(void)
>> +{
>> +    /* No KVM, so always use the qemu multitce implementations */
>> +    return true;
>> +}
>> +
>> static inline void *kvmppc_create_spapr_tce(uint32_t liobn,
>>                                             uint32_t window_size, int *fd)
>> {
>> -- 
>> 1.8.3.2
>>
>
Paolo Bonzini - Aug. 19, 2013, 9:47 a.m.
Il 19/08/2013 10:44, Alexey Kardashevskiy ha scritto:
>> > It means that if you use the same QEMU version with the same command
>> > line on a different kernel version, your guest looks different because
>> > we generate the dtb differently.
> Oh. Sorry for my ignorance again, I am not playing dump or anything like
> that - I do not understand how the device tree (which we cook in QEMU) on
> the destination can possibly survive migration and not to be overwritten by
> the one from the source. What was in the destination RAM before migration
> does not matter at all (including dt), QEMU device tree is what matters but
> this does not change. As it is "the same QEMU version", hypercalls are
> supported anyway, the only difference where they will be handled - in the
> host kernel or QEMU. What do I miss?

Nothing, I just asked to test that handling the hypercall in QEMU works.

On x86 we have a similar problem, though with cpuid bits instead of the
device tree.  An older kernel might not support some cpuid bits, thus
"-cpu SandyBridge" might have different cpuid bits depending on the host
processor and kernel version.  This is handled by having an "enforce"
mode where "-cpu SandyBridge,enforce" will fail to start if the host
processor or the kernel is not new enough.

But in this case, you do not need this because the hypercall works if
emulated by QEMU.  I like Alex's solution of making it universally
available in the dtb.

Paolo
Alexey Kardashevskiy - Aug. 20, 2013, 1:36 a.m.
On 08/19/2013 07:47 PM, Paolo Bonzini wrote:
> Il 19/08/2013 10:44, Alexey Kardashevskiy ha scritto:
>>>> It means that if you use the same QEMU version with the same command
>>>> line on a different kernel version, your guest looks different because
>>>> we generate the dtb differently.
>> Oh. Sorry for my ignorance again, I am not playing dump or anything like
>> that - I do not understand how the device tree (which we cook in QEMU) on
>> the destination can possibly survive migration and not to be overwritten by
>> the one from the source. What was in the destination RAM before migration
>> does not matter at all (including dt), QEMU device tree is what matters but
>> this does not change. As it is "the same QEMU version", hypercalls are
>> supported anyway, the only difference where they will be handled - in the
>> host kernel or QEMU. What do I miss?
> 
> Nothing, I just asked to test that handling the hypercall in QEMU works.

Well, I was asking rather Alex :)

> On x86 we have a similar problem, though with cpuid bits instead of the
> device tree.  An older kernel might not support some cpuid bits, thus
> "-cpu SandyBridge" might have different cpuid bits depending on the host
> processor and kernel version.  This is handled by having an "enforce"
> mode where "-cpu SandyBridge,enforce" will fail to start if the host
> processor or the kernel is not new enough.

Hm. Here we might have a problem like this is we decide to migrate from
QEMU with this patch running on modern kernel to QEMU without this patch
running on old kernel - for this we might want to be able to disable
"multi-tce" via machine options on newer kernels. Do we care enough to add
such a parameter or we just disable migration and that's it?

This SandyBridge,enforce - what if the destination host running on old
kernel was run without this option - will the migration fail? What is the
mechanism? Do machine options migrate? I looked at target-i386/cpu.c but
did not see the quick answer.


> But in this case, you do not need this because the hypercall works if
> emulated by QEMU.  I like Alex's solution of making it universally
> available in the dtb.

The solution would be good if we did not already have H_PUT_TCE accelerated
for emulated devices in the host kernel but we do have it.
Alexander Graf - Aug. 20, 2013, 6:55 a.m.
On 20.08.2013, at 02:36, Alexey Kardashevskiy wrote:

> On 08/19/2013 07:47 PM, Paolo Bonzini wrote:
>> Il 19/08/2013 10:44, Alexey Kardashevskiy ha scritto:
>>>>> It means that if you use the same QEMU version with the same command
>>>>> line on a different kernel version, your guest looks different because
>>>>> we generate the dtb differently.
>>> Oh. Sorry for my ignorance again, I am not playing dump or anything like
>>> that - I do not understand how the device tree (which we cook in QEMU) on
>>> the destination can possibly survive migration and not to be overwritten by
>>> the one from the source. What was in the destination RAM before migration
>>> does not matter at all (including dt), QEMU device tree is what matters but
>>> this does not change. As it is "the same QEMU version", hypercalls are
>>> supported anyway, the only difference where they will be handled - in the
>>> host kernel or QEMU. What do I miss?
>> 
>> Nothing, I just asked to test that handling the hypercall in QEMU works.
> 
> Well, I was asking rather Alex :)
> 
>> On x86 we have a similar problem, though with cpuid bits instead of the
>> device tree.  An older kernel might not support some cpuid bits, thus
>> "-cpu SandyBridge" might have different cpuid bits depending on the host
>> processor and kernel version.  This is handled by having an "enforce"
>> mode where "-cpu SandyBridge,enforce" will fail to start if the host
>> processor or the kernel is not new enough.
> 
> Hm. Here we might have a problem like this is we decide to migrate from
> QEMU with this patch running on modern kernel to QEMU without this patch
> running on old kernel - for this we might want to be able to disable
> "multi-tce" via machine options on newer kernels. Do we care enough to add
> such a parameter or we just disable migration and that's it?

The problem is not only contained to migration. Even if you just start up your VM with a different host kernel you get a different guest environment, so you potentially get change lurking in that can kill reproducability. Imagine you're Amazon EC2: You don't want people to get any idea what host they're running on.

If the pseries target was a mature, well established and used target, I'd add a machine option "multi-tce" with 3 possibilities:

  on - force multi-tce exposing on
  off - force multi-tce exposing off
  unset - use your current detection code

That way libvirt for example can decide that it wants to nail down TCE support throughout a cluster. It's really the same as the cpu,enforce mode, just on a machine level rather than for cpuid bits.

However, considering the current user base of KVM on pseries I think it's fine to just declare newer QEMU on older KVM as slower because it doesn't use the in-kernel multi-tce support and call it a day. It makes everyone's life a _lot_ easier.

Or are you aware of any products using older kernels that are going to run QEMU 1.7 and above and won't change the kernel as well as they bump up the version?

> This SandyBridge,enforce - what if the destination host running on old
> kernel was run without this option - will the migration fail? What is the

Migration "requires" you to use the same command line on both ends of the migration. Unfortunately in only enforces it implicitly - one of the protocol's shortcomings - but that's the idea.


Alex

> mechanism? Do machine options migrate? I looked at target-i386/cpu.c but
> did not see the quick answer.
> 
> 
>> But in this case, you do not need this because the hypercall works if
>> emulated by QEMU.  I like Alex's solution of making it universally
>> available in the dtb.
> 
> The solution would be good if we did not already have H_PUT_TCE accelerated
> for emulated devices in the host kernel but we do have it.
> 
> 
> -- 
> Alexey
Paolo Bonzini - Aug. 20, 2013, 8:39 a.m.
Il 20/08/2013 03:36, Alexey Kardashevskiy ha scritto:
> Hm. Here we might have a problem like this is we decide to migrate from
> QEMU with this patch running on modern kernel to QEMU without this patch
> running on old kernel - for this we might want to be able to disable
> "multi-tce" via machine options on newer kernels. Do we care enough to add
> such a parameter or we just disable migration and that's it?

Upstream doesn't support migration to older QEMU.

> This SandyBridge,enforce - what if the destination host running on old
> kernel was run without this option - will the migration fail?

The destination machine will not even start.

>> But in this case, you do not need this because the hypercall works if
>> emulated by QEMU.  I like Alex's solution of making it universally
>> available in the dtb.
> 
> The solution would be good if we did not already have H_PUT_TCE accelerated
> for emulated devices in the host kernel but we do have it.

The question is also whether you consider pSeries support complete
enough to be production ready---and until you have versioned machine
types I would say you don't.

If you still consider it somewhat experimental, I would do as Alex said:
make newer QEMU on older KVM as slower, and that's it.

Paolo
Alexey Kardashevskiy - Aug. 20, 2013, 8:49 a.m.
On 08/20/2013 06:39 PM, Paolo Bonzini wrote:
> Il 20/08/2013 03:36, Alexey Kardashevskiy ha scritto:
>> Hm. Here we might have a problem like this is we decide to migrate from
>> QEMU with this patch running on modern kernel to QEMU without this patch
>> running on old kernel - for this we might want to be able to disable
>> "multi-tce" via machine options on newer kernels. Do we care enough to add
>> such a parameter or we just disable migration and that's it?
> 
> Upstream doesn't support migration to older QEMU.


Hm. That makes things simpler. Then I do not understand why we need
migration protocol versions as QEMU version in stronger version for
migration. Ah, offtopic.

>> This SandyBridge,enforce - what if the destination host running on old
>> kernel was run without this option - will the migration fail?
> 
> The destination machine will not even start.
>
>>> But in this case, you do not need this because the hypercall works if
>>> emulated by QEMU.  I like Alex's solution of making it universally
>>> available in the dtb.
>>
>> The solution would be good if we did not already have H_PUT_TCE accelerated
>> for emulated devices in the host kernel but we do have it.
> 
> The question is also whether you consider pSeries support complete
> enough to be production ready---and until you have versioned machine
> types I would say you don't.
> 
> If you still consider it somewhat experimental, I would do as Alex said:
> make newer QEMU on older KVM as slower, and that's it.


Sorry if I miss anything, but is not it what the patch already does? :)
If so, I'll repost this patch + traces rework tomorrow or so.
Paolo Bonzini - Aug. 20, 2013, 9:09 a.m.
Il 20/08/2013 10:49, Alexey Kardashevskiy ha scritto:
> On 08/20/2013 06:39 PM, Paolo Bonzini wrote:
>> Il 20/08/2013 03:36, Alexey Kardashevskiy ha scritto:
>>> Hm. Here we might have a problem like this is we decide to migrate from
>>> QEMU with this patch running on modern kernel to QEMU without this patch
>>> running on old kernel - for this we might want to be able to disable
>>> "multi-tce" via machine options on newer kernels. Do we care enough to add
>>> such a parameter or we just disable migration and that's it?
>>
>> Upstream doesn't support migration to older QEMU.
> 
> Hm. That makes things simpler. Then I do not understand why we need
> migration protocol versions as QEMU version in stronger version for
> migration. Ah, offtopic.

Because you have multiple downstreams that can backport arbitrary crap
on top of the same base QEMU version.

In RHEL for example we do some effort to force usage of older version
IDs (or to forcibly leave out subsections) depending on the machine type
you're using.

So even though upstream is limited in the kind of migrations we support,
we provide the infrastructure for downstreams to give stronger guarantees.

>>> This SandyBridge,enforce - what if the destination host running on old
>>> kernel was run without this option - will the migration fail?
>>
>> The destination machine will not even start.
>>
>>>> But in this case, you do not need this because the hypercall works if
>>>> emulated by QEMU.  I like Alex's solution of making it universally
>>>> available in the dtb.
>>>
>>> The solution would be good if we did not already have H_PUT_TCE accelerated
>>> for emulated devices in the host kernel but we do have it.
>>
>> The question is also whether you consider pSeries support complete
>> enough to be production ready---and until you have versioned machine
>> types I would say you don't.
>>
>> If you still consider it somewhat experimental, I would do as Alex said:
>> make newer QEMU on older KVM as slower, and that's it.
> 
> Sorry if I miss anything, but is not it what the patch already does? :)

No, you need to expose multitce unconditionally in the device tree.

Paolo

> If so, I'll repost this patch + traces rework tomorrow or so.
Benjamin Herrenschmidt - Aug. 20, 2013, 9:13 a.m.
On Tue, 2013-08-20 at 11:09 +0200, Paolo Bonzini wrote:
> > Sorry if I miss anything, but is not it what the patch already
> does? :)
> 
> No, you need to expose multitce unconditionally in the device tree.

If I'm not mistaken the multitce kernel side patches are still not
upstream so I disagree. Exposing it will make Linux use it which means
that anything running on 3.10 or 3.11 will become very slow.

So no, multitce should not be exposed if KVM doesn't support it.

Cheers,
Ben.
Paolo Bonzini - Aug. 20, 2013, 9:15 a.m.
Il 20/08/2013 11:13, Benjamin Herrenschmidt ha scritto:
> On Tue, 2013-08-20 at 11:09 +0200, Paolo Bonzini wrote:
>>> Sorry if I miss anything, but is not it what the patch already
>> does? :)
>>
>> No, you need to expose multitce unconditionally in the device tree.
> 
> If I'm not mistaken the multitce kernel side patches are still not
> upstream so I disagree. Exposing it will make Linux use it which means
> that anything running on 3.10 or 3.11 will become very slow.
> 
> So no, multitce should not be exposed if KVM doesn't support it.

Then you have to do it right and:

- provide the infrastructure to enable/disable it from the command line
(which will be enough design effort alone);

- add pseries-1.6 as a synonym of pseries in 1.6.1;

- add pseries-1.7 a synonym of pseries in master;

- add a pseries-1.6 machine type in master that always disables it.

Paolo
Benjamin Herrenschmidt - Aug. 20, 2013, 9:20 a.m.
On Tue, 2013-08-20 at 11:15 +0200, Paolo Bonzini wrote:
> 
> - provide the infrastructure to enable/disable it from the command
> line
> (which will be enough design effort alone);

<sight> ... why do things simply when we can come up with a cathedral
instead ?

> - add pseries-1.6 as a synonym of pseries in 1.6.1;
> 
> - add pseries-1.7 a synonym of pseries in master;
> 
> - add a pseries-1.6 machine type in master that always disables it.
Paolo Bonzini - Aug. 20, 2013, 9:22 a.m.
Il 20/08/2013 11:20, Benjamin Herrenschmidt ha scritto:
> On Tue, 2013-08-20 at 11:15 +0200, Paolo Bonzini wrote:
>>
>> - provide the infrastructure to enable/disable it from the command
>> line
>> (which will be enough design effort alone);
> 
> <sight> ... why do things simply when we can come up with a cathedral
> instead ?

Uhm... I thought Alex and I were the one who proposed the simple things.
 You _will_ need to do the complicated stuff sooner or later, but it's
probably early enough now to ignore it.

And I'm not saying this out of love for big cathedrals, but out of
lessons we learned the hard way for x86 (where we haven't gotten
everything right yet, either).

Paolo

>> - add pseries-1.6 as a synonym of pseries in 1.6.1;
>>
>> - add pseries-1.7 a synonym of pseries in master;
>>
>> - add a pseries-1.6 machine type in master that always disables it.
> 
> 
>
Benjamin Herrenschmidt - Aug. 20, 2013, 9:26 a.m.
On Tue, 2013-08-20 at 11:22 +0200, Paolo Bonzini wrote:
> 
> Uhm... I thought Alex and I were the one who proposed the simple things.
>  You _will_ need to do the complicated stuff sooner or later, but it's
> probably early enough now to ignore it.
> 
> And I'm not saying this out of love for big cathedrals, but out of
> lessons we learned the hard way for x86 (where we haven't gotten
> everything right yet, either).

I suppose if RH is going to deploy 3.10 and we aren't going to backport
the multitce patches then there *might* be a case for supporting that
combo specifically... but it's going to be that bad every time we add
a new feature with a kernel counter part or start adding the gazillion
little bits of PAPR that we are still missing ?

Ben.
Paolo Bonzini - Aug. 20, 2013, 10:14 a.m.
Il 20/08/2013 11:26, Benjamin Herrenschmidt ha scritto:
> On Tue, 2013-08-20 at 11:22 +0200, Paolo Bonzini wrote:
>>
>> Uhm... I thought Alex and I were the one who proposed the simple things.
>>  You _will_ need to do the complicated stuff sooner or later, but it's
>> probably early enough now to ignore it.
>>
>> And I'm not saying this out of love for big cathedrals, but out of
>> lessons we learned the hard way for x86 (where we haven't gotten
>> everything right yet, either).
> 
> I suppose if RH is going to deploy 3.10 and we aren't going to backport
> the multitce patches then there *might* be a case for supporting that
> combo specifically... but it's going to be that bad every time we add
> a new feature with a kernel counter part or start adding the gazillion
> little bits of PAPR that we are still missing ?

Yes. :(

Unless you consider pSeries KVM not mature enough to provide a guest ABI
(basically supporting live migration only between identical kernels and
QEMUs).  It would make some sense, for example (mutatis mutandis) Red
Hat has a kernel ABI for the "regular" RHEL kernel, but not for the
"real-time" RHEL kernel.

Paolo
Benjamin Herrenschmidt - Aug. 20, 2013, 11:38 a.m.
On Tue, 2013-08-20 at 12:14 +0200, Paolo Bonzini wrote:
> > I suppose if RH is going to deploy 3.10 and we aren't going to backport
> > the multitce patches then there *might* be a case for supporting that
> > combo specifically... but it's going to be that bad every time we add
> > a new feature with a kernel counter part or start adding the gazillion
> > little bits of PAPR that we are still missing ?
> 
> Yes. :(
> 
> Unless you consider pSeries KVM not mature enough to provide a guest ABI
> (basically supporting live migration only between identical kernels and
> QEMUs).  It would make some sense, for example (mutatis mutandis) Red
> Hat has a kernel ABI for the "regular" RHEL kernel, but not for the
> "real-time" RHEL kernel.

Migration is somewhat less of an issue, and there is to consider what "products"
will actually support KVM on Power. So at this early stage of the game, I would
still play it by ear and stay flexible. When we have something we really need to
have long term support for around the corner (or whatever we haven't announced
yet :-) we'll backport what's needed.

But yes, the minute we have that out, I think the problem will present itself,
at which point we need to probably start considering features in "batches" to
limit the explosion of individual options ...

Cheers,
Ben.
Alexander Graf - Aug. 21, 2013, 7:11 a.m.
On 20.08.2013, at 12:38, Benjamin Herrenschmidt wrote:

> On Tue, 2013-08-20 at 12:14 +0200, Paolo Bonzini wrote:
>>> I suppose if RH is going to deploy 3.10 and we aren't going to backport
>>> the multitce patches then there *might* be a case for supporting that
>>> combo specifically... but it's going to be that bad every time we add
>>> a new feature with a kernel counter part or start adding the gazillion
>>> little bits of PAPR that we are still missing ?
>> 
>> Yes. :(
>> 
>> Unless you consider pSeries KVM not mature enough to provide a guest ABI
>> (basically supporting live migration only between identical kernels and
>> QEMUs).  It would make some sense, for example (mutatis mutandis) Red
>> Hat has a kernel ABI for the "regular" RHEL kernel, but not for the
>> "real-time" RHEL kernel.
> 
> Migration is somewhat less of an issue, and there is to consider what "products"
> will actually support KVM on Power. So at this early stage of the game, I would
> still play it by ear and stay flexible. When we have something we really need to
> have long term support for around the corner (or whatever we haven't announced
> yet :-) we'll backport what's needed.
> 
> But yes, the minute we have that out, I think the problem will present itself,
> at which point we need to probably start considering features in "batches" to
> limit the explosion of individual options ...

Yes, I completely agree. I am very happy to postpone that "always stay compatible" mode as far to the future as possible ;). So instead of making multi-tce support runtime switchable (which is a guarantee for exposing things differently to the guest), the easy way out is to always expose it. That way when we pull the trigger later to have a stable interface, we don't have to worry about this piece of code.

If you like, add an if () on the multi-tce cap and warn the user that his system will be slow and that he should update his kernel. That way you get no headaches on compatibility and people won't get confused why they're being slow because you warned them.


Alex
Alexander Graf - Aug. 21, 2013, 7:29 a.m.
On 21.08.2013, at 08:11, Alexander Graf wrote:

> 
> On 20.08.2013, at 12:38, Benjamin Herrenschmidt wrote:
> 
>> On Tue, 2013-08-20 at 12:14 +0200, Paolo Bonzini wrote:
>>>> I suppose if RH is going to deploy 3.10 and we aren't going to backport
>>>> the multitce patches then there *might* be a case for supporting that
>>>> combo specifically... but it's going to be that bad every time we add
>>>> a new feature with a kernel counter part or start adding the gazillion
>>>> little bits of PAPR that we are still missing ?
>>> 
>>> Yes. :(
>>> 
>>> Unless you consider pSeries KVM not mature enough to provide a guest ABI
>>> (basically supporting live migration only between identical kernels and
>>> QEMUs).  It would make some sense, for example (mutatis mutandis) Red
>>> Hat has a kernel ABI for the "regular" RHEL kernel, but not for the
>>> "real-time" RHEL kernel.
>> 
>> Migration is somewhat less of an issue, and there is to consider what "products"
>> will actually support KVM on Power. So at this early stage of the game, I would
>> still play it by ear and stay flexible. When we have something we really need to
>> have long term support for around the corner (or whatever we haven't announced
>> yet :-) we'll backport what's needed.
>> 
>> But yes, the minute we have that out, I think the problem will present itself,
>> at which point we need to probably start considering features in "batches" to
>> limit the explosion of individual options ...
> 
> Yes, I completely agree. I am very happy to postpone that "always stay compatible" mode as far to the future as possible ;). So instead of making multi-tce support runtime switchable (which is a guarantee for exposing things differently to the guest), the easy way out is to always expose it. That way when we pull the trigger later to have a stable interface, we don't have to worry about this piece of code.
> 
> If you like, add an if () on the multi-tce cap and warn the user that his system will be slow and that he should update his kernel. That way you get no headaches on compatibility and people won't get confused why they're being slow because you warned them.

Or keep the code as is, but print a warning to the user on the "kernel does not expose multi-tce" case so that he knows he uses an outdated version of KVM. That way the guest visible difference is at least visible in the logs.


Alex

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9494915..a6b1f54 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -301,6 +301,8 @@  static void *spapr_create_fdt_skel(const char *cpu_model,
     CPUState *cs;
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
+    char hypertas_propm[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
+        "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk\0hcall-multi-tce";
     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
     char qemu_hypertas_prop[] = "hcall-memop1";
@@ -480,8 +482,18 @@  static void *spapr_create_fdt_skel(const char *cpu_model,
     /* RTAS */
     _FDT((fdt_begin_node(fdt, "rtas")));
 
-    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
-                       sizeof(hypertas_prop))));
+    /* In TCG mode, the multitce functions, which we implement are a
+     * win.  With KVM, we could fall back to the qemu implementation
+     * when KVM doesn't support them, but that would be much slower
+     * than just using the KVM implementations of the single TCE
+     * hypercalls. */
+    if (kvmppc_spapr_use_multitce()) {
+        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_propm,
+                           sizeof(hypertas_propm))));
+    } else {
+        _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
+                           sizeof(hypertas_prop))));
+    }
     _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
                        sizeof(qemu_hypertas_prop))));
 
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 3d4a1fc..22b09be 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -244,6 +244,71 @@  static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
     return H_SUCCESS;
 }
 
+static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
+                                       sPAPREnvironment *spapr,
+                                       target_ulong opcode, target_ulong *args)
+{
+    int i;
+    target_ulong liobn = args[0];
+    target_ulong ioba = args[1];
+    target_ulong tce_list = args[2];
+    target_ulong npages = args[3];
+    target_ulong ret = 0;
+    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+
+    if (tcet) {
+        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
+            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
+                                        i * sizeof(target_ulong));
+            ret = put_tce_emu(tcet, ioba, tce);
+            if (ret) {
+                break;
+            }
+        }
+        return ret;
+    }
+#ifdef DEBUG_TCE
+    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
+            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
+            " ret = " TARGET_FMT_ld "\n",
+            __func__, liobn, ioba, tce_list, ret);
+#endif
+
+    return H_PARAMETER;
+}
+
+static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
+                              target_ulong opcode, target_ulong *args)
+{
+    int i;
+    target_ulong liobn = args[0];
+    target_ulong ioba = args[1];
+    target_ulong tce_value = args[2];
+    target_ulong npages = args[3];
+    target_ulong ret = 0;
+    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+
+    ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
+
+    if (tcet) {
+        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
+            ret = put_tce_emu(tcet, ioba, tce_value);
+            if (ret) {
+                break;
+            }
+        }
+        return ret;
+    }
+#ifdef DEBUG_TCE
+    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
+            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
+            " ret = " TARGET_FMT_ld "\n",
+            __func__, liobn, ioba, tce_value, ret);
+#endif
+
+    return H_PARAMETER;
+}
+
 static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
                               target_ulong opcode, target_ulong *args)
 {
@@ -258,9 +323,10 @@  static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
         return put_tce_emu(tcet, ioba, tce);
     }
 #ifdef DEBUG_TCE
-    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/
-            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx "\n",
-            __func__, liobn, /*dev->qdev.id, */ioba, tce);
+    fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
+            "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
+            " ret = " TARGET_FMT_ld "\n",
+            __func__, liobn, ioba, tce, ret);
 #endif
 
     return H_PARAMETER;
@@ -318,6 +384,8 @@  static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
 
     /* hcall-tce */
     spapr_register_hypercall(H_PUT_TCE, h_put_tce);
+    spapr_register_hypercall(H_PUT_TCE_INDIRECT, h_put_tce_indirect);
+    spapr_register_hypercall(H_STUFF_TCE, h_stuff_tce);
 }
 
 static TypeInfo spapr_tce_table_info = {
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 8afa7eb..97ac670 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -60,6 +60,7 @@  static int cap_booke_sregs;
 static int cap_ppc_smt;
 static int cap_ppc_rma;
 static int cap_spapr_tce;
+static int cap_spapr_multitce;
 static int cap_hior;
 static int cap_one_reg;
 static int cap_epr;
@@ -96,6 +97,7 @@  int kvm_arch_init(KVMState *s)
     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
+    cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
@@ -1603,6 +1605,11 @@  uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
 }
 #endif
 
+bool kvmppc_spapr_use_multitce(void)
+{
+    return !kvm_enabled() || cap_spapr_multitce;
+}
+
 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd)
 {
     struct kvm_create_spapr_tce args = {
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 12564ef..a2a903f 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -31,6 +31,7 @@  int kvmppc_set_tcr(PowerPCCPU *cpu);
 int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
 #ifndef CONFIG_USER_ONLY
 off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
+bool kvmppc_spapr_use_multitce(void);
 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
 int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
 int kvmppc_reset_htab(int shift_hint);
@@ -125,6 +126,12 @@  static inline off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
     return 0;
 }
 
+static inline bool kvmppc_spapr_use_multitce(void)
+{
+    /* No KVM, so always use the qemu multitce implementations */
+    return true;
+}
+
 static inline void *kvmppc_create_spapr_tce(uint32_t liobn,
                                             uint32_t window_size, int *fd)
 {