diff mbox series

[3/3] target/ppc: Increment PMC5 with inline insns

Message ID 20221021170112.151393-4-leandro.lupori@eldorado.org.br
State New
Headers show
Series Performance optimizations for PPC64 | expand

Commit Message

Leandro Lupori Oct. 21, 2022, 5:01 p.m. UTC
Profiling QEMU during Fedora 35 for PPC64 boot revealed that
6.39% of total time was being spent in helper_insns_inc(), on a
POWER9 machine. To avoid calling this helper every time PMCs had
to be incremented, an inline implementation of PMC5 increment and
check for overflow was developed. This led to a reduction of
about 12% in Fedora's boot time.

Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
---
 target/ppc/helper.h     |  1 +
 target/ppc/power8-pmu.c | 74 +++++++++++++++++++++--------------------
 target/ppc/power8-pmu.h |  3 ++
 target/ppc/translate.c  | 28 ++++++++++++++--
 4 files changed, 67 insertions(+), 39 deletions(-)

Comments

Daniel Henrique Barboza Oct. 25, 2022, 7:29 p.m. UTC | #1
On 10/21/22 14:01, Leandro Lupori wrote:
> Profiling QEMU during Fedora 35 for PPC64 boot revealed that
> 6.39% of total time was being spent in helper_insns_inc(), on a
> POWER9 machine. To avoid calling this helper every time PMCs had
> to be incremented, an inline implementation of PMC5 increment and
> check for overflow was developed. This led to a reduction of
> about 12% in Fedora's boot time.
> 
> Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
> ---

Given that PMC5 is the counter that is most likely to be active, yeah,
isolating the case where PMC5 is incremented standalone makes sense.

Still, 12% performance gain is not too shaby. Not too shaby at all.


Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>


>   target/ppc/helper.h     |  1 +
>   target/ppc/power8-pmu.c | 74 +++++++++++++++++++++--------------------
>   target/ppc/power8-pmu.h |  3 ++
>   target/ppc/translate.c  | 28 ++++++++++++++--
>   4 files changed, 67 insertions(+), 39 deletions(-)
> 
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index 57eee07256..f8cd00c976 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -29,6 +29,7 @@ DEF_HELPER_2(store_mmcr1, void, env, tl)
>   DEF_HELPER_3(store_pmc, void, env, i32, i64)
>   DEF_HELPER_2(read_pmc, tl, env, i32)
>   DEF_HELPER_2(insns_inc, void, env, i32)
> +DEF_HELPER_1(handle_pmc5_overflow, void, env)
>   #endif
>   DEF_HELPER_1(check_tlb_flush_local, void, env)
>   DEF_HELPER_1(check_tlb_flush_global, void, env)
> diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
> index beeab5c494..1381072b9e 100644
> --- a/target/ppc/power8-pmu.c
> +++ b/target/ppc/power8-pmu.c
> @@ -22,8 +22,6 @@
>   
>   #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
>   
> -#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
> -
>   static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
>   {
>       if (sprn == SPR_POWER_PMC1) {
> @@ -88,49 +86,47 @@ static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
>       bool overflow_triggered = false;
>       target_ulong tmp;
>   
> -    if (unlikely(ins_cnt & 0x1e)) {
> -        if (ins_cnt & (1 << 1)) {
> -            tmp = env->spr[SPR_POWER_PMC1];
> -            tmp += num_insns;
> -            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
> -                tmp = PMC_COUNTER_NEGATIVE_VAL;
> -                overflow_triggered = true;
> -            }
> -            env->spr[SPR_POWER_PMC1] = tmp;
> +    if (ins_cnt & (1 << 1)) {
> +        tmp = env->spr[SPR_POWER_PMC1];
> +        tmp += num_insns;
> +        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
> +            tmp = PMC_COUNTER_NEGATIVE_VAL;
> +            overflow_triggered = true;
>           }
> +        env->spr[SPR_POWER_PMC1] = tmp;
> +    }
>   
> -        if (ins_cnt & (1 << 2)) {
> -            tmp = env->spr[SPR_POWER_PMC2];
> -            tmp += num_insns;
> -            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> -                tmp = PMC_COUNTER_NEGATIVE_VAL;
> -                overflow_triggered = true;
> -            }
> -            env->spr[SPR_POWER_PMC2] = tmp;
> +    if (ins_cnt & (1 << 2)) {
> +        tmp = env->spr[SPR_POWER_PMC2];
> +        tmp += num_insns;
> +        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> +            tmp = PMC_COUNTER_NEGATIVE_VAL;
> +            overflow_triggered = true;
> +        }
> +        env->spr[SPR_POWER_PMC2] = tmp;
> +    }
> +
> +    if (ins_cnt & (1 << 3)) {
> +        tmp = env->spr[SPR_POWER_PMC3];
> +        tmp += num_insns;
> +        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> +            tmp = PMC_COUNTER_NEGATIVE_VAL;
> +            overflow_triggered = true;
>           }
> +        env->spr[SPR_POWER_PMC3] = tmp;
> +    }
>   
> -        if (ins_cnt & (1 << 3)) {
> -            tmp = env->spr[SPR_POWER_PMC3];
> +    if (ins_cnt & (1 << 4)) {
> +        target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
> +        int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
> +        if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
> +            tmp = env->spr[SPR_POWER_PMC4];
>               tmp += num_insns;
>               if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
>                   tmp = PMC_COUNTER_NEGATIVE_VAL;
>                   overflow_triggered = true;
>               }
> -            env->spr[SPR_POWER_PMC3] = tmp;
> -        }
> -
> -        if (ins_cnt & (1 << 4)) {
> -            target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
> -            int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
> -            if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
> -                tmp = env->spr[SPR_POWER_PMC4];
> -                tmp += num_insns;
> -                if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
> -                    tmp = PMC_COUNTER_NEGATIVE_VAL;
> -                    overflow_triggered = true;
> -                }
> -                env->spr[SPR_POWER_PMC4] = tmp;
> -            }
> +            env->spr[SPR_POWER_PMC4] = tmp;
>           }
>       }
>   
> @@ -310,6 +306,12 @@ static void fire_PMC_interrupt(PowerPCCPU *cpu)
>       raise_ebb_perfm_exception(env);
>   }
>   
> +void helper_handle_pmc5_overflow(CPUPPCState *env)
> +{
> +    env->spr[SPR_POWER_PMC5] = PMC_COUNTER_NEGATIVE_VAL;
> +    fire_PMC_interrupt(env_archcpu(env));
> +}
> +
>   /* This helper assumes that the PMC is running. */
>   void helper_insns_inc(CPUPPCState *env, uint32_t num_insns)
>   {
> diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
> index 9692dd765e..c0093e2219 100644
> --- a/target/ppc/power8-pmu.h
> +++ b/target/ppc/power8-pmu.h
> @@ -14,6 +14,9 @@
>   #define POWER8_PMU_H
>   
>   #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
> +
> +#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
> +
>   void cpu_ppc_pmu_init(CPUPPCState *env);
>   void pmu_update_summaries(CPUPPCState *env);
>   #else
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 8fda2cf836..5c74684eee 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -36,6 +36,7 @@
>   #include "exec/log.h"
>   #include "qemu/atomic128.h"
>   #include "spr_common.h"
> +#include "power8-pmu.h"
>   
>   #include "qemu/qemu-print.h"
>   #include "qapi/error.h"
> @@ -4263,6 +4264,9 @@ static void pmu_count_insns(DisasContext *ctx)
>       }
>   
>    #if !defined(CONFIG_USER_ONLY)
> +    TCGLabel *l;
> +    TCGv t0;
> +
>       /*
>        * The PMU insns_inc() helper stops the internal PMU timer if a
>        * counter overflows happens. In that case, if the guest is
> @@ -4271,8 +4275,26 @@ static void pmu_count_insns(DisasContext *ctx)
>        */
>       gen_icount_io_start(ctx);
>   
> -    gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
> -#else
> +    /* Avoid helper calls when only PMC5-6 are enabled. */
> +    if (!ctx->pmc_other) {
> +        l = gen_new_label();
> +        t0 = tcg_temp_new();
> +
> +        gen_load_spr(t0, SPR_POWER_PMC5);
> +        tcg_gen_addi_tl(t0, t0, ctx->base.num_insns);
> +        gen_store_spr(SPR_POWER_PMC5, t0);
> +        /* Check for overflow, if it's enabled */
> +        if (ctx->mmcr0_pmcjce) {
> +            tcg_gen_brcondi_tl(TCG_COND_LT, t0, PMC_COUNTER_NEGATIVE_VAL, l);
> +            gen_helper_handle_pmc5_overflow(cpu_env);
> +        }
> +
> +        gen_set_label(l);
> +        tcg_temp_free(t0);
> +    } else {
> +        gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
> +    }
> +  #else
>       /*
>        * User mode can read (but not write) PMC5 and start/stop
>        * the PMU via MMCR0_FC. In this case just increment
> @@ -4285,7 +4307,7 @@ static void pmu_count_insns(DisasContext *ctx)
>       gen_store_spr(SPR_POWER_PMC5, t0);
>   
>       tcg_temp_free(t0);
> -#endif /* #if !defined(CONFIG_USER_ONLY) */
> +  #endif /* #if !defined(CONFIG_USER_ONLY) */
>   }
>   #else
>   static void pmu_count_insns(DisasContext *ctx)
Leandro Lupori Oct. 25, 2022, 8:39 p.m. UTC | #2
On 10/25/22 16:29, Daniel Henrique Barboza wrote:

> On 10/21/22 14:01, Leandro Lupori wrote:
>> Profiling QEMU during Fedora 35 for PPC64 boot revealed that
>> 6.39% of total time was being spent in helper_insns_inc(), on a
>> POWER9 machine. To avoid calling this helper every time PMCs had
>> to be incremented, an inline implementation of PMC5 increment and
>> check for overflow was developed. This led to a reduction of
>> about 12% in Fedora's boot time.
>>
>> Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
>> ---
> 
> Given that PMC5 is the counter that is most likely to be active, yeah,
> isolating the case where PMC5 is incremented standalone makes sense.
> 
> Still, 12% performance gain is not too shaby. Not too shaby at all.
> 

I've tried to move more of helper_insns_inc() to the inline 
implementation, but then performance started to decrease.

Initially I found this strange, but perf revealed a considerable 
increase of time spent in functions such as tcg_gen_code and 
liveness_pass_1.

So as this code has to be generated and optimized for most TBs, it seems 
it makes code generation slower if it's too big.

> 
> Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> 
>
diff mbox series

Patch

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57eee07256..f8cd00c976 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -29,6 +29,7 @@  DEF_HELPER_2(store_mmcr1, void, env, tl)
 DEF_HELPER_3(store_pmc, void, env, i32, i64)
 DEF_HELPER_2(read_pmc, tl, env, i32)
 DEF_HELPER_2(insns_inc, void, env, i32)
+DEF_HELPER_1(handle_pmc5_overflow, void, env)
 #endif
 DEF_HELPER_1(check_tlb_flush_local, void, env)
 DEF_HELPER_1(check_tlb_flush_global, void, env)
diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index beeab5c494..1381072b9e 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -22,8 +22,6 @@ 
 
 #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
 
-#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
-
 static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
 {
     if (sprn == SPR_POWER_PMC1) {
@@ -88,49 +86,47 @@  static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
     bool overflow_triggered = false;
     target_ulong tmp;
 
-    if (unlikely(ins_cnt & 0x1e)) {
-        if (ins_cnt & (1 << 1)) {
-            tmp = env->spr[SPR_POWER_PMC1];
-            tmp += num_insns;
-            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
-                tmp = PMC_COUNTER_NEGATIVE_VAL;
-                overflow_triggered = true;
-            }
-            env->spr[SPR_POWER_PMC1] = tmp;
+    if (ins_cnt & (1 << 1)) {
+        tmp = env->spr[SPR_POWER_PMC1];
+        tmp += num_insns;
+        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
+            tmp = PMC_COUNTER_NEGATIVE_VAL;
+            overflow_triggered = true;
         }
+        env->spr[SPR_POWER_PMC1] = tmp;
+    }
 
-        if (ins_cnt & (1 << 2)) {
-            tmp = env->spr[SPR_POWER_PMC2];
-            tmp += num_insns;
-            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
-                tmp = PMC_COUNTER_NEGATIVE_VAL;
-                overflow_triggered = true;
-            }
-            env->spr[SPR_POWER_PMC2] = tmp;
+    if (ins_cnt & (1 << 2)) {
+        tmp = env->spr[SPR_POWER_PMC2];
+        tmp += num_insns;
+        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+            tmp = PMC_COUNTER_NEGATIVE_VAL;
+            overflow_triggered = true;
+        }
+        env->spr[SPR_POWER_PMC2] = tmp;
+    }
+
+    if (ins_cnt & (1 << 3)) {
+        tmp = env->spr[SPR_POWER_PMC3];
+        tmp += num_insns;
+        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+            tmp = PMC_COUNTER_NEGATIVE_VAL;
+            overflow_triggered = true;
         }
+        env->spr[SPR_POWER_PMC3] = tmp;
+    }
 
-        if (ins_cnt & (1 << 3)) {
-            tmp = env->spr[SPR_POWER_PMC3];
+    if (ins_cnt & (1 << 4)) {
+        target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
+        int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
+        if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
+            tmp = env->spr[SPR_POWER_PMC4];
             tmp += num_insns;
             if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
                 tmp = PMC_COUNTER_NEGATIVE_VAL;
                 overflow_triggered = true;
             }
-            env->spr[SPR_POWER_PMC3] = tmp;
-        }
-
-        if (ins_cnt & (1 << 4)) {
-            target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
-            int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
-            if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
-                tmp = env->spr[SPR_POWER_PMC4];
-                tmp += num_insns;
-                if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
-                    tmp = PMC_COUNTER_NEGATIVE_VAL;
-                    overflow_triggered = true;
-                }
-                env->spr[SPR_POWER_PMC4] = tmp;
-            }
+            env->spr[SPR_POWER_PMC4] = tmp;
         }
     }
 
@@ -310,6 +306,12 @@  static void fire_PMC_interrupt(PowerPCCPU *cpu)
     raise_ebb_perfm_exception(env);
 }
 
+void helper_handle_pmc5_overflow(CPUPPCState *env)
+{
+    env->spr[SPR_POWER_PMC5] = PMC_COUNTER_NEGATIVE_VAL;
+    fire_PMC_interrupt(env_archcpu(env));
+}
+
 /* This helper assumes that the PMC is running. */
 void helper_insns_inc(CPUPPCState *env, uint32_t num_insns)
 {
diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
index 9692dd765e..c0093e2219 100644
--- a/target/ppc/power8-pmu.h
+++ b/target/ppc/power8-pmu.h
@@ -14,6 +14,9 @@ 
 #define POWER8_PMU_H
 
 #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
+
+#define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
+
 void cpu_ppc_pmu_init(CPUPPCState *env);
 void pmu_update_summaries(CPUPPCState *env);
 #else
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 8fda2cf836..5c74684eee 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -36,6 +36,7 @@ 
 #include "exec/log.h"
 #include "qemu/atomic128.h"
 #include "spr_common.h"
+#include "power8-pmu.h"
 
 #include "qemu/qemu-print.h"
 #include "qapi/error.h"
@@ -4263,6 +4264,9 @@  static void pmu_count_insns(DisasContext *ctx)
     }
 
  #if !defined(CONFIG_USER_ONLY)
+    TCGLabel *l;
+    TCGv t0;
+
     /*
      * The PMU insns_inc() helper stops the internal PMU timer if a
      * counter overflows happens. In that case, if the guest is
@@ -4271,8 +4275,26 @@  static void pmu_count_insns(DisasContext *ctx)
      */
     gen_icount_io_start(ctx);
 
-    gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
-#else
+    /* Avoid helper calls when only PMC5-6 are enabled. */
+    if (!ctx->pmc_other) {
+        l = gen_new_label();
+        t0 = tcg_temp_new();
+
+        gen_load_spr(t0, SPR_POWER_PMC5);
+        tcg_gen_addi_tl(t0, t0, ctx->base.num_insns);
+        gen_store_spr(SPR_POWER_PMC5, t0);
+        /* Check for overflow, if it's enabled */
+        if (ctx->mmcr0_pmcjce) {
+            tcg_gen_brcondi_tl(TCG_COND_LT, t0, PMC_COUNTER_NEGATIVE_VAL, l);
+            gen_helper_handle_pmc5_overflow(cpu_env);
+        }
+
+        gen_set_label(l);
+        tcg_temp_free(t0);
+    } else {
+        gen_helper_insns_inc(cpu_env, tcg_constant_i32(ctx->base.num_insns));
+    }
+  #else
     /*
      * User mode can read (but not write) PMC5 and start/stop
      * the PMU via MMCR0_FC. In this case just increment
@@ -4285,7 +4307,7 @@  static void pmu_count_insns(DisasContext *ctx)
     gen_store_spr(SPR_POWER_PMC5, t0);
 
     tcg_temp_free(t0);
-#endif /* #if !defined(CONFIG_USER_ONLY) */
+  #endif /* #if !defined(CONFIG_USER_ONLY) */
 }
 #else
 static void pmu_count_insns(DisasContext *ctx)