diff mbox series

target-ppc: optimize cmp translation

Message ID 20171217054953.28500-1-pbonzini@redhat.com
State New
Headers show
Series target-ppc: optimize cmp translation | expand

Commit Message

Paolo Bonzini Dec. 17, 2017, 5:49 a.m. UTC
We know that only one bit (in addition to SO) is going to be set in
the condition register, so do two movconds instead of three setconds,
three shifts and two ORs.

For ppc64-linux-user, the code size reduction is around 5% and the
performance improvement slightly less than 10%.  For softmmu, the
improvement is around 5%.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/ppc/translate.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

Comments

David Gibson Dec. 18, 2017, 2:39 a.m. UTC | #1
On Sun, Dec 17, 2017 at 06:49:53AM +0100, Paolo Bonzini wrote:
> We know that only one bit (in addition to SO) is going to be set in
> the condition register, so do two movconds instead of three setconds,
> three shifts and two ORs.
> 
> For ppc64-linux-user, the code size reduction is around 5% and the
> performance improvement slightly less than 10%.  For softmmu, the
> improvement is around 5%.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Applied to ppc-for-2.12, thanks.


> ---
>  target/ppc/translate.c | 29 ++++++++++++-----------------
>  1 file changed, 12 insertions(+), 17 deletions(-)
> 
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 998fbed848..0e9e6823a3 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -605,27 +605,22 @@ static opc_handler_t invalid_handler = {
>  static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
>  {
>      TCGv t0 = tcg_temp_new();
> -    TCGv_i32 t1 = tcg_temp_new_i32();
> -
> -    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> -
> -    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_LT_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    TCGv t1 = tcg_temp_new();
> +    TCGv_i32 t = tcg_temp_new_i32();
>  
> -    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_GT_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    tcg_gen_movi_tl(t0, CRF_EQ);
> +    tcg_gen_movi_tl(t1, CRF_LT);
> +    tcg_gen_movcond_tl((s ? TCG_COND_LT : TCG_COND_LTU), t0, arg0, arg1, t1, t0);
> +    tcg_gen_movi_tl(t1, CRF_GT);
> +    tcg_gen_movcond_tl((s ? TCG_COND_GT : TCG_COND_GTU), t0, arg0, arg1, t1, t0);
>  
> -    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
> -    tcg_gen_trunc_tl_i32(t1, t0);
> -    tcg_gen_shli_i32(t1, t1, CRF_EQ_BIT);
> -    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
> +    tcg_gen_trunc_tl_i32(t, t0);
> +    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
> +    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t);
>  
>      tcg_temp_free(t0);
> -    tcg_temp_free_i32(t1);
> +    tcg_temp_free(t1);
> +    tcg_temp_free_i32(t);
>  }
>  
>  static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)
Richard Henderson Dec. 18, 2017, 7:01 p.m. UTC | #2
On 12/16/2017 09:49 PM, Paolo Bonzini wrote:
> We know that only one bit (in addition to SO) is going to be set in
> the condition register, so do two movconds instead of three setconds,
> three shifts and two ORs.
> 
> For ppc64-linux-user, the code size reduction is around 5% and the
> performance improvement slightly less than 10%.  For softmmu, the
> improvement is around 5%.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  target/ppc/translate.c | 29 ++++++++++++-----------------
>  1 file changed, 12 insertions(+), 17 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
diff mbox series

Patch

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 998fbed848..0e9e6823a3 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -605,27 +605,22 @@  static opc_handler_t invalid_handler = {
 static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
 {
     TCGv t0 = tcg_temp_new();
-    TCGv_i32 t1 = tcg_temp_new_i32();
-
-    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
-
-    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_LT_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    TCGv t1 = tcg_temp_new();
+    TCGv_i32 t = tcg_temp_new_i32();
 
-    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_GT_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    tcg_gen_movi_tl(t0, CRF_EQ);
+    tcg_gen_movi_tl(t1, CRF_LT);
+    tcg_gen_movcond_tl((s ? TCG_COND_LT : TCG_COND_LTU), t0, arg0, arg1, t1, t0);
+    tcg_gen_movi_tl(t1, CRF_GT);
+    tcg_gen_movcond_tl((s ? TCG_COND_GT : TCG_COND_GTU), t0, arg0, arg1, t1, t0);
 
-    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
-    tcg_gen_trunc_tl_i32(t1, t0);
-    tcg_gen_shli_i32(t1, t1, CRF_EQ_BIT);
-    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+    tcg_gen_trunc_tl_i32(t, t0);
+    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
+    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t);
 
     tcg_temp_free(t0);
-    tcg_temp_free_i32(t1);
+    tcg_temp_free(t1);
+    tcg_temp_free_i32(t);
 }
 
 static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)