Patchwork [10/10] tcg: Optimize mulu2

login
register
mail settings
Submitter Richard Henderson
Date Oct. 2, 2012, 6:32 p.m.
Message ID <1349202750-16815-11-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/188611/
State New
Headers show

Comments

Richard Henderson - Oct. 2, 2012, 6:32 p.m.
Like add2, do operand ordering, constant folding, and dead operand
elimination.  The latter happens about 15% of all mulu2 during an
x86_64 bios boot.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 26 ++++++++++++++++++++++++++
 tcg/tcg-op.h   |  2 ++
 tcg/tcg.c      | 19 +++++++++++++++++++
 3 files changed, 47 insertions(+)
Aurelien Jarno - Oct. 16, 2012, 11:25 p.m.
On Tue, Oct 02, 2012 at 11:32:30AM -0700, Richard Henderson wrote:
> Like add2, do operand ordering, constant folding, and dead operand
> elimination.  The latter happens about 15% of all mulu2 during an
> x86_64 bios boot.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 26 ++++++++++++++++++++++++++
>  tcg/tcg-op.h   |  2 ++
>  tcg/tcg.c      | 19 +++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 05891ef..a06c8eb 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -543,6 +543,9 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              swap_commutative(args[0], &args[2], &args[4]);
>              swap_commutative(args[1], &args[3], &args[5]);
>              break;
> +        case INDEX_op_mulu2_i32:
> +            swap_commutative(args[0], &args[2], &args[3]);
> +            break;
>          case INDEX_op_brcond2_i32:
>              if (swap_commutative2(&args[0], &args[2])) {
>                  args[4] = tcg_swap_cond(args[4]);
> @@ -831,6 +834,29 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              }
>              goto do_default;
>  
> +        case INDEX_op_mulu2_i32:
> +            if (temps[args[2]].state == TCG_TEMP_CONST
> +                && temps[args[3]].state == TCG_TEMP_CONST) {
> +                uint32_t a = temps[args[2]].val;
> +                uint32_t b = temps[args[3]].val;
> +                uint64_t r = (uint64_t)a * b;
> +                TCGArg rl, rh;
> +
> +                /* We emit the extra nop when we emit the mulu2.  */
> +                assert(gen_opc_buf[op_index + 1] == INDEX_op_nop);
> +
> +                rl = args[0];
> +                rh = args[1];
> +                gen_opc_buf[op_index] = INDEX_op_movi_i32;
> +                gen_opc_buf[++op_index] = INDEX_op_movi_i32;
> +                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r);
> +                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32));
> +                gen_args += 4;
> +                args += 4;
> +                break;
> +            }
> +            goto do_default;
> +
>          case INDEX_op_brcond2_i32:
>              tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
>              if (tmp != 2) {
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 1f5a021..044e648 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -997,6 +997,8 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
>  
>      tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
>                      TCGV_LOW(arg1), TCGV_LOW(arg2));
> +    /* Allow the optimizer room to replace mulu2 with two moves.  */
> +    tcg_gen_op0(INDEX_op_nop);
>  
>      tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
>      tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 21c1074..8280489 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1337,6 +1337,25 @@ static void tcg_liveness_analysis(TCGContext *s)
>              }
>              goto do_not_remove;
>  
> +        case INDEX_op_mulu2_i32:
> +            args -= 4;
> +            nb_iargs = 2;
> +            nb_oargs = 2;
> +            /* Likewise, test for the high part of the operation dead.  */
> +            if (dead_temps[args[1]]) {
> +                if (dead_temps[args[0]]) {
> +                    goto do_remove;
> +                }
> +                gen_opc_buf[op_index] = op = INDEX_op_mul_i32;

Very minor nitpick: you probably don't need to set op there.

> +                args[1] = args[2];
> +                args[2] = args[3];
> +                assert(gen_opc_buf[op_index + 1] == INDEX_op_nop);
> +                tcg_set_nop(s, gen_opc_buf + op_index + 1, args + 3, 1);
> +                /* Fall through and mark the single-word operation live.  */
> +                nb_oargs = 1;
> +            }
> +            goto do_not_remove;
> +
>          default:
>              /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
>              args -= def->nb_args;

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
Richard Henderson - Oct. 17, 2012, 1:09 a.m.
On 2012-10-17 09:25, Aurelien Jarno wrote:
>> > +                gen_opc_buf[op_index] = op = INDEX_op_mul_i32;
> Very minor nitpick: you probably don't need to set op there.
> 

Perhaps not, but I prefer to keep the variables in sync as we
drop into common code...


r~
Avi Kivity - Oct. 17, 2012, 10:58 a.m.
On 10/17/2012 03:09 AM, Richard Henderson wrote:
> On 2012-10-17 09:25, Aurelien Jarno wrote:
>>> > +                gen_opc_buf[op_index] = op = INDEX_op_mul_i32;
>> Very minor nitpick: you probably don't need to set op there.
>> 
> 
> Perhaps not, but I prefer to keep the variables in sync as we
> drop into common code...

The compiler should recognize the dead variable anyway.  How very meta.

Patch

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 05891ef..a06c8eb 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -543,6 +543,9 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             swap_commutative(args[0], &args[2], &args[4]);
             swap_commutative(args[1], &args[3], &args[5]);
             break;
+        case INDEX_op_mulu2_i32:
+            swap_commutative(args[0], &args[2], &args[3]);
+            break;
         case INDEX_op_brcond2_i32:
             if (swap_commutative2(&args[0], &args[2])) {
                 args[4] = tcg_swap_cond(args[4]);
@@ -831,6 +834,29 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             }
             goto do_default;
 
+        case INDEX_op_mulu2_i32:
+            if (temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[3]].state == TCG_TEMP_CONST) {
+                uint32_t a = temps[args[2]].val;
+                uint32_t b = temps[args[3]].val;
+                uint64_t r = (uint64_t)a * b;
+                TCGArg rl, rh;
+
+                /* We emit the extra nop when we emit the mulu2.  */
+                assert(gen_opc_buf[op_index + 1] == INDEX_op_nop);
+
+                rl = args[0];
+                rh = args[1];
+                gen_opc_buf[op_index] = INDEX_op_movi_i32;
+                gen_opc_buf[++op_index] = INDEX_op_movi_i32;
+                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r);
+                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32));
+                gen_args += 4;
+                args += 4;
+                break;
+            }
+            goto do_default;
+
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
             if (tmp != 2) {
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 1f5a021..044e648 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -997,6 +997,8 @@  static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 
     tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
                     TCGV_LOW(arg1), TCGV_LOW(arg2));
+    /* Allow the optimizer room to replace mulu2 with two moves.  */
+    tcg_gen_op0(INDEX_op_nop);
 
     tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
     tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 21c1074..8280489 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1337,6 +1337,25 @@  static void tcg_liveness_analysis(TCGContext *s)
             }
             goto do_not_remove;
 
+        case INDEX_op_mulu2_i32:
+            args -= 4;
+            nb_iargs = 2;
+            nb_oargs = 2;
+            /* Likewise, test for the high part of the operation dead.  */
+            if (dead_temps[args[1]]) {
+                if (dead_temps[args[0]]) {
+                    goto do_remove;
+                }
+                gen_opc_buf[op_index] = op = INDEX_op_mul_i32;
+                args[1] = args[2];
+                args[2] = args[3];
+                assert(gen_opc_buf[op_index + 1] == INDEX_op_nop);
+                tcg_set_nop(s, gen_opc_buf + op_index + 1, args + 3, 1);
+                /* Fall through and mark the single-word operation live.  */
+                nb_oargs = 1;
+            }
+            goto do_not_remove;
+
         default:
             /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
             args -= def->nb_args;