Patchwork [01/10] tcg: Split out swap_commutative as a subroutine

login
register
mail settings
Submitter Richard Henderson
Date Oct. 2, 2012, 6:32 p.m.
Message ID <1349202750-16815-2-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/188605/
State New
Headers show

Comments

Richard Henderson - Oct. 2, 2012, 6:32 p.m.
Reduces code duplication and prefers

  movcond d, c1, c2, const, s
to
  movcond d, c1, c2, s, const

It also prefers

  add r, r, c
over
  add r, c, r

when both inputs are known constants.  This doesn't matter for true add, as
we will fully constant fold that.  But it matters for a follow-on patch using
this routine for add2 which may not be fully foldable.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)
Aurelien Jarno - Oct. 9, 2012, 3:13 p.m.
On Tue, Oct 02, 2012 at 11:32:21AM -0700, Richard Henderson wrote:
> Reduces code duplication and prefers
> 
>   movcond d, c1, c2, const, s
> to
>   movcond d, c1, c2, s, const
> 
> It also prefers
> 
>   add r, r, c
> over
>   add r, c, r
> 
> when both inputs are known constants.  This doesn't matter for true add, as
> we will fully constant fold that.  But it matters for a follow-on patch using
> this routine for add2 which may not be fully foldable.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 56 ++++++++++++++++++++++++--------------------------------
>  1 file changed, 24 insertions(+), 32 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 35532a1..5e0504a 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -382,6 +382,23 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
>      tcg_abort();
>  }
>  
> +static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
> +{
> +    TCGArg a1 = *p1, a2 = *p2;
> +    int sum = 0;
> +    sum += temps[a1].state == TCG_TEMP_CONST;
> +    sum -= temps[a2].state == TCG_TEMP_CONST;
> +
> +    /* Prefer the constant in second argument, and then the form
> +       op a, a, b, which is better handled on non-RISC hosts. */
> +    if (sum > 0 || (sum == 0 && dest == a2)) {
> +        *p1 = a2;
> +        *p2 = a1;
> +        return true;
> +    }
> +    return false;
> +}
> +

Does this sum += and -= actually generates better code than the previous
one? It's not something obvious to read (fortunately there is the
comment for helping), so if it doesn't bring any optimization, it's
better to keep the previous form.

>  /* Propagate constants and copies, fold constant expressions. */
>  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>                                      TCGArg *args, TCGOpDef *tcg_op_defs)
> @@ -391,7 +408,6 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>      const TCGOpDef *def;
>      TCGArg *gen_args;
>      TCGArg tmp;
> -    TCGCond cond;
>  
>      /* Array VALS has an element for each temp.
>         If this temp holds a constant then its value is kept in VALS' element.
> @@ -434,52 +450,28 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          CASE_OP_32_64(eqv):
>          CASE_OP_32_64(nand):
>          CASE_OP_32_64(nor):
> -            /* Prefer the constant in second argument, and then the form
> -               op a, a, b, which is better handled on non-RISC hosts. */
> -            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
> -                && temps[args[2]].state != TCG_TEMP_CONST)) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> -            }
> +            swap_commutative(args[0], &args[1], &args[2]);
>              break;
>          CASE_OP_32_64(brcond):
> -            if (temps[args[0]].state == TCG_TEMP_CONST
> -                && temps[args[1]].state != TCG_TEMP_CONST) {
> -                tmp = args[0];
> -                args[0] = args[1];
> -                args[1] = tmp;
> +            if (swap_commutative(-1, &args[0], &args[1])) {
>                  args[2] = tcg_swap_cond(args[2]);
>              }
>              break;
>          CASE_OP_32_64(setcond):
> -            if (temps[args[1]].state == TCG_TEMP_CONST
> -                && temps[args[2]].state != TCG_TEMP_CONST) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> +            if (swap_commutative(args[0], &args[1], &args[2])) {
>                  args[3] = tcg_swap_cond(args[3]);
>              }
>              break;
>          CASE_OP_32_64(movcond):
> -            cond = args[5];
> -            if (temps[args[1]].state == TCG_TEMP_CONST
> -                && temps[args[2]].state != TCG_TEMP_CONST) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> -                cond = tcg_swap_cond(cond);
> +            if (swap_commutative(-1, &args[1], &args[2])) {
> +                args[5] = tcg_swap_cond(args[5]);
>              }
>              /* For movcond, we canonicalize the "false" input reg to match
>                 the destination reg so that the tcg backend can implement
>                 a "move if true" operation.  */
> -            if (args[0] == args[3]) {
> -                tmp = args[3];
> -                args[3] = args[4];
> -                args[4] = tmp;
> -                cond = tcg_invert_cond(cond);
> +            if (swap_commutative(args[0], &args[4], &args[3])) {
> +                args[5] = tcg_invert_cond(args[5]);
>              }
> -            args[5] = cond;
>          default:
>              break;
>          }

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
Richard Henderson - Oct. 9, 2012, 3:23 p.m.
On 10/09/2012 08:13 AM, Aurelien Jarno wrote:
>> > It also prefers
>> > 
>> >   add r, r, c
>> > over
>> >   add r, c, r
>> > 
>> > when both inputs are known constants.  This doesn't matter for true add, as
>> > we will fully constant fold that.  But it matters for a follow-on patch using
>> > this routine for add2 which may not be fully foldable.
...
> Does this sum += and -= actually generates better code than the previous
> one? It's not something obvious to read (fortunately there is the
> comment for helping), so if it doesn't bring any optimization, it's
> better to keep the previous form.

Yes.  See the comment within the log above.


r~
Aurelien Jarno - Oct. 9, 2012, 3:31 p.m.
On Tue, Oct 09, 2012 at 08:23:27AM -0700, Richard Henderson wrote:
> On 10/09/2012 08:13 AM, Aurelien Jarno wrote:
> >> > It also prefers
> >> > 
> >> >   add r, r, c
> >> > over
> >> >   add r, c, r
> >> > 
> >> > when both inputs are known constants.  This doesn't matter for true add, as
> >> > we will fully constant fold that.  But it matters for a follow-on patch using
> >> > this routine for add2 which may not be fully foldable.
> ...
> > Does this sum += and -= actually generates better code than the previous
> > one? It's not something obvious to read (fortunately there is the
> > comment for helping), so if it doesn't bring any optimization, it's
> > better to keep the previous form.
> 
> Yes.  See the comment within the log above.

I am not talking about the code generated by TCG, but rather by the code
generated by GCC. Does using sum += and sum -= brings a gain to compare
to the equivalent if function?
Richard Henderson - Oct. 9, 2012, 4:40 p.m.
On 10/09/2012 08:31 AM, Aurelien Jarno wrote:
> I am not talking about the code generated by TCG, but rather by the code
> generated by GCC. Does using sum += and sum -= brings a gain to compare
> to the equivalent if function?

It's hard to tell.  My guess is that it's about a wash.  Adding an
artificial __attribute__((noinline)) to make it easier to see:

SUM version

0000000000000190 <swap_commutative>:
     190:       48 83 ec 18             sub    $0x18,%rsp
     194:       4c 8b 06                mov    (%rsi),%r8
     197:       48 8b 0a                mov    (%rdx),%rcx
     19a:       64 48 8b 04 25 28 00    mov    %fs:0x28,%rax
     1a1:       00 00 
     1a3:       48 89 44 24 08          mov    %rax,0x8(%rsp)
     1a8:       31 c0                   xor    %eax,%eax
     1aa:       4c 89 c0                mov    %r8,%rax
     1ad:       49 89 c9                mov    %rcx,%r9
     1b0:       48 c1 e0 04             shl    $0x4,%rax
     1b4:       83 b8 00 00 00 00 01    cmpl   $0x1,0x0(%rax)
                        1b6: R_X86_64_32S       .bss
     1bb:       0f 94 c0                sete   %al
     1be:       49 c1 e1 04             shl    $0x4,%r9
     1c2:       41 83 b9 00 00 00 00    cmpl   $0x1,0x0(%r9)
     1c9:       01 
                        1c5: R_X86_64_32S       .bss
     1ca:       0f b6 c0                movzbl %al,%eax
     1cd:       41 0f 94 c1             sete   %r9b
     1d1:       45 0f b6 c9             movzbl %r9b,%r9d
     1d5:       44 29 c8                sub    %r9d,%eax
     1d8:       83 f8 01                cmp    $0x1,%eax
     1db:       75 23                   jne    200 <swap_commutative+0x70>
     1dd:       48 89 0e                mov    %rcx,(%rsi)
     1e0:       b8 01 00 00 00          mov    $0x1,%eax
     1e5:       4c 89 02                mov    %r8,(%rdx)
     1e8:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
     1ed:       64 48 33 14 25 28 00    xor    %fs:0x28,%rdx
     1f4:       00 00 
     1f6:       75 15                   jne    20d <swap_commutative+0x7d>
     1f8:       48 83 c4 18             add    $0x18,%rsp
     1fc:       c3                      retq   
     1fd:       0f 1f 00                nopl   (%rax)
     200:       48 39 cf                cmp    %rcx,%rdi
     203:       75 04                   jne    209 <swap_commutative+0x79>
     205:       85 c0                   test   %eax,%eax
     207:       74 d4                   je     1dd <swap_commutative+0x4d>
     209:       31 c0                   xor    %eax,%eax
     20b:       eb db                   jmp    1e8 <swap_commutative+0x58>
     20d:       0f 1f 00                nopl   (%rax)
     210:       e8 00 00 00 00          callq  215 <swap_commutative+0x85>
                        211: R_X86_64_PC32      __stack_chk_fail-0x4

=======

    if ((temps[a1].state == TCG_TEMP_CONST
         && temps[a2].state != TCG_TEMP_CONST)
        || (dest == a2
            && ((temps[a1].state == TCG_TEMP_CONST
                 && temps[a2].state == TCG_TEMP_CONST)
                || (temps[a1].state != TCG_TEMP_CONST
                     && temps[a2].state != TCG_TEMP_CONST)))) {

0000000000000190 <swap_commutative>:
     190:       48 83 ec 18             sub    $0x18,%rsp
     194:       4c 8b 02                mov    (%rdx),%r8
     197:       64 48 8b 04 25 28 00    mov    %fs:0x28,%rax
     19e:       00 00 
     1a0:       48 89 44 24 08          mov    %rax,0x8(%rsp)
     1a5:       31 c0                   xor    %eax,%eax
     1a7:       48 8b 06                mov    (%rsi),%rax
     1aa:       48 89 c1                mov    %rax,%rcx
     1ad:       48 c1 e1 04             shl    $0x4,%rcx
     1b1:       83 b9 00 00 00 00 01    cmpl   $0x1,0x0(%rcx)
                        1b3: R_X86_64_32S       .bss
     1b8:       74 0e                   je     1c8 <swap_commutative+0x38>
     1ba:       4c 39 c7                cmp    %r8,%rdi
     1bd:       74 39                   je     1f8 <swap_commutative+0x68>
     1bf:       31 c0                   xor    %eax,%eax
     1c1:       eb 20                   jmp    1e3 <swap_commutative+0x53>
     1c3:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
     1c8:       4c 89 c1                mov    %r8,%rcx
     1cb:       48 c1 e1 04             shl    $0x4,%rcx
     1cf:       83 b9 00 00 00 00 01    cmpl   $0x1,0x0(%rcx)
                        1d1: R_X86_64_32S       .bss
     1d6:       74 36                   je     20e <swap_commutative+0x7e>
     1d8:       4c 89 06                mov    %r8,(%rsi)
     1db:       48 89 02                mov    %rax,(%rdx)
     1de:       b8 01 00 00 00          mov    $0x1,%eax
     1e3:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
     1e8:       64 48 33 14 25 28 00    xor    %fs:0x28,%rdx
     1ef:       00 00 
     1f1:       75 16                   jne    209 <swap_commutative+0x79>
     1f3:       48 83 c4 18             add    $0x18,%rsp
     1f7:       c3                      retq   
     1f8:       48 c1 e7 04             shl    $0x4,%rdi
     1fc:       83 bf 00 00 00 00 01    cmpl   $0x1,0x0(%rdi)
                        1fe: R_X86_64_32S       .bss
     203:       75 d3                   jne    1d8 <swap_commutative+0x48>
     205:       31 c0                   xor    %eax,%eax
     207:       eb da                   jmp    1e3 <swap_commutative+0x53>
     209:       e8 00 00 00 00          callq  20e <swap_commutative+0x7e>
                        20a: R_X86_64_PC32      __stack_chk_fail-0x4
     20e:       4c 39 c7                cmp    %r8,%rdi
     211:       75 ac                   jne    1bf <swap_commutative+0x2f>
     213:       eb c3                   jmp    1d8 <swap_commutative+0x48>


r~

Patch

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 35532a1..5e0504a 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -382,6 +382,23 @@  static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
     tcg_abort();
 }
 
+static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
+{
+    TCGArg a1 = *p1, a2 = *p2;
+    int sum = 0;
+    sum += temps[a1].state == TCG_TEMP_CONST;
+    sum -= temps[a2].state == TCG_TEMP_CONST;
+
+    /* Prefer the constant in second argument, and then the form
+       op a, a, b, which is better handled on non-RISC hosts. */
+    if (sum > 0 || (sum == 0 && dest == a2)) {
+        *p1 = a2;
+        *p2 = a1;
+        return true;
+    }
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -391,7 +408,6 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     const TCGOpDef *def;
     TCGArg *gen_args;
     TCGArg tmp;
-    TCGCond cond;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -434,52 +450,28 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
-            /* Prefer the constant in second argument, and then the form
-               op a, a, b, which is better handled on non-RISC hosts. */
-            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
-                && temps[args[2]].state != TCG_TEMP_CONST)) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
-            }
+            swap_commutative(args[0], &args[1], &args[2]);
             break;
         CASE_OP_32_64(brcond):
-            if (temps[args[0]].state == TCG_TEMP_CONST
-                && temps[args[1]].state != TCG_TEMP_CONST) {
-                tmp = args[0];
-                args[0] = args[1];
-                args[1] = tmp;
+            if (swap_commutative(-1, &args[0], &args[1])) {
                 args[2] = tcg_swap_cond(args[2]);
             }
             break;
         CASE_OP_32_64(setcond):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state != TCG_TEMP_CONST) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
+            if (swap_commutative(args[0], &args[1], &args[2])) {
                 args[3] = tcg_swap_cond(args[3]);
             }
             break;
         CASE_OP_32_64(movcond):
-            cond = args[5];
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state != TCG_TEMP_CONST) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
-                cond = tcg_swap_cond(cond);
+            if (swap_commutative(-1, &args[1], &args[2])) {
+                args[5] = tcg_swap_cond(args[5]);
             }
             /* For movcond, we canonicalize the "false" input reg to match
                the destination reg so that the tcg backend can implement
                a "move if true" operation.  */
-            if (args[0] == args[3]) {
-                tmp = args[3];
-                args[3] = args[4];
-                args[4] = tmp;
-                cond = tcg_invert_cond(cond);
+            if (swap_commutative(args[0], &args[4], &args[3])) {
+                args[5] = tcg_invert_cond(args[5]);
             }
-            args[5] = cond;
         default:
             break;
         }