diff mbox

[RFA,PR,tree-optimization/45685]

Message ID 52A80C3C.1090805@redhat.com
State New
Headers show

Commit Message

Jeff Law Dec. 11, 2013, 6:54 a.m. UTC
So for this source, compiled for x86_64 with -O3:

typedef unsigned long int uint64_t;
typedef long int int64_t;
int summation_helper_1(int64_t* products, uint64_t count)
{
         int s = 0;
         uint64_t i;
         for(i=0; i<count; i++)
         {
                 int64_t val = (products[i]>0) ? 1 : -1;
                 products[i] *= val;
                 if(products[i] != i)
                         val = -val;
                 products[i] = val;
                 s += val;
         }
         return s;
}


int summation_helper_2(int64_t* products, uint64_t count)
{
         int s = 0;
         uint64_t i;
         for(i=0; i<count; i++)
         {
                 int val = (products[i]>0) ? 1 : -1;
                 products[i] *= val;
                 if(products[i] != i)
                         val = -val;
                 products[i] = val;
                 s += val;
         }
         return s;
}


The loops we generate are pretty bad and have regressed relative to 
older versions of GCC.

For the first loop, we have the following .optimized output for the loop:

   <bb 4>:
   # s_28 = PHI <s_20(5), 0(3)>
   # i_27 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_27, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   prephitmp_38 = _11 > 0 ? -1 : 1;
   prephitmp_39 = _11 > 0 ? 4294967295 : 1;
   prephitmp_41 = _11 > 0 ? 1 : 4294967295;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   val_3 = _14 != i_27 ? prephitmp_38 : val_4;
   prephitmp_44 = _14 != i_27 ? prephitmp_39 : prephitmp_41;
   MEM[base: products_9(D), index: i_27, step: 8, offset: 0B] = val_3;
   s.1_18 = (unsigned int) s_28;
   _19 = prephitmp_44 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_27 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

   <bb 5>:
   goto <bb 4>;


Note the series of COND_EXPRs.  A couple are just conditional negation 
which can be implemented with a straight-line code sequence.   Using 
that straight-line sequence results in:

   <bb 4>:
   # s_31 = PHI <s_20(5), 0(3)>
   # i_32 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   _24 = _14 != i_32;
   _25 = (int64_t) _24;
   _29 = -_25;
   _28 = _29 ^ val_4;
   _27 = _28 + _25;
   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
   _17 = (unsigned int) _27;
   s.1_18 = (unsigned int) s_31;
   _19 = _17 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_32 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

   <bb 5>:
   goto <bb 4>;

Which *appears* worse.  However, that code can much more easily be 
handled by the RTL optimizers.    When we look at what the trunk 
generates at the assembly level we have:

.L3:
         movq    (%rdi,%rcx,8), %rdx
         testq   %rdx, %rdx
         setg    %r8b
         movzbl  %r8b, %r10d
         movzbl  %r8b, %r8d
         leaq    -1(%r10,%r10), %r10
         leal    -1(%r8,%r8), %r8d
         movq    %r10, %r11
         imulq   %rdx, %r11
         testq   %rdx, %rdx
         setle   %dl
         movzbl  %dl, %r9d
         movzbl  %dl, %edx
         leaq    -1(%r9,%r9), %r9
         leal    -1(%rdx,%rdx), %edx
         cmpq    %rcx, %r11
         cmove   %r10, %r9
         cmove   %r8d, %edx
         movq    %r9, (%rdi,%rcx,8)
         addq    $1, %rcx
         addl    %edx, %eax
         cmpq    %rsi, %rcx
         jne     .L3
(Ick)

With the conditional negation patch that turns into:

L3:
         movq    (%rdi,%rcx,8), %r8
         xorl    %edx, %edx
         testq   %r8, %r8
         setg    %dl
         leaq    -1(%rdx,%rdx), %rdx
         imulq   %rdx, %r8
         cmpq    %rcx, %r8
         setne   %r8b
         movzbl  %r8b, %r8d
         movq    %r8, %r9
         negq    %r9
         xorq    %r9, %rdx
         addq    %r8, %rdx
         movq    %rdx, (%rdi,%rcx,8)
         addq    $1, %rcx
         addl    %edx, %eax
         cmpq    %rsi, %rcx
         jne     .L3

No branches within the loop, no conditional moves either.  In all it's 5 
instructions shorter.


The second loop shows similar effects, though they're not as dramatic.

Before:
   <bb 4>:
   # s_27 = PHI <s_19(5), 0(3)>
   # i_26 = PHI <i_20(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_26, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   prephitmp_32 = _11 > 0 ? 1 : -1;
   prephitmp_33 = _11 > 0 ? -1 : 1;
   prephitmp_34 = _11 > 0 ? -1 : 1;
   _13 = _11 * prephitmp_32;
   _15 = (long unsigned int) _13;
   val_3 = _15 != i_26 ? prephitmp_33 : val_4;
   prephitmp_36 = _15 != i_26 ? prephitmp_34 : prephitmp_32;
   MEM[base: products_9(D), index: i_26, step: 8, offset: 0B] = 
prephitmp_36;
   s_19 = val_3 + s_27;
   i_20 = i_26 + 1;
   if (i_20 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

  <bb 5>:
   goto <bb 4>;


Which results in the following assembly:

.L8:
         movq    (%rdi,%r8,8), %rdx
         testq   %rdx, %rdx
         movq    %rdx, %r11
         setg    %cl
         movzbl  %cl, %r10d
         movzbl  %cl, %ecx
         leaq    -1(%rcx,%rcx), %rcx
         leal    -1(%r10,%r10), %r10d
         imulq   %rcx, %r11
         testq   %rdx, %rdx
         setle   %dl
         movzbl  %dl, %r9d
         movzbl  %dl, %edx
         leaq    -1(%r9,%r9), %r9
         leal    -1(%rdx,%rdx), %edx
         cmpq    %r8, %r11
         cmovne  %r9, %rcx
         cmove   %r10d, %edx
         movq    %rcx, (%rdi,%r8,8)
         addq    $1, %r8
         addl    %edx, %eax
         cmpq    %rsi, %r8
         jne     .L8



With the conditional negation patch:

  <bb 4>:
   # s_31 = PHI <s_20(5), 0(3)>
   # i_32 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   _24 = _14 != i_32;
   _25 = (int64_t) _24;
   _29 = -_25;
   _28 = _29 ^ val_4;
   _27 = _28 + _25;
   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
   _17 = (unsigned int) _27;
   s.1_18 = (unsigned int) s_31;
   _19 = _17 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_32 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

  <bb 5>:
   goto <bb 4>;

Which again looks worse than the original, but optimizes well into:

.L8:
         movq    (%rdi,%r8,8), %r9
         testq   %r9, %r9
         setg    %cl
         movzbl  %cl, %edx
         leaq    -1(%rdx,%rdx), %rdx
         imulq   %r9, %rdx
         xorl    %r9d, %r9d
         cmpq    %r8, %rdx
         movzbl  %cl, %edx
         setne   %r9b
         leal    -1(%rdx,%rdx), %edx
         movl    %r9d, %r10d
         negl    %r10d
         xorl    %r10d, %edx
         addl    %r9d, %edx
         movslq  %edx, %rcx
         addl    %edx, %eax
         movq    %rcx, (%rdi,%r8,8)
         addq    $1, %r8
         cmpq    %rsi, %r8
         jne     .L8


Bootstrapped and regression tested on x86_64-unknown-linux-gnu.  OK for 
the trunk?
PR tree-optimization/45685
	* tree-ssa-phiopt.c (neg_replacement): New function.
	(tree_ssa_phiopt_worker): Call it.
	
	PR tree-optimization/45685
	* gcc.dg/tree-ssa/pr45685.c: New test.

Comments

Richard Biener Dec. 11, 2013, 9:51 a.m. UTC | #1
On Wed, Dec 11, 2013 at 7:54 AM, Jeff Law <law@redhat.com> wrote:
>
> So for this source, compiled for x86_64 with -O3:
>
> typedef unsigned long int uint64_t;
> typedef long int int64_t;
> int summation_helper_1(int64_t* products, uint64_t count)
> {
>         int s = 0;
>         uint64_t i;
>         for(i=0; i<count; i++)
>         {
>                 int64_t val = (products[i]>0) ? 1 : -1;
>                 products[i] *= val;
>                 if(products[i] != i)
>                         val = -val;
>                 products[i] = val;
>                 s += val;
>         }
>         return s;
> }
>
>
> int summation_helper_2(int64_t* products, uint64_t count)
> {
>         int s = 0;
>         uint64_t i;
>         for(i=0; i<count; i++)
>         {
>                 int val = (products[i]>0) ? 1 : -1;
>                 products[i] *= val;
>                 if(products[i] != i)
>                         val = -val;
>                 products[i] = val;
>                 s += val;
>         }
>         return s;
> }
>
>
> The loops we generate are pretty bad and have regressed relative to older
> versions of GCC.
>
> For the first loop, we have the following .optimized output for the loop:
>
>   <bb 4>:
>   # s_28 = PHI <s_20(5), 0(3)>
>   # i_27 = PHI <i_21(5), 0(3)>
>   _11 = MEM[base: products_9(D), index: i_27, step: 8, offset: 0B];
>   val_4 = _11 > 0 ? 1 : -1;
>   prephitmp_38 = _11 > 0 ? -1 : 1;
>   prephitmp_39 = _11 > 0 ? 4294967295 : 1;
>   prephitmp_41 = _11 > 0 ? 1 : 4294967295;
>   _12 = val_4 * _11;
>   _14 = (long unsigned int) _12;
>   val_3 = _14 != i_27 ? prephitmp_38 : val_4;
>   prephitmp_44 = _14 != i_27 ? prephitmp_39 : prephitmp_41;
>   MEM[base: products_9(D), index: i_27, step: 8, offset: 0B] = val_3;
>   s.1_18 = (unsigned int) s_28;
>   _19 = prephitmp_44 + s.1_18;
>   s_20 = (int) _19;
>   i_21 = i_27 + 1;
>   if (i_21 != count_7(D))
>     goto <bb 5>;
>   else
>     goto <bb 6>;
>
>   <bb 5>:
>   goto <bb 4>;
>
>
> Note the series of COND_EXPRs.  A couple are just conditional negation which
> can be implemented with a straight-line code sequence.   Using that
> straight-line sequence results in:
>
>   <bb 4>:
>   # s_31 = PHI <s_20(5), 0(3)>
>   # i_32 = PHI <i_21(5), 0(3)>
>   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
>   val_4 = _11 > 0 ? 1 : -1;
>   _12 = val_4 * _11;
>   _14 = (long unsigned int) _12;
>   _24 = _14 != i_32;
>   _25 = (int64_t) _24;
>   _29 = -_25;
>   _28 = _29 ^ val_4;
>   _27 = _28 + _25;
>   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
>   _17 = (unsigned int) _27;
>   s.1_18 = (unsigned int) s_31;
>   _19 = _17 + s.1_18;
>   s_20 = (int) _19;
>   i_21 = i_32 + 1;
>   if (i_21 != count_7(D))
>     goto <bb 5>;
>   else
>     goto <bb 6>;
>
>   <bb 5>:
>   goto <bb 4>;
>
> Which *appears* worse.  However, that code can much more easily be handled
> by the RTL optimizers.    When we look at what the trunk generates at the
> assembly level we have:
>
> .L3:
>         movq    (%rdi,%rcx,8), %rdx
>         testq   %rdx, %rdx
>         setg    %r8b
>         movzbl  %r8b, %r10d
>         movzbl  %r8b, %r8d
>         leaq    -1(%r10,%r10), %r10
>         leal    -1(%r8,%r8), %r8d
>         movq    %r10, %r11
>         imulq   %rdx, %r11
>         testq   %rdx, %rdx
>         setle   %dl
>         movzbl  %dl, %r9d
>         movzbl  %dl, %edx
>         leaq    -1(%r9,%r9), %r9
>         leal    -1(%rdx,%rdx), %edx
>         cmpq    %rcx, %r11
>         cmove   %r10, %r9
>         cmove   %r8d, %edx
>         movq    %r9, (%rdi,%rcx,8)
>         addq    $1, %rcx
>         addl    %edx, %eax
>         cmpq    %rsi, %rcx
>         jne     .L3
> (Ick)
>
> With the conditional negation patch that turns into:
>
> L3:
>         movq    (%rdi,%rcx,8), %r8
>         xorl    %edx, %edx
>         testq   %r8, %r8
>         setg    %dl
>         leaq    -1(%rdx,%rdx), %rdx
>         imulq   %rdx, %r8
>         cmpq    %rcx, %r8
>         setne   %r8b
>         movzbl  %r8b, %r8d
>         movq    %r8, %r9
>         negq    %r9
>         xorq    %r9, %rdx
>         addq    %r8, %rdx
>         movq    %rdx, (%rdi,%rcx,8)
>         addq    $1, %rcx
>         addl    %edx, %eax
>         cmpq    %rsi, %rcx
>         jne     .L3
>
> No branches within the loop, no conditional moves either.  In all it's 5
> instructions shorter.
>
>
> The second loop shows similar effects, though they're not as dramatic.
>
> Before:
>   <bb 4>:
>   # s_27 = PHI <s_19(5), 0(3)>
>   # i_26 = PHI <i_20(5), 0(3)>
>   _11 = MEM[base: products_9(D), index: i_26, step: 8, offset: 0B];
>   val_4 = _11 > 0 ? 1 : -1;
>   prephitmp_32 = _11 > 0 ? 1 : -1;
>   prephitmp_33 = _11 > 0 ? -1 : 1;
>   prephitmp_34 = _11 > 0 ? -1 : 1;
>   _13 = _11 * prephitmp_32;
>   _15 = (long unsigned int) _13;
>   val_3 = _15 != i_26 ? prephitmp_33 : val_4;
>   prephitmp_36 = _15 != i_26 ? prephitmp_34 : prephitmp_32;
>   MEM[base: products_9(D), index: i_26, step: 8, offset: 0B] = prephitmp_36;
>   s_19 = val_3 + s_27;
>   i_20 = i_26 + 1;
>   if (i_20 != count_7(D))
>     goto <bb 5>;
>   else
>     goto <bb 6>;
>
>  <bb 5>:
>   goto <bb 4>;
>
>
> Which results in the following assembly:
>
> .L8:
>         movq    (%rdi,%r8,8), %rdx
>         testq   %rdx, %rdx
>         movq    %rdx, %r11
>         setg    %cl
>         movzbl  %cl, %r10d
>         movzbl  %cl, %ecx
>         leaq    -1(%rcx,%rcx), %rcx
>         leal    -1(%r10,%r10), %r10d
>         imulq   %rcx, %r11
>         testq   %rdx, %rdx
>         setle   %dl
>         movzbl  %dl, %r9d
>         movzbl  %dl, %edx
>         leaq    -1(%r9,%r9), %r9
>         leal    -1(%rdx,%rdx), %edx
>         cmpq    %r8, %r11
>         cmovne  %r9, %rcx
>         cmove   %r10d, %edx
>         movq    %rcx, (%rdi,%r8,8)
>         addq    $1, %r8
>         addl    %edx, %eax
>         cmpq    %rsi, %r8
>         jne     .L8
>
>
>
> With the conditional negation patch:
>
>  <bb 4>:
>   # s_31 = PHI <s_20(5), 0(3)>
>   # i_32 = PHI <i_21(5), 0(3)>
>   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
>   val_4 = _11 > 0 ? 1 : -1;
>   _12 = val_4 * _11;
>   _14 = (long unsigned int) _12;
>   _24 = _14 != i_32;
>   _25 = (int64_t) _24;
>   _29 = -_25;
>   _28 = _29 ^ val_4;
>   _27 = _28 + _25;
>   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
>   _17 = (unsigned int) _27;
>   s.1_18 = (unsigned int) s_31;
>   _19 = _17 + s.1_18;
>   s_20 = (int) _19;
>   i_21 = i_32 + 1;
>   if (i_21 != count_7(D))
>     goto <bb 5>;
>   else
>     goto <bb 6>;
>
>  <bb 5>:
>   goto <bb 4>;
>
> Which again looks worse than the original, but optimizes well into:
>
> .L8:
>         movq    (%rdi,%r8,8), %r9
>         testq   %r9, %r9
>         setg    %cl
>         movzbl  %cl, %edx
>         leaq    -1(%rdx,%rdx), %rdx
>         imulq   %r9, %rdx
>         xorl    %r9d, %r9d
>         cmpq    %r8, %rdx
>         movzbl  %cl, %edx
>         setne   %r9b
>         leal    -1(%rdx,%rdx), %edx
>         movl    %r9d, %r10d
>         negl    %r10d
>         xorl    %r10d, %edx
>         addl    %r9d, %edx
>         movslq  %edx, %rcx
>         addl    %edx, %eax
>         movq    %rcx, (%rdi,%r8,8)
>         addq    $1, %r8
>         cmpq    %rsi, %r8
>         jne     .L8
>
>
> Bootstrapped and regression tested on x86_64-unknown-linux-gnu.  OK for the
> trunk?

First of all phiopt runs unconditionally for -On with n > 0 but the conversion
is clearly not suitable for non-speed optimizations.  Thus I'd guard it
with at least !optimize_size && optimize >= 2.  As you are targeting
a worse transformation done by if-conversion you may want to
add || (((flag_tree_loop_vectorize || cfun->has_force_vect_loops)
           && flag_tree_loop_if_convert != 0)
          || flag_tree_loop_if_convert == 1
          || flag_tree_loop_if_convert_stores == 1)
(ugh).

More comments below.

>
>         PR tree-optimization/45685
>         * tree-ssa-phiopt.c (neg_replacement): New function.
>         (tree_ssa_phiopt_worker): Call it.
>
>         PR tree-optimization/45685
>         * gcc.dg/tree-ssa/pr45685.c: New test.
>
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
> b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
> new file mode 100644
> index 0000000..0628943
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */
> +
> +typedef unsigned long int uint64_t;
> +typedef long int int64_t;
> +int summation_helper_1(int64_t* products, uint64_t count)
> +{
> +       int s = 0;
> +       uint64_t i;
> +       for(i=0; i<count; i++)
> +       {
> +               int64_t val = (products[i]>0) ? 1 : -1;
> +               products[i] *= val;
> +               if(products[i] != i)
> +                       val = -val;
> +               products[i] = val;
> +               s += val;
> +       }
> +       return s;
> +}
> +
> +
> +int summation_helper_2(int64_t* products, uint64_t count)
> +{
> +       int s = 0;
> +       uint64_t i;
> +       for(i=0; i<count; i++)
> +       {
> +               int val = (products[i]>0) ? 1 : -1;
> +               products[i] *= val;
> +               if(products[i] != i)
> +                       val = -val;
> +               products[i] = val;
> +               s += val;
> +       }
> +       return s;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "converted to straightline code" 2
> "phiopt1" } } */
> +/* { dg-final { cleanup-tree-dump "phiopt1" } } */
> +
> diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
> index 11e565f..2522255 100644
> --- a/gcc/tree-ssa-phiopt.c
> +++ b/gcc/tree-ssa-phiopt.c
> @@ -69,6 +69,8 @@ static bool minmax_replacement (basic_block, basic_block,
>                                 edge, edge, gimple, tree, tree);
>  static bool abs_replacement (basic_block, basic_block,
>                              edge, edge, gimple, tree, tree);
> +static bool neg_replacement (basic_block, basic_block,
> +                            edge, edge, gimple, tree, tree);
>  static bool cond_store_replacement (basic_block, basic_block, edge, edge,
>                                     struct pointer_set_t *);
>  static bool cond_if_else_store_replacement (basic_block, basic_block,
> basic_block);
> @@ -489,6 +491,8 @@ tree_ssa_phiopt_worker (bool do_store_elim, bool
> do_hoist_loads)
>             cfgchanged = true;
>           else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>             cfgchanged = true;
> +         else if (neg_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
> +           cfgchanged = true;
>           else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>             cfgchanged = true;
>         }
> @@ -1285,6 +1289,143 @@ abs_replacement (basic_block cond_bb, basic_block
> middle_bb,
>    return true;
>  }
>
> +/*  The function neg_replacement replaces conditional negation with
> +    equivalent straight line code.  Returns TRUE if replacement is done,
> +    otherwise returns FALSE.
> +
> +    COND_BB branches around negation occuring in MIDDLE_BB.
> +
> +    E0 and E1 are edges out of COND_BB.  E0 reaches MIDDLE_BB and
> +    E1 reaches the other successor which should contain PHI with
> +    arguments ARG0 and ARG1.
> +
> +    Assuming negation is to occur when the condition is true,
> +    then the non-branching sequence is:
> +
> +       result = (rhs ^ -cond) + cond
> +
> +    Inverting the condition or its result gives us negation
> +    when the original condition is false.  */
> +
> +static bool
> +neg_replacement (basic_block cond_bb, basic_block middle_bb,
> +                edge e0 ATTRIBUTE_UNUSED, edge e1,
> +                gimple phi, tree arg0, tree arg1)
> +{
> +  gimple new_stmt, cond;
> +  gimple_stmt_iterator gsi;
> +  gimple assign;
> +  edge true_edge, false_edge;
> +  tree rhs, lhs;
> +  enum tree_code cond_code;
> +  bool invert = false;
> +
> +  /* This transformation performs logical operations on the
> +     incoming arguments.  So force them to be integral types.   */
> +  if (!INTEGRAL_TYPE_P (TREE_TYPE (arg0)))
> +    return false;
> +  /* OTHER_BLOCK must have only one executable statement which must have
> the
> +     form arg0 = -arg1 or arg1 = -arg0.  */
> +
> +  assign = last_and_only_stmt (middle_bb);
> +  /* If we did not find the proper negation assignment, then we can not
> +     optimize.  */
> +  if (assign == NULL)
> +    return false;
> +
> +  /* If we got here, then we have found the only executable statement
> +     in OTHER_BLOCK.  If it is anything other than arg0 = -arg1 or
> +     arg1 = -arg0, then we can not optimize.  */
> +  if (gimple_code (assign) != GIMPLE_ASSIGN)
> +    return false;
> +
> +  lhs = gimple_assign_lhs (assign);
> +
> +  if (gimple_assign_rhs_code (assign) != NEGATE_EXPR)
> +    return false;
> +
> +  rhs = gimple_assign_rhs1 (assign);
> +
> +  /* The assignment has to be arg0 = -arg1 or arg1 = -arg0.  */
> +  if (!(lhs == arg0 && rhs == arg1)
> +      && !(lhs == arg1 && rhs == arg0))
> +    return false;
> +
> +  /* The basic sequence assumes we negate when the condition is true.
> +     If we need the opposite, then we will either need to invert the
> +     condition or its result.  */
> +  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
> +  invert = false_edge->dest == middle_bb;
> +
> +  /* Unlike abs_replacement, we can handle arbitrary conditionals here.  */
> +  cond = last_stmt (cond_bb);
> +  cond_code = gimple_cond_code (cond);
> +
> +  /* If inversion is needed, first try to invert the test since
> +     that's cheapest.  */
> +  if (invert)
> +    {
> +      enum tree_code new_code
> +       = invert_tree_comparison (cond_code,
> +                                 HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs))));

That looks wrong - you want to look at HONOR_NANS on the mode
of one of the comparison operands, not of the actual value you want
to negate (it's integer and thus never has NaNs).

The rest of the patch looks ok to me.

Thanks,
Richard.

> +
> +      /* If invert_tree_comparison was successful, then use its return
> +        value as the new code and note that inversion is no longer
> +        needed.  */
> +      if (new_code != ERROR_MARK)
> +       {
> +         cond_code = new_code;
> +         invert = false;
> +       }
> +    }
> +
> +  tree cond_val = make_ssa_name (boolean_type_node, NULL);
> +  new_stmt = gimple_build_assign_with_ops (cond_code, cond_val,
> +                                          gimple_cond_lhs (cond),
> +                                          gimple_cond_rhs (cond));
> +  gsi = gsi_last_bb (cond_bb);
> +  gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT);
> +
> +  /* If we still need inversion, then invert the result of the
> +     condition.  */
> +  if (invert)
> +    {
> +      tree tmp = make_ssa_name (boolean_type_node, NULL);
> +      new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
> +                                              cond_val, boolean_true_node);
> +      gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
> +      cond_val = tmp;
> +    }
> +
> +  /* Get the condition in the right type so that we can perform
> +     logical and arithmetic operations on it.  */
> +  tree cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
> +  new_stmt = gimple_build_assign_with_ops (NOP_EXPR, cond_val_converted,
> +                                          cond_val, NULL_TREE);
> +  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
> +
> +  tree neg_cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
> +  new_stmt = gimple_build_assign_with_ops (NEGATE_EXPR,
> neg_cond_val_converted,
> +                                          cond_val_converted, NULL_TREE);
> +  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
> +
> +  tree tmp = make_ssa_name (TREE_TYPE (rhs), NULL);
> +  new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
> +                                          rhs, neg_cond_val_converted);
> +  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
> +
> +  tree new_lhs = make_ssa_name (TREE_TYPE (rhs), NULL);
> +  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, new_lhs,
> +                                          tmp, cond_val_converted);
> +  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
> +
> +  replace_phi_edge_with_variable (cond_bb, e1, phi, new_lhs);
> +
> +  /* Note that we optimized this PHI.  */
> +  return true;
> +}
> +
>  /* Auxiliary functions to determine the set of memory accesses which
>     can't trap because they are preceded by accesses to the same memory
>     portion.  We do that for MEM_REFs, so we only need to track
>
Jeff Law Dec. 11, 2013, 2:51 p.m. UTC | #2
On 12/11/13 02:51, Richard Biener wrote:
>
> First of all phiopt runs unconditionally for -On with n > 0 but the conversion
> is clearly not suitable for non-speed optimizations.  Thus I'd guard it
> with at least !optimize_size && optimize >= 2.  As you are targeting
> a worse transformation done by if-conversion you may want to
> add || (((flag_tree_loop_vectorize || cfun->has_force_vect_loops)
>             && flag_tree_loop_if_convert != 0)
>            || flag_tree_loop_if_convert == 1
>            || flag_tree_loop_if_convert_stores == 1)
> (ugh).
That's a hell of a condition to guard a transformation.  But, yea, I agree.

>> +
>> +  /* If inversion is needed, first try to invert the test since
>> +     that's cheapest.  */
>> +  if (invert)
>> +    {
>> +      enum tree_code new_code
>> +       = invert_tree_comparison (cond_code,
>> +                                 HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs))));
>
> That looks wrong - you want to look at HONOR_NANS on the mode
> of one of the comparison operands, not of the actual value you want
> to negate (it's integer and thus never has NaNs).
Bah.  That was supposed to be HONOR_SIGNED_ZEROS.  Which as far as I can 
tell is a property of the value being tested.

So it seems to me it should be

HONOR_SIGNED_ZEROS (TYPE_MODE (TREE_TYPE (one of the conditional's 
arguments)))

much like is done in abs_replacement.  With the difference we want to 
look at the comparison (which may have different arguments than the PHI 
we're converting) and that we can still apply the optimization, just in 
a slightly different way.

jeff
Richard Biener Dec. 11, 2013, 3:11 p.m. UTC | #3
On Wed, Dec 11, 2013 at 3:51 PM, Jeff Law <law@redhat.com> wrote:
> On 12/11/13 02:51, Richard Biener wrote:
>>
>>
>> First of all phiopt runs unconditionally for -On with n > 0 but the
>> conversion
>> is clearly not suitable for non-speed optimizations.  Thus I'd guard it
>> with at least !optimize_size && optimize >= 2.  As you are targeting
>> a worse transformation done by if-conversion you may want to
>> add || (((flag_tree_loop_vectorize || cfun->has_force_vect_loops)
>>             && flag_tree_loop_if_convert != 0)
>>            || flag_tree_loop_if_convert == 1
>>            || flag_tree_loop_if_convert_stores == 1)
>> (ugh).
>
> That's a hell of a condition to guard a transformation.  But, yea, I agree.
>
>
>>> +
>>> +  /* If inversion is needed, first try to invert the test since
>>> +     that's cheapest.  */
>>> +  if (invert)
>>> +    {
>>> +      enum tree_code new_code
>>> +       = invert_tree_comparison (cond_code,
>>> +                                 HONOR_NANS (TYPE_MODE (TREE_TYPE
>>> (rhs))));
>>
>>
>> That looks wrong - you want to look at HONOR_NANS on the mode
>> of one of the comparison operands, not of the actual value you want
>> to negate (it's integer and thus never has NaNs).
>
> Bah.  That was supposed to be HONOR_SIGNED_ZEROS.  Which as far as I can
> tell is a property of the value being tested.

No, it's

invert_tree_comparison (enum tree_code code, bool honor_nans)

so indeed HONOR_NANS.  And yes, on a conditional argument
(it can be a FP comparison but a integer negate).

Richard.

> So it seems to me it should be
>
> HONOR_SIGNED_ZEROS (TYPE_MODE (TREE_TYPE (one of the conditional's
> arguments)))
>
> much like is done in abs_replacement.  With the difference we want to look
> at the comparison (which may have different arguments than the PHI we're
> converting) and that we can still apply the optimization, just in a slightly
> different way.
>
> jeff
>
Jeff Law Dec. 11, 2013, 4:32 p.m. UTC | #4
On 12/11/13 08:11, Richard Biener wrote:
>> Bah.  That was supposed to be HONOR_SIGNED_ZEROS.  Which as far as I can
>> tell is a property of the value being tested.
>
> No, it's
>
> invert_tree_comparison (enum tree_code code, bool honor_nans)
>
> so indeed HONOR_NANS.  And yes, on a conditional argument
> (it can be a FP comparison but a integer negate).
I realized I was wrong after I went downstairs for breakfast :-)  Ignore 
my last message WRT HONOR_NANs and HONOR_SIGNED_ZEROs.

Jeff
Jeff Law Dec. 11, 2013, 4:53 p.m. UTC | #5
On 12/11/13 02:51, Richard Biener wrote:
>> +  /* If inversion is needed, first try to invert the test since
>> +     that's cheapest.  */
>> +  if (invert)
>> +    {
>> +      enum tree_code new_code
>> +       = invert_tree_comparison (cond_code,
>> +                                 HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs))));
>
> That looks wrong - you want to look at HONOR_NANS on the mode
> of one of the comparison operands, not of the actual value you want
> to negate (it's integer and thus never has NaNs).
HONOR_NANS (TYPE_MODE (TREE_TYPE (gimple_cond_lhs (cond))))

Right?

Jeff
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
new file mode 100644
index 0000000..0628943
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
@@ -0,0 +1,41 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */
+
+typedef unsigned long int uint64_t;
+typedef long int int64_t;
+int summation_helper_1(int64_t* products, uint64_t count)
+{
+	int s = 0;
+	uint64_t i;
+	for(i=0; i<count; i++)
+	{	
+		int64_t val = (products[i]>0) ? 1 : -1;
+		products[i] *= val;
+		if(products[i] != i)
+			val = -val;
+		products[i] = val;
+		s += val;
+	}
+	return s;
+}
+
+
+int summation_helper_2(int64_t* products, uint64_t count)
+{
+	int s = 0;
+	uint64_t i;
+	for(i=0; i<count; i++)
+	{	
+		int val = (products[i]>0) ? 1 : -1;
+		products[i] *= val;
+		if(products[i] != i)
+			val = -val;
+		products[i] = val;
+		s += val;
+	}
+	return s;
+}
+
+/* { dg-final { scan-tree-dump-times "converted to straightline code" 2 "phiopt1" } } */
+/* { dg-final { cleanup-tree-dump "phiopt1" } } */
+
diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
index 11e565f..2522255 100644
--- a/gcc/tree-ssa-phiopt.c
+++ b/gcc/tree-ssa-phiopt.c
@@ -69,6 +69,8 @@  static bool minmax_replacement (basic_block, basic_block,
 				edge, edge, gimple, tree, tree);
 static bool abs_replacement (basic_block, basic_block,
 			     edge, edge, gimple, tree, tree);
+static bool neg_replacement (basic_block, basic_block,
+			     edge, edge, gimple, tree, tree);
 static bool cond_store_replacement (basic_block, basic_block, edge, edge,
 				    struct pointer_set_t *);
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
@@ -489,6 +491,8 @@  tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 	    cfgchanged = true;
 	  else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
 	    cfgchanged = true;
+	  else if (neg_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
+	    cfgchanged = true;
 	  else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
 	    cfgchanged = true;
 	}
@@ -1285,6 +1289,143 @@  abs_replacement (basic_block cond_bb, basic_block middle_bb,
   return true;
 }
 
+/*  The function neg_replacement replaces conditional negation with
+    equivalent straight line code.  Returns TRUE if replacement is done,
+    otherwise returns FALSE.
+
+    COND_BB branches around negation occuring in MIDDLE_BB.
+
+    E0 and E1 are edges out of COND_BB.  E0 reaches MIDDLE_BB and
+    E1 reaches the other successor which should contain PHI with
+    arguments ARG0 and ARG1.
+
+    Assuming negation is to occur when the condition is true,
+    then the non-branching sequence is:
+
+       result = (rhs ^ -cond) + cond
+
+    Inverting the condition or its result gives us negation
+    when the original condition is false.  */
+
+static bool
+neg_replacement (basic_block cond_bb, basic_block middle_bb,
+		 edge e0 ATTRIBUTE_UNUSED, edge e1,
+		 gimple phi, tree arg0, tree arg1)
+{
+  gimple new_stmt, cond;
+  gimple_stmt_iterator gsi;
+  gimple assign;
+  edge true_edge, false_edge;
+  tree rhs, lhs;
+  enum tree_code cond_code;
+  bool invert = false;
+
+  /* This transformation performs logical operations on the
+     incoming arguments.  So force them to be integral types.   */
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (arg0)))
+    return false;
+
+  /* OTHER_BLOCK must have only one executable statement which must have the
+     form arg0 = -arg1 or arg1 = -arg0.  */
+
+  assign = last_and_only_stmt (middle_bb);
+  /* If we did not find the proper negation assignment, then we can not
+     optimize.  */
+  if (assign == NULL)
+    return false;
+
+  /* If we got here, then we have found the only executable statement
+     in OTHER_BLOCK.  If it is anything other than arg0 = -arg1 or
+     arg1 = -arg0, then we can not optimize.  */
+  if (gimple_code (assign) != GIMPLE_ASSIGN)
+    return false;
+
+  lhs = gimple_assign_lhs (assign);
+
+  if (gimple_assign_rhs_code (assign) != NEGATE_EXPR)
+    return false;
+
+  rhs = gimple_assign_rhs1 (assign);
+
+  /* The assignment has to be arg0 = -arg1 or arg1 = -arg0.  */
+  if (!(lhs == arg0 && rhs == arg1)
+      && !(lhs == arg1 && rhs == arg0))
+    return false;
+
+  /* The basic sequence assumes we negate when the condition is true.
+     If we need the opposite, then we will either need to invert the
+     condition or its result.  */
+  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
+  invert = false_edge->dest == middle_bb;
+
+  /* Unlike abs_replacement, we can handle arbitrary conditionals here.  */
+  cond = last_stmt (cond_bb);
+  cond_code = gimple_cond_code (cond);
+
+  /* If inversion is needed, first try to invert the test since
+     that's cheapest.  */
+  if (invert)
+    {
+      enum tree_code new_code
+	= invert_tree_comparison (cond_code,
+				  HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs))));
+
+      /* If invert_tree_comparison was successful, then use its return
+	 value as the new code and note that inversion is no longer
+	 needed.  */
+      if (new_code != ERROR_MARK)
+	{
+	  cond_code = new_code;
+	  invert = false;
+	}
+    }
+
+  tree cond_val = make_ssa_name (boolean_type_node, NULL);
+  new_stmt = gimple_build_assign_with_ops (cond_code, cond_val,
+					   gimple_cond_lhs (cond),
+					   gimple_cond_rhs (cond));
+  gsi = gsi_last_bb (cond_bb);
+  gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT);
+
+  /* If we still need inversion, then invert the result of the
+     condition.  */
+  if (invert)
+    {
+      tree tmp = make_ssa_name (boolean_type_node, NULL);
+      new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
+					       cond_val, boolean_true_node);
+      gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+      cond_val = tmp;
+    }
+
+  /* Get the condition in the right type so that we can perform
+     logical and arithmetic operations on it.  */
+  tree cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (NOP_EXPR, cond_val_converted,
+					   cond_val, NULL_TREE);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree neg_cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, neg_cond_val_converted,
+					   cond_val_converted, NULL_TREE);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree tmp = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
+					   rhs, neg_cond_val_converted);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree new_lhs = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, new_lhs,
+					   tmp, cond_val_converted);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  replace_phi_edge_with_variable (cond_bb, e1, phi, new_lhs);
+
+  /* Note that we optimized this PHI.  */
+  return true;
+}
+
 /* Auxiliary functions to determine the set of memory accesses which
    can't trap because they are preceded by accesses to the same memory
    portion.  We do that for MEM_REFs, so we only need to track