[RFA,PR,tree-optimization/45685]

So for this source, compiled for x86_64 with -O3:

typedef unsigned long int uint64_t;
typedef long int int64_t;
int summation_helper_1(int64_t* products, uint64_t count)
{
         int s = 0;
         uint64_t i;
         for(i=0; i<count; i++)
         {
                 int64_t val = (products[i]>0) ? 1 : -1;
                 products[i] *= val;
                 if(products[i] != i)
                         val = -val;
                 products[i] = val;
                 s += val;
         }
         return s;
}

int summation_helper_2(int64_t* products, uint64_t count)
{
         int s = 0;
         uint64_t i;
         for(i=0; i<count; i++)
         {
                 int val = (products[i]>0) ? 1 : -1;
                 products[i] *= val;
                 if(products[i] != i)
                         val = -val;
                 products[i] = val;
                 s += val;
         }
         return s;
}

The loops we generate are pretty bad and have regressed relative to 
older versions of GCC.

For the first loop, we have the following .optimized output for the loop:

   <bb 4>:
   # s_28 = PHI <s_20(5), 0(3)>
   # i_27 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_27, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   prephitmp_38 = _11 > 0 ? -1 : 1;
   prephitmp_39 = _11 > 0 ? 4294967295 : 1;
   prephitmp_41 = _11 > 0 ? 1 : 4294967295;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   val_3 = _14 != i_27 ? prephitmp_38 : val_4;
   prephitmp_44 = _14 != i_27 ? prephitmp_39 : prephitmp_41;
   MEM[base: products_9(D), index: i_27, step: 8, offset: 0B] = val_3;
   s.1_18 = (unsigned int) s_28;
   _19 = prephitmp_44 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_27 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

   <bb 5>:
   goto <bb 4>;

Note the series of COND_EXPRs.  A couple are just conditional negation 
which can be implemented with a straight-line code sequence.   Using 
that straight-line sequence results in:

   <bb 4>:
   # s_31 = PHI <s_20(5), 0(3)>
   # i_32 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   _24 = _14 != i_32;
   _25 = (int64_t) _24;
   _29 = -_25;
   _28 = _29 ^ val_4;
   _27 = _28 + _25;
   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
   _17 = (unsigned int) _27;
   s.1_18 = (unsigned int) s_31;
   _19 = _17 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_32 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

   <bb 5>:
   goto <bb 4>;

Which *appears* worse.  However, that code can much more easily be 
handled by the RTL optimizers.    When we look at what the trunk 
generates at the assembly level we have:

.L3:
         movq    (%rdi,%rcx,8), %rdx
         testq   %rdx, %rdx
         setg    %r8b
         movzbl  %r8b, %r10d
         movzbl  %r8b, %r8d
         leaq    -1(%r10,%r10), %r10
         leal    -1(%r8,%r8), %r8d
         movq    %r10, %r11
         imulq   %rdx, %r11
         testq   %rdx, %rdx
         setle   %dl
         movzbl  %dl, %r9d
         movzbl  %dl, %edx
         leaq    -1(%r9,%r9), %r9
         leal    -1(%rdx,%rdx), %edx
         cmpq    %rcx, %r11
         cmove   %r10, %r9
         cmove   %r8d, %edx
         movq    %r9, (%rdi,%rcx,8)
         addq    $1, %rcx
         addl    %edx, %eax
         cmpq    %rsi, %rcx
         jne     .L3
(Ick)

With the conditional negation patch that turns into:

L3:
         movq    (%rdi,%rcx,8), %r8
         xorl    %edx, %edx
         testq   %r8, %r8
         setg    %dl
         leaq    -1(%rdx,%rdx), %rdx
         imulq   %rdx, %r8
         cmpq    %rcx, %r8
         setne   %r8b
         movzbl  %r8b, %r8d
         movq    %r8, %r9
         negq    %r9
         xorq    %r9, %rdx
         addq    %r8, %rdx
         movq    %rdx, (%rdi,%rcx,8)
         addq    $1, %rcx
         addl    %edx, %eax
         cmpq    %rsi, %rcx
         jne     .L3

No branches within the loop, no conditional moves either.  In all it's 5 
instructions shorter.

The second loop shows similar effects, though they're not as dramatic.

Before:
   <bb 4>:
   # s_27 = PHI <s_19(5), 0(3)>
   # i_26 = PHI <i_20(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_26, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   prephitmp_32 = _11 > 0 ? 1 : -1;
   prephitmp_33 = _11 > 0 ? -1 : 1;
   prephitmp_34 = _11 > 0 ? -1 : 1;
   _13 = _11 * prephitmp_32;
   _15 = (long unsigned int) _13;
   val_3 = _15 != i_26 ? prephitmp_33 : val_4;
   prephitmp_36 = _15 != i_26 ? prephitmp_34 : prephitmp_32;
   MEM[base: products_9(D), index: i_26, step: 8, offset: 0B] = 
prephitmp_36;
   s_19 = val_3 + s_27;
   i_20 = i_26 + 1;
   if (i_20 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

  <bb 5>:
   goto <bb 4>;

Which results in the following assembly:

.L8:
         movq    (%rdi,%r8,8), %rdx
         testq   %rdx, %rdx
         movq    %rdx, %r11
         setg    %cl
         movzbl  %cl, %r10d
         movzbl  %cl, %ecx
         leaq    -1(%rcx,%rcx), %rcx
         leal    -1(%r10,%r10), %r10d
         imulq   %rcx, %r11
         testq   %rdx, %rdx
         setle   %dl
         movzbl  %dl, %r9d
         movzbl  %dl, %edx
         leaq    -1(%r9,%r9), %r9
         leal    -1(%rdx,%rdx), %edx
         cmpq    %r8, %r11
         cmovne  %r9, %rcx
         cmove   %r10d, %edx
         movq    %rcx, (%rdi,%r8,8)
         addq    $1, %r8
         addl    %edx, %eax
         cmpq    %rsi, %r8
         jne     .L8

With the conditional negation patch:

  <bb 4>:
   # s_31 = PHI <s_20(5), 0(3)>
   # i_32 = PHI <i_21(5), 0(3)>
   _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
   val_4 = _11 > 0 ? 1 : -1;
   _12 = val_4 * _11;
   _14 = (long unsigned int) _12;
   _24 = _14 != i_32;
   _25 = (int64_t) _24;
   _29 = -_25;
   _28 = _29 ^ val_4;
   _27 = _28 + _25;
   MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
   _17 = (unsigned int) _27;
   s.1_18 = (unsigned int) s_31;
   _19 = _17 + s.1_18;
   s_20 = (int) _19;
   i_21 = i_32 + 1;
   if (i_21 != count_7(D))
     goto <bb 5>;
   else
     goto <bb 6>;

  <bb 5>:
   goto <bb 4>;

Which again looks worse than the original, but optimizes well into:

.L8:
         movq    (%rdi,%r8,8), %r9
         testq   %r9, %r9
         setg    %cl
         movzbl  %cl, %edx
         leaq    -1(%rdx,%rdx), %rdx
         imulq   %r9, %rdx
         xorl    %r9d, %r9d
         cmpq    %r8, %rdx
         movzbl  %cl, %edx
         setne   %r9b
         leal    -1(%rdx,%rdx), %edx
         movl    %r9d, %r10d
         negl    %r10d
         xorl    %r10d, %edx
         addl    %r9d, %edx
         movslq  %edx, %rcx
         addl    %edx, %eax
         movq    %rcx, (%rdi,%r8,8)
         addq    $1, %r8
         cmpq    %rsi, %r8
         jne     .L8

Bootstrapped and regression tested on x86_64-unknown-linux-gnu.  OK for 
the trunk?
PR tree-optimization/45685
	* tree-ssa-phiopt.c (neg_replacement): New function.
	(tree_ssa_phiopt_worker): Call it.

	PR tree-optimization/45685
	* gcc.dg/tree-ssa/pr45685.c: New test.

[RFA,PR,tree-optimization/45685]

Commit Message

Comments

Patch