diff mbox

[10/16] Add pass_oacc_kernels pass group in passes.def

Message ID alpine.LSU.2.11.1511231217040.4884@t29.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Nov. 23, 2015, 11:31 a.m. UTC
On Sat, 21 Nov 2015, Tom de Vries wrote:

> On 20/11/15 11:28, Richard Biener wrote:
> > On Thu, 19 Nov 2015, Tom de Vries wrote:
> > 
> > > >On 17/11/15 15:53, Tom de Vries wrote:
> > > > > > > >And the above LIM example
> > > > > > > >is none for why you need two LIM passes...
> > > > > >
> > > > > >Indeed. I'm planning a separate reply to explain in more detail the
> > > > need
> > > > > >for the two pass_lims.
> > > >
> > > >I.
> > > >
> > > >I managed to get rid of the two pass_lims for the motivating example that
> > > I
> > > >used until now (goacc/kernels-double-reduction.c). I found that by adding
> > > a
> > > >pass_dominator instance after pass_ch, I could get rid of the second
> > > pass_lim
> > > >(and pass_copyprop as well).
> > > >
> > > >But... then I wrote a counter example
> > > (goacc/kernels-double-reduction-n.c),
> > > >and I'm back at two pass_lims (and two pass_dominators).
> > > >Also I've split the pass group into a bit before and after pass_fre.
> > > >
> > > >So, the current pass group looks like:
> > > >...
> > > >NEXT_PASS (pass_build_ealias);
> > > >
> > > >/* Pass group that runs when the function is an offloaded function
> > > >    containing oacc kernels loops.  Part 1.  */
> > > >NEXT_PASS (pass_oacc_kernels);
> > > >PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels)
> > > >     /* We need pass_ch here, because pass_lim has no effect on
> > > >        exit-first loops (PR65442).  Ideally we want to remove both
> > > >        this pass instantiation, and the reverse transformation
> > > >        transform_to_exit_first_loop_alt, which is done in
> > > >        pass_parallelize_loops_oacc_kernels. */
> > > >     NEXT_PASS (pass_ch);
> > > >POP_INSERT_PASSES ()
> > > >
> > > >NEXT_PASS (pass_fre);
> > > >
> > > >/* Pass group that runs when the function is an offloaded function
> > > >    containing oacc kernels loops.  Part 2.  */
> > > >NEXT_PASS (pass_oacc_kernels2);
> > > >PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels2)
> > > >     /* We use pass_lim to rewrite in-memory iteration and reduction
> > > >        variable accesses in loops into local variables accesses.  */
> > > >     NEXT_PASS (pass_lim);
> > > >     NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
> > > >     NEXT_PASS (pass_lim);
> > > >     NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
> > > >     NEXT_PASS (pass_dce);
> > > >     NEXT_PASS (pass_parallelize_loops_oacc_kernels);
> > > >     NEXT_PASS (pass_expand_omp_ssa);
> > > >POP_INSERT_PASSES ()
> > > >NEXT_PASS (pass_merge_phi);
> > > >...
> > > >
> > > >
> > > >II.
> > > >
> > > >The motivating test-case kernels-double-reduction-n.c:
> > > >...
> > > >#include <stdlib.h>
> > > >
> > > >#define N 500
> > > >
> > > >unsigned int a[N][N];
> > > >
> > > >void  __attribute__((noinline,noclone))
> > > >foo (unsigned int n)
> > > >{
> > > >   int i, j;
> > > >   unsigned int sum = 1;
> > > >
> > > >#pragma acc kernels copyin (a[0:n]) copy (sum)
> > > >   {
> > > >     for (i = 0; i < n; ++i)
> > > >       for (j = 0; j < n; ++j)
> > > >         sum += a[i][j];
> > > >   }
> > > >
> > > >   if (sum != 5001)
> > > >     abort ();
> > > >}
> > > >...
> > > >
> > > >
> > > >III.
> > > >
> > > >Before first pass_lim. Note no phis on inner or outer loop header for
> > > >iteration varables or reduction variable:
> > > >...
> > > >   <bb 2>:
> > > >   _5 = *.omp_data_i_4(D).i;
> > > >   *_5 = 0;
> > > >   _44 = *.omp_data_i_4(D).n;
> > > >   _45 = *_44;
> > > >   if (_45 != 0)
> > > >     goto <bb 4>;
> > > >   else
> > > >     goto <bb 3>;
> > > >
> > > >   <bb 4>: outer loop header
> > > >   _12 = *.omp_data_i_4(D).j;
> > > >   *_12 = 0;
> > > >   if (_45 != 0)
> > > >     goto <bb 6>;
> > > >   else
> > > >     goto <bb 5>;
> > > >
> > > >   <bb 6>: inner loop header, latch
> > > >   _19 = *.omp_data_i_4(D).a;
> > > >   _21 = *_5;
> > > >   _23 = *_12;
> > > >   _24 = *_19[_21][_23];
> > > >   _25 = *.omp_data_i_4(D).sum;
> > > >   sum.0_26 = *_25;
> > > >   sum.1_27 = _24 + sum.0_26;
> > > >   *_25 = sum.1_27;
> > > >   _33 = _23 + 1;
> > > >   *_12 = _33;
> > > >   j.2_16 = (unsigned int) _33;
> > > >   if (j.2_16 < _45)
> > > >     goto <bb 6>;
> > > >   else
> > > >     goto <bb 5>;
> > > >
> > > >   <bb 5>: outer loop latch
> > > >   _36 = *_5;
> > > >   _38 = _36 + 1;
> > > >   *_5 = _38;
> > > >   i.3_9 = (unsigned int) _38;
> > > >   if (i.3_9 < _45)
> > > >     goto <bb 4>;
> > > >   else
> > > >     goto <bb 3>;
> > > >
> > > >   <bb 3>:
> > > >   return;
> > > >...
> > > >
> > > >
> > > >IV.
> > > >
> > > >After first pass_lim/pass_dom pair. Note there are phis on the inner loop
> > > >header for the reduction and the iteration variable, but not on the outer
> > > loop
> > > >header:
> > > >...
> > > >   <bb 2>:
> > > >   _5 = *.omp_data_i_4(D).i;
> > > >   *_5 = 0;
> > > >   _44 = *.omp_data_i_4(D).n;
> > > >   _45 = *_44;
> > > >   if (_45 != 0)
> > > >     goto <bb 4>;
> > > >   else
> > > >     goto <bb 3>;
> > > >
> > > >   <bb 4>:
> > > >   _12 = *.omp_data_i_4(D).j;
> > > >   _19 = *.omp_data_i_4(D).a;
> > > >   D__lsm.10_50 = *_12;
> > > >   D__lsm.11_51 = 0;
> > > >   _25 = *.omp_data_i_4(D).sum;
> > > >
> > > >   <bb 5>: outer loop header
> > > >   D__lsm.10_20 = 0;
> > > >   D__lsm.11_22 = 1;
> > > >   _21 = *_5;
> > > >   D__lsm.12_28 = *_25;
> > > >   D__lsm.13_30 = 0;
> > > >   goto <bb 7>;
> > > >
> > > >   <bb 7>: inner loop header, latch
> > > >   # D__lsm.10_47 = PHI <0(5), _33(7)>
> > > >   # D__lsm.12_49 = PHI <D__lsm.12_28(5), sum.1_27(7)>
> > > >   _23 = D__lsm.10_47;
> > > >   _24 = *_19[_21][D__lsm.10_47];
> > > >   sum.0_26 = D__lsm.12_49;
> > > >   sum.1_27 = _24 + D__lsm.12_49;
> > > >   D__lsm.12_31 = sum.1_27;
> > > >   D__lsm.13_32 = 1;
> > > >   _33 = D__lsm.10_47 + 1;
> > > >   D__lsm.10_14 = _33;
> > > >   D__lsm.11_15 = 1;
> > > >   j.2_16 = (unsigned int) _33;
> > > >   if (j.2_16 < _45)
> > > >     goto <bb 7>;
> > > >   else
> > > >     goto <bb 8>;
> > > >
> > > >   <bb 8>: outer loop latch
> > > >   # D__lsm.10_35 = PHI <_33(7)>
> > > >   # D__lsm.11_37 = PHI <1(7)>
> > > >   # D__lsm.12_7 = PHI <sum.1_27(7)>
> > > >   # D__lsm.13_8 = PHI <1(7)>
> > > >   *_25 = sum.1_27;
> > > >   _36 = *_5;
> > > >   _38 = _36 + 1;
> > > >   *_5 = _38;
> > > >   i.3_9 = (unsigned int) _38;
> > > >   if (i.3_9 < _45)
> > > >     goto <bb 5>;
> > > >   else
> > > >     goto <bb 6>;
> > > >
> > > >   <bb 6>:
> > > >   # D__lsm.10_10 = PHI <_33(8)>
> > > >   # D__lsm.11_11 = PHI <1(8)>
> > > >   *_12 = _33;
> > > >   goto <bb 3>;
> > > >
> > > >   <bb 3>:
> > > >   return;
> > > >...
> > > >
> > > >
> > > >V.
> > > >
> > > >After second pass_lim/pass_dom pair. Note there are phis on the inner and
> > > >outer loop header for the reduction and the iteration variables:
> > > >...
> > > >   <bb 2>:
> > > >   _5 = *.omp_data_i_4(D).i;
> > > >   *_5 = 0;
> > > >   _44 = *.omp_data_i_4(D).n;
> > > >   _45 = *_44;
> > > >   if (_45 != 0)
> > > >     goto <bb 4>;
> > > >   else
> > > >     goto <bb 3>;
> > > >
> > > >   <bb 4>:
> > > >   _12 = *.omp_data_i_4(D).j;
> > > >   _19 = *.omp_data_i_4(D).a;
> > > >   D__lsm.10_50 = *_12;
> > > >   D__lsm.11_51 = 0;
> > > >   _25 = *.omp_data_i_4(D).sum;
> > > >   D__lsm.14_40 = 0;
> > > >   D__lsm.15_2 = 0;
> > > >   D__lsm.16_1 = *_25;
> > > >   D__lsm.17_46 = 0;
> > > >
> > > >   <bb 5>: outer loop header
> > > >   # D__lsm.14_13 = PHI <0(4), _38(8)>
> > > >   # D__lsm.16_34 = PHI <D__lsm.16_1(4), sum.1_27(8)>
> > > >   D__lsm.10_20 = 0;
> > > >   D__lsm.11_22 = 1;
> > > >   _21 = D__lsm.14_13;
> > > >   D__lsm.12_28 = D__lsm.16_34;
> > > >   D__lsm.13_30 = 0;
> > > >   goto <bb 7>;
> > > >
> > > >   <bb 7>: inner loop header, latch
> > > >   # D__lsm.10_47 = PHI <0(5), _33(7)>
> > > >   # D__lsm.12_49 = PHI <D__lsm.16_34(5), sum.1_27(7)>
> > > >   _23 = D__lsm.10_47;
> > > >   _24 = *_19[D__lsm.14_13][D__lsm.10_47];
> > > >   sum.0_26 = D__lsm.12_49;
> > > >   sum.1_27 = _24 + D__lsm.12_49;
> > > >   D__lsm.12_31 = sum.1_27;
> > > >   D__lsm.13_32 = 1;
> > > >   _33 = D__lsm.10_47 + 1;
> > > >   D__lsm.10_14 = _33;
> > > >   D__lsm.11_15 = 1;
> > > >   j.2_16 = (unsigned int) _33;
> > > >   if (j.2_16 < _45)
> > > >     goto <bb 7>;
> > > >   else
> > > >     goto <bb 8>;
> > > >
> > > >   <bb 8>: outer loop latch
> > > >   # D__lsm.10_35 = PHI <_33(7)>
> > > >   # D__lsm.11_37 = PHI <1(7)>
> > > >   # D__lsm.12_7 = PHI <sum.1_27(7)>
> > > >   # D__lsm.13_8 = PHI <1(7)>
> > > >   # sum.1_48 = PHI <sum.1_27(7)>
> > > >   # _53 = PHI <_33(7)>
> > > >   D__lsm.16_56 = sum.1_27;
> > > >   D__lsm.17_57 = 1;
> > > >   _36 = D__lsm.14_13;
> > > >   _38 = D__lsm.14_13 + 1;
> > > >   D__lsm.14_58 = _38;
> > > >   D__lsm.15_59 = 1;
> > > >   i.3_9 = (unsigned int) _38;
> > > >   if (i.3_9 < _45)
> > > >     goto <bb 5>;
> > > >   else
> > > >     goto <bb 6>;
> > > >
> > > >   <bb 6>:
> > > >   # D__lsm.10_10 = PHI <_33(8)>
> > > >   # D__lsm.11_11 = PHI <1(8)>
> > > >   # _43 = PHI <_33(8)>
> > > >   # D__lsm.16_62 = PHI <sum.1_27(8)>
> > > >   # D__lsm.17_63 = PHI <1(8)>
> > > >   # D__lsm.14_64 = PHI <_38(8)>
> > > >   # D__lsm.15_65 = PHI <1(8)>
> > > >   *_5 = _38;
> > > >   *_25 = sum.1_27;
> > > >   *_12 = _33;
> > > >   goto <bb 3>;
> > > >
> > > >   <bb 3>:
> > > >   return;
> > > >...
> > Sorry but staring at dumps doesn't make me understand the issue you
> > run into.  Where can I reproduce this if I have time to look at this?
> 
> I've posted the state of the patch series that reproduces this problem at
> https://github.com/vries/gcc/commits/vries/master-port-kernels-test-rb , run
> goacc.exp, testcase kernels-double-reduction-n.c.
> 
> > From the dump below I understand you want no memory references in
> > the outer loop?
> > So the issue seems to be that store motion fails
> > to insert the preheader load / exit store to the outermost loop
> > possible and thus another LIM pass is needed to "store motion" those
> > again?
> 
> Yep.
> 
> >  But a simple testcase
> > 
> > int a;
> > int *p = &a;
> > int foo (int n)
> > {
> >    for (int i = 0; i < n; ++i)
> >      for (int j = 0; j < 100; ++j)
> >        *p += j + i;
> >    return a;
> > }
> > 
> > shows that LIM can do this in one step.
> 
> I've filed a FTR PR68465 - "pass_lim doesn't detect identical loop entry
> conditions" for a test-case where that doesn't happen (when using
> -fno-tree-dominator-opts).
> 
> > Which means it should
> > be investigated why it doesn't do this properly for your testcase
> > (store motion of *_25).
> 
> There seems to be two related problems:
> 1. the store has tree_could_trap_p (ref->mem.ref) true, which should be
>    false. I'll work on a fix for this.
> 2. Give that the store can trap, I  was running into PR68465. I managed
>    to eliminate the 2nd pass_lim by moving the pass_dominator instance
>    before the pass_lim instance.
> 
> Attached patch shows the pass group with only one pass_lim. I hope to be able
> to eliminate the first pass_dominator instance before pass_lim once I fix 1.
> 
> > Simply adding two LIM passes either papers over a wrong-code
> > bug (in LIM or in DOM) or over a missed-optimization in LIM.
> 
> AFAIU now, it's PR68465, a missed optimization in LIM.

Ok, it's not really LIMs job to cleanup loop header copying that way.

DOM performs jump-threading for this but FRE should also be able
to handle this just fine.  Ah, it doesn't because the outer loop
header directly contains the condition


fixes this for me (for a small testcase).  Does it help yours?

Otherwise untested of course (I hope EDGE_DFS_BACK is good enough,
it's supposed to match edges that have the src dominated by the dest).
Testing the above now.

Thanks,
Richard.

Comments

Tom de Vries Nov. 23, 2015, 3:37 p.m. UTC | #1
On 23/11/15 12:31, Richard Biener wrote:
>>>  From the dump below I understand you want no memory references in
>>> > >the outer loop?
>>> > >So the issue seems to be that store motion fails
>>> > >to insert the preheader load / exit store to the outermost loop
>>> > >possible and thus another LIM pass is needed to "store motion" those
>>> > >again?
>> >
>> >Yep.
>> >
>>> > >  But a simple testcase
>>> > >
>>> > >int a;
>>> > >int *p = &a;
>>> > >int foo (int n)
>>> > >{
>>> > >    for (int i = 0; i < n; ++i)
>>> > >      for (int j = 0; j < 100; ++j)
>>> > >        *p += j + i;
>>> > >    return a;
>>> > >}
>>> > >
>>> > >shows that LIM can do this in one step.
>> >
>> >I've filed a FTR PR68465 - "pass_lim doesn't detect identical loop entry
>> >conditions" for a test-case where that doesn't happen (when using
>> >-fno-tree-dominator-opts).
>> >
>>> > >Which means it should
>>> > >be investigated why it doesn't do this properly for your testcase
>>> > >(store motion of *_25).
>> >
>> >There seems to be two related problems:
>> >1. the store has tree_could_trap_p (ref->mem.ref) true, which should be
>> >    false. I'll work on a fix for this.
>> >2. Give that the store can trap, I  was running into PR68465. I managed
>> >    to eliminate the 2nd pass_lim by moving the pass_dominator instance
>> >    before the pass_lim instance.
>> >
>> >Attached patch shows the pass group with only one pass_lim. I hope to be able
>> >to eliminate the first pass_dominator instance before pass_lim once I fix 1.
>> >
>>> > >Simply adding two LIM passes either papers over a wrong-code
>>> > >bug (in LIM or in DOM) or over a missed-optimization in LIM.
>> >
>> >AFAIU now, it's PR68465, a missed optimization in LIM.
> Ok, it's not really LIMs job to cleanup loop header copying that way.
>
> DOM performs jump-threading for this but FRE should also be able
> to handle this just fine.  Ah, it doesn't because the outer loop
> header directly contains the condition
>
> Index: gcc/tree-ssa-sccvn.c
> ===================================================================
> --- gcc/tree-ssa-sccvn.c        (revision 230737)
> +++ gcc/tree-ssa-sccvn.c        (working copy)
> @@ -4357,20 +4402,32 @@ sccvn_dom_walker::before_dom_children (b
>
>     /* If we have a single predecessor record the equivalence from a
>        possible condition on the predecessor edge.  */
> -  if (single_pred_p (bb))
> +  edge pred_e = NULL;
> +  FOR_EACH_EDGE (e, ei, bb->preds)
> +    {
> +      if (e->flags & EDGE_DFS_BACK)
> +       continue;
> +      if (! pred_e)
> +       pred_e = e;
> +      else
> +       {
> +         pred_e = NULL;
> +         break;
> +       }
> +    }
> +  if (pred_e)
>       {
> -      edge e = single_pred_edge (bb);
>         /* Check if there are multiple executable successor edges in
>           the source block.  Otherwise there is no additional info
>           to be recorded.  */
>         edge e2;
> -      FOR_EACH_EDGE (e2, ei, e->src->succs)
> -       if (e2 != e
> +      FOR_EACH_EDGE (e2, ei, pred_e->src->succs)
> +       if (e2 != pred_e
>              && e2->flags & EDGE_EXECUTABLE)
>            break;
>         if (e2 && (e2->flags & EDGE_EXECUTABLE))
>          {
> -         gimple *stmt = last_stmt (e->src);
> +         gimple *stmt = last_stmt (pred_e->src);
>            if (stmt
>                && gimple_code (stmt) == GIMPLE_COND)
>              {
> @@ -4378,11 +4435,11 @@ sccvn_dom_walker::before_dom_children (b
>                tree lhs = gimple_cond_lhs (stmt);
>                tree rhs = gimple_cond_rhs (stmt);
>                record_conds (bb, code, lhs, rhs,
> -                           (e->flags & EDGE_TRUE_VALUE) != 0);
> +                           (pred_e->flags & EDGE_TRUE_VALUE) != 0);
>                code = invert_tree_comparison (code, HONOR_NANS (lhs));
>                if (code != ERROR_MARK)
>                  record_conds (bb, code, lhs, rhs,
> -                             (e->flags & EDGE_TRUE_VALUE) == 0);
> +                             (pred_e->flags & EDGE_TRUE_VALUE) == 0);
>              }
>          }
>       }
>
> fixes this for me (for a small testcase).  Does it help yours?
>

Yes, it has the desired effect (of not needing pass_dominator before 
pass_lim) . But, patch "Mark by_ref mem_ref in build_receiver_ref as 
non-trapping" committed as r230738, also has that effect, so AFAIU I 
don't require this tree-ssa-sccvn.c fix.

Thanks,
- Tom

> Otherwise untested of course (I hope EDGE_DFS_BACK is good enough,
> it's supposed to match edges that have the src dominated by the dest).
> Testing the above now.
Richard Biener Nov. 23, 2015, 4:36 p.m. UTC | #2
On November 23, 2015 4:37:18 PM GMT+01:00, Tom de Vries <Tom_deVries@mentor.com> wrote:
>On 23/11/15 12:31, Richard Biener wrote:
>>>>  From the dump below I understand you want no memory references in
>>>> > >the outer loop?
>>>> > >So the issue seems to be that store motion fails
>>>> > >to insert the preheader load / exit store to the outermost loop
>>>> > >possible and thus another LIM pass is needed to "store motion"
>those
>>>> > >again?
>>> >
>>> >Yep.
>>> >
>>>> > >  But a simple testcase
>>>> > >
>>>> > >int a;
>>>> > >int *p = &a;
>>>> > >int foo (int n)
>>>> > >{
>>>> > >    for (int i = 0; i < n; ++i)
>>>> > >      for (int j = 0; j < 100; ++j)
>>>> > >        *p += j + i;
>>>> > >    return a;
>>>> > >}
>>>> > >
>>>> > >shows that LIM can do this in one step.
>>> >
>>> >I've filed a FTR PR68465 - "pass_lim doesn't detect identical loop
>entry
>>> >conditions" for a test-case where that doesn't happen (when using
>>> >-fno-tree-dominator-opts).
>>> >
>>>> > >Which means it should
>>>> > >be investigated why it doesn't do this properly for your
>testcase
>>>> > >(store motion of *_25).
>>> >
>>> >There seems to be two related problems:
>>> >1. the store has tree_could_trap_p (ref->mem.ref) true, which
>should be
>>> >    false. I'll work on a fix for this.
>>> >2. Give that the store can trap, I  was running into PR68465. I
>managed
>>> >    to eliminate the 2nd pass_lim by moving the pass_dominator
>instance
>>> >    before the pass_lim instance.
>>> >
>>> >Attached patch shows the pass group with only one pass_lim. I hope
>to be able
>>> >to eliminate the first pass_dominator instance before pass_lim once
>I fix 1.
>>> >
>>>> > >Simply adding two LIM passes either papers over a wrong-code
>>>> > >bug (in LIM or in DOM) or over a missed-optimization in LIM.
>>> >
>>> >AFAIU now, it's PR68465, a missed optimization in LIM.
>> Ok, it's not really LIMs job to cleanup loop header copying that way.
>>
>> DOM performs jump-threading for this but FRE should also be able
>> to handle this just fine.  Ah, it doesn't because the outer loop
>> header directly contains the condition
>>
>> Index: gcc/tree-ssa-sccvn.c
>> ===================================================================
>> --- gcc/tree-ssa-sccvn.c        (revision 230737)
>> +++ gcc/tree-ssa-sccvn.c        (working copy)
>> @@ -4357,20 +4402,32 @@ sccvn_dom_walker::before_dom_children (b
>>
>>     /* If we have a single predecessor record the equivalence from a
>>        possible condition on the predecessor edge.  */
>> -  if (single_pred_p (bb))
>> +  edge pred_e = NULL;
>> +  FOR_EACH_EDGE (e, ei, bb->preds)
>> +    {
>> +      if (e->flags & EDGE_DFS_BACK)
>> +       continue;
>> +      if (! pred_e)
>> +       pred_e = e;
>> +      else
>> +       {
>> +         pred_e = NULL;
>> +         break;
>> +       }
>> +    }
>> +  if (pred_e)
>>       {
>> -      edge e = single_pred_edge (bb);
>>         /* Check if there are multiple executable successor edges in
>>           the source block.  Otherwise there is no additional info
>>           to be recorded.  */
>>         edge e2;
>> -      FOR_EACH_EDGE (e2, ei, e->src->succs)
>> -       if (e2 != e
>> +      FOR_EACH_EDGE (e2, ei, pred_e->src->succs)
>> +       if (e2 != pred_e
>>              && e2->flags & EDGE_EXECUTABLE)
>>            break;
>>         if (e2 && (e2->flags & EDGE_EXECUTABLE))
>>          {
>> -         gimple *stmt = last_stmt (e->src);
>> +         gimple *stmt = last_stmt (pred_e->src);
>>            if (stmt
>>                && gimple_code (stmt) == GIMPLE_COND)
>>              {
>> @@ -4378,11 +4435,11 @@ sccvn_dom_walker::before_dom_children (b
>>                tree lhs = gimple_cond_lhs (stmt);
>>                tree rhs = gimple_cond_rhs (stmt);
>>                record_conds (bb, code, lhs, rhs,
>> -                           (e->flags & EDGE_TRUE_VALUE) != 0);
>> +                           (pred_e->flags & EDGE_TRUE_VALUE) != 0);
>>                code = invert_tree_comparison (code, HONOR_NANS
>(lhs));
>>                if (code != ERROR_MARK)
>>                  record_conds (bb, code, lhs, rhs,
>> -                             (e->flags & EDGE_TRUE_VALUE) == 0);
>> +                             (pred_e->flags & EDGE_TRUE_VALUE) ==
>0);
>>              }
>>          }
>>       }
>>
>> fixes this for me (for a small testcase).  Does it help yours?
>>
>
>Yes, it has the desired effect (of not needing pass_dominator before 
>pass_lim) . But, patch "Mark by_ref mem_ref in build_receiver_ref as 
>non-trapping" committed as r230738, also has that effect, so AFAIU I 
>don't require this tree-ssa-sccvn.c fix.

OK, I committed it anyway already.

Richard.

>Thanks,
>- Tom
>
>> Otherwise untested of course (I hope EDGE_DFS_BACK is good enough,
>> it's supposed to match edges that have the src dominated by the
>dest).
>> Testing the above now.
diff mbox

Patch

Index: gcc/tree-ssa-sccvn.c
===================================================================
--- gcc/tree-ssa-sccvn.c        (revision 230737)
+++ gcc/tree-ssa-sccvn.c        (working copy)
@@ -4357,20 +4402,32 @@  sccvn_dom_walker::before_dom_children (b
 
   /* If we have a single predecessor record the equivalence from a
      possible condition on the predecessor edge.  */
-  if (single_pred_p (bb))
+  edge pred_e = NULL;
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      if (e->flags & EDGE_DFS_BACK)
+       continue;
+      if (! pred_e)
+       pred_e = e;
+      else
+       {
+         pred_e = NULL;
+         break;
+       }
+    }
+  if (pred_e)
     {
-      edge e = single_pred_edge (bb);
       /* Check if there are multiple executable successor edges in
         the source block.  Otherwise there is no additional info
         to be recorded.  */
       edge e2;
-      FOR_EACH_EDGE (e2, ei, e->src->succs)
-       if (e2 != e
+      FOR_EACH_EDGE (e2, ei, pred_e->src->succs)
+       if (e2 != pred_e
            && e2->flags & EDGE_EXECUTABLE)
          break;
       if (e2 && (e2->flags & EDGE_EXECUTABLE))
        {
-         gimple *stmt = last_stmt (e->src);
+         gimple *stmt = last_stmt (pred_e->src);
          if (stmt
              && gimple_code (stmt) == GIMPLE_COND)
            {
@@ -4378,11 +4435,11 @@  sccvn_dom_walker::before_dom_children (b
              tree lhs = gimple_cond_lhs (stmt);
              tree rhs = gimple_cond_rhs (stmt);
              record_conds (bb, code, lhs, rhs,
-                           (e->flags & EDGE_TRUE_VALUE) != 0);
+                           (pred_e->flags & EDGE_TRUE_VALUE) != 0);
              code = invert_tree_comparison (code, HONOR_NANS (lhs));
              if (code != ERROR_MARK)
                record_conds (bb, code, lhs, rhs,
-                             (e->flags & EDGE_TRUE_VALUE) == 0);
+                             (pred_e->flags & EDGE_TRUE_VALUE) == 0);
            }
        }
     }