Patchwork [PR41488] Recognize more induction variables by simplifying PEELED chrec in scalar evolution

login
register
mail settings
Submitter Bin Cheng
Date Dec. 6, 2013, 10:29 a.m.
Message ID <000501cef26e$19db9b70$4d92d250$@arm.com>
Download mbox | patch
Permalink /patch/297703/
State New
Headers show

Comments

Bin Cheng - Dec. 6, 2013, 10:29 a.m.
Hi,
Entry pr41488 shows a case in which induction variable can't be recognized
or coalesced.  As noted in the bug entry, one possible fix is to teach PRE
not to do some specific transformation.  However, it's in nature a scalar
evolution issue.  Considering below snippet:

   # i_17 = PHI <i_13(5), 0(3)>
   # _20 = PHI <_5(5), start_4(D)(3)>
   ...
   i_13 = i_17 + 1;
   _5 = start_4(D) + i_13;

Variable _20 appears in the form of PEELED chrec like (start_4, _5)_LOOP,
meaning it has value "start_4" in the 1st iteration, then changes to _5 in
following iterations.  PEELED chrec is not implemented by GCC right now, it
can be simplified into either POLYNOMIAL or PERIOD one.  The POLYNOMIAL
chrec is processed by GCC after being recognized, as in the examle, _20 is
actually {start_4, 1}_LOOP.

This patch modifies scalar evolution by trying to simplify PEELED chrec.
Without this patch, generated code for pr41488.c is like:
foo:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	cmp	r1, r2
	bge	.L7
	push	{r4}
	mov	r3, r1
	ldr	r4, [r0]
	adds	r2, r2, #1
	adds	r1, r1, #1
	movs	r0, #0
.L3:
	str	r0, [r4, r3, lsl #2]
	mov	r3, r1
	adds	r1, r1, #1
	cmp	r1, r2
	bne	.L3
	ldr	r4, [sp], #4
.L7:
	bx	lr
	.size	foo, .-foo

Which is improved to:
foo:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	cmp	r1, r2
	bge	.L1
	ldr	r3, [r0]
	movs	r0, #0
	add	r1, r3, r1, lsl #2
	add	r2, r3, r2, lsl #2
.L3:
	str	r0, [r1], #4
	cmp	r1, r2
	bne	.L3
.L1:
	bx	lr
	.size	foo, .-foo

One point needs to be clarified that I use tree_to_aff_combination_expand in
the patch.  Rational is the number of the specific PEELED_CHRECs should be
moderate, I also check the equality literally firstly and resort to affine
facility if that fails.  By measuring bootstrap of gcc and spec2kint, I
collected the number of opportunities caught by this patch like below:
                            literal comparison/affine comparison
GCC                          ~1200/~250
Spec2Kint              93/34

I could back trace the ssa chain instead of using affine functions, but that
would miss some cases.

Bootstrap and test on x86/x86_64/arm.  Is it OK?

Thanks,
bin

2013-12-06  Bin Cheng  <bin.cheng@arm.com>

	PR tree-optimization/41488
	* tree-ssa-loop-ivopts.c (add_old_iv_candidates): Don't add cand
	for PEELED_CHREC kind of IV.
	* tree-scalar-evolution.c: Include necessary header files.
	(peeled_chrec_map, simplify_peeled_chrec): New.
	(analyze_evolution_in_loop): New static variable.
	Call simplify_peeled_chrec.
	(scev_initialize): Initialize peeled_chrec_map.
	(scev_reset, scev_finalize): Reset and release peeled_chrec_map.

gcc/testsuite/ChangeLog
2013-12-06  Bin Cheng  <bin.cheng@arm.com>

	* gcc.dg/tree-ssa/scev-7.c: New test.
	* gcc.dg/pr41488.c: New test.
Richard Guenther - Dec. 6, 2013, 11:18 a.m.
On Fri, Dec 6, 2013 at 11:29 AM, bin.cheng <bin.cheng@arm.com> wrote:
> Hi,
> Entry pr41488 shows a case in which induction variable can't be recognized
> or coalesced.  As noted in the bug entry, one possible fix is to teach PRE
> not to do some specific transformation.  However, it's in nature a scalar
> evolution issue.  Considering below snippet:
>
>    # i_17 = PHI <i_13(5), 0(3)>
>    # _20 = PHI <_5(5), start_4(D)(3)>
>    ...
>    i_13 = i_17 + 1;
>    _5 = start_4(D) + i_13;
>
> Variable _20 appears in the form of PEELED chrec like (start_4, _5)_LOOP,
> meaning it has value "start_4" in the 1st iteration, then changes to _5 in
> following iterations.  PEELED chrec is not implemented by GCC right now, it
> can be simplified into either POLYNOMIAL or PERIOD one.  The POLYNOMIAL
> chrec is processed by GCC after being recognized, as in the examle, _20 is
> actually {start_4, 1}_LOOP.
>
> This patch modifies scalar evolution by trying to simplify PEELED chrec.
> Without this patch, generated code for pr41488.c is like:
> foo:
>         @ args = 0, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         @ link register save eliminated.
>         cmp     r1, r2
>         bge     .L7
>         push    {r4}
>         mov     r3, r1
>         ldr     r4, [r0]
>         adds    r2, r2, #1
>         adds    r1, r1, #1
>         movs    r0, #0
> .L3:
>         str     r0, [r4, r3, lsl #2]
>         mov     r3, r1
>         adds    r1, r1, #1
>         cmp     r1, r2
>         bne     .L3
>         ldr     r4, [sp], #4
> .L7:
>         bx      lr
>         .size   foo, .-foo
>
> Which is improved to:
> foo:
>         @ args = 0, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         @ link register save eliminated.
>         cmp     r1, r2
>         bge     .L1
>         ldr     r3, [r0]
>         movs    r0, #0
>         add     r1, r3, r1, lsl #2
>         add     r2, r3, r2, lsl #2
> .L3:
>         str     r0, [r1], #4
>         cmp     r1, r2
>         bne     .L3
> .L1:
>         bx      lr
>         .size   foo, .-foo
>
> One point needs to be clarified that I use tree_to_aff_combination_expand in
> the patch.  Rational is the number of the specific PEELED_CHRECs should be
> moderate, I also check the equality literally firstly and resort to affine
> facility if that fails.  By measuring bootstrap of gcc and spec2kint, I
> collected the number of opportunities caught by this patch like below:
>                             literal comparison/affine comparison
> GCC                          ~1200/~250
> Spec2Kint              93/34
>
> I could back trace the ssa chain instead of using affine functions, but that
> would miss some cases.
>
> Bootstrap and test on x86/x86_64/arm.  Is it OK?

I'm not too much into the SCEV theory (CCed Sebastian for this), but at least
tree-affine allows you to compare without going back to trees by computing
the difference and comparing that to zero.

The patch looks sensible otherwise but let's see if Sebastian has anything
to say.

Thanks,
Richard.

> Thanks,
> bin
>
> 2013-12-06  Bin Cheng  <bin.cheng@arm.com>
>
>         PR tree-optimization/41488
>         * tree-ssa-loop-ivopts.c (add_old_iv_candidates): Don't add cand
>         for PEELED_CHREC kind of IV.
>         * tree-scalar-evolution.c: Include necessary header files.
>         (peeled_chrec_map, simplify_peeled_chrec): New.
>         (analyze_evolution_in_loop): New static variable.
>         Call simplify_peeled_chrec.
>         (scev_initialize): Initialize peeled_chrec_map.
>         (scev_reset, scev_finalize): Reset and release peeled_chrec_map.
>
> gcc/testsuite/ChangeLog
> 2013-12-06  Bin Cheng  <bin.cheng@arm.com>
>
>         * gcc.dg/tree-ssa/scev-7.c: New test.
>         * gcc.dg/pr41488.c: New test.
Jeff Law - Dec. 6, 2013, 7:20 p.m.
On 12/06/13 03:29, bin.cheng wrote:
> Hi,
> Entry pr41488 shows a case in which induction variable can't be recognized
> or coalesced.  As noted in the bug entry, one possible fix is to teach PRE
> not to do some specific transformation.  However, it's in nature a scalar
> evolution issue.  Considering below snippet:
>
>     # i_17 = PHI <i_13(5), 0(3)>
>     # _20 = PHI <_5(5), start_4(D)(3)>
>     ...
>     i_13 = i_17 + 1;
>     _5 = start_4(D) + i_13;
>
> Variable _20 appears in the form of PEELED chrec like (start_4, _5)_LOOP,
> meaning it has value "start_4" in the 1st iteration, then changes to _5 in
> following iterations.  PEELED chrec is not implemented by GCC right now, it
> can be simplified into either POLYNOMIAL or PERIOD one.
Right.  PEELED_CHREC was removed from the LNO branch back in 2004, 
presumably before the LNO branch was merged into the mainline.  No 
reason was given.

But what you're really discovering here is that _20 is just another 
standard looking IV that appears in the form of a peeled chrec, right?


  The POLYNOMIAL
> chrec is processed by GCC after being recognized, as in the examle, _20 is
> actually {start_4, 1}_LOOP.
Right.  Based on reading the archives, it looks like this stuff is/was 
generated by PRE.  I also suspect jump threading can create them.  There 
was talk of throttling PRE to leave things in a form that the IV 
analysis could more easily digest, but I'm not sure if that was ever 
completed.

[ snip ]

>
> One point needs to be clarified that I use tree_to_aff_combination_expand in
> the patch.  Rational is the number of the specific PEELED_CHRECs should be
> moderate, I also check the equality literally firstly and resort to affine
> facility if that fails.  By measuring bootstrap of gcc and spec2kint, I
> collected the number of opportunities caught by this patch like below:
>                              literal comparison/affine comparison
> GCC                          ~1200/~250
> Spec2Kint              93/34
>
> I could back trace the ssa chain instead of using affine functions, but that
> would miss some cases.
I assume tree_to_aff_combination_expand is relatively expensive, thus 
the two approaches, one which avoids tree_to_aff_combination_expand.


In add_old_iv_candidates, is it the case that the only time 
SSA_NAME_DEF_STMT (def) is another PHI node is when it's one of these 
affine ivs appearing in the form of a PEELED_CHREC?  And in that case, 
we do not want to record the possibility of leaving the original IV 
untouched?  -- Just trying to make sure I understand the code before 
giving a final approval.

jeff
Bin.Cheng - Dec. 7, 2013, 2:44 a.m.
On Sat, Dec 7, 2013 at 3:20 AM, Jeff Law <law@redhat.com> wrote:
> On 12/06/13 03:29, bin.cheng wrote:
>>
>> Hi,
>> Entry pr41488 shows a case in which induction variable can't be recognized
>> or coalesced.  As noted in the bug entry, one possible fix is to teach PRE
>> not to do some specific transformation.  However, it's in nature a scalar
>> evolution issue.  Considering below snippet:
>>
>>     # i_17 = PHI <i_13(5), 0(3)>
>>     # _20 = PHI <_5(5), start_4(D)(3)>
>>     ...
>>     i_13 = i_17 + 1;
>>     _5 = start_4(D) + i_13;
>>
>> Variable _20 appears in the form of PEELED chrec like (start_4, _5)_LOOP,
>> meaning it has value "start_4" in the 1st iteration, then changes to _5 in
>> following iterations.  PEELED chrec is not implemented by GCC right now,
>> it
>> can be simplified into either POLYNOMIAL or PERIOD one.
>
> Right.  PEELED_CHREC was removed from the LNO branch back in 2004,
> presumably before the LNO branch was merged into the mainline.  No reason
> was given.

Maybe we don't have any user for such CHRECs in GCC now, but is's just guessing.

>
> But what you're really discovering here is that _20 is just another standard
> looking IV that appears in the form of a peeled chrec, right?

Exactly, IVOPT handles only polynomial ones, and _20 is one such chrec
only appearing in peeled form.

>
>
>
>  The POLYNOMIAL
>>
>> chrec is processed by GCC after being recognized, as in the examle, _20 is
>> actually {start_4, 1}_LOOP.
>
> Right.  Based on reading the archives, it looks like this stuff is/was
> generated by PRE.  I also suspect jump threading can create them.  There was
> talk of throttling PRE to leave things in a form that the IV analysis could
> more easily digest, but I'm not sure if that was ever completed.

Also could just because it is coded in that way, just as the case I
added in patch.  I found real examples from ggc-page.c in GCC.
But it's always good to cleanup input of an optimization, I presume
that's why Richard tried to move IVOPT later before.

>
> [ snip ]
>
>
>>
>> One point needs to be clarified that I use tree_to_aff_combination_expand
>> in
>> the patch.  Rational is the number of the specific PEELED_CHRECs should be
>> moderate, I also check the equality literally firstly and resort to affine
>> facility if that fails.  By measuring bootstrap of gcc and spec2kint, I
>> collected the number of opportunities caught by this patch like below:
>>                              literal comparison/affine comparison
>> GCC                          ~1200/~250
>> Spec2Kint              93/34
>>
>> I could back trace the ssa chain instead of using affine functions, but
>> that
>> would miss some cases.
>
> I assume tree_to_aff_combination_expand is relatively expensive, thus the
> two approaches, one which avoids tree_to_aff_combination_expand.
Considering the dump of case in the patch:

  <bb 2>:
  _16 = start_4(D) + 1000;
  if (end_6(D) < _16)
    goto <bb 3>;
  else
    goto <bb 6>;

  <bb 3>:
  pretmp_22 = sp_7(D)->data;

  <bb 4>:
  # i_17 = PHI <i_13(5), 1000(3)>
  # _20 = PHI <_5(5), _16(3)>
  _9 = (unsigned int) _20;
  _10 = _9 * 4;
  _11 = pretmp_22 + _10;
  *_11 = 0;
  i_13 = i_17 + -1;
  _5 = start_4(D) + i_13;
  if (_5 > end_6(D))
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 5>:
  goto <bb 4>;

I have to prove (_16 + -1) equals to (start_4 + 999) for _20, so
either using affine function to back trace the definition of _16, or I
have to back trace ssa_chain manually.  Here I use affine function
because the number of such cases should be moderate and there are more
complicated case in which simple back trace will lose.

Another question, is it acceptable to add an parameter for
tree_to_aff_combination_expand to limit the number of recursive call
for it?  Thus we don't need to expand to leaf node every time.

>
>
> In add_old_iv_candidates, is it the case that the only time
> SSA_NAME_DEF_STMT (def) is another PHI node is when it's one of these affine

I suppose.  Actually IVOPT make an assert on IP_ORIGINAL candidates in
function rewrite_use_nonlinear_expr, like:

  /* An important special case -- if we are asked to express value of
     the original iv by itself, just exit; there is no need to
     introduce a new computation (that might also need casting the
     variable to unsigned and back).  */
  if (cand->pos == IP_ORIGINAL
      && cand->incremented_at == use->stmt)
    {
      enum tree_code stmt_code;

      gcc_assert (is_gimple_assign (use->stmt));
      ......

The only case I can think about involving other kind of phi node is
for merging phi in conditional code like:

LOOP:
    x_1 = phi <x_2, start>
    ....
    if (cond)
       x_3 = x_1 + 1;
    else
       x_4 = x_1 + 1;
    x_2 = phi <x_3, x_4>

Though SCEV knows x_3/x_4 are ssa_names for an iv in different
branches, IVOPT don't handle them this way (not treated as an original
iv).  This is one defect of current implementation of IVOPT, I think.

> ivs appearing in the form of a PEELED_CHREC?  And in that case, we do not
> want to record the possibility of leaving the original IV untouched?  --

IVOPT adds original cand and tries to keep it the original IV is
because it does have an updating statement.  For IVs picked up by this
patch, it doesn't have stepping statement, so no need/way to leaving
it untouched.  It will be represented by candidates for IVs from which
it is peeled.

> Just trying to make sure I understand the code before giving a final
> approval.

I hope this can explain the patch a little bit, and thanks for your reviewing.

Thanks,
bin

>
> jeff
Jeff Law - Dec. 9, 2013, 4 a.m.
On 12/06/13 19:44, Bin.Cheng wrote:
>> Right.  Based on reading the archives, it looks like this stuff is/was
>> generated by PRE.  I also suspect jump threading can create them.  There was
>> talk of throttling PRE to leave things in a form that the IV analysis could
>> more easily digest, but I'm not sure if that was ever completed.
>
> Also could just because it is coded in that way, just as the case I
> added in patch.  I found real examples from ggc-page.c in GCC.
> But it's always good to cleanup input of an optimization, I presume
> that's why Richard tried to move IVOPT later before.
Certainly.  That possibility was mentioned as well in either 41488 or 
elsewhere in my research.

>>
>> I assume tree_to_aff_combination_expand is relatively expensive, thus the
>> two approaches, one which avoids tree_to_aff_combination_expand.
> Considering the dump of case in the patch:
[ snip ]
So it's also a case using the affine stuff gets you get a simpler 
interface to express the value in terms you may be able match.

>
> Another question, is it acceptable to add an parameter for
> tree_to_aff_combination_expand to limit the number of recursive call
> for it?  Thus we don't need to expand to leaf node every time.
If there's a compelling reason to limit the recursion, then I'd think 
such a parameter would be reasonable.


>
>>
>>
>> In add_old_iv_candidates, is it the case that the only time
>> SSA_NAME_DEF_STMT (def) is another PHI node is when it's one of these affine
>
> I suppose.  Actually IVOPT make an assert on IP_ORIGINAL candidates in
> function rewrite_use_nonlinear_expr, like:
Well, the reason I ask is after your patch we'd be ignoring anything 
where SSA_NAME_DEF_STMT (def) is a PHI node and not a PEELED_CHREC.  I 
want to make sure there aren't other expected cases where 
SSA_NAME_DEF_STMT (def) is a PHI that we'd want to call add_candidate_1 for.

It sounds like there aren't other cases you'd expect to see here where 
SSA_NAME_DEF_STMT (defFFF) is a PHI.

> IVOPT adds original cand and tries to keep it the original IV is
> because it does have an updating statement.  For IVs picked up by this
> patch, it doesn't have stepping statement, so no need/way to leaving
> it untouched.  It will be represented by candidates for IVs from which
> it is peeled.
Understood.  Thanks for clarifying.  The patch is fine for the trunk.

jeff
Bin.Cheng - Dec. 9, 2013, 5:15 a.m.
On Mon, Dec 9, 2013 at 12:00 PM, Jeff Law <law@redhat.com> wrote:
> On 12/06/13 19:44, Bin.Cheng wrote:
>>>
>>> Right.  Based on reading the archives, it looks like this stuff is/was
>>> generated by PRE.  I also suspect jump threading can create them.  There
>>> was
>>> talk of throttling PRE to leave things in a form that the IV analysis
>>> could
>>> more easily digest, but I'm not sure if that was ever completed.
>>
>>
>> Also could just because it is coded in that way, just as the case I
>> added in patch.  I found real examples from ggc-page.c in GCC.
>> But it's always good to cleanup input of an optimization, I presume
>> that's why Richard tried to move IVOPT later before.
>
> Certainly.  That possibility was mentioned as well in either 41488 or
> elsewhere in my research.
>
>
>>>
>>> I assume tree_to_aff_combination_expand is relatively expensive, thus the
>>> two approaches, one which avoids tree_to_aff_combination_expand.
>>
>> Considering the dump of case in the patch:
>
> [ snip ]
> So it's also a case using the affine stuff gets you get a simpler interface
> to express the value in terms you may be able match.
Yeah.

>
>
>>
>> Another question, is it acceptable to add an parameter for
>> tree_to_aff_combination_expand to limit the number of recursive call
>> for it?  Thus we don't need to expand to leaf node every time.
>
> If there's a compelling reason to limit the recursion, then I'd think such a
> parameter would be reasonable.
Not for now.  Just come up this idea given that some recent patches
also use the interface to get back trace information and it's
expensive.  Maybe Richard have some comment here too.

>
>
>
>>
>>>
>>>
>>> In add_old_iv_candidates, is it the case that the only time
>>> SSA_NAME_DEF_STMT (def) is another PHI node is when it's one of these
>>> affine
>>
>>
>> I suppose.  Actually IVOPT make an assert on IP_ORIGINAL candidates in
>> function rewrite_use_nonlinear_expr, like:
>
> Well, the reason I ask is after your patch we'd be ignoring anything where
> SSA_NAME_DEF_STMT (def) is a PHI node and not a PEELED_CHREC.  I want to
> make sure there aren't other expected cases where SSA_NAME_DEF_STMT (def) is
> a PHI that we'd want to call add_candidate_1 for.
>
> It sounds like there aren't other cases you'd expect to see here where
> SSA_NAME_DEF_STMT (defFFF) is a PHI.
Yes.

>
>
>> IVOPT adds original cand and tries to keep it the original IV is
>> because it does have an updating statement.  For IVs picked up by this
>> patch, it doesn't have stepping statement, so no need/way to leaving
>> it untouched.  It will be represented by candidates for IVs from which
>> it is peeled.
>
> Understood.  Thanks for clarifying.  The patch is fine for the trunk.
Thanks very much for reviewing.  Since Sebastian hasn't added any
comments, I will change the patch as suggested by Richard before, then
retest it, and check in the new version if it's ok.

Thanks,
bin

>
> jeff
Richard Guenther - Dec. 9, 2013, 9:53 a.m.
On Mon, Dec 9, 2013 at 6:15 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Mon, Dec 9, 2013 at 12:00 PM, Jeff Law <law@redhat.com> wrote:
>> On 12/06/13 19:44, Bin.Cheng wrote:
>>>>
>>>> Right.  Based on reading the archives, it looks like this stuff is/was
>>>> generated by PRE.  I also suspect jump threading can create them.  There
>>>> was
>>>> talk of throttling PRE to leave things in a form that the IV analysis
>>>> could
>>>> more easily digest, but I'm not sure if that was ever completed.
>>>
>>>
>>> Also could just because it is coded in that way, just as the case I
>>> added in patch.  I found real examples from ggc-page.c in GCC.
>>> But it's always good to cleanup input of an optimization, I presume
>>> that's why Richard tried to move IVOPT later before.
>>
>> Certainly.  That possibility was mentioned as well in either 41488 or
>> elsewhere in my research.
>>
>>
>>>>
>>>> I assume tree_to_aff_combination_expand is relatively expensive, thus the
>>>> two approaches, one which avoids tree_to_aff_combination_expand.
>>>
>>> Considering the dump of case in the patch:
>>
>> [ snip ]
>> So it's also a case using the affine stuff gets you get a simpler interface
>> to express the value in terms you may be able match.
> Yeah.
>
>>
>>
>>>
>>> Another question, is it acceptable to add an parameter for
>>> tree_to_aff_combination_expand to limit the number of recursive call
>>> for it?  Thus we don't need to expand to leaf node every time.
>>
>> If there's a compelling reason to limit the recursion, then I'd think such a
>> parameter would be reasonable.
> Not for now.  Just come up this idea given that some recent patches
> also use the interface to get back trace information and it's
> expensive.  Maybe Richard have some comment here too.

IMHO the natural limit should be MAX_AFF_ELTS - it shouldn't ever
try to fill up more slots than that, as if the aff_combination_add
at the end will end up populating comb->rest the whole thing was
pretty pointless.

Also memory management of the cache it uses can be improved
by using an obstack alongside the pointer-map to avoid calling
malloc/free per element.  Also the pointer-map should rather use
the SSA name it expands as base which would make formulating
the cache as a lattice possible.

Richard.

>>
>>
>>
>>>
>>>>
>>>>
>>>> In add_old_iv_candidates, is it the case that the only time
>>>> SSA_NAME_DEF_STMT (def) is another PHI node is when it's one of these
>>>> affine
>>>
>>>
>>> I suppose.  Actually IVOPT make an assert on IP_ORIGINAL candidates in
>>> function rewrite_use_nonlinear_expr, like:
>>
>> Well, the reason I ask is after your patch we'd be ignoring anything where
>> SSA_NAME_DEF_STMT (def) is a PHI node and not a PEELED_CHREC.  I want to
>> make sure there aren't other expected cases where SSA_NAME_DEF_STMT (def) is
>> a PHI that we'd want to call add_candidate_1 for.
>>
>> It sounds like there aren't other cases you'd expect to see here where
>> SSA_NAME_DEF_STMT (defFFF) is a PHI.
> Yes.
>
>>
>>
>>> IVOPT adds original cand and tries to keep it the original IV is
>>> because it does have an updating statement.  For IVs picked up by this
>>> patch, it doesn't have stepping statement, so no need/way to leaving
>>> it untouched.  It will be represented by candidates for IVs from which
>>> it is peeled.
>>
>> Understood.  Thanks for clarifying.  The patch is fine for the trunk.
> Thanks very much for reviewing.  Since Sebastian hasn't added any
> comments, I will change the patch as suggested by Richard before, then
> retest it, and check in the new version if it's ok.
>
> Thanks,
> bin
>
>>
>> jeff
>
>
>
> --
> Best Regards.

Patch

Index: gcc/tree-scalar-evolution.c
===================================================================
--- gcc/tree-scalar-evolution.c	(revision 205688)
+++ gcc/tree-scalar-evolution.c	(working copy)
@@ -280,6 +280,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa.h"
 #include "cfgloop.h"
 #include "tree-chrec.h"
+#include "pointer-set.h"
+#include "tree-affine.h"
 #include "tree-scalar-evolution.h"
 #include "dumpfile.h"
 #include "params.h"
@@ -1380,7 +1382,66 @@  follow_ssa_edge (struct loop *loop, gimple def, gi
 }
 
 
+/* Pointer map used when simplifying PEELED_CHREC into POLYNOMIAL_CHREC.  */
+static pointer_map_t *peeled_chrec_map;
 
+/* Simplify PEELED_CHREC represented by (init_cond, arg) in LOOP.
+   Handle below case and return the corresponding POLYNOMIAL_CHREC:
+
+   # i_17 = PHI <i_13(5), 0(3)>
+   # _20 = PHI <_5(5), start_4(D)(3)>
+   ...
+   i_13 = i_17 + 1;
+   _5 = start_4(D) + i_13;
+
+   Though variable _20 appears as a PEELED_CHREC in the form of
+   (start_4, _5)_LOOP, it's a POLYNOMIAL_CHREC like {start_4, 1}_LOOP.
+
+   See PR41488.  */
+
+static tree
+simplify_peeled_chrec (struct loop *loop, tree arg, tree init_cond)
+{
+  aff_tree aff1, aff2;
+  tree ev, left, right, type, step_val;
+
+  ev = instantiate_parameters (loop, analyze_scalar_evolution (loop, arg));
+  if (ev == NULL_TREE || TREE_CODE (ev) != POLYNOMIAL_CHREC)
+    return chrec_dont_know;
+
+  left = CHREC_LEFT (ev);
+  right = CHREC_RIGHT (ev);
+  type = TREE_TYPE (left);
+  step_val = chrec_fold_plus (type, init_cond, right);
+
+  /* Transform (init, {left, right}_LOOP)_LOOP to {init, right}_LOOP
+     if "left" equals to "init + right".  */
+  if (operand_equal_p (left, step_val, 0))
+    {
+      if (dump_file && (dump_flags & TDF_SCEV))
+	fprintf (dump_file, "Simplify PEELED_CHREC into POLYNOMIAL_CHREC.\n");
+
+      return build_polynomial_chrec (loop->num, init_cond, right);
+    }
+
+  /* Try harder to check if they are equal.  */
+  tree_to_aff_combination_expand (left, type, &aff1, &peeled_chrec_map);
+  tree_to_aff_combination_expand (step_val, type, &aff2, &peeled_chrec_map);
+  left = fold_convert (type, aff_combination_to_tree (&aff1));
+  step_val = fold_convert (type, aff_combination_to_tree (&aff2));
+
+  /* Transform (init, {left, right}_LOOP)_LOOP to {init, right}_LOOP
+     if "left" equals to "init + right".  */
+  if (operand_equal_p (left, step_val, 0))
+    {
+      if (dump_file && (dump_flags & TDF_SCEV))
+	fprintf (dump_file, "Simplify PEELED_CHREC into POLYNOMIAL_CHREC.\n");
+
+      return build_polynomial_chrec (loop->num, init_cond, right);
+    }
+  return chrec_dont_know;
+}
+
 /* Given a LOOP_PHI_NODE, this function determines the evolution
    function from LOOP_PHI_NODE to LOOP_PHI_NODE in the loop.  */
 
@@ -1392,6 +1453,7 @@  analyze_evolution_in_loop (gimple loop_phi_node,
   tree evolution_function = chrec_not_analyzed_yet;
   struct loop *loop = loop_containing_stmt (loop_phi_node);
   basic_block bb;
+  static bool simplify_peeled_chrec_p = true;
 
   if (dump_file && (dump_flags & TDF_SCEV))
     {
@@ -1442,7 +1504,19 @@  analyze_evolution_in_loop (gimple loop_phi_node,
 	 all the other iterations it has the value of ARG.
 	 For the moment, PEELED_CHREC nodes are not built.  */
       if (res != t_true)
-	ev_fn = chrec_dont_know;
+	{
+	  ev_fn = chrec_dont_know;
+	  /* Try to recognize POLYNOMIAL_CHREC which appears in
+	     the form of PEELED_CHREC, but guard the process with
+	     a bool variable to keep the analyzer from infinite
+	     recurrence for real PEELED_RECs.  */
+	  if (simplify_peeled_chrec_p && TREE_CODE (arg) == SSA_NAME)
+	    {
+	      simplify_peeled_chrec_p = false;
+	      ev_fn = simplify_peeled_chrec (loop, arg, init_cond);
+	      simplify_peeled_chrec_p = true;
+	    }
+	}
 
       /* When there are multiple back edges of the loop (which in fact never
 	 happens currently, but nevertheless), merge their evolutions.  */
@@ -3086,6 +3160,8 @@  scev_initialize (void)
 
   initialize_scalar_evolutions_analyzer ();
 
+  peeled_chrec_map = pointer_map_create ();
+
   FOR_EACH_LOOP (loop, 0)
     {
       loop->nb_iterations = NULL_TREE;
@@ -3122,6 +3198,12 @@  scev_reset (void)
 
   scev_reset_htab ();
 
+  if (peeled_chrec_map)
+    {
+      pointer_map_destroy (peeled_chrec_map);
+      peeled_chrec_map = NULL;
+    }
+
   if (!current_loops)
     return;
 
@@ -3209,6 +3291,11 @@  scev_finalize (void)
     return;
   htab_delete (scalar_evolution_info);
   scalar_evolution_info = NULL;
+  if (peeled_chrec_map)
+    {
+      pointer_map_destroy (peeled_chrec_map);
+      peeled_chrec_map = NULL;
+    }
 }
 
 /* Returns true if the expression EXPR is considered to be too expensive
Index: gcc/testsuite/gcc.dg/tree-ssa/scev-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/scev-7.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/scev-7.c	(revision 0)
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-sccp-scev" } */
+
+struct struct_t
+{
+  int* data;
+};
+
+void foo (struct struct_t* sp, int start, int end)
+{
+  int i;
+
+  for (i = 1000; i+start > end; i--)
+    sp->data[i+start] = 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Simplify PEELED_CHREC into POLYNOMIAL_CHREC" 1 "sccp" } } */
+/* { dg-final { cleanup-tree-dump "sccp" } } */
Index: gcc/testsuite/gcc.dg/pr41488.c
===================================================================
--- gcc/testsuite/gcc.dg/pr41488.c	(revision 0)
+++ gcc/testsuite/gcc.dg/pr41488.c	(revision 0)
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-sccp-scev" } */
+
+struct struct_t
+{
+  int* data;
+};
+
+void foo (struct struct_t* sp, int start, int end)
+{
+  int i;
+
+  for (i = 0; i+start < end; i++)
+    sp->data[i+start] = 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Simplify PEELED_CHREC into POLYNOMIAL_CHREC" 1 "sccp" } } */
+/* { dg-final { cleanup-tree-dump "sccp" } } */
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c	(revision 205688)
+++ gcc/tree-ssa-loop-ivopts.c	(working copy)
@@ -2526,11 +2526,19 @@  add_old_iv_candidates (struct ivopts_data *data, s
       /* Additionally record the possibility of leaving the original iv
 	 untouched.  */
       def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (data->current_loop));
-      cand = add_candidate_1 (data,
-			      iv->base, iv->step, true, IP_ORIGINAL, NULL,
-			      SSA_NAME_DEF_STMT (def));
-      cand->var_before = iv->ssa_name;
-      cand->var_after = def;
+      /* Don't add candidate if it's from another PHI node because
+	  it's an affine iv appearing in the form of PEELED_CHREC.  */
+      phi = SSA_NAME_DEF_STMT (def);
+      if (gimple_code (phi) != GIMPLE_PHI)
+	{
+	  cand = add_candidate_1 (data,
+				  iv->base, iv->step, true, IP_ORIGINAL, NULL,
+				  SSA_NAME_DEF_STMT (def));
+	  cand->var_before = iv->ssa_name;
+	  cand->var_after = def;
+	}
+      else
+	gcc_assert (gimple_bb (phi) == data->current_loop->header);
     }
 }