diff mbox series

[v2,3/3] Consider doloop cmp use in ivopts

Message ID 1557803406-123657-1-git-send-email-linkw@linux.ibm.com
State New
Headers show
Series [v2,1/3] Move prepare_decl_rtl to expr.[ch] as extern | expand

Commit Message

Kewen.Lin May 14, 2019, 3:10 a.m. UTC
From: Kewen Lin <linkw@linux.ibm.com>

Previous version link for background:
https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html

Firstly, it's to call predict_doloop_p hook to check this
loop will be transformed to doloop or not, if yes, find
the expected comp iv use and its dependent original iv,
set the iv candidate as bind_cand of the group.
In following candidate selection process, we will bypass
the group with bind_cand, since we don't want to affect
global decision making for an iv use which will be
eliminated eventually.  At the time of iv candidate set
finalization, we will fill the cost pair for the group
with bind_cand.  If the bind_cand is already in the final
set, then just use it. Otherwise, we can check whether one
of existing final set is better and fill with that if so.

Bootstrapped and regression testing passed on powerpc64le.

Is it ok for trunk?

gcc/ChangeLog

2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize_loop): Call
	predict_doloop_p hook and bind_cand_for_doloop_uses.
	(bind_cand_for_doloop_uses): New function.
	(find_optimal_iv_set): Call handle_groups_with_bind_cand.
	(handle_groups_with_bind_cand): New function.
	(record_group): Init bind_cand.
	(set_group_iv_cost): Consider bind_cand group.
	(iv_ca_dump): Add dump for bind_cand.
	(try_add_cand_for): Bypass bind_cand group.
	(iv_ca_extend): Likewise.
	(iv_ca_narrow): Likewise.
	(iv_ca_replace): Likewise.

gcc/testsuite/ChangeLog

2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-lt.c : Adjust.

---
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c |   7 +-
 gcc/tree-ssa-loop-ivopts.c                | 155 +++++++++++++++++++++++++++++-
 2 files changed, 156 insertions(+), 6 deletions(-)

Comments

Richard Biener May 14, 2019, 7:26 a.m. UTC | #1
On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>
> From: Kewen Lin <linkw@linux.ibm.com>
>
> Previous version link for background:
> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>
> Firstly, it's to call predict_doloop_p hook to check this
> loop will be transformed to doloop or not, if yes, find
> the expected comp iv use and its dependent original iv,
> set the iv candidate as bind_cand of the group.
> In following candidate selection process, we will bypass
> the group with bind_cand, since we don't want to affect
> global decision making for an iv use which will be
> eliminated eventually.  At the time of iv candidate set
> finalization, we will fill the cost pair for the group
> with bind_cand.  If the bind_cand is already in the final
> set, then just use it. Otherwise, we can check whether one
> of existing final set is better and fill with that if so.
>
> Bootstrapped and regression testing passed on powerpc64le.
>
> Is it ok for trunk?

I wonder what prevents IVOPTs to consider a counter IV
(eventually such candidate needs to be added if that's not
already done) to be the most profitable variant w/o any
of the other changes?  I guess that would be costing of
the IV adjust plus branch which we would need to lower
in case there's nothing inside the loop that would make
later doloop transform fail?

Richard.

> gcc/ChangeLog
>
> 2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize_loop): Call
>         predict_doloop_p hook and bind_cand_for_doloop_uses.
>         (bind_cand_for_doloop_uses): New function.
>         (find_optimal_iv_set): Call handle_groups_with_bind_cand.
>         (handle_groups_with_bind_cand): New function.
>         (record_group): Init bind_cand.
>         (set_group_iv_cost): Consider bind_cand group.
>         (iv_ca_dump): Add dump for bind_cand.
>         (try_add_cand_for): Bypass bind_cand group.
>         (iv_ca_extend): Likewise.
>         (iv_ca_narrow): Likewise.
>         (iv_ca_replace): Likewise.
>
> gcc/testsuite/ChangeLog
>
> 2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-lt.c : Adjust.
>
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c |   7 +-
>  gcc/tree-ssa-loop-ivopts.c                | 155 +++++++++++++++++++++++++++++-
>  2 files changed, 156 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> index 171c85a..f61288c 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> @@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
>    while (i < n);
>  }
>
> -/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
> -/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
> -/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
> +/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 885c8e8..50b5900 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -393,6 +393,8 @@ struct iv_group
>    struct cost_pair *cost_map;
>    /* The selected candidate for the group.  */
>    struct iv_cand *selected;
> +  /* The bind candidate for this group, for doloop only so far.  */
> +  struct iv_cand *bind_cand;
>    /* Uses in the group.  */
>    vec<struct iv_use *> vuses;
>  };
> @@ -1592,6 +1594,7 @@ record_group (struct ivopts_data *data, enum use_type type)
>    group->type = type;
>    group->related_cands = BITMAP_ALLOC (NULL);
>    group->vuses.create (1);
> +  group->bind_cand = NULL;
>
>    data->vgroups.safe_push (group);
>    return group;
> @@ -3612,7 +3615,9 @@ set_group_iv_cost (struct ivopts_data *data,
>  {
>    unsigned i, s;
>
> -  if (cost.infinite_cost_p ())
> +  gcc_assert (cand);
> +  /* For the group with bind_cand, make it always have cost pair.  */
> +  if (cost.infinite_cost_p () && group->bind_cand != cand)
>      {
>        BITMAP_FREE (inv_vars);
>        BITMAP_FREE (inv_exprs);
> @@ -6067,7 +6072,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
>                  group->id, cp->cand->id, cp->cost.cost,
>                  cp->cost.complexity);
>        else
> -       fprintf (file, "   group:%d --> ??\n", group->id);
> +       fprintf (file, "   group:%d --> ?? %s\n", group->id,
> +                group->bind_cand ? "(bind)" : "");
>      }
>
>    const char *pref = "";
> @@ -6110,6 +6116,9 @@ iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
>    for (i = 0; i < ivs->upto; i++)
>      {
>        group = data->vgroups[i];
> +      /* Ignore groups binded with some cand.  */
> +      if (group->bind_cand)
> +       continue;
>        old_cp = iv_ca_cand_for_group (ivs, group);
>
>        if (old_cp
> @@ -6165,7 +6174,9 @@ iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs,
>    for (i = 0; i < data->vgroups.length (); i++)
>      {
>        group = data->vgroups[i];
> -
> +      /* Ignore groups binded with some cand.  */
> +      if (group->bind_cand)
> +       continue;
>        old_cp = iv_ca_cand_for_group (ivs, group);
>        if (old_cp->cand != cand)
>         continue;
> @@ -6348,6 +6359,9 @@ iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
>        for (j = 0; j < ivs->upto; j++)
>         {
>           struct iv_group *group = data->vgroups[j];
> +         /* Ignore groups binded with some cand.  */
> +         if (group->bind_cand)
> +           continue;
>           old_cp = iv_ca_cand_for_group (ivs, group);
>
>           if (old_cp->cand != cand)
> @@ -6406,6 +6420,15 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
>    struct iv_ca_delta *best_delta = NULL, *act_delta;
>    struct cost_pair *cp;
>
> +  /* Bypass the candidate selection for the groups with bind_cand, but need
> +     to keep upto up to date, to avoid the count of visited groups becomes
> +     inconsistent in futher handlings.  */
> +  if (group->bind_cand)
> +    {
> +      ivs->upto++;
> +      return true;
> +    }
> +
>    iv_ca_add_group (data, ivs, group);
>    best_cost = iv_ca_cost (ivs);
>    cp = iv_ca_cand_for_group (ivs, group);
> @@ -6635,6 +6658,59 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
>    return set;
>  }
>
> +/* Since we bypass the candidate determination process for the groups with
> +   bind_cand previously, now we want to fill the cost pair for them.  The
> +   simplest way is to fill the bind_cand directly, but for some cases it
> +   exposes more opportunities for downstream optimization if we rewrite the
> +   cmp use with one candidate in the final set.  So the idea is:
> +     1) if the bind_cand is already in final set, use bind_cand.
> +     2) otherwise, check whether final set has better candidate,
> +       fill with it if yes; or still go with bind_cand.  */
> +
> +static void
> +handle_groups_with_bind_cand (struct ivopts_data *data, struct iv_ca *set)
> +{
> +  for (unsigned i = 0; i < data->vgroups.length (); i++)
> +    {
> +      struct iv_group *group = data->vgroups[i];
> +      if (group->bind_cand)
> +       {
> +         /* Since we always bypass it before.  */
> +         gcc_assert (!iv_ca_cand_for_group (set, group));
> +
> +         struct cost_pair *best_cp
> +           = get_group_iv_cost (data, group, group->bind_cand);
> +         gcc_assert (best_cp);
> +
> +         if (!bitmap_bit_p (set->cands, group->bind_cand->id))
> +           {
> +             /* Count in cost of bind_cand.  */
> +             best_cp->cost.cost += best_cp->cand->cost;
> +             unsigned j;
> +             bitmap_iterator bi;
> +             EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, j, bi)
> +             {
> +               struct iv_cand *cand = data->vcands[j];
> +               /* Since the purpose of rewrite here is to expose more
> +                  opportunities to downstream, the cost saving isn't
> +                  critical because this cmp use gets elimintated
> +                  finally.  So far we can't see any gains to replace
> +                  original non memory base iv cand with memory based
> +                  iv cand, but the rewrite could cause doloop unable
> +                  to find it's finite.  */
> +               if (group->bind_cand->iv->base_object == NULL_TREE
> +                   && cand->iv->base_object != NULL_TREE)
> +                 continue;
> +               struct cost_pair *cp = get_group_iv_cost (data, group, cand);
> +               if (cp && cp->cost < best_cp->cost)
> +                 best_cp = cp;
> +             }
> +           }
> +         iv_ca_set_cp (data, set, group, best_cp);
> +       }
> +    }
> +}
> +
>  static struct iv_ca *
>  find_optimal_iv_set (struct ivopts_data *data)
>  {
> @@ -6672,6 +6748,8 @@ find_optimal_iv_set (struct ivopts_data *data)
>    else if (origset)
>      iv_ca_free (&origset);
>
> +  handle_groups_with_bind_cand (data, set);
> +
>    for (i = 0; i < data->vgroups.length (); i++)
>      {
>        struct iv_group *group = data->vgroups[i];
> @@ -7442,12 +7520,69 @@ loop_body_includes_call (basic_block *body, unsigned num_nodes)
>    return false;
>  }
>
> +/* Doloop optimization RTL pass can make the related comparison computation
> +   become dead and get eliminated, then these comparison IV uses should NOT
> +   be considered in optimal IVs determination, set bind_cand for this kind
> +   of group, bypass them in later candidate determination algorithm.  */
> +
> +static void
> +bind_cand_for_doloop_uses (struct ivopts_data *data)
> +{
> +  struct loop *loop = data->current_loop;
> +
> +  for (unsigned i = 0; i < data->vgroups.length (); i++)
> +    {
> +      struct iv_group *group = data->vgroups[i];
> +      if (group->type == USE_COMPARE)
> +       {
> +         gcc_assert (group->vuses.length () == 1);
> +         struct iv_use *use = group->vuses[0];
> +         gimple *stmt = use->stmt;
> +         if (gimple_code (stmt) == GIMPLE_COND)
> +           {
> +             basic_block bb = gimple_bb (stmt);
> +             edge true_edge, false_edge;
> +             extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
> +
> +             /* This comparison is used for loop latch.  Require latch is empty
> +                for now.  */
> +             if ((loop->latch == true_edge->dest
> +                  || loop->latch == false_edge->dest)
> +                 && empty_block_p (loop->latch))
> +               {
> +                 for (unsigned j = 0; j < data->vcands.length (); j++)
> +                   {
> +                     if (bitmap_bit_p (group->related_cands, j))
> +                       {
> +                         struct iv_cand *cand = data->vcands[j];
> +                         tree op = use->iv->ssa_name;
> +                         if (op == cand->var_before || op == cand->var_after)
> +                           {
> +                             group->bind_cand = cand;
> +                             if (dump_file && (dump_flags & TDF_DETAILS))
> +                               {
> +                                 fprintf (dump_file, "Doloop cmp iv use: ");
> +                                 print_gimple_stmt (dump_file, stmt,
> +                                                    TDF_DETAILS);
> +                                 dump_cand (dump_file, cand);
> +                               }
> +                             break;
> +                           }
> +                       }
> +                   }
> +               }
> +           }
> +       }
> +    }
> +}
> +
>  /* Optimizes the LOOP.  Returns true if anything changed.  */
>
>  static bool
>  tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
>  {
>    bool changed = false;
> +  bool bind_for_doloop_p = false;
>    struct iv_ca *iv_ca;
>    edge exit = single_dom_exit (loop);
>    basic_block *body;
> @@ -7496,6 +7631,20 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
>    /* Finds candidates for the induction variables (item 2).  */
>    find_iv_candidates (data);
>
> +  bind_for_doloop_p = targetm.predict_doloop_p (loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file,
> +              "Predict loop %d can %sperform doloop optimization later.\n",
> +              loop->num, bind_for_doloop_p ? "" : "not ");
> +      flow_loop_dump (loop, dump_file, NULL, 1);
> +    }
> +
> +  /* Some compare iv_use is probably useless once the doloop optimization
> +     performs.  Set bind_cand for the use (group).  */
> +  if (bind_for_doloop_p)
> +    bind_cand_for_doloop_uses (data);
> +
>    /* Calculates the costs (item 3, part 1).  */
>    determine_iv_costs (data);
>    determine_group_iv_costs (data);
> --
> 2.7.4
>
Kewen.Lin May 15, 2019, 5:03 a.m. UTC | #2
on 2019/5/14 下午3:26, Richard Biener wrote:
> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> Previous version link for background:
>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>
>> Firstly, it's to call predict_doloop_p hook to check this
>> loop will be transformed to doloop or not, if yes, find
>> the expected comp iv use and its dependent original iv,
>> set the iv candidate as bind_cand of the group.
>> In following candidate selection process, we will bypass
>> the group with bind_cand, since we don't want to affect
>> global decision making for an iv use which will be
>> eliminated eventually.  At the time of iv candidate set
>> finalization, we will fill the cost pair for the group
>> with bind_cand.  If the bind_cand is already in the final
>> set, then just use it. Otherwise, we can check whether one
>> of existing final set is better and fill with that if so.
>>
>> Bootstrapped and regression testing passed on powerpc64le.
>>
>> Is it ok for trunk?
> 
> I wonder what prevents IVOPTs to consider a counter IV
> (eventually such candidate needs to be added if that's not
> already done) to be the most profitable variant w/o any
> of the other changes?  I guess that would be costing of
> the IV adjust plus branch which we would need to lower
> in case there's nothing inside the loop that would make
> later doloop transform fail?
> 
> Richard.
> 

If the question is for "w/o this patch", I think IVOPTs
can find counter IV as the most profitable one for the cmp
use in most time.  But the key issue is that it may stop
us to bring in more iv cands.  We have to add on iv cost
of new cand desirable for some iv use, it's probably more
than the cost of just using counter IV for the interest
use.  

If the question is for "w/i this patch", since we bypass
the doloop cmp use in candidate determination algorithm, 
it's possible that some other iv cands are preferred for 
the remaining uses rather than the counter IV. For example,
for some address type iv use, iv cand with memory based is
mostly better.


Thanks,
Kewen
Richard Biener May 15, 2019, 8:47 a.m. UTC | #3
On Wed, 15 May 2019, Kewen.Lin wrote:

> on 2019/5/14 下午3:26, Richard Biener wrote:
> > On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
> >>
> >> From: Kewen Lin <linkw@linux.ibm.com>
> >>
> >> Previous version link for background:
> >> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
> >>
> >> Firstly, it's to call predict_doloop_p hook to check this
> >> loop will be transformed to doloop or not, if yes, find
> >> the expected comp iv use and its dependent original iv,
> >> set the iv candidate as bind_cand of the group.
> >> In following candidate selection process, we will bypass
> >> the group with bind_cand, since we don't want to affect
> >> global decision making for an iv use which will be
> >> eliminated eventually.  At the time of iv candidate set
> >> finalization, we will fill the cost pair for the group
> >> with bind_cand.  If the bind_cand is already in the final
> >> set, then just use it. Otherwise, we can check whether one
> >> of existing final set is better and fill with that if so.
> >>
> >> Bootstrapped and regression testing passed on powerpc64le.
> >>
> >> Is it ok for trunk?
> > 
> > I wonder what prevents IVOPTs to consider a counter IV
> > (eventually such candidate needs to be added if that's not
> > already done) to be the most profitable variant w/o any
> > of the other changes?  I guess that would be costing of
> > the IV adjust plus branch which we would need to lower
> > in case there's nothing inside the loop that would make
> > later doloop transform fail?
> > 
> > Richard.
> > 
> 
> If the question is for "w/o this patch", I think IVOPTs
> can find counter IV as the most profitable one for the cmp
> use in most time.  But the key issue is that it may stop
> us to bring in more iv cands.  We have to add on iv cost
> of new cand desirable for some iv use, it's probably more
> than the cost of just using counter IV for the interest
> use.  
> 
> If the question is for "w/i this patch", since we bypass
> the doloop cmp use in candidate determination algorithm, 
> it's possible that some other iv cands are preferred for 
> the remaining uses rather than the counter IV. For example,
> for some address type iv use, iv cand with memory based is
> mostly better.

Ah, so the key issue is that the doloop IV is "free"?  That
is, it doesn't consume a general register and whatnot?  That
is allocating this IV doesn't really interfere with other IVs?
But can other uses be based on the doloop IV easily (if the
IV doesn't reside in a general reg?)?

Otherwise I understand that IVOPTs doesn't properly cost
the doloop IV update and conditional branch.  That's clearly
something we should fix (maybe even indepenently on other
changes).  One important thing is that we need to base costs
on a common base to not compare apples and oranges, didn't
dig into your patch in detail enough to see whether it
fits into the general cost model or whether it is a hack
ontop of everything.

Richard.
Segher Boessenkool May 15, 2019, 4:17 p.m. UTC | #4
On Wed, May 15, 2019 at 10:47:31AM +0200, Richard Biener wrote:
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?

That is one half of it, yes.

> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?

Getting the value of the count reg can be expensive, that is the
other half of it.

> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.

Currently it doesn't even *know* something is or isn't a doloop.
And yeah that matters a lot for proper costing, on all targets that
have a doloop.

> That's clearly
> something we should fix (maybe even indepenently on other
> changes).  One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.

The different cost for a doloop is pretty easy...  Might have to
be a target hook though; on Power the decrement + compare-to-zero
are "free", while on some other targets only the "compare" is.
The cost for using the IV...  For us we could just disallow it
being used at all (except for the looping itself of course), but
not sure what is optimal in general.  Another hook?


Segher
Kewen.Lin May 16, 2019, 3:53 a.m. UTC | #5
on 2019/5/15 下午4:47, Richard Biener wrote:
> On Wed, 15 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/14 下午3:26, Richard Biener wrote:
>>> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>>>
>>>> From: Kewen Lin <linkw@linux.ibm.com>
>>>>
>>>> Previous version link for background:
>>>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>>>
>>>> Firstly, it's to call predict_doloop_p hook to check this
>>>> loop will be transformed to doloop or not, if yes, find
>>>> the expected comp iv use and its dependent original iv,
>>>> set the iv candidate as bind_cand of the group.
>>>> In following candidate selection process, we will bypass
>>>> the group with bind_cand, since we don't want to affect
>>>> global decision making for an iv use which will be
>>>> eliminated eventually.  At the time of iv candidate set
>>>> finalization, we will fill the cost pair for the group
>>>> with bind_cand.  If the bind_cand is already in the final
>>>> set, then just use it. Otherwise, we can check whether one
>>>> of existing final set is better and fill with that if so.
>>>>
>>>> Bootstrapped and regression testing passed on powerpc64le.
>>>>
>>>> Is it ok for trunk?
>>>
>>> I wonder what prevents IVOPTs to consider a counter IV
>>> (eventually such candidate needs to be added if that's not
>>> already done) to be the most profitable variant w/o any
>>> of the other changes?  I guess that would be costing of
>>> the IV adjust plus branch which we would need to lower
>>> in case there's nothing inside the loop that would make
>>> later doloop transform fail?
>>>
>>> Richard.
>>>
>>
>> If the question is for "w/o this patch", I think IVOPTs
>> can find counter IV as the most profitable one for the cmp
>> use in most time.  But the key issue is that it may stop
>> us to bring in more iv cands.  We have to add on iv cost
>> of new cand desirable for some iv use, it's probably more
>> than the cost of just using counter IV for the interest
>> use.  
>>
>> If the question is for "w/i this patch", since we bypass
>> the doloop cmp use in candidate determination algorithm, 
>> it's possible that some other iv cands are preferred for 
>> the remaining uses rather than the counter IV. For example,
>> for some address type iv use, iv cand with memory based is
>> mostly better.
> 
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?
> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?

Yes, it takes one special hardware register "counter 
register" on Power.  For other uses based on doloop
IV, if there are no more suitable IVs, we still need
one general reg for update and use.  In the current
patch, although we bypass this doloop cmp use, it's
still allowed to have other uses to choose this
doloop IV candidate.  The costing is as usual. Since
the doloop cmp use is actually free, we don't want
ivopts to consider it and affect optimal IV set.

> 
> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.  That's clearly
> something we should fix (maybe even indepenently on other
> changes).  One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.
> 

In the previous version of patch, it's to make this doloop 
cmp use as zero cost with any iv cands (like it's invisible),
sounds better fit in general cost model? But it requires us
to preserve the doloop IV incase it's not chosen.
The current version is to bind the doloop IV at the first
place, bypass the choosing process and fill it directly if 
no better found later.  For Power, either zero cost or
bypass can coexist with the cost model framework.


Thanks,
Kewen

> Richard.
>
Richard Biener May 16, 2019, 7:25 a.m. UTC | #6
On Wed, 15 May 2019, Segher Boessenkool wrote:

> On Wed, May 15, 2019 at 10:47:31AM +0200, Richard Biener wrote:
> > Ah, so the key issue is that the doloop IV is "free"?  That
> > is, it doesn't consume a general register and whatnot?  That
> > is allocating this IV doesn't really interfere with other IVs?
> 
> That is one half of it, yes.
> 
> > But can other uses be based on the doloop IV easily (if the
> > IV doesn't reside in a general reg?)?
> 
> Getting the value of the count reg can be expensive, that is the
> other half of it.
> 
> > Otherwise I understand that IVOPTs doesn't properly cost
> > the doloop IV update and conditional branch.
> 
> Currently it doesn't even *know* something is or isn't a doloop.
> And yeah that matters a lot for proper costing, on all targets that
> have a doloop.

Ah, OK.  So for general handling IVOPTs would add a new
candidate kind (doloop kind) which is costed differently
at the various uses.  The "guessed" RTL we create for
costing also needs to properly create a proper counter reg
(IIRC it always creates pseudos right now, but here it would
need to be a hard reg so costing can properly pessimize uses
in addresses/memory?).

> > That's clearly
> > something we should fix (maybe even indepenently on other
> > changes).  One important thing is that we need to base costs
> > on a common base to not compare apples and oranges, didn't
> > dig into your patch in detail enough to see whether it
> > fits into the general cost model or whether it is a hack
> > ontop of everything.
> 
> The different cost for a doloop is pretty easy...  Might have to
> be a target hook though; on Power the decrement + compare-to-zero
> are "free", while on some other targets only the "compare" is.
> The cost for using the IV...  For us we could just disallow it
> being used at all (except for the looping itself of course), but
> not sure what is optimal in general.  Another hook?

Indeed the easiest thing is to simply disallow uses of the doloop
IV outside of the increment, compare and branch (maybe have a
target hook that says whether a particular IV may be used for
a particular USE).

We'd still need to cost the spilling thing around calls of course,
but this can maybe be done incrementally.  It's still RTL doloop
that ultimatively decides on the doloop use.

Richard.
Segher Boessenkool May 16, 2019, 5:34 p.m. UTC | #7
On Thu, May 16, 2019 at 09:25:49AM +0200, Richard Biener wrote:
> On Wed, 15 May 2019, Segher Boessenkool wrote:
> > > Otherwise I understand that IVOPTs doesn't properly cost
> > > the doloop IV update and conditional branch.
> > 
> > Currently it doesn't even *know* something is or isn't a doloop.
> > And yeah that matters a lot for proper costing, on all targets that
> > have a doloop.
> 
> Ah, OK.  So for general handling IVOPTs would add a new
> candidate kind (doloop kind) which is costed differently
> at the various uses.

That sounds good.

> The "guessed" RTL we create for
> costing also needs to properly create a proper counter reg
> (IIRC it always creates pseudos right now, but here it would
> need to be a hard reg so costing can properly pessimize uses
> in addresses/memory?).

We always use a pseudo currently; it is not turned into a hard register
until after RA.  Expanding as hard registers directly works really well,
*if* you can put *all* uses of that hard reg into the RTl at expand time
already.  Indirect jumps and switch tables want to use the count
register as well; this complicates things enormously.  Also, sometimes
a loop is mangled enough (in RTL) that it is better to use a GPR as IV.

> > The different cost for a doloop is pretty easy...  Might have to
> > be a target hook though; on Power the decrement + compare-to-zero
> > are "free", while on some other targets only the "compare" is.
> > The cost for using the IV...  For us we could just disallow it
> > being used at all (except for the looping itself of course), but
> > not sure what is optimal in general.  Another hook?
> 
> Indeed the easiest thing is to simply disallow uses of the doloop
> IV outside of the increment, compare and branch (maybe have a
> target hook that says whether a particular IV may be used for
> a particular USE).

But is that generic enough?

> We'd still need to cost the spilling thing around calls of course,
> but this can maybe be done incrementally.  It's still RTL doloop
> that ultimatively decides on the doloop use.

We cannot have doloops with calls, on rs6000.  This differs per target
of course.

We really need to get a good overview of what our various targets need.


Segher
Jeff Law May 16, 2019, 6:41 p.m. UTC | #8
On 5/15/19 2:47 AM, Richard Biener wrote:
> On Wed, 15 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/14 下午3:26, Richard Biener wrote:
>>> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>>>
>>>> From: Kewen Lin <linkw@linux.ibm.com>
>>>>
>>>> Previous version link for background:
>>>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>>>
>>>> Firstly, it's to call predict_doloop_p hook to check this
>>>> loop will be transformed to doloop or not, if yes, find
>>>> the expected comp iv use and its dependent original iv,
>>>> set the iv candidate as bind_cand of the group.
>>>> In following candidate selection process, we will bypass
>>>> the group with bind_cand, since we don't want to affect
>>>> global decision making for an iv use which will be
>>>> eliminated eventually.  At the time of iv candidate set
>>>> finalization, we will fill the cost pair for the group
>>>> with bind_cand.  If the bind_cand is already in the final
>>>> set, then just use it. Otherwise, we can check whether one
>>>> of existing final set is better and fill with that if so.
>>>>
>>>> Bootstrapped and regression testing passed on powerpc64le.
>>>>
>>>> Is it ok for trunk?
>>>
>>> I wonder what prevents IVOPTs to consider a counter IV
>>> (eventually such candidate needs to be added if that's not
>>> already done) to be the most profitable variant w/o any
>>> of the other changes?  I guess that would be costing of
>>> the IV adjust plus branch which we would need to lower
>>> in case there's nothing inside the loop that would make
>>> later doloop transform fail?
>>>
>>> Richard.
>>>
>>
>> If the question is for "w/o this patch", I think IVOPTs
>> can find counter IV as the most profitable one for the cmp
>> use in most time.  But the key issue is that it may stop
>> us to bring in more iv cands.  We have to add on iv cost
>> of new cand desirable for some iv use, it's probably more
>> than the cost of just using counter IV for the interest
>> use.  
>>
>> If the question is for "w/i this patch", since we bypass
>> the doloop cmp use in candidate determination algorithm, 
>> it's possible that some other iv cands are preferred for 
>> the remaining uses rather than the counter IV. For example,
>> for some address type iv use, iv cand with memory based is
>> mostly better.
> 
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?
> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?
That's my understanding of how at least some of the low overhead looping
instructions work on some ISAs (ppc included).  There's a special loop
count register and the low overhead looping insns handle the decrement
and branch.

This is similar, but different than something like m68k dbcc where the
counter is a GPR.

For architectures like PPC, we probably don't want to use the loop count
for anything else as it's likely expensive to get data in/out of the the
loop count register.

For architectures where the counter is stored in a GPR, then we have
more flexibility in how we use it.

So at least part of the problem is cost modeling of this.  It's all
pretty low level, so not really a good match for the goals of gimple.
But we may ultimately have no choice here but to be pragmatic like we've
done with stuff like vector widths and allow some target properties to
bleed in.

The only saving grace is the existence of low overhead loops is static
-- the target either has them or it doesn't.  Similarly whether or not
the counter is a GPR or not is a static property of the target.

> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.  That's clearly
> something we should fix (maybe even indepenently on other
> changes). 
It feels independent to me.

 One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.
Agreed.

jeff
Segher Boessenkool May 16, 2019, 9:42 p.m. UTC | #9
Hi Jeff,

On Thu, May 16, 2019 at 12:41:16PM -0600, Jeff Law wrote:
> For architectures like PPC, we probably don't want to use the loop count
> for anything else as it's likely expensive to get data in/out of the the
> loop count register.

That is part of it.  Another part is that it costs extra code, negating
one of the advantages of using these instructions.  And a third reason
we do not want this is that on some implementations you have to load the
count register early enough to get the loop predicted correctly.

> So at least part of the problem is cost modeling of this.  It's all
> pretty low level, so not really a good match for the goals of gimple.
> But we may ultimately have no choice here but to be pragmatic like we've
> done with stuff like vector widths and allow some target properties to
> bleed in.

*All* of ivopts is low level in this sense: *all* of it is about finding
out what IVs to use such that it is lowest cost on the target.

Other than costs it doesn't use many target attributes.  For doloop it
would also ask the target whether some loop can be a doloop at all.  So
everything it does is quite high level still, but it *does* have to know
about some very machine-specific things.

Maybe two hooks for that: one, taking a struct loop, to decide if that
loop should be considered for a doloop at all; and another taking a
gimple statement, and returning whether that statement prevents the
loop it is in from being a doloop.  That way we do not have to pass
a lot of gimple data and work to the backends.  Most can just look at
some of the simple loop properties ("is this an inner loop?"), and
allow all statements or just disallow some particular types.

> > Otherwise I understand that IVOPTs doesn't properly cost
> > the doloop IV update and conditional branch.  That's clearly
> > something we should fix (maybe even indepenently on other
> > changes). 
> It feels independent to me.

It cannot cost things properly if nothing has yet decided whether some
loop could (or should) be a doloop :-)


Segher
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 171c85a..f61288c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@  f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 885c8e8..50b5900 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -393,6 +393,8 @@  struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* The bind candidate for this group, for doloop only so far.  */
+  struct iv_cand *bind_cand;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -1592,6 +1594,7 @@  record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->bind_cand = NULL;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3612,7 +3615,9 @@  set_group_iv_cost (struct ivopts_data *data,
 {
   unsigned i, s;
 
-  if (cost.infinite_cost_p ())
+  gcc_assert (cand);
+  /* For the group with bind_cand, make it always have cost pair.  */
+  if (cost.infinite_cost_p () && group->bind_cand != cand)
     {
       BITMAP_FREE (inv_vars);
       BITMAP_FREE (inv_exprs);
@@ -6067,7 +6072,8 @@  iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 		 group->id, cp->cand->id, cp->cost.cost,
 		 cp->cost.complexity);
       else
-	fprintf (file, "   group:%d --> ??\n", group->id);
+	fprintf (file, "   group:%d --> ?? %s\n", group->id,
+		 group->bind_cand ? "(bind)" : "");
     }
 
   const char *pref = "";
@@ -6110,6 +6116,9 @@  iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
   for (i = 0; i < ivs->upto; i++)
     {
       group = data->vgroups[i];
+      /* Ignore groups binded with some cand.  */
+      if (group->bind_cand)
+	continue;
       old_cp = iv_ca_cand_for_group (ivs, group);
 
       if (old_cp
@@ -6165,7 +6174,9 @@  iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs,
   for (i = 0; i < data->vgroups.length (); i++)
     {
       group = data->vgroups[i];
-
+      /* Ignore groups binded with some cand.  */
+      if (group->bind_cand)
+	continue;
       old_cp = iv_ca_cand_for_group (ivs, group);
       if (old_cp->cand != cand)
 	continue;
@@ -6348,6 +6359,9 @@  iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
       for (j = 0; j < ivs->upto; j++)
 	{
 	  struct iv_group *group = data->vgroups[j];
+	  /* Ignore groups binded with some cand.  */
+	  if (group->bind_cand)
+	    continue;
 	  old_cp = iv_ca_cand_for_group (ivs, group);
 
 	  if (old_cp->cand != cand)
@@ -6406,6 +6420,15 @@  try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
   struct iv_ca_delta *best_delta = NULL, *act_delta;
   struct cost_pair *cp;
 
+  /* Bypass the candidate selection for the groups with bind_cand, but need
+     to keep upto up to date, to avoid the count of visited groups becomes
+     inconsistent in futher handlings.  */
+  if (group->bind_cand)
+    {
+      ivs->upto++;
+      return true;
+    }
+
   iv_ca_add_group (data, ivs, group);
   best_cost = iv_ca_cost (ivs);
   cp = iv_ca_cand_for_group (ivs, group);
@@ -6635,6 +6658,59 @@  find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
   return set;
 }
 
+/* Since we bypass the candidate determination process for the groups with
+   bind_cand previously, now we want to fill the cost pair for them.  The
+   simplest way is to fill the bind_cand directly, but for some cases it
+   exposes more opportunities for downstream optimization if we rewrite the
+   cmp use with one candidate in the final set.  So the idea is:
+     1) if the bind_cand is already in final set, use bind_cand.
+     2) otherwise, check whether final set has better candidate,
+	fill with it if yes; or still go with bind_cand.  */
+
+static void
+handle_groups_with_bind_cand (struct ivopts_data *data, struct iv_ca *set)
+{
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->bind_cand)
+	{
+	  /* Since we always bypass it before.  */
+	  gcc_assert (!iv_ca_cand_for_group (set, group));
+
+	  struct cost_pair *best_cp
+	    = get_group_iv_cost (data, group, group->bind_cand);
+	  gcc_assert (best_cp);
+
+	  if (!bitmap_bit_p (set->cands, group->bind_cand->id))
+	    {
+	      /* Count in cost of bind_cand.  */
+	      best_cp->cost.cost += best_cp->cand->cost;
+	      unsigned j;
+	      bitmap_iterator bi;
+	      EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, j, bi)
+	      {
+		struct iv_cand *cand = data->vcands[j];
+		/* Since the purpose of rewrite here is to expose more
+		   opportunities to downstream, the cost saving isn't
+		   critical because this cmp use gets elimintated
+		   finally.  So far we can't see any gains to replace
+		   original non memory base iv cand with memory based
+		   iv cand, but the rewrite could cause doloop unable
+		   to find it's finite.  */
+		if (group->bind_cand->iv->base_object == NULL_TREE
+		    && cand->iv->base_object != NULL_TREE)
+		  continue;
+		struct cost_pair *cp = get_group_iv_cost (data, group, cand);
+		if (cp && cp->cost < best_cp->cost)
+		  best_cp = cp;
+	      }
+	    }
+	  iv_ca_set_cp (data, set, group, best_cp);
+	}
+    }
+}
+
 static struct iv_ca *
 find_optimal_iv_set (struct ivopts_data *data)
 {
@@ -6672,6 +6748,8 @@  find_optimal_iv_set (struct ivopts_data *data)
   else if (origset)
     iv_ca_free (&origset);
 
+  handle_groups_with_bind_cand (data, set);
+
   for (i = 0; i < data->vgroups.length (); i++)
     {
       struct iv_group *group = data->vgroups[i];
@@ -7442,12 +7520,69 @@  loop_body_includes_call (basic_block *body, unsigned num_nodes)
   return false;
 }
 
+/* Doloop optimization RTL pass can make the related comparison computation
+   become dead and get eliminated, then these comparison IV uses should NOT
+   be considered in optimal IVs determination, set bind_cand for this kind
+   of group, bypass them in later candidate determination algorithm.  */
+
+static void
+bind_cand_for_doloop_uses (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  for (unsigned j = 0; j < data->vcands.length (); j++)
+		    {
+		      if (bitmap_bit_p (group->related_cands, j))
+			{
+			  struct iv_cand *cand = data->vcands[j];
+			  tree op = use->iv->ssa_name;
+			  if (op == cand->var_before || op == cand->var_after)
+			    {
+			      group->bind_cand = cand;
+			      if (dump_file && (dump_flags & TDF_DETAILS))
+				{
+				  fprintf (dump_file, "Doloop cmp iv use: ");
+				  print_gimple_stmt (dump_file, stmt,
+						     TDF_DETAILS);
+				  dump_cand (dump_file, cand);
+				}
+			      break;
+			    }
+			}
+		    }
+		}
+	    }
+	}
+    }
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
 tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
 {
   bool changed = false;
+  bool bind_for_doloop_p = false;
   struct iv_ca *iv_ca;
   edge exit = single_dom_exit (loop);
   basic_block *body;
@@ -7496,6 +7631,20 @@  tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
+  bind_for_doloop_p = targetm.predict_doloop_p (loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file,
+	       "Predict loop %d can %sperform doloop optimization later.\n",
+	       loop->num, bind_for_doloop_p ? "" : "not ");
+      flow_loop_dump (loop, dump_file, NULL, 1);
+    }
+
+  /* Some compare iv_use is probably useless once the doloop optimization
+     performs.  Set bind_cand for the use (group).  */
+  if (bind_for_doloop_p)
+    bind_cand_for_doloop_uses (data);
+
   /* Calculates the costs (item 3, part 1).  */
   determine_iv_costs (data);
   determine_group_iv_costs (data);