diff mbox series

[v2,2/3] Add predict_doloop_p target hook

Message ID 1557803395-117707-1-git-send-email-linkw@linux.ibm.com
State New
Headers show
Series [v2,1/3] Move prepare_decl_rtl to expr.[ch] as extern | expand

Commit Message

Kewen.Lin May 14, 2019, 3:09 a.m. UTC
From: Kewen Lin <linkw@linux.ibm.com>

Previous version link for background:
https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html

This hook is to predict whether one loop in gimple will
be transformed to low-overhead loop later in RTL, and
designed to be called in ivopts.

"Since the low-overhead loop optimize transformation is
based on RTL, some of those checks are hard to be imitated
on gimple, so it's not possible to predict the current
loop will be transformed exactly in middle-end.  But if we
can have most loop predicted precisely, it would be helpful.
It highly depends on target hook fine tuning. It's
acceptable to have some loops which can be transformed to
low-overhead loop but we don't catch.  But we should try
our best to avoid to predict some loop as low-overhead loop
but which isn't."

Bootstrapped and regression testing passed on powerpc64le.

Is it ok for trunk?

gcc/ChangeLog

2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* target.def (predict_doloop_p): New hook.
	* targhooks.h (default_predict_doloop_p): New declaration.
	* targhooks.c (default_predict_doloop_p): New function.
	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
	* doc/tm.texi: Regenerate.
	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
	(costly_iter_for_doloop_p): Likewise.
	(rs6000_predict_doloop_p): Likewise.
	(TARGET_PREDICT_DOLOOP_P): New macro.

---
 gcc/config/rs6000/rs6000.c | 174 ++++++++++++++++++++++++++++++++++++++++++++-
 gcc/doc/tm.texi            |   8 +++
 gcc/doc/tm.texi.in         |   2 +
 gcc/target.def             |   9 +++
 gcc/targhooks.c            |  13 ++++
 gcc/targhooks.h            |   1 +
 6 files changed, 206 insertions(+), 1 deletion(-)

Comments

Richard Biener May 14, 2019, 7:24 a.m. UTC | #1
On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>
> From: Kewen Lin <linkw@linux.ibm.com>
>
> Previous version link for background:
> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>
> This hook is to predict whether one loop in gimple will
> be transformed to low-overhead loop later in RTL, and
> designed to be called in ivopts.
>
> "Since the low-overhead loop optimize transformation is
> based on RTL, some of those checks are hard to be imitated
> on gimple, so it's not possible to predict the current
> loop will be transformed exactly in middle-end.  But if we
> can have most loop predicted precisely, it would be helpful.
> It highly depends on target hook fine tuning. It's
> acceptable to have some loops which can be transformed to
> low-overhead loop but we don't catch.  But we should try
> our best to avoid to predict some loop as low-overhead loop
> but which isn't."
>
> Bootstrapped and regression testing passed on powerpc64le.
>
> Is it ok for trunk?

Most of the rs6000 target hook checks look general
(can we compute niter, etc.), IVOPTs would have a hard
time adding a counter IV w/o being able to compute niters.
Likewise IVOPTs should already consider (does it?) costs
of invariant computations (the niter computation).

That leaves the CTR invalidation checks which makes
me wonder if the hook shouldn't be one working on
a stmt, like stmt_precludes_doloop_p?

Richard.

> gcc/ChangeLog
>
> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * target.def (predict_doloop_p): New hook.
>         * targhooks.h (default_predict_doloop_p): New declaration.
>         * targhooks.c (default_predict_doloop_p): New function.
>         * doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
>         * doc/tm.texi: Regenerate.
>         * config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
>         (costly_iter_for_doloop_p): Likewise.
>         (rs6000_predict_doloop_p): Likewise.
>         (TARGET_PREDICT_DOLOOP_P): New macro.
>
> ---
>  gcc/config/rs6000/rs6000.c | 174 ++++++++++++++++++++++++++++++++++++++++++++-
>  gcc/doc/tm.texi            |   8 +++
>  gcc/doc/tm.texi.in         |   2 +
>  gcc/target.def             |   9 +++
>  gcc/targhooks.c            |  13 ++++
>  gcc/targhooks.h            |   1 +
>  6 files changed, 206 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index a21f4f7..1c1c8eb 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -83,6 +83,9 @@
>  #include "tree-ssa-propagate.h"
>  #include "tree-vrp.h"
>  #include "tree-ssanames.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-cfg.h"
> +#include "tree-scalar-evolution.h"
>
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -1914,6 +1917,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_CAN_USE_DOLOOP_P
>  #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
>
> +#undef TARGET_PREDICT_DOLOOP_P
> +#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
> +
>  #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
>  #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
>
> @@ -39436,7 +39442,173 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
>    return id;
>  }
>
> -
> +/* Check whether there are some instructions preventing doloop transformation
> +   inside loop body, mainly for instructions which are possible to kill CTR.
> +
> +   Return true if some invalid insn exits, otherwise return false.  */
> +
> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);
> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +       gimple *stmt = gsi_stmt (gsi);
> +       if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +           && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding call.\n");
> +           return true;
> +         }
> +       if (computed_goto_p (stmt))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file, "predict doloop failure due to"
> +                                 "finding computed jump.\n");
> +           return true;
> +         }
> +       /* TODO: Now this hook is expected to be called in ivopts, which is
> +          before switchlower1/switchlower2.  Checking for SWITCH at this point
> +       will eliminate some good candidates.  But since there are only a few
> +       cases having _a_ switch statement in the innermost loop, it can be a low
> +          priority enhancement.  */
> +
> +       if (gimple_code (stmt) == GIMPLE_SWITCH)
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding switch.\n");
> +           return true;
> +         }
> +      }
> +
> +  return false;
> +}
> +
> +/* Check whether number of iteration computation is too costly for doloop
> +   transformation.  It expands the gimple sequence to equivalent RTL insn
> +   sequence, then evaluate the cost.
> +
> +   Return true if it's costly, otherwise return false.  */
> +
> +static bool
> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
> +{
> +  tree type = TREE_TYPE (niters);
> +  unsigned cost = 0, i;
> +  tree obj;
> +  bool speed = optimize_loop_for_speed_p (loop);
> +  int regno = LAST_VIRTUAL_REGISTER + 1;
> +  vec<tree> tvec;
> +  tvec.create (20);
> +  decl_rtl_data tdata;
> +  tdata.regno = &regno;
> +  tdata.treevec = &tvec;
> +  walk_tree (&niters, prepare_decl_rtl, &tdata, NULL);
> +  start_sequence ();
> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +  FOR_EACH_VEC_ELT (tvec, i, obj)
> +    SET_DECL_RTL (obj, NULL_RTX);
> +  tvec.release ();
> +
> +  for (; seq; seq = NEXT_INSN (seq))
> +    {
> +      if (!INSN_P (seq))
> +       continue;
> +      rtx body = PATTERN (seq);
> +      if (GET_CODE (body) == SET)
> +       {
> +         rtx set_val = XEXP (body, 1);
> +         enum rtx_code code = GET_CODE (set_val);
> +         enum rtx_class cls = GET_RTX_CLASS (code);
> +         /* For now, we only consider these two RTX classes, to match what we
> +        get in doloop_optimize, excluding operations like zero/sign extend.  */
> +         if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> +           cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> +       }
> +    }
> +  unsigned max_cost
> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
> +  if (cost > max_cost)
> +    return true;
> +
> +  return false;
> +}
> +
> +/*
> +   Predict whether the given loop will be transformed in the RTL
> +   doloop_optimize pass.  This is for use by the IVOPTS middle-end pass.
> +   Attempt to duplicate as many doloop_optimize checks as possible.
> +
> +   Some RTL specific checks seems unable to be checked here, if any
> +   new checks or easy checks _are_ missing here.  */
> +
> +static bool
> +rs6000_predict_doloop_p (struct loop *loop)
> +{
> +  gcc_assert (loop);
> +
> +  /* On rs6000, targetm.can_use_doloop_p is actually
> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> +  if (loop->inner != NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "no innermost.\n");
> +      return false;
> +    }
> +
> +  /* number_of_latch_executions is not so costly, so we don't use
> +     number_of_iterations_exit for iteration description.  */
> +  tree niters = number_of_latch_executions (loop);
> +  if (niters == NULL_TREE || niters == chrec_dont_know)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "unexpected niters.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether iteration count too small
> +     and not profitable.  */
> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> +  if (est_niter == -1)
> +    est_niter = get_likely_max_loop_iterations_int (loop);
> +  if (est_niter >= 0 && est_niter < 3)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file,
> +                "predict doloop failure due to"
> +                "too few iterations (%d).\n",
> +                (unsigned int) est_niter);
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether number of iterations too costly
> +     to compute.  */
> +  if (costly_iter_for_doloop_p (loop, niters))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "costly niter computation.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
> +     CALL, tablejump, computed_jump.  */
> +  if (invalid_insn_for_doloop_p (loop))
> +    return false;
> +
> +  return true;
> +}
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-rs6000.h"
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 8c8978b..e595b8b 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>  body must be generated.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> +which is going to be checked.  This target hook is required only when the
> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index fe1194e..e047734 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>
>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>
> +@hook TARGET_PREDICT_DOLOOP_P
> +
>  @hook TARGET_CAN_USE_DOLOOP_P
>
>  @hook TARGET_INVALID_WITHIN_DOLOOP
> diff --git a/gcc/target.def b/gcc/target.def
> index 66cee07..0a80de3 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4227,6 +4227,15 @@ DEFHOOK
>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>   default_speculation_safe_value)
>
> +DEFHOOK
> +(predict_doloop_p,
> + "Return true if we can predict it is possible to use low-overhead loops\n\
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
> +which is going to be checked.  This target hook is required only when the\n\
> +target supports low-overhead loops, and will help some earlier middle-end\n\
> +passes to make some decisions.",
> + bool, (struct loop *loop),
> + default_predict_doloop_p)
>
>  DEFHOOK
>  (can_use_doloop_p,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index 318f7e9..56ed4cc 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>    return HAVE_GNU_INDIRECT_FUNCTION;
>  }
>
> +/* True if we can predict this loop is possible to be transformed to a
> +   low-overhead loop, otherwise returns false.
> +
> +   By default, false is returned, as this hook's applicability should be
> +   verified for each target.  Target maintainers should re-define the hook
> +   if the target can take advantage of it.  */
> +
> +bool
> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>     an error message.
>
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 5943627..70bfb17 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>
>  extern bool default_has_ifunc_p (void);
>
> +extern bool default_predict_doloop_p (struct loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
> --
> 2.7.4
>
Jeff Law May 14, 2019, 7:13 p.m. UTC | #2
On 5/13/19 9:09 PM, linkw@linux.ibm.com wrote:
> From: Kewen Lin <linkw@linux.ibm.com>
> 
> Previous version link for background:
> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
> 
> This hook is to predict whether one loop in gimple will
> be transformed to low-overhead loop later in RTL, and
> designed to be called in ivopts.
> 
> "Since the low-overhead loop optimize transformation is
> based on RTL, some of those checks are hard to be imitated
> on gimple, so it's not possible to predict the current
> loop will be transformed exactly in middle-end.  But if we
> can have most loop predicted precisely, it would be helpful.
> It highly depends on target hook fine tuning. It's
> acceptable to have some loops which can be transformed to
> low-overhead loop but we don't catch.  But we should try
> our best to avoid to predict some loop as low-overhead loop
> but which isn't."
> 
> Bootstrapped and regression testing passed on powerpc64le.
> 
> Is it ok for trunk?
> 
> gcc/ChangeLog
> 
> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
> 
> 	PR middle-end/80791
> 	* target.def (predict_doloop_p): New hook.
> 	* targhooks.h (default_predict_doloop_p): New declaration.
> 	* targhooks.c (default_predict_doloop_p): New function.
> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
> 	* doc/tm.texi: Regenerate.
> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
> 	(costly_iter_for_doloop_p): Likewise.
> 	(rs6000_predict_doloop_p): Likewise.
> 	(TARGET_PREDICT_DOLOOP_P): New macro.
Trying to guess what the target is going to do, then changing the
structure of the loops in gimple based on that guess -- is that really a
good idea.  It's fairly counter to many of the design goals around gimple.

jeff
Bill Schmidt May 14, 2019, 7:35 p.m. UTC | #3
On 5/14/19 2:13 PM, Jeff Law wrote:
> On 5/13/19 9:09 PM, linkw@linux.ibm.com wrote:
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> Previous version link for background:
>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>
>> This hook is to predict whether one loop in gimple will
>> be transformed to low-overhead loop later in RTL, and
>> designed to be called in ivopts.
>>
>> "Since the low-overhead loop optimize transformation is
>> based on RTL, some of those checks are hard to be imitated
>> on gimple, so it's not possible to predict the current
>> loop will be transformed exactly in middle-end.  But if we
>> can have most loop predicted precisely, it would be helpful.
>> It highly depends on target hook fine tuning. It's
>> acceptable to have some loops which can be transformed to
>> low-overhead loop but we don't catch.  But we should try
>> our best to avoid to predict some loop as low-overhead loop
>> but which isn't."
>>
>> Bootstrapped and regression testing passed on powerpc64le.
>>
>> Is it ok for trunk?
>>
>> gcc/ChangeLog
>>
>> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
>>
>> 	PR middle-end/80791
>> 	* target.def (predict_doloop_p): New hook.
>> 	* targhooks.h (default_predict_doloop_p): New declaration.
>> 	* targhooks.c (default_predict_doloop_p): New function.
>> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
>> 	* doc/tm.texi: Regenerate.
>> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
>> 	(costly_iter_for_doloop_p): Likewise.
>> 	(rs6000_predict_doloop_p): Likewise.
>> 	(TARGET_PREDICT_DOLOOP_P): New macro.
> Trying to guess what the target is going to do, then changing the
> structure of the loops in gimple based on that guess -- is that really a
> good idea.  It's fairly counter to many of the design goals around gimple.

Understood -- but incorrect selection of ivars has a heavily detrimental effect.
We have run into this problem many times on Power, where the cost model wrongly
takes into effect the cost of the compare that will be absorbed into the bdnz
instruction.  Hot loops can end up 15-20% worse.  I don't see any reasonable
alternative to a target hook with the present IVOPTS cost model.

Thanks,
Bill

> jeff
>
Jeff Law May 14, 2019, 9:20 p.m. UTC | #4
On 5/14/19 1:35 PM, Bill Schmidt wrote:
> On 5/14/19 2:13 PM, Jeff Law wrote:
>> On 5/13/19 9:09 PM, linkw@linux.ibm.com wrote:
>>> From: Kewen Lin <linkw@linux.ibm.com>
>>>
>>> Previous version link for background:
>>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>>
>>> This hook is to predict whether one loop in gimple will
>>> be transformed to low-overhead loop later in RTL, and
>>> designed to be called in ivopts.
>>>
>>> "Since the low-overhead loop optimize transformation is
>>> based on RTL, some of those checks are hard to be imitated
>>> on gimple, so it's not possible to predict the current
>>> loop will be transformed exactly in middle-end.  But if we
>>> can have most loop predicted precisely, it would be helpful.
>>> It highly depends on target hook fine tuning. It's
>>> acceptable to have some loops which can be transformed to
>>> low-overhead loop but we don't catch.  But we should try
>>> our best to avoid to predict some loop as low-overhead loop
>>> but which isn't."
>>>
>>> Bootstrapped and regression testing passed on powerpc64le.
>>>
>>> Is it ok for trunk?
>>>
>>> gcc/ChangeLog
>>>
>>> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
>>>
>>> 	PR middle-end/80791
>>> 	* target.def (predict_doloop_p): New hook.
>>> 	* targhooks.h (default_predict_doloop_p): New declaration.
>>> 	* targhooks.c (default_predict_doloop_p): New function.
>>> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
>>> 	* doc/tm.texi: Regenerate.
>>> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
>>> 	(costly_iter_for_doloop_p): Likewise.
>>> 	(rs6000_predict_doloop_p): Likewise.
>>> 	(TARGET_PREDICT_DOLOOP_P): New macro.
>> Trying to guess what the target is going to do, then changing the
>> structure of the loops in gimple based on that guess -- is that really a
>> good idea.  It's fairly counter to many of the design goals around gimple.
> 
> Understood -- but incorrect selection of ivars has a heavily detrimental effect.
> We have run into this problem many times on Power, where the cost model wrongly
> takes into effect the cost of the compare that will be absorbed into the bdnz
> instruction.  Hot loops can end up 15-20% worse.  I don't see any reasonable
> alternative to a target hook with the present IVOPTS cost model.
I won't claim to know what the solution is here -- IVOPTS is certainly a
pain point and not just with low overhead looping constructs.

Of course we had similar problems when we did this on RTL even with
access to the low level costing information -- often the right choice
would depends on some factor external to the current loop.

In a perfect world we'd probably keep the current stuff as-is, then
lower a bit to expose addressing modes and some other target
dependencies in a lowered gimple.  But we're certainly not there.




jeff
Segher Boessenkool May 14, 2019, 9:53 p.m. UTC | #5
On Tue, May 14, 2019 at 01:13:34PM -0600, Jeff Law wrote:
> Trying to guess what the target is going to do, then changing the
> structure of the loops in gimple based on that guess -- is that really a
> good idea.  It's fairly counter to many of the design goals around gimple.

That is exactly what ivopts does in the first place?  It tries to find
what ivs result in cheaper code, and it looks at a lot of target-specific
stuff for that (like available addressing modes and the cost to use those).

I think doloops fit perfectly well in that scheme.  This patch is not
radically changing anything.


Segher
Bin.Cheng May 15, 2019, 2:40 a.m. UTC | #6
I wonder if you can factor out generic part into GIMPLE and leave
target hook as specific as possible?

On Tue, May 14, 2019 at 11:10 AM <linkw@linux.ibm.com> wrote:
>
> From: Kewen Lin <linkw@linux.ibm.com>
>
> Previous version link for background:
> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>
> This hook is to predict whether one loop in gimple will
> be transformed to low-overhead loop later in RTL, and
> designed to be called in ivopts.
>
> "Since the low-overhead loop optimize transformation is
> based on RTL, some of those checks are hard to be imitated
> on gimple, so it's not possible to predict the current
> loop will be transformed exactly in middle-end.  But if we
> can have most loop predicted precisely, it would be helpful.
> It highly depends on target hook fine tuning. It's
> acceptable to have some loops which can be transformed to
> low-overhead loop but we don't catch.  But we should try
> our best to avoid to predict some loop as low-overhead loop
> but which isn't."
>
> Bootstrapped and regression testing passed on powerpc64le.
>
> Is it ok for trunk?
>
> gcc/ChangeLog
>
> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * target.def (predict_doloop_p): New hook.
>         * targhooks.h (default_predict_doloop_p): New declaration.
>         * targhooks.c (default_predict_doloop_p): New function.
>         * doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
>         * doc/tm.texi: Regenerate.
>         * config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
>         (costly_iter_for_doloop_p): Likewise.
>         (rs6000_predict_doloop_p): Likewise.
>         (TARGET_PREDICT_DOLOOP_P): New macro.
>
> ---
>  gcc/config/rs6000/rs6000.c | 174 ++++++++++++++++++++++++++++++++++++++++++++-
>  gcc/doc/tm.texi            |   8 +++
>  gcc/doc/tm.texi.in         |   2 +
>  gcc/target.def             |   9 +++
>  gcc/targhooks.c            |  13 ++++
>  gcc/targhooks.h            |   1 +
>  6 files changed, 206 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index a21f4f7..1c1c8eb 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -83,6 +83,9 @@
>  #include "tree-ssa-propagate.h"
>  #include "tree-vrp.h"
>  #include "tree-ssanames.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-cfg.h"
> +#include "tree-scalar-evolution.h"
>
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -1914,6 +1917,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_CAN_USE_DOLOOP_P
>  #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
>
> +#undef TARGET_PREDICT_DOLOOP_P
> +#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
> +
>  #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
>  #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
>
> @@ -39436,7 +39442,173 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
>    return id;
>  }
>
> -
> +/* Check whether there are some instructions preventing doloop transformation
> +   inside loop body, mainly for instructions which are possible to kill CTR.
> +
> +   Return true if some invalid insn exits, otherwise return false.  */
> +
> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);
> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
The loops on bbs/stmts seem general and can be put in GIMPLE.  So a
target hook taking gimple_stmt parameter and returning if the stmt
blocks doloop is enough?

> +      {
> +       gimple *stmt = gsi_stmt (gsi);
> +       if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +           && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding call.\n");
> +           return true;
> +         }
> +       if (computed_goto_p (stmt))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file, "predict doloop failure due to"
> +                                 "finding computed jump.\n");
> +           return true;
> +         }
> +       /* TODO: Now this hook is expected to be called in ivopts, which is
> +          before switchlower1/switchlower2.  Checking for SWITCH at this point
> +       will eliminate some good candidates.  But since there are only a few
> +       cases having _a_ switch statement in the innermost loop, it can be a low
> +          priority enhancement.  */
> +
> +       if (gimple_code (stmt) == GIMPLE_SWITCH)
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding switch.\n");
> +           return true;
> +         }
> +      }
> +
> +  return false;
> +}
> +
> +/* Check whether number of iteration computation is too costly for doloop
> +   transformation.  It expands the gimple sequence to equivalent RTL insn
> +   sequence, then evaluate the cost.
> +
> +   Return true if it's costly, otherwise return false.  */
> +
> +static bool
> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
> +{
> +  tree type = TREE_TYPE (niters);
> +  unsigned cost = 0, i;
> +  tree obj;
> +  bool speed = optimize_loop_for_speed_p (loop);
> +  int regno = LAST_VIRTUAL_REGISTER + 1;
> +  vec<tree> tvec;
> +  tvec.create (20);
> +  decl_rtl_data tdata;
> +  tdata.regno = &regno;
> +  tdata.treevec = &tvec;
> +  walk_tree (&niters, prepare_decl_rtl, &tdata, NULL);
> +  start_sequence ();
> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +  FOR_EACH_VEC_ELT (tvec, i, obj)
> +    SET_DECL_RTL (obj, NULL_RTX);
> +  tvec.release ();
> +
> +  for (; seq; seq = NEXT_INSN (seq))
> +    {
> +      if (!INSN_P (seq))
> +       continue;
> +      rtx body = PATTERN (seq);
> +      if (GET_CODE (body) == SET)
> +       {
> +         rtx set_val = XEXP (body, 1);
> +         enum rtx_code code = GET_CODE (set_val);
> +         enum rtx_class cls = GET_RTX_CLASS (code);
> +         /* For now, we only consider these two RTX classes, to match what we
> +        get in doloop_optimize, excluding operations like zero/sign extend.  */
> +         if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> +           cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> +       }
> +    }
> +  unsigned max_cost
> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
> +  if (cost > max_cost)
> +    return true;
> +
> +  return false;
> +}
Though this function works on RTL, it's generic too, especially the
code is already in ivopts (and moved by your previous patch).
> +
> +/*
> +   Predict whether the given loop will be transformed in the RTL
> +   doloop_optimize pass.  This is for use by the IVOPTS middle-end pass.
> +   Attempt to duplicate as many doloop_optimize checks as possible.
> +
> +   Some RTL specific checks seems unable to be checked here, if any
> +   new checks or easy checks _are_ missing here.  */
> +
> +static bool
> +rs6000_predict_doloop_p (struct loop *loop)
> +{
> +  gcc_assert (loop);
> +
> +  /* On rs6000, targetm.can_use_doloop_p is actually
> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> +  if (loop->inner != NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "no innermost.\n");
> +      return false;
> +    }
> +
> +  /* number_of_latch_executions is not so costly, so we don't use
> +     number_of_iterations_exit for iteration description.  */
> +  tree niters = number_of_latch_executions (loop);
> +  if (niters == NULL_TREE || niters == chrec_dont_know)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "unexpected niters.\n");
> +      return false;
> +    }
The checks on niters are generic too.
> +
> +  /* Similar to doloop_optimize, check whether iteration count too small
> +     and not profitable.  */
> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> +  if (est_niter == -1)
> +    est_niter = get_likely_max_loop_iterations_int (loop);
> +  if (est_niter >= 0 && est_niter < 3)
The only probably target dependent is the magic number 3 here.  After
moving all generic part to ivopts, we can use a macro for the moment,
and improve it as a parameter if there are more doloop targets.
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file,
> +                "predict doloop failure due to"
> +                "too few iterations (%d).\n",
> +                (unsigned int) est_niter);
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether number of iterations too costly
> +     to compute.  */
> +  if (costly_iter_for_doloop_p (loop, niters))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "costly niter computation.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
> +     CALL, tablejump, computed_jump.  */
> +  if (invalid_insn_for_doloop_p (loop))
> +    return false;
> +
> +  return true;
> +}
> +
Overall most of above checks can be moved out of backend, leaving only
more specific target hook checking on gimple_stmt?  And even that can
be made generic to some extend.

Thanks,
bin
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-rs6000.h"
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 8c8978b..e595b8b 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>  body must be generated.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> +which is going to be checked.  This target hook is required only when the
> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index fe1194e..e047734 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>
>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>
> +@hook TARGET_PREDICT_DOLOOP_P
> +
>  @hook TARGET_CAN_USE_DOLOOP_P
>
>  @hook TARGET_INVALID_WITHIN_DOLOOP
> diff --git a/gcc/target.def b/gcc/target.def
> index 66cee07..0a80de3 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4227,6 +4227,15 @@ DEFHOOK
>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>   default_speculation_safe_value)
>
> +DEFHOOK
> +(predict_doloop_p,
> + "Return true if we can predict it is possible to use low-overhead loops\n\
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
> +which is going to be checked.  This target hook is required only when the\n\
> +target supports low-overhead loops, and will help some earlier middle-end\n\
> +passes to make some decisions.",
> + bool, (struct loop *loop),
> + default_predict_doloop_p)
>
>  DEFHOOK
>  (can_use_doloop_p,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index 318f7e9..56ed4cc 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>    return HAVE_GNU_INDIRECT_FUNCTION;
>  }
>
> +/* True if we can predict this loop is possible to be transformed to a
> +   low-overhead loop, otherwise returns false.
> +
> +   By default, false is returned, as this hook's applicability should be
> +   verified for each target.  Target maintainers should re-define the hook
> +   if the target can take advantage of it.  */
> +
> +bool
> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>     an error message.
>
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 5943627..70bfb17 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>
>  extern bool default_has_ifunc_p (void);
>
> +extern bool default_predict_doloop_p (struct loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
> --
> 2.7.4
>
Kewen.Lin May 15, 2019, 2:46 a.m. UTC | #7
on 2019/5/14 下午3:24, Richard Biener wrote:> 
> Most of the rs6000 target hook checks look general
> (can we compute niter, etc.), IVOPTs would have a hard
> time adding a counter IV w/o being able to compute niters.

Do you mean to reuse them?

Yes, IVOPTs has already checked niter.  At the initial 
version of this patch, I tried to pass more parameters
for this hook, but IMHO it looks not elegant.  In future,
if this hook is called in other places, it may require
others to construct the required parameters.  

number_of_latch_executions uses the cache way, it should
not cost much to get the niter.  For the estimated niter
counts, I don't think IVOPTs also checks it.

> Likewise IVOPTs should already consider (does it?) costs
> of invariant computations (the niter computation).
> 

I did check the computation_cost in IVOPTs, but I found
it can not be reused directly for doloop cost check.  So
I had to refactor the prepare_decl_rtl part.  It's 
possible to hack the IVOPTs code to get what we want, 
but it seems too hacky.


> That leaves the CTR invalidation checks which makes
> me wonder if the hook shouldn't be one working on
> a stmt, like stmt_precludes_doloop_p?
> 

This sounds like a better name instead of 
invalid_insn_for_doloop_p.  :)


Thanks,
Kewen

> Richard.
>
Kewen.Lin May 15, 2019, 3:34 a.m. UTC | #8
on 2019/5/15 上午10:40, Bin.Cheng wrote:
> I wonder if you can factor out generic part into GIMPLE and leave
> target hook as specific as possible?
> 

Good suggestion! Yes, most of the checks are common as you
pointed out.  I hope the other targets won't have more
customization needs excepting for that GIMPLE stmt hook
check. 

I am thinking IVOPTs is a best place to factor to? Or 
somewhere to put common GIMPLE query interface? 

>> +
>> +  /* Similar to doloop_optimize, check whether iteration count too small
>> +     and not profitable.  */
>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
>> +  if (est_niter == -1)
>> +    est_niter = get_likely_max_loop_iterations_int (loop);
>> +  if (est_niter >= 0 && est_niter < 3)
> The only probably target dependent is the magic number 3 here.  After
> moving all generic part to ivopts, we can use a macro for the moment,
> and improve it as a parameter if there are more doloop targets.

The magic number 3 is from function doloop_optimize, not a
target dependent value.  But thanks for your tips with 
macro mechanism!


Thanks,
Kewen

> Overall most of above checks can be moved out of backend, leaving only
> more specific target hook checking on gimple_stmt?  And even that can
> be made generic to some extend.
> 
> Thanks,
> bin
Kewen.Lin May 15, 2019, 5:20 a.m. UTC | #9
on 2019/5/15 上午11:34, Kewen.Lin wrote:
> 
> on 2019/5/15 上午10:40, Bin.Cheng wrote:
>> I wonder if you can factor out generic part into GIMPLE and leave
>> target hook as specific as possible?
>>
> 
> Good suggestion! Yes, most of the checks are common as you
> pointed out.  I hope the other targets won't have more
> customization needs excepting for that GIMPLE stmt hook
> check. 
> 
> I am thinking IVOPTs is a best place to factor to? Or 
> somewhere to put common GIMPLE query interface? 
> 

Or move it into targhooks.cpp as a possible base process 
of [target]_predict_doloop_p?   The target owner can
decide whether to use generic_predict_doloop_p in its 
own target hook.
It seems more flexible and allow them to have a new 
implementation for their own targets.  Like:

rs6000_predict_doloop_p:
	....
	if (generic_predict_doloop_p(loop))
	...

special_target_predict_doloop_p:
	....
	....


Thanks,
Kewen

>>> +
>>> +  /* Similar to doloop_optimize, check whether iteration count too small
>>> +     and not profitable.  */
>>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
>>> +  if (est_niter == -1)
>>> +    est_niter = get_likely_max_loop_iterations_int (loop);
>>> +  if (est_niter >= 0 && est_niter < 3)
>> The only probably target dependent is the magic number 3 here.  After
>> moving all generic part to ivopts, we can use a macro for the moment,
>> and improve it as a parameter if there are more doloop targets.
> 
> The magic number 3 is from function doloop_optimize, not a
> target dependent value.  But thanks for your tips with 
> macro mechanism!
> 
> 
> Thanks,
> Kewen
> 
>> Overall most of above checks can be moved out of backend, leaving only
>> more specific target hook checking on gimple_stmt?  And even that can
>> be made generic to some extend.
>>
>> Thanks,
>> bin
>
Bin.Cheng May 15, 2019, 6:18 a.m. UTC | #10
On Wed, May 15, 2019 at 1:20 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> on 2019/5/15 上午11:34, Kewen.Lin wrote:
> >
> > on 2019/5/15 上午10:40, Bin.Cheng wrote:
> >> I wonder if you can factor out generic part into GIMPLE and leave
> >> target hook as specific as possible?
> >>
> >
> > Good suggestion! Yes, most of the checks are common as you
> > pointed out.  I hope the other targets won't have more
> > customization needs excepting for that GIMPLE stmt hook
> > check.
> >
> > I am thinking IVOPTs is a best place to factor to? Or
> > somewhere to put common GIMPLE query interface?
> >
>
> Or move it into targhooks.cpp as a possible base process
> of [target]_predict_doloop_p?   The target owner can
> decide whether to use generic_predict_doloop_p in its
> own target hook.
> It seems more flexible and allow them to have a new
> implementation for their own targets.  Like:
>
> rs6000_predict_doloop_p:
>         ....
>         if (generic_predict_doloop_p(loop))
>         ...
>
> special_target_predict_doloop_p:
Hmm, I thought the target hook would not need to take loop parameter
after moving generic part into ivopts?  Also the moved part would be a
local function (calling the new hook on stmt) in ivopts?

Thanks,
bin
>         ....
>         ....
>
>
> Thanks,
> Kewen
>
> >>> +
> >>> +  /* Similar to doloop_optimize, check whether iteration count too small
> >>> +     and not profitable.  */
> >>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> >>> +  if (est_niter == -1)
> >>> +    est_niter = get_likely_max_loop_iterations_int (loop);
> >>> +  if (est_niter >= 0 && est_niter < 3)
> >> The only probably target dependent is the magic number 3 here.  After
> >> moving all generic part to ivopts, we can use a macro for the moment,
> >> and improve it as a parameter if there are more doloop targets.
> >
> > The magic number 3 is from function doloop_optimize, not a
> > target dependent value.  But thanks for your tips with
> > macro mechanism!
> >
> >
> > Thanks,
> > Kewen
> >
> >> Overall most of above checks can be moved out of backend, leaving only
> >> more specific target hook checking on gimple_stmt?  And even that can
> >> be made generic to some extend.
> >>
> >> Thanks,
> >> bin
> >
>
Segher Boessenkool May 15, 2019, 7:03 a.m. UTC | #11
On Wed, May 15, 2019 at 10:40:09AM +0800, Bin.Cheng wrote:
> I wonder if you can factor out generic part into GIMPLE and leave
> target hook as specific as possible?

Less GIMPLE handling code in the backend would probably be good, yes.  Less
of the mechanics at least.

> > +static bool
> > +invalid_insn_for_doloop_p (struct loop *loop)
> > +{
> > +  basic_block *body = get_loop_body (loop);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  for (unsigned i = 0; i < loop->num_nodes; i++)
> > +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> The loops on bbs/stmts seem general and can be put in GIMPLE.  So a
> target hook taking gimple_stmt parameter and returning if the stmt
> blocks doloop is enough?

I can't think of any arch where that will not work, for most things.  The
other big "can we make this loop a doloop" factor is the loop (nest) itself:
on some archs there can be more than one doloop at once.  Maybe GCC doesn't
target any such target?

> > +  /* Similar to doloop_optimize, check whether iteration count too small
> > +     and not profitable.  */
> > +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> > +  if (est_niter == -1)
> > +    est_niter = get_likely_max_loop_iterations_int (loop);
> > +  if (est_niter >= 0 && est_niter < 3)
> The only probably target dependent is the magic number 3 here.  After
> moving all generic part to ivopts, we can use a macro for the moment,
> and improve it as a parameter if there are more doloop targets.

It is a constant 3 in current RTL code, already; making it a param would
be welcome, indeed :-)

> > +    {
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       fprintf (dump_file,
> > +                "predict doloop failure due to"
> > +                "too few iterations (%d).\n",
> > +                (unsigned int) est_niter);

%d isn't unsigned, btw.  Just use (int), or use %u?

> Overall most of above checks can be moved out of backend, leaving only
> more specific target hook checking on gimple_stmt?  And even that can
> be made generic to some extend.

But will that help, will it make the code more readable / maintainable?

Most of these things aren't generic...  On Power we cannot allow function
calls inside the loop because the count register is call-clobbered, and
we cannot allow indirect jumps (like in many switch statements) because
those use the count register as well.  There can of course be other reasons
why we do not want calls or switches inside a doloop anyway, but that then
is just a lucky coincidence ;-)


Segher
Richard Biener May 15, 2019, 8:35 a.m. UTC | #12
On Tue, 14 May 2019, Bill Schmidt wrote:

> On 5/14/19 2:13 PM, Jeff Law wrote:
> > On 5/13/19 9:09 PM, linkw@linux.ibm.com wrote:
> >> From: Kewen Lin <linkw@linux.ibm.com>
> >>
> >> Previous version link for background:
> >> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
> >>
> >> This hook is to predict whether one loop in gimple will
> >> be transformed to low-overhead loop later in RTL, and
> >> designed to be called in ivopts.
> >>
> >> "Since the low-overhead loop optimize transformation is
> >> based on RTL, some of those checks are hard to be imitated
> >> on gimple, so it's not possible to predict the current
> >> loop will be transformed exactly in middle-end.  But if we
> >> can have most loop predicted precisely, it would be helpful.
> >> It highly depends on target hook fine tuning. It's
> >> acceptable to have some loops which can be transformed to
> >> low-overhead loop but we don't catch.  But we should try
> >> our best to avoid to predict some loop as low-overhead loop
> >> but which isn't."
> >>
> >> Bootstrapped and regression testing passed on powerpc64le.
> >>
> >> Is it ok for trunk?
> >>
> >> gcc/ChangeLog
> >>
> >> 2019-05-13  Kewen Lin  <linkw@gcc.gnu.org>
> >>
> >> 	PR middle-end/80791
> >> 	* target.def (predict_doloop_p): New hook.
> >> 	* targhooks.h (default_predict_doloop_p): New declaration.
> >> 	* targhooks.c (default_predict_doloop_p): New function.
> >> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
> >> 	* doc/tm.texi: Regenerate.
> >> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
> >> 	(costly_iter_for_doloop_p): Likewise.
> >> 	(rs6000_predict_doloop_p): Likewise.
> >> 	(TARGET_PREDICT_DOLOOP_P): New macro.
> > Trying to guess what the target is going to do, then changing the
> > structure of the loops in gimple based on that guess -- is that really a
> > good idea.  It's fairly counter to many of the design goals around gimple.
> 
> Understood -- but incorrect selection of ivars has a heavily detrimental effect.
> We have run into this problem many times on Power, where the cost model wrongly
> takes into effect the cost of the compare that will be absorbed into the bdnz
> instruction.  Hot loops can end up 15-20% worse.  I don't see any reasonable
> alternative to a target hook with the present IVOPTS cost model.

Note I would go with GIMPLE level detection of doloop style
decrement and branch and just have a special path in its costing,
for example not costing the compare at all in case there are
doloop patterns?

Richard.
Richard Biener May 15, 2019, 8:53 a.m. UTC | #13
On Wed, 15 May 2019, Segher Boessenkool wrote:

> On Wed, May 15, 2019 at 10:40:09AM +0800, Bin.Cheng wrote:
> > I wonder if you can factor out generic part into GIMPLE and leave
> > target hook as specific as possible?
> 
> Less GIMPLE handling code in the backend would probably be good, yes.  Less
> of the mechanics at least.
> 
> > > +static bool
> > > +invalid_insn_for_doloop_p (struct loop *loop)
> > > +{
> > > +  basic_block *body = get_loop_body (loop);
> > > +  gimple_stmt_iterator gsi;
> > > +
> > > +  for (unsigned i = 0; i < loop->num_nodes; i++)
> > > +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> > The loops on bbs/stmts seem general and can be put in GIMPLE.  So a
> > target hook taking gimple_stmt parameter and returning if the stmt
> > blocks doloop is enough?
> 
> I can't think of any arch where that will not work, for most things.  The
> other big "can we make this loop a doloop" factor is the loop (nest) itself:
> on some archs there can be more than one doloop at once.  Maybe GCC doesn't
> target any such target?
> 
> > > +  /* Similar to doloop_optimize, check whether iteration count too small
> > > +     and not profitable.  */
> > > +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> > > +  if (est_niter == -1)
> > > +    est_niter = get_likely_max_loop_iterations_int (loop);
> > > +  if (est_niter >= 0 && est_niter < 3)
> > The only probably target dependent is the magic number 3 here.  After
> > moving all generic part to ivopts, we can use a macro for the moment,
> > and improve it as a parameter if there are more doloop targets.
> 
> It is a constant 3 in current RTL code, already; making it a param would
> be welcome, indeed :-)
> 
> > > +    {
> > > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > > +       fprintf (dump_file,
> > > +                "predict doloop failure due to"
> > > +                "too few iterations (%d).\n",
> > > +                (unsigned int) est_niter);
> 
> %d isn't unsigned, btw.  Just use (int), or use %u?
> 
> > Overall most of above checks can be moved out of backend, leaving only
> > more specific target hook checking on gimple_stmt?  And even that can
> > be made generic to some extend.
> 
> But will that help, will it make the code more readable / maintainable?
> 
> Most of these things aren't generic...  On Power we cannot allow function
> calls inside the loop because the count register is call-clobbered, and
> we cannot allow indirect jumps (like in many switch statements) because
> those use the count register as well.  There can of course be other reasons
> why we do not want calls or switches inside a doloop anyway, but that then
> is just a lucky coincidence ;-)

I wonder if making the doloop patterns (tried to find them in rs6000.md,
but I only see define_expands with no predicates/alternatives...)
accept any counter register, just have a preference on that special
counter reg and have the define_insn deal with RA allocating another
one by emitting a regular update & branch-on-zero?

That is, the penalty of doing that shouldn't be too big and thus
we can more optimistically cost & handle "doloops"?  I guess
the doloop.c checks are quite too strict because we have to
rely on RA being able to allocate that reg and as soon as we
need to spill it using a general reg with update & branch-on-zero
will be cheaper anyways?

Richard.
Segher Boessenkool May 15, 2019, 4:44 p.m. UTC | #14
On Wed, May 15, 2019 at 10:53:43AM +0200, Richard Biener wrote:
> I wonder if making the doloop patterns (tried to find them in rs6000.md,
> but I only see define_expands with no predicates/alternatives...)

"doloop_end" --> "ctr<mode>" --> "<bd>_<mode>"
(all consecutive in rs6000.md btw.)  Alternative 0 in "<bd>_<mode>"
are the actual looping instructions; the other alternatives are for
the uncommon case where we ended up not being able to use this insn
after all.

> accept any counter register, just have a preference on that special
> counter reg and have the define_insn deal with RA allocating another
> one by emitting a regular update & branch-on-zero?

That is what those other alternatives do.  It is expensive, and cannot
even *work* in all cases.

We have no generic "branch on (not) zero" in Power, btw.  Archs that do
can use that as a doloop, if they choose IVs that end the loop at 0.

> That is, the penalty of doing that shouldn't be too big and thus
> we can more optimistically cost & handle "doloops"?

We have done that for ages, in the RTL level doloop handling.  With
newer hardware it becomes more and more expensive to guess wrong.

> I guess
> the doloop.c checks are quite too strict because we have to
> rely on RA being able to allocate that reg and as soon as we
> need to spill it using a general reg with update & branch-on-zero
> will be cheaper anyways?

(Update, compare, branch, for us).

We can predict quite well where the count register will be unavailable
to our doloops.  The cost if we are allocated a GPR isn't so bad: it
costs an insn or maybe two more than if we made optimal code (without
doloop).

But we can be allocated a floating point register, or memory, instead.
That is heavily discouraged (by making it more expensive), but it can
still happen.  This is a jump_insn so it cannot get any reloads, either;
but even if it could, that is an *expensive* thing to do.


Segher
Jeff Law May 20, 2019, 7:31 p.m. UTC | #15
On 5/15/19 10:44 AM, Segher Boessenkool wrote:
> On Wed, May 15, 2019 at 10:53:43AM +0200, Richard Biener wrote:
>> I wonder if making the doloop patterns (tried to find them in rs6000.md,
>> but I only see define_expands with no predicates/alternatives...)
> 
> "doloop_end" --> "ctr<mode>" --> "<bd>_<mode>"
> (all consecutive in rs6000.md btw.)  Alternative 0 in "<bd>_<mode>"
> are the actual looping instructions; the other alternatives are for
> the uncommon case where we ended up not being able to use this insn
> after all.
> 
>> accept any counter register, just have a preference on that special
>> counter reg and have the define_insn deal with RA allocating another
>> one by emitting a regular update & branch-on-zero?
> 
> That is what those other alternatives do.  It is expensive, and cannot
> even *work* in all cases.
> 
> We have no generic "branch on (not) zero" in Power, btw.  Archs that do
> can use that as a doloop, if they choose IVs that end the loop at 0.
> 
>> That is, the penalty of doing that shouldn't be too big and thus
>> we can more optimistically cost & handle "doloops"?
> 
> We have done that for ages, in the RTL level doloop handling.  With
> newer hardware it becomes more and more expensive to guess wrong.
> 
>> I guess
>> the doloop.c checks are quite too strict because we have to
>> rely on RA being able to allocate that reg and as soon as we
>> need to spill it using a general reg with update & branch-on-zero
>> will be cheaper anyways?
> 
> (Update, compare, branch, for us).
> 
> We can predict quite well where the count register will be unavailable
> to our doloops.  The cost if we are allocated a GPR isn't so bad: it
> costs an insn or maybe two more than if we made optimal code (without
> doloop).
> 
> But we can be allocated a floating point register, or memory, instead.
> That is heavily discouraged (by making it more expensive), but it can
> still happen.  This is a jump_insn so it cannot get any reloads, either;
> but even if it could, that is an *expensive* thing to do.
RIght.  ANd that's consistent with what other architectures have needed
to do.  I can't describe the pain of what happens on the PA when you
find out that the loop counter got allocated to the shift amount
register or a floating point register.  It's rare, but you had to handle
it.  Ugh.

jeff
Segher Boessenkool May 21, 2019, 10:02 a.m. UTC | #16
On Mon, May 20, 2019 at 01:31:10PM -0600, Jeff Law wrote:
> On 5/15/19 10:44 AM, Segher Boessenkool wrote:
> > But we can be allocated a floating point register, or memory, instead.
> > That is heavily discouraged (by making it more expensive), but it can
> > still happen.  This is a jump_insn so it cannot get any reloads, either;
> > but even if it could, that is an *expensive* thing to do.
> RIght.  ANd that's consistent with what other architectures have needed
> to do.  I can't describe the pain of what happens on the PA when you
> find out that the loop counter got allocated to the shift amount
> register or a floating point register.  It's rare, but you had to handle
> it.  Ugh.

Maybe it's time to finally allow output reloads on jump insns.  In LRA
only, if that helps?


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a21f4f7..1c1c8eb 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -83,6 +83,9 @@ 
 #include "tree-ssa-propagate.h"
 #include "tree-vrp.h"
 #include "tree-ssanames.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-cfg.h"
+#include "tree-scalar-evolution.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -1914,6 +1917,9 @@  static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_CAN_USE_DOLOOP_P
 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
 
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
@@ -39436,7 +39442,173 @@  rs6000_mangle_decl_assembler_name (tree decl, tree id)
   return id;
 }
 
-
+/* Check whether there are some instructions preventing doloop transformation
+   inside loop body, mainly for instructions which are possible to kill CTR.
+
+   Return true if some invalid insn exits, otherwise return false.  */
+
+static bool
+invalid_insn_for_doloop_p (struct loop *loop)
+{
+  basic_block *body = get_loop_body (loop);
+  gimple_stmt_iterator gsi;
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple *stmt = gsi_stmt (gsi);
+	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
+	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file,
+		       "predict doloop failure due to finding call.\n");
+	    return true;
+	  }
+	if (computed_goto_p (stmt))
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file, "predict doloop failure due to"
+				  "finding computed jump.\n");
+	    return true;
+	  }
+	/* TODO: Now this hook is expected to be called in ivopts, which is
+	   before switchlower1/switchlower2.  Checking for SWITCH at this point
+       will eliminate some good candidates.  But since there are only a few
+       cases having _a_ switch statement in the innermost loop, it can be a low
+	   priority enhancement.  */
+
+	if (gimple_code (stmt) == GIMPLE_SWITCH)
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file,
+		       "predict doloop failure due to finding switch.\n");
+	    return true;
+	  }
+      }
+
+  return false;
+}
+
+/* Check whether number of iteration computation is too costly for doloop
+   transformation.  It expands the gimple sequence to equivalent RTL insn
+   sequence, then evaluate the cost.
+
+   Return true if it's costly, otherwise return false.  */
+
+static bool
+costly_iter_for_doloop_p (struct loop *loop, tree niters)
+{
+  tree type = TREE_TYPE (niters);
+  unsigned cost = 0, i;
+  tree obj;
+  bool speed = optimize_loop_for_speed_p (loop);
+  int regno = LAST_VIRTUAL_REGISTER + 1;
+  vec<tree> tvec;
+  tvec.create (20);
+  decl_rtl_data tdata;
+  tdata.regno = &regno;
+  tdata.treevec = &tvec;
+  walk_tree (&niters, prepare_decl_rtl, &tdata, NULL);
+  start_sequence ();
+  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+  FOR_EACH_VEC_ELT (tvec, i, obj)
+    SET_DECL_RTL (obj, NULL_RTX);
+  tvec.release ();
+
+  for (; seq; seq = NEXT_INSN (seq))
+    {
+      if (!INSN_P (seq))
+	continue;
+      rtx body = PATTERN (seq);
+      if (GET_CODE (body) == SET)
+	{
+	  rtx set_val = XEXP (body, 1);
+	  enum rtx_code code = GET_CODE (set_val);
+	  enum rtx_class cls = GET_RTX_CLASS (code);
+	  /* For now, we only consider these two RTX classes, to match what we
+	 get in doloop_optimize, excluding operations like zero/sign extend.  */
+	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
+	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
+	}
+    }
+  unsigned max_cost
+    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
+  if (cost > max_cost)
+    return true;
+
+  return false;
+}
+
+/*
+   Predict whether the given loop will be transformed in the RTL
+   doloop_optimize pass.  This is for use by the IVOPTS middle-end pass.
+   Attempt to duplicate as many doloop_optimize checks as possible.
+
+   Some RTL specific checks seems unable to be checked here, if any
+   new checks or easy checks _are_ missing here.  */
+
+static bool
+rs6000_predict_doloop_p (struct loop *loop)
+{
+  gcc_assert (loop);
+
+  /* On rs6000, targetm.can_use_doloop_p is actually
+     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
+  if (loop->inner != NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "no innermost.\n");
+      return false;
+    }
+
+  /* number_of_latch_executions is not so costly, so we don't use
+     number_of_iterations_exit for iteration description.  */
+  tree niters = number_of_latch_executions (loop);
+  if (niters == NULL_TREE || niters == chrec_dont_know)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "unexpected niters.\n");
+      return false;
+    }
+
+  /* Similar to doloop_optimize, check whether iteration count too small
+     and not profitable.  */
+  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
+  if (est_niter == -1)
+    est_niter = get_likely_max_loop_iterations_int (loop);
+  if (est_niter >= 0 && est_niter < 3)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "predict doloop failure due to"
+		 "too few iterations (%d).\n",
+		 (unsigned int) est_niter);
+      return false;
+    }
+
+  /* Similar to doloop_optimize, check whether number of iterations too costly
+     to compute.  */
+  if (costly_iter_for_doloop_p (loop, niters))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "costly niter computation.\n");
+      return false;
+    }
+
+  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
+     CALL, tablejump, computed_jump.  */
+  if (invalid_insn_for_doloop_p (loop))
+    return false;
+
+  return true;
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8c8978b..e595b8b 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11603,6 +11603,14 @@  function version at run-time for a given set of function versions.
 body must be generated.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
+Return true if we can predict it is possible to use low-overhead loops
+for a particular loop.  The parameter @var{loop} is a pointer to the loop
+which is going to be checked.  This target hook is required only when the
+target supports low-overhead loops, and will help some earlier middle-end
+passes to make some decisions.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fe1194e..e047734 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7942,6 +7942,8 @@  to by @var{ce_info}.
 
 @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
 
+@hook TARGET_PREDICT_DOLOOP_P
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 66cee07..0a80de3 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4227,6 +4227,15 @@  DEFHOOK
 rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
  default_speculation_safe_value)
  
+DEFHOOK
+(predict_doloop_p,
+ "Return true if we can predict it is possible to use low-overhead loops\n\
+for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
+which is going to be checked.  This target hook is required only when the\n\
+target supports low-overhead loops, and will help some earlier middle-end\n\
+passes to make some decisions.",
+ bool, (struct loop *loop),
+ default_predict_doloop_p)
 
 DEFHOOK
 (can_use_doloop_p,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 318f7e9..56ed4cc 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -643,6 +643,19 @@  default_has_ifunc_p (void)
   return HAVE_GNU_INDIRECT_FUNCTION;
 }
 
+/* True if we can predict this loop is possible to be transformed to a
+   low-overhead loop, otherwise returns false.
+
+   By default, false is returned, as this hook's applicability should be
+   verified for each target.  Target maintainers should re-define the hook
+   if the target can take advantage of it.  */
+
+bool
+default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
    an error message.
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 5943627..70bfb17 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -85,6 +85,7 @@  extern bool default_fixed_point_supported_p (void);
 
 extern bool default_has_ifunc_p (void);
 
+extern bool default_predict_doloop_p (struct loop *);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);