diff mbox series

[v3,2/3] Add predict_doloop_p target hook

Message ID 1558064130-111037-1-git-send-email-linkw@linux.ibm.com
State New
Headers show
Series None | expand

Commit Message

Kewen.Lin May 17, 2019, 3:35 a.m. UTC
From: Kewen Lin <linkw@linux.ibm.com>

Hi,

Previous version link:
https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html

Comparing with the previous version, I moved the generic
parts of rs6000 target hook to IVOPTs.  But I still kept
the target hook as previous which checks some target
specific criteria like innermost, max iteration counts
etc, and checks for invalid stmt in loop.  The reason
I decided not to move this part to generic is they are
not generic enough.

1) For the part of target specific criteria, if we want
to put it in generic, we can call the hook
targetm.can_use_doloop_p, which requires us to
prepare those four parameters, but each target only needs
one or two parameters, it means we will evaluate some
things which aren't required for that target.  So I'd like
to leave this part to target hook.
2) For the other part of target invalid stmt check, as the
hook invalid_within_doloop grep data shows, no all targets
need to check whether invalid instructions exist in doloop.
If we scan all stmts as generic, it can waste time for those
targets which don't need to check.  Besides, the scope of
the current check on SWITCH in rs6000 hook is wide, later
if we want it more exact, we may need to check more stmts
instead of single.  To let target hook scan the BBs/stmts
by itself is also more flexible.

Bootstrapped and regression testing ongoing on powerpc64le.

Any more comments?

gcc/ChangeLog

2019-05-17  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* target.def (predict_doloop_p): New hook.
	* targhooks.h (default_predict_doloop_p): New declaration.
	* targhooks.c (default_predict_doloop_p): New function.
	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
	* doc/tm.texi: Regenerate.
	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
	(rs6000_predict_doloop_p): Likewise.
	(TARGET_PREDICT_DOLOOP_P): New macro.
	* tree-ssa-loop-ivopts.c (generic_predict_doloop_p): New function. 
	(costly_iter_for_doloop_p): Likewise.

---
 gcc/config/rs6000/rs6000.c |  79 +++++++++++++++++++++++++++++++++-
 gcc/doc/tm.texi            |   8 ++++
 gcc/doc/tm.texi.in         |   2 +
 gcc/target.def             |   9 ++++
 gcc/targhooks.c            |  13 ++++++
 gcc/targhooks.h            |   1 +
 gcc/tree-ssa-loop-ivopts.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 216 insertions(+), 1 deletion(-)

Comments

Kugan Vivekanandarajah May 17, 2019, 5:30 a.m. UTC | #1
Hi,

On Fri, 17 May 2019 at 13:37, <linkw@linux.ibm.com> wrote:
>
> From: Kewen Lin <linkw@linux.ibm.com>
>
> Hi,
>
> Previous version link:
> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html
>
> Comparing with the previous version, I moved the generic
> parts of rs6000 target hook to IVOPTs.  But I still kept
> the target hook as previous which checks some target
> specific criteria like innermost, max iteration counts
> etc, and checks for invalid stmt in loop.  The reason
> I decided not to move this part to generic is they are
> not generic enough.
>
> 1) For the part of target specific criteria, if we want
> to put it in generic, we can call the hook
> targetm.can_use_doloop_p, which requires us to
> prepare those four parameters, but each target only needs
> one or two parameters, it means we will evaluate some
> things which aren't required for that target.  So I'd like
> to leave this part to target hook.
> 2) For the other part of target invalid stmt check, as the
> hook invalid_within_doloop grep data shows, no all targets
> need to check whether invalid instructions exist in doloop.
> If we scan all stmts as generic, it can waste time for those
> targets which don't need to check.  Besides, the scope of
> the current check on SWITCH in rs6000 hook is wide, later
> if we want it more exact, we may need to check more stmts
> instead of single.  To let target hook scan the BBs/stmts
> by itself is also more flexible.
>
> Bootstrapped and regression testing ongoing on powerpc64le.
>
> Any more comments?
>
> gcc/ChangeLog
>
> 2019-05-17  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * target.def (predict_doloop_p): New hook.
>         * targhooks.h (default_predict_doloop_p): New declaration.
>         * targhooks.c (default_predict_doloop_p): New function.
>         * doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
>         * doc/tm.texi: Regenerate.
>         * config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
>         (rs6000_predict_doloop_p): Likewise.
>         (TARGET_PREDICT_DOLOOP_P): New macro.
>         * tree-ssa-loop-ivopts.c (generic_predict_doloop_p): New function.
>         (costly_iter_for_doloop_p): Likewise.
>
> ---
>  gcc/config/rs6000/rs6000.c |  79 +++++++++++++++++++++++++++++++++-
>  gcc/doc/tm.texi            |   8 ++++
>  gcc/doc/tm.texi.in         |   2 +
>  gcc/target.def             |   9 ++++
>  gcc/targhooks.c            |  13 ++++++
>  gcc/targhooks.h            |   1 +
>  gcc/tree-ssa-loop-ivopts.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 216 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index a21f4f7..2fd52d7 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -83,6 +83,7 @@
>  #include "tree-ssa-propagate.h"
>  #include "tree-vrp.h"
>  #include "tree-ssanames.h"
> +#include "tree-cfg.h"
>
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -1914,6 +1915,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_CAN_USE_DOLOOP_P
>  #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
>
> +#undef TARGET_PREDICT_DOLOOP_P
> +#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
> +
>  #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
>  #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
>
> @@ -39436,7 +39440,80 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
>    return id;
>  }
>
> -
> +/* Check whether there are some instructions preventing doloop transformation
> +   inside loop body, mainly for instructions which are possible to kill CTR.
> +
> +   Return true if some invalid insn exits, otherwise return false.  */
> +
> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);
> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +       gimple *stmt = gsi_stmt (gsi);
> +       if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +           && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding call.\n");
> +           return true;
> +         }
> +       if (computed_goto_p (stmt))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file, "predict doloop failure due to"
> +                                 "finding computed jump.\n");
> +           return true;
> +         }
> +
> +       /* TODO: Now this hook is expected to be called in ivopts, which is
> +          before switchlower1/switchlower2.  Checking for SWITCH at this point
> +          will eliminate some good candidates.  But since there are only a few
> +          cases having _a_ switch statement in the innermost loop, it can be a
> +          low priority enhancement.  */
> +       if (gimple_code (stmt) == GIMPLE_SWITCH)
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             fprintf (dump_file,
> +                      "predict doloop failure due to finding switch.\n");
> +           return true;
> +         }
> +      }
> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop in gimple will be transformed in the RTL
> +   doloop_optimize pass.  This is for rs6000 target specific.  */
> +
> +static bool
> +rs6000_predict_doloop_p (struct loop *loop)
> +{
> +  gcc_assert (loop);
> +
> +  /* On rs6000, targetm.can_use_doloop_p is actually
> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> +  if (loop->inner != NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "no innermost.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
> +     CALL, tablejump, computed_jump.  */
> +  if (invalid_insn_for_doloop_p (loop))
> +    return false;
> +
> +  return true;
> +}
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-rs6000.h"
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 8c8978b..e595b8b 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>  body must be generated.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> +which is going to be checked.  This target hook is required only when the
> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index fe1194e..e047734 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>
>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>
> +@hook TARGET_PREDICT_DOLOOP_P
> +
>  @hook TARGET_CAN_USE_DOLOOP_P
>
>  @hook TARGET_INVALID_WITHIN_DOLOOP
> diff --git a/gcc/target.def b/gcc/target.def
> index 66cee07..0a80de3 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4227,6 +4227,15 @@ DEFHOOK
>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>   default_speculation_safe_value)
>
> +DEFHOOK
> +(predict_doloop_p,
> + "Return true if we can predict it is possible to use low-overhead loops\n\
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
> +which is going to be checked.  This target hook is required only when the\n\
> +target supports low-overhead loops, and will help some earlier middle-end\n\
> +passes to make some decisions.",
> + bool, (struct loop *loop),
> + default_predict_doloop_p)
>
>  DEFHOOK
>  (can_use_doloop_p,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index 318f7e9..56ed4cc 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>    return HAVE_GNU_INDIRECT_FUNCTION;
>  }
>
> +/* True if we can predict this loop is possible to be transformed to a
> +   low-overhead loop, otherwise returns false.
> +
> +   By default, false is returned, as this hook's applicability should be
> +   verified for each target.  Target maintainers should re-define the hook
> +   if the target can take advantage of it.  */
> +
> +bool
> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>     an error message.
>
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 5943627..70bfb17 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>
>  extern bool default_has_ifunc_p (void);
>
> +extern bool default_predict_doloop_p (struct loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index a44b4cb..ed7f2a5 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -3776,6 +3776,111 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
>    return NULL_TREE;
>  }
>
> +/* Check whether number of iteration computation is too costly for doloop
> +   transformation.  It expands the gimple sequence to equivalent RTL insn
> +   sequence, then evaluate the cost.
> +
> +   Return true if it's costly, otherwise return false.  */
> +
> +static bool
> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
> +{
> +  tree type = TREE_TYPE (niters);
> +  unsigned cost = 0;
> +  bool speed = optimize_loop_for_speed_p (loop);
> +  int regno = LAST_VIRTUAL_REGISTER + 1;
> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
> +  start_sequence ();
> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  for (; seq; seq = NEXT_INSN (seq))
> +    {
> +      if (!INSN_P (seq))
> +       continue;
> +      rtx body = PATTERN (seq);
> +      if (GET_CODE (body) == SET)
> +       {
> +         rtx set_val = XEXP (body, 1);
> +         enum rtx_code code = GET_CODE (set_val);
> +         enum rtx_class cls = GET_RTX_CLASS (code);
> +         /* For now, we only consider these two RTX classes, to match what we
> +        get in doloop_optimize, excluding operations like zero/sign extend.  */
> +         if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> +           cost += set_src_cost (set_val, GET_MODE (set_val), speed);
Cant you have PARALLEL with SET here?

> +       }
> +    }
> +  unsigned max_cost
> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
> +  if (cost > max_cost)
> +    return true;
Maybe it is better to bailout early if the limit is reached instead of
doing it outside the loop?

Thanks,
Kugan

> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop will be transformed in the RTL
> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
> +   as possible.  This is only for target independent checks, see
> +   targetm.predict_doloop_p for the target dependent ones.
> +
> +   Some RTL specific checks seems unable to be checked in gimple, if any new
> +   checks or easy checks _are_ missing here, please add them.  */
> +
> +static bool
> +generic_predict_doloop_p (struct ivopts_data *data)
> +{
> +  struct loop *loop = data->current_loop;
> +
> +  /* Call target hook for target dependent checks.  */
> +  if (!targetm.predict_doloop_p (loop))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "target specific checks.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check iteration description to know it's
> +     suitable or not.  */
> +  edge exit = loop_latch_edge (loop);
> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
> +  if (niter_desc == NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "unexpected niters.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether iteration count too small
> +     and not profitable.  */
> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> +  if (est_niter == -1)
> +    est_niter = get_likely_max_loop_iterations_int (loop);
> +  if (est_niter >= 0 && est_niter < 3)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file,
> +                "predict doloop failure due to"
> +                "too few iterations (%u).\n",
> +                (unsigned int) est_niter);
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether number of iterations too costly
> +     to compute.  */
> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, "predict doloop failure due to"
> +                           "costly niter computation.\n");
> +      return false;
> +    }
> +
> +  return true;
> +}
> +
>  /* Determines cost of the computation of EXPR.  */
>
>  static unsigned
> --
> 2.7.4
>
Kewen.Lin May 17, 2019, 6:15 a.m. UTC | #2
on 2019/5/17 下午1:30, Kugan Vivekanandarajah wrote:
> Hi,
> 
> On Fri, 17 May 2019 at 13:37, <linkw@linux.ibm.com> wrote:
>>
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> +/* Check whether number of iteration computation is too costly for doloop
>> +   transformation.  It expands the gimple sequence to equivalent RTL insn
>> +   sequence, then evaluate the cost.
>> +
>> +   Return true if it's costly, otherwise return false.  */
>> +
>> +static bool
>> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
>> +{
>> +  tree type = TREE_TYPE (niters);
>> +  unsigned cost = 0;
>> +  bool speed = optimize_loop_for_speed_p (loop);
>> +  int regno = LAST_VIRTUAL_REGISTER + 1;
>> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
>> +  start_sequence ();
>> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
>> +  rtx_insn *seq = get_insns ();
>> +  end_sequence ();
>> +
>> +  for (; seq; seq = NEXT_INSN (seq))
>> +    {
>> +      if (!INSN_P (seq))
>> +       continue;
>> +      rtx body = PATTERN (seq);
>> +      if (GET_CODE (body) == SET)
>> +       {
>> +         rtx set_val = XEXP (body, 1);
>> +         enum rtx_code code = GET_CODE (set_val);
>> +         enum rtx_class cls = GET_RTX_CLASS (code);
>> +         /* For now, we only consider these two RTX classes, to match what we
>> +        get in doloop_optimize, excluding operations like zero/sign extend.  */
>> +         if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
>> +           cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> Cant you have PARALLEL with SET here?
> 

Thanks for catching, updated it with single_set for PARALLEL.

-      if (!INSN_P (seq))
-       continue;
-      rtx body = PATTERN (seq);
-      if (GET_CODE (body) == SET)
+      rtx set = single_set (seq);
+      if (set != NULL_RTX)
        {
-         rtx set_val = XEXP (body, 1);
+         rtx set_val = XEXP (set, 1);


>> +       }
>> +    }
>> +  unsigned max_cost
>> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
>> +  if (cost > max_cost)
>> +    return true;
> Maybe it is better to bailout early if the limit is reached instead of
> doing it outside the loop?
> 

Good point.  Based on those cases I've checked so far, most of them are less
than max cost, it looks most cases won't return early.  Too many early checks
seem inefficient to some extent.  Does it make sense?
And we have to collect some statistics for sure. :)


Thanks,
Kewen

> Thanks,
> Kugan
> 
>> +
>> +  return false;
>> +}
>> +
>> +/* Predict whether the given loop will be transformed in the RTL
>> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
>> +   as possible.  This is only for target independent checks, see
>> +   targetm.predict_doloop_p for the target dependent ones.
>> +
>> +   Some RTL specific checks seems unable to be checked in gimple, if any new
>> +   checks or easy checks _are_ missing here, please add them.  */
>> +
>> +static bool
>> +generic_predict_doloop_p (struct ivopts_data *data)
>> +{
>> +  struct loop *loop = data->current_loop;
>> +
>> +  /* Call target hook for target dependent checks.  */
>> +  if (!targetm.predict_doloop_p (loop))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +       fprintf (dump_file, "predict doloop failure due to"
>> +                           "target specific checks.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check iteration description to know it's
>> +     suitable or not.  */
>> +  edge exit = loop_latch_edge (loop);
>> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
>> +  if (niter_desc == NULL)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +       fprintf (dump_file, "predict doloop failure due to"
>> +                           "unexpected niters.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether iteration count too small
>> +     and not profitable.  */
>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
>> +  if (est_niter == -1)
>> +    est_niter = get_likely_max_loop_iterations_int (loop);
>> +  if (est_niter >= 0 && est_niter < 3)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +       fprintf (dump_file,
>> +                "predict doloop failure due to"
>> +                "too few iterations (%u).\n",
>> +                (unsigned int) est_niter);
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether number of iterations too costly
>> +     to compute.  */
>> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +       fprintf (dump_file, "predict doloop failure due to"
>> +                           "costly niter computation.\n");
>> +      return false;
>> +    }
>> +
>> +  return true;
>> +}
>> +
>>  /* Determines cost of the computation of EXPR.  */
>>
>>  static unsigned
>> --
>> 2.7.4
>>
>
Segher Boessenkool May 17, 2019, 6:49 a.m. UTC | #3
Hi Kewen,

On Thu, May 16, 2019 at 10:35:30PM -0500, linkw@linux.ibm.com wrote:
> 2) For the other part of target invalid stmt check, as the
> hook invalid_within_doloop grep data shows, no all targets
> need to check whether invalid instructions exist in doloop.
> If we scan all stmts as generic, it can waste time for those
> targets which don't need to check.

So make the default version of the hook NULL, and only run the hook if
non-null?  There are many examples of this.

>  Besides, the scope of
> the current check on SWITCH in rs6000 hook is wide, later
> if we want it more exact, we may need to check more stmts
> instead of single.  To let target hook scan the BBs/stmts
> by itself is also more flexible.

If we'll need that flexibility, okay.

> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);
> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +	gimple *stmt = gsi_stmt (gsi);
> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file,
> +		       "predict doloop failure due to finding call.\n");

Should this really be for -all dumps only?  "X failed because Y" is often
very interesting info -- and it is not much output.

(Please start the line with a capital if you end it with a period :-) )

> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "no innermost.\n");

If you paste strings (which is fine for debug output), you still need a
space between words ;-)

> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop

"... use a low-overhead loop ..."

> +which is going to be checked.  This target hook is required only when the

Just remove the whole "which is going to be checked" part?

> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.

Is it *required* when the target has doloops?  And what will happen if you
do not define this hook, either if or you have doloops or if you don't?

Hook documentation often ends "The default version of this hook returns..."
which neatly answers all this.

> +	  /* For now, we only consider these two RTX classes, to match what we
> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */

The indentation is broken here.

> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "target specific checks.\n");

Missing space as well (and more later, please check all).


Segher
Segher Boessenkool May 17, 2019, 6:57 a.m. UTC | #4
On Fri, May 17, 2019 at 03:30:41PM +1000, Kugan Vivekanandarajah wrote:
> On Fri, 17 May 2019 at 13:37, <linkw@linux.ibm.com> wrote:
> > +      if (GET_CODE (body) == SET)
> > +       {
> > +         rtx set_val = XEXP (body, 1);
> > +         enum rtx_code code = GET_CODE (set_val);
> > +         enum rtx_class cls = GET_RTX_CLASS (code);
> > +         /* For now, we only consider these two RTX classes, to match what we
> > +        get in doloop_optimize, excluding operations like zero/sign extend.  */
> > +         if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> > +           cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> Cant you have PARALLEL with SET here?

So it should use single_set and SET_SRC?  Yeah I guess.

For Power, we don't have many PARALLELs in freshly expanded code, so it
doesn't make much difference for us.

> > +  if (cost > max_cost)
> > +    return true;
> Maybe it is better to bailout early if the limit is reached instead of
> doing it outside the loop?

That won't be more complicated code here, so yes let's do that.


Segher
Kewen.Lin May 17, 2019, 7:20 a.m. UTC | #5
Hi Segher,

on 2019/5/17 下午2:49, Segher Boessenkool wrote:
> Hi Kewen,
> 
> On Thu, May 16, 2019 at 10:35:30PM -0500, linkw@linux.ibm.com wrote:
>> 2) For the other part of target invalid stmt check, as the
>> hook invalid_within_doloop grep data shows, no all targets
>> need to check whether invalid instructions exist in doloop.
>> If we scan all stmts as generic, it can waste time for those
>> targets which don't need to check.
> 
> So make the default version of the hook NULL, and only run the hook if
> non-null?  There are many examples of this.
> 

Good tips!

>>  Besides, the scope of
>> the current check on SWITCH in rs6000 hook is wide, later
>> if we want it more exact, we may need to check more stmts
>> instead of single.  To let target hook scan the BBs/stmts
>> by itself is also more flexible.
> 
> If we'll need that flexibility, okay.
> 
>> +static bool
>> +invalid_insn_for_doloop_p (struct loop *loop)
>> +{
>> +  basic_block *body = get_loop_body (loop);
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
>> +      {
>> +	gimple *stmt = gsi_stmt (gsi);
>> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
>> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file,
>> +		       "predict doloop failure due to finding call.\n");
> 
> Should this really be for -all dumps only?  "X failed because Y" is often
> very interesting info -- and it is not much output.
> 

OK, I thought users can firstly check the predict_doloop result whether
meet theirs' expectation and use -details dump for further check why.
And IVOPTs also uses "TDF_DETAILS" for dumping much.

Just remove the flags check? or which TDF level you suggested?

> (Please start the line with a capital if you end it with a period :-) )
> 

Some contexts seems missing for this?  Or it's for dumping string?

>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "no innermost.\n");
> 
> If you paste strings (which is fine for debug output), you still need a
> space between words ;-)
> 

Good catches here and later!  :)

>> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
>> +Return true if we can predict it is possible to use low-overhead loops
>> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> 
> "... use a low-overhead loop ..."
> 
>> +which is going to be checked.  This target hook is required only when the
> 
> Just remove the whole "which is going to be checked" part?
> 
>> +target supports low-overhead loops, and will help some earlier middle-end
>> +passes to make some decisions.
> 
> Is it *required* when the target has doloops?  And what will happen if you
> do not define this hook, either if or you have doloops or if you don't?
> 
> Hook documentation often ends "The default version of this hook returns..."
> which neatly answers all this.
> 

OK, will update it with "the default version of this hook return false."

>> +	  /* For now, we only consider these two RTX classes, to match what we
>> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */
> 
> The indentation is broken here.
> 

Good catch, contrib/check_GNU_style.sh did NOT catch this.  :(


Thanks,
Kewen

>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "target specific checks.\n");
> 
> Missing space as well (and more later, please check all).
>
> 
> Segher
>
Richard Biener May 20, 2019, 9:28 a.m. UTC | #6
On Thu, 16 May 2019, linkw@linux.ibm.com wrote:

> From: Kewen Lin <linkw@linux.ibm.com>
> 
> Hi,
> 
> Previous version link:
> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html
> 
> Comparing with the previous version, I moved the generic
> parts of rs6000 target hook to IVOPTs.  But I still kept
> the target hook as previous which checks some target
> specific criteria like innermost, max iteration counts
> etc, and checks for invalid stmt in loop.  The reason
> I decided not to move this part to generic is they are
> not generic enough.
> 
> 1) For the part of target specific criteria, if we want
> to put it in generic, we can call the hook
> targetm.can_use_doloop_p, which requires us to
> prepare those four parameters, but each target only needs
> one or two parameters, it means we will evaluate some
> things which aren't required for that target.  So I'd like
> to leave this part to target hook.
> 2) For the other part of target invalid stmt check, as the
> hook invalid_within_doloop grep data shows, no all targets
> need to check whether invalid instructions exist in doloop.
> If we scan all stmts as generic, it can waste time for those
> targets which don't need to check.  Besides, the scope of
> the current check on SWITCH in rs6000 hook is wide, later
> if we want it more exact, we may need to check more stmts
> instead of single.  To let target hook scan the BBs/stmts
> by itself is also more flexible.
> 
> Bootstrapped and regression testing ongoing on powerpc64le.
> 
> Any more comments?
> 
> gcc/ChangeLog
> 
> 2019-05-17  Kewen Lin  <linkw@gcc.gnu.org>
> 
> 	PR middle-end/80791
> 	* target.def (predict_doloop_p): New hook.
> 	* targhooks.h (default_predict_doloop_p): New declaration.
> 	* targhooks.c (default_predict_doloop_p): New function.
> 	* doc/tm.texi.in (TARGET_PREDICT_DOLOOP_P): New hook.
> 	* doc/tm.texi: Regenerate.
> 	* config/rs6000/rs6000.c (invalid_insn_for_doloop_p): New function.
> 	(rs6000_predict_doloop_p): Likewise.
> 	(TARGET_PREDICT_DOLOOP_P): New macro.
> 	* tree-ssa-loop-ivopts.c (generic_predict_doloop_p): New function. 
> 	(costly_iter_for_doloop_p): Likewise.
> 
> ---
>  gcc/config/rs6000/rs6000.c |  79 +++++++++++++++++++++++++++++++++-
>  gcc/doc/tm.texi            |   8 ++++
>  gcc/doc/tm.texi.in         |   2 +
>  gcc/target.def             |   9 ++++
>  gcc/targhooks.c            |  13 ++++++
>  gcc/targhooks.h            |   1 +
>  gcc/tree-ssa-loop-ivopts.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 216 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index a21f4f7..2fd52d7 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -83,6 +83,7 @@
>  #include "tree-ssa-propagate.h"
>  #include "tree-vrp.h"
>  #include "tree-ssanames.h"
> +#include "tree-cfg.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -1914,6 +1915,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_CAN_USE_DOLOOP_P
>  #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
>  
> +#undef TARGET_PREDICT_DOLOOP_P
> +#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
> +
>  #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
>  #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
>  
> @@ -39436,7 +39440,80 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
>    return id;
>  }
>  
> -
> +/* Check whether there are some instructions preventing doloop transformation
> +   inside loop body, mainly for instructions which are possible to kill CTR.
> +
> +   Return true if some invalid insn exits, otherwise return false.  */
> +
> +static bool
> +invalid_insn_for_doloop_p (struct loop *loop)
> +{
> +  basic_block *body = get_loop_body (loop);

You leak memory here.  I still think all of this function should
be in generic code for now.

> +  gimple_stmt_iterator gsi;
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
> +      {
> +	gimple *stmt = gsi_stmt (gsi);
> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file,
> +		       "predict doloop failure due to finding call.\n");
> +	    return true;
> +	  }
> +	if (computed_goto_p (stmt))
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file, "predict doloop failure due to"
> +				  "finding computed jump.\n");
> +	    return true;
> +	  }
> +
> +	/* TODO: Now this hook is expected to be called in ivopts, which is
> +	   before switchlower1/switchlower2.  Checking for SWITCH at this point
> +	   will eliminate some good candidates.  But since there are only a few
> +	   cases having _a_ switch statement in the innermost loop, it can be a
> +	   low priority enhancement.  */
> +	if (gimple_code (stmt) == GIMPLE_SWITCH)
> +	  {
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      fprintf (dump_file,
> +		       "predict doloop failure due to finding switch.\n");
> +	    return true;
> +	  }
> +      }
> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop in gimple will be transformed in the RTL
> +   doloop_optimize pass.  This is for rs6000 target specific.  */
> +
> +static bool
> +rs6000_predict_doloop_p (struct loop *loop)
> +{
> +  gcc_assert (loop);
> +
> +  /* On rs6000, targetm.can_use_doloop_p is actually
> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */

So the point is that ppc only has a single counter register, right?

So the better way would be to expose that via a target hook somehow.
Or simply restrict IVOPTs processing to innermost loops for now.

> +  if (loop->inner != NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "no innermost.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
> +     CALL, tablejump, computed_jump.  */
> +  if (invalid_insn_for_doloop_p (loop))
> +    return false;
> +
> +  return true;
> +}
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>  
>  #include "gt-rs6000.h"
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 8c8978b..e595b8b 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>  body must be generated.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
> +Return true if we can predict it is possible to use low-overhead loops
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
> +which is going to be checked.  This target hook is required only when the
> +target supports low-overhead loops, and will help some earlier middle-end
> +passes to make some decisions.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index fe1194e..e047734 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>  
>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>  
> +@hook TARGET_PREDICT_DOLOOP_P
> +
>  @hook TARGET_CAN_USE_DOLOOP_P
>  
>  @hook TARGET_INVALID_WITHIN_DOLOOP
> diff --git a/gcc/target.def b/gcc/target.def
> index 66cee07..0a80de3 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4227,6 +4227,15 @@ DEFHOOK
>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>   default_speculation_safe_value)
>   
> +DEFHOOK
> +(predict_doloop_p,
> + "Return true if we can predict it is possible to use low-overhead loops\n\
> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
> +which is going to be checked.  This target hook is required only when the\n\
> +target supports low-overhead loops, and will help some earlier middle-end\n\
> +passes to make some decisions.",
> + bool, (struct loop *loop),
> + default_predict_doloop_p)
>  
>  DEFHOOK
>  (can_use_doloop_p,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index 318f7e9..56ed4cc 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>    return HAVE_GNU_INDIRECT_FUNCTION;
>  }
>  
> +/* True if we can predict this loop is possible to be transformed to a
> +   low-overhead loop, otherwise returns false.
> +
> +   By default, false is returned, as this hook's applicability should be
> +   verified for each target.  Target maintainers should re-define the hook
> +   if the target can take advantage of it.  */
> +
> +bool
> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>     an error message.
>  
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 5943627..70bfb17 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>  
>  extern bool default_has_ifunc_p (void);
>  
> +extern bool default_predict_doloop_p (struct loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>  
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index a44b4cb..ed7f2a5 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -3776,6 +3776,111 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
>    return NULL_TREE;
>  }
>  
> +/* Check whether number of iteration computation is too costly for doloop
> +   transformation.  It expands the gimple sequence to equivalent RTL insn
> +   sequence, then evaluate the cost.
> +
> +   Return true if it's costly, otherwise return false.  */
> +
> +static bool
> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
> +{

I already said this but I expect IVOPTs to already cost initial
value computation of new IVs.  I also doubt it's worth
going into RTL details here, estimate_num_insns should be
enough, like via using expression_expensive_p from SCEV analysis
which is already used by IVOPTs.

You try to be perfect here but end up adding more
magic numbers (PARAM_MAX_ITERATIONS_COMPUTATION_COST).

> +  tree type = TREE_TYPE (niters);
> +  unsigned cost = 0;
> +  bool speed = optimize_loop_for_speed_p (loop);
> +  int regno = LAST_VIRTUAL_REGISTER + 1;
> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
> +  start_sequence ();
> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  for (; seq; seq = NEXT_INSN (seq))
> +    {
> +      if (!INSN_P (seq))
> +	continue;
> +      rtx body = PATTERN (seq);
> +      if (GET_CODE (body) == SET)
> +	{
> +	  rtx set_val = XEXP (body, 1);
> +	  enum rtx_code code = GET_CODE (set_val);
> +	  enum rtx_class cls = GET_RTX_CLASS (code);
> +	  /* For now, we only consider these two RTX classes, to match what we
> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */
> +	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
> +	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
> +	}
> +    }
> +  unsigned max_cost
> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
> +  if (cost > max_cost)
> +    return true;
> +
> +  return false;
> +}
> +
> +/* Predict whether the given loop will be transformed in the RTL
> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
> +   as possible.  This is only for target independent checks, see
> +   targetm.predict_doloop_p for the target dependent ones.
> +
> +   Some RTL specific checks seems unable to be checked in gimple, if any new
> +   checks or easy checks _are_ missing here, please add them.  */
> +
> +static bool
> +generic_predict_doloop_p (struct ivopts_data *data)
> +{
> +  struct loop *loop = data->current_loop;
> +
> +  /* Call target hook for target dependent checks.  */
> +  if (!targetm.predict_doloop_p (loop))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "target specific checks.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check iteration description to know it's
> +     suitable or not.  */
> +  edge exit = loop_latch_edge (loop);
> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
> +  if (niter_desc == NULL)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "unexpected niters.\n");
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether iteration count too small
> +     and not profitable.  */
> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
> +  if (est_niter == -1)
> +    est_niter = get_likely_max_loop_iterations_int (loop);
> +  if (est_niter >= 0 && est_niter < 3)
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file,
> +		 "predict doloop failure due to"
> +		 "too few iterations (%u).\n",
> +		 (unsigned int) est_niter);
> +      return false;
> +    }
> +
> +  /* Similar to doloop_optimize, check whether number of iterations too costly
> +     to compute.  */
> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	fprintf (dump_file, "predict doloop failure due to"
> +			    "costly niter computation.\n");
> +      return false;
> +    }
> +
> +  return true;
> +}
> +
>  /* Determines cost of the computation of EXPR.  */
>  
>  static unsigned
>
Segher Boessenkool May 20, 2019, 10:24 a.m. UTC | #7
Let me try to answer a bit here...

On Mon, May 20, 2019 at 11:28:26AM +0200, Richard Biener wrote:
> On Thu, 16 May 2019, linkw@linux.ibm.com wrote:
> > +/* Predict whether the given loop in gimple will be transformed in the RTL
> > +   doloop_optimize pass.  This is for rs6000 target specific.  */
> > +
> > +static bool
> > +rs6000_predict_doloop_p (struct loop *loop)
> > +{
> > +  gcc_assert (loop);
> > +
> > +  /* On rs6000, targetm.can_use_doloop_p is actually
> > +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> 
> So the point is that ppc only has a single counter register, right?

That, and the register is call-clobbered, and some RTL patterns use it
for other purposes.  So we want doloop only for inner loops, and not if
certain constructs are used in the loop (switch statements...)

It may be good enough if we only look at the loop structure?  That would
simplify things a lot ;-)

> So the better way would be to expose that via a target hook somehow.
> Or simply restrict IVOPTs processing to innermost loops for now.

I think we should have two hooks: one is called with the struct loop as
parameter; and the other is called for every statement in the loop, if
the hook isn't null anyway.  Or perhaps we do not need that second one.


Segher
Jeff Law May 20, 2019, 2:43 p.m. UTC | #8
On 5/20/19 4:24 AM, Segher Boessenkool wrote:
> Let me try to answer a bit here...
> 
> On Mon, May 20, 2019 at 11:28:26AM +0200, Richard Biener wrote:
>> On Thu, 16 May 2019, linkw@linux.ibm.com wrote:
>>> +/* Predict whether the given loop in gimple will be transformed in the RTL
>>> +   doloop_optimize pass.  This is for rs6000 target specific.  */
>>> +
>>> +static bool
>>> +rs6000_predict_doloop_p (struct loop *loop)
>>> +{
>>> +  gcc_assert (loop);
>>> +
>>> +  /* On rs6000, targetm.can_use_doloop_p is actually
>>> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
>>
>> So the point is that ppc only has a single counter register, right?
> 
> That, and the register is call-clobbered, and some RTL patterns use it
> for other purposes.  So we want doloop only for inner loops, and not if
> certain constructs are used in the loop (switch statements...)
> 
> It may be good enough if we only look at the loop structure?  That would
> simplify things a lot ;-)
That seems reasonable to me.  My recollection is that most other targets
with low overhead loops have a single counter register.  THe most
obvious counter example would be m68k, but that's a don't care IMHO.


> 
>> So the better way would be to expose that via a target hook somehow.
>> Or simply restrict IVOPTs processing to innermost loops for now.
> 
> I think we should have two hooks: one is called with the struct loop as
> parameter; and the other is called for every statement in the loop, if
> the hook isn't null anyway.  Or perhaps we do not need that second one.
I'd wait to see a compelling example from real world code where we need
to scan the statements.  Otherwise we're just dragging in more target
specific decisions which in fact we want to minimize target stuff.

jeff
Segher Boessenkool May 20, 2019, 4:37 p.m. UTC | #9
On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
> > I think we should have two hooks: one is called with the struct loop as
> > parameter; and the other is called for every statement in the loop, if
> > the hook isn't null anyway.  Or perhaps we do not need that second one.
> I'd wait to see a compelling example from real world code where we need
> to scan the statements.  Otherwise we're just dragging in more target
> specific decisions which in fact we want to minimize target stuff.

The ivopts pass will be too optimistic about what loops will end up as a
doloop, and cost things accordingly.  The cases where we cannot later
actually use a doloop are doing pretty much per iteration, so I think
ivopts will still make good decisions.  We'll need to make the rtl part
not actually do a doloop then, but we probably still need that logic
anyway.

Kewen, Bin, will that work satisfactorily do you think?


Segher
Kewen.Lin May 21, 2019, 3:48 a.m. UTC | #10
on 2019/5/20 下午5:28, Richard Biener wrote:
> On Thu, 16 May 2019, linkw@linux.ibm.com wrote:
> 
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> Hi,
>>
>> Previous version link:
>> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html
>>
>> Comparing with the previous version, I moved the generic
>> parts of rs6000 target hook to IVOPTs.  But I still kept
>> the target hook as previous which checks some target
>> specific criteria like innermost, max iteration counts
>> etc, and checks for invalid stmt in loop.  The reason
>> I decided not to move this part to generic is they are
>> not generic enough.
>>
>> 1) For the part of target specific criteria, if we want
>> to put it in generic, we can call the hook
>> targetm.can_use_doloop_p, which requires us to
>> prepare those four parameters, but each target only needs
>> one or two parameters, it means we will evaluate some
>> things which aren't required for that target.  So I'd like
>> to leave this part to target hook.
>> 2) For the other part of target invalid stmt check, as the
>> hook invalid_within_doloop grep data shows, no all targets
>> need to check whether invalid instructions exist in doloop.
>> If we scan all stmts as generic, it can waste time for those
>> targets which don't need to check.  Besides, the scope of
>> the current check on SWITCH in rs6000 hook is wide, later
>> if we want it more exact, we may need to check more stmts
>> instead of single.  To let target hook scan the BBs/stmts
>> by itself is also more flexible.
>>
>> Bootstrapped and regression testing ongoing on powerpc64le.
>>
>> Any more comments?
>>
>>  
>> -
>> +/* Check whether there are some instructions preventing doloop transformation
>> +   inside loop body, mainly for instructions which are possible to kill CTR.
>> +
>> +   Return true if some invalid insn exits, otherwise return false.  */
>> +
>> +static bool
>> +invalid_insn_for_doloop_p (struct loop *loop)
>> +{
>> +  basic_block *body = get_loop_body (loop);
> 
> You leak memory here.  I still think all of this function should
> be in generic code for now.
> 

Thanks a lot for catching this!  I didn't notice I should free it by myself.

>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
>> +      {
>> +	gimple *stmt = gsi_stmt (gsi);
>> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
>> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file,
>> +		       "predict doloop failure due to finding call.\n");
>> +	    return true;
>> +	  }
>> +	if (computed_goto_p (stmt))
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file, "predict doloop failure due to"
>> +				  "finding computed jump.\n");
>> +	    return true;
>> +	  }
>> +
>> +	/* TODO: Now this hook is expected to be called in ivopts, which is
>> +	   before switchlower1/switchlower2.  Checking for SWITCH at this point
>> +	   will eliminate some good candidates.  But since there are only a few
>> +	   cases having _a_ switch statement in the innermost loop, it can be a
>> +	   low priority enhancement.  */
>> +	if (gimple_code (stmt) == GIMPLE_SWITCH)
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file,
>> +		       "predict doloop failure due to finding switch.\n");
>> +	    return true;
>> +	  }
>> +      }
>> +
>> +  return false;
>> +}
>> +
>> +/* Predict whether the given loop in gimple will be transformed in the RTL
>> +   doloop_optimize pass.  This is for rs6000 target specific.  */
>> +
>> +static bool
>> +rs6000_predict_doloop_p (struct loop *loop)
>> +{
>> +  gcc_assert (loop);
>> +
>> +  /* On rs6000, targetm.can_use_doloop_p is actually
>> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> 
> So the point is that ppc only has a single counter register, right?
> 
> So the better way would be to expose that via a target hook somehow.
> Or simply restrict IVOPTs processing to innermost loops for now.
> 
It's to answer whether the target supports doloop or not.  I guess
to check innermost is stricter and simpler.  There is another target 
hook invalid_within_doloop to check counter register clobber.

>> +  if (loop->inner != NULL)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "no innermost.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
>> +     CALL, tablejump, computed_jump.  */
>> +  if (invalid_insn_for_doloop_p (loop))
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>>  struct gcc_target targetm = TARGET_INITIALIZER;
>>  
>>  #include "gt-rs6000.h"
>> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
>> index 8c8978b..e595b8b 100644
>> --- a/gcc/doc/tm.texi
>> +++ b/gcc/doc/tm.texi
>> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>>  body must be generated.
>>  @end deftypefn
>>  
>> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
>> +Return true if we can predict it is possible to use low-overhead loops
>> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
>> +which is going to be checked.  This target hook is required only when the
>> +target supports low-overhead loops, and will help some earlier middle-end
>> +passes to make some decisions.
>> +@end deftypefn
>> +
>>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
>> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
>> index fe1194e..e047734 100644
>> --- a/gcc/doc/tm.texi.in
>> +++ b/gcc/doc/tm.texi.in
>> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>>  
>>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>>  
>> +@hook TARGET_PREDICT_DOLOOP_P
>> +
>>  @hook TARGET_CAN_USE_DOLOOP_P
>>  
>>  @hook TARGET_INVALID_WITHIN_DOLOOP
>> diff --git a/gcc/target.def b/gcc/target.def
>> index 66cee07..0a80de3 100644
>> --- a/gcc/target.def
>> +++ b/gcc/target.def
>> @@ -4227,6 +4227,15 @@ DEFHOOK
>>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>>   default_speculation_safe_value)
>>   
>> +DEFHOOK
>> +(predict_doloop_p,
>> + "Return true if we can predict it is possible to use low-overhead loops\n\
>> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
>> +which is going to be checked.  This target hook is required only when the\n\
>> +target supports low-overhead loops, and will help some earlier middle-end\n\
>> +passes to make some decisions.",
>> + bool, (struct loop *loop),
>> + default_predict_doloop_p)
>>  
>>  DEFHOOK
>>  (can_use_doloop_p,
>> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
>> index 318f7e9..56ed4cc 100644
>> --- a/gcc/targhooks.c
>> +++ b/gcc/targhooks.c
>> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>>    return HAVE_GNU_INDIRECT_FUNCTION;
>>  }
>>  
>> +/* True if we can predict this loop is possible to be transformed to a
>> +   low-overhead loop, otherwise returns false.
>> +
>> +   By default, false is returned, as this hook's applicability should be
>> +   verified for each target.  Target maintainers should re-define the hook
>> +   if the target can take advantage of it.  */
>> +
>> +bool
>> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
>> +{
>> +  return false;
>> +}
>> +
>>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>>     an error message.
>>  
>> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
>> index 5943627..70bfb17 100644
>> --- a/gcc/targhooks.h
>> +++ b/gcc/targhooks.h
>> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>>  
>>  extern bool default_has_ifunc_p (void);
>>  
>> +extern bool default_predict_doloop_p (struct loop *);
>>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>>  
>>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
>> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
>> index a44b4cb..ed7f2a5 100644
>> --- a/gcc/tree-ssa-loop-ivopts.c
>> +++ b/gcc/tree-ssa-loop-ivopts.c
>> @@ -3776,6 +3776,111 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
>>    return NULL_TREE;
>>  }
>>  
>> +/* Check whether number of iteration computation is too costly for doloop
>> +   transformation.  It expands the gimple sequence to equivalent RTL insn
>> +   sequence, then evaluate the cost.
>> +
>> +   Return true if it's costly, otherwise return false.  */
>> +
>> +static bool
>> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
>> +{
> 
> I already said this but I expect IVOPTs to already cost initial
> value computation of new IVs.  I also doubt it's worth
> going into RTL details here, estimate_num_insns should be
> enough, like via using expression_expensive_p from SCEV analysis
> which is already used by IVOPTs.
> 
> You try to be perfect here but end up adding more
> magic numbers (PARAM_MAX_ITERATIONS_COMPUTATION_COST).
> 

Yes, I noticed there is something used for cost computation.  But it can not 
be used for this niter cost directly.  I tried to make similar thing like 
what we have in doloop_optimize.  The magic number is also from 
doloop_optimize.  I'm fine to use those existing ways, I don't have data for
whether it's not important to be exact.


Thanks
Kewen

>> +  tree type = TREE_TYPE (niters);
>> +  unsigned cost = 0;
>> +  bool speed = optimize_loop_for_speed_p (loop);
>> +  int regno = LAST_VIRTUAL_REGISTER + 1;
>> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
>> +  start_sequence ();
>> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
>> +  rtx_insn *seq = get_insns ();
>> +  end_sequence ();
>> +
>> +  for (; seq; seq = NEXT_INSN (seq))
>> +    {
>> +      if (!INSN_P (seq))
>> +	continue;
>> +      rtx body = PATTERN (seq);
>> +      if (GET_CODE (body) == SET)
>> +	{
>> +	  rtx set_val = XEXP (body, 1);
>> +	  enum rtx_code code = GET_CODE (set_val);
>> +	  enum rtx_class cls = GET_RTX_CLASS (code);
>> +	  /* For now, we only consider these two RTX classes, to match what we
>> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */
>> +	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
>> +	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
>> +	}
>> +    }
>> +  unsigned max_cost
>> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
>> +  if (cost > max_cost)
>> +    return true;
>> +
>> +  return false;
>> +}
>> +
>> +/* Predict whether the given loop will be transformed in the RTL
>> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
>> +   as possible.  This is only for target independent checks, see
>> +   targetm.predict_doloop_p for the target dependent ones.
>> +
>> +   Some RTL specific checks seems unable to be checked in gimple, if any new
>> +   checks or easy checks _are_ missing here, please add them.  */
>> +
>> +static bool
>> +generic_predict_doloop_p (struct ivopts_data *data)
>> +{
>> +  struct loop *loop = data->current_loop;
>> +
>> +  /* Call target hook for target dependent checks.  */
>> +  if (!targetm.predict_doloop_p (loop))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "target specific checks.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check iteration description to know it's
>> +     suitable or not.  */
>> +  edge exit = loop_latch_edge (loop);
>> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
>> +  if (niter_desc == NULL)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "unexpected niters.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether iteration count too small
>> +     and not profitable.  */
>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
>> +  if (est_niter == -1)
>> +    est_niter = get_likely_max_loop_iterations_int (loop);
>> +  if (est_niter >= 0 && est_niter < 3)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file,
>> +		 "predict doloop failure due to"
>> +		 "too few iterations (%u).\n",
>> +		 (unsigned int) est_niter);
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether number of iterations too costly
>> +     to compute.  */
>> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "costly niter computation.\n");
>> +      return false;
>> +    }
>> +
>> +  return true;
>> +}
>> +
>>  /* Determines cost of the computation of EXPR.  */
>>  
>>  static unsigned
>>
>
Kewen.Lin May 21, 2019, 5:28 a.m. UTC | #11
on 2019/5/20 下午5:28, Richard Biener wrote:
> On Thu, 16 May 2019, linkw@linux.ibm.com wrote:
> 
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> Hi,
>>
>> Previous version link:
>> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00654.html
>>
>> Comparing with the previous version, I moved the generic
>> parts of rs6000 target hook to IVOPTs.  But I still kept
>> the target hook as previous which checks some target
>> specific criteria like innermost, max iteration counts
>> etc, and checks for invalid stmt in loop.  The reason
>> I decided not to move this part to generic is they are
>> not generic enough.
>>
>> 1) For the part of target specific criteria, if we want
>> to put it in generic, we can call the hook
>> targetm.can_use_doloop_p, which requires us to
>> prepare those four parameters, but each target only needs
>> one or two parameters, it means we will evaluate some
>> things which aren't required for that target.  So I'd like
>> to leave this part to target hook.
>> 2) For the other part of target invalid stmt check, as the
>> hook invalid_within_doloop grep data shows, no all targets
>> need to check whether invalid instructions exist in doloop.
>> If we scan all stmts as generic, it can waste time for those
>> targets which don't need to check.  Besides, the scope of
>> the current check on SWITCH in rs6000 hook is wide, later
>> if we want it more exact, we may need to check more stmts
>> instead of single.  To let target hook scan the BBs/stmts
>> by itself is also more flexible.
>>
>> Bootstrapped and regression testing ongoing on powerpc64le.
>>
>> Any more comments?
>>
>> -
>> +/* Check whether there are some instructions preventing doloop transformation
>> +   inside loop body, mainly for instructions which are possible to kill CTR.
>> +
>> +   Return true if some invalid insn exits, otherwise return false.  */
>> +
>> +static bool
>> +invalid_insn_for_doloop_p (struct loop *loop)
>> +{
>> +  basic_block *body = get_loop_body (loop);
> 
> You leak memory here.  I still think all of this function should
> be in generic code for now.
> 

Sorry, forgot to reply the part on "this function should be in generic code".
I'll update this, as Segher suggested, make this part as another hook, call 
this hook in generic code.  Any targets which don't require it can disable
it with null re-defintion.


Thanks,
Kewen 

>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> +    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
>> +      {
>> +	gimple *stmt = gsi_stmt (gsi);
>> +	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
>> +	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file,
>> +		       "predict doloop failure due to finding call.\n");
>> +	    return true;
>> +	  }
>> +	if (computed_goto_p (stmt))
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file, "predict doloop failure due to"
>> +				  "finding computed jump.\n");
>> +	    return true;
>> +	  }
>> +
>> +	/* TODO: Now this hook is expected to be called in ivopts, which is
>> +	   before switchlower1/switchlower2.  Checking for SWITCH at this point
>> +	   will eliminate some good candidates.  But since there are only a few
>> +	   cases having _a_ switch statement in the innermost loop, it can be a
>> +	   low priority enhancement.  */
>> +	if (gimple_code (stmt) == GIMPLE_SWITCH)
>> +	  {
>> +	    if (dump_file && (dump_flags & TDF_DETAILS))
>> +	      fprintf (dump_file,
>> +		       "predict doloop failure due to finding switch.\n");
>> +	    return true;
>> +	  }
>> +      }
>> +
>> +  return false;
>> +}
>> +
>> +/* Predict whether the given loop in gimple will be transformed in the RTL
>> +   doloop_optimize pass.  This is for rs6000 target specific.  */
>> +
>> +static bool
>> +rs6000_predict_doloop_p (struct loop *loop)
>> +{
>> +  gcc_assert (loop);
>> +
>> +  /* On rs6000, targetm.can_use_doloop_p is actually
>> +     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
> 
> So the point is that ppc only has a single counter register, right?
> 
> So the better way would be to expose that via a target hook somehow.
> Or simply restrict IVOPTs processing to innermost loops for now.
> 
>> +  if (loop->inner != NULL)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "no innermost.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
>> +     CALL, tablejump, computed_jump.  */
>> +  if (invalid_insn_for_doloop_p (loop))
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>>  struct gcc_target targetm = TARGET_INITIALIZER;
>>  
>>  #include "gt-rs6000.h"
>> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
>> index 8c8978b..e595b8b 100644
>> --- a/gcc/doc/tm.texi
>> +++ b/gcc/doc/tm.texi
>> @@ -11603,6 +11603,14 @@ function version at run-time for a given set of function versions.
>>  body must be generated.
>>  @end deftypefn
>>  
>> +@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
>> +Return true if we can predict it is possible to use low-overhead loops
>> +for a particular loop.  The parameter @var{loop} is a pointer to the loop
>> +which is going to be checked.  This target hook is required only when the
>> +target supports low-overhead loops, and will help some earlier middle-end
>> +passes to make some decisions.
>> +@end deftypefn
>> +
>>  @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
>>  Return true if it is possible to use low-overhead loops (@code{doloop_end}
>>  and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
>> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
>> index fe1194e..e047734 100644
>> --- a/gcc/doc/tm.texi.in
>> +++ b/gcc/doc/tm.texi.in
>> @@ -7942,6 +7942,8 @@ to by @var{ce_info}.
>>  
>>  @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
>>  
>> +@hook TARGET_PREDICT_DOLOOP_P
>> +
>>  @hook TARGET_CAN_USE_DOLOOP_P
>>  
>>  @hook TARGET_INVALID_WITHIN_DOLOOP
>> diff --git a/gcc/target.def b/gcc/target.def
>> index 66cee07..0a80de3 100644
>> --- a/gcc/target.def
>> +++ b/gcc/target.def
>> @@ -4227,6 +4227,15 @@ DEFHOOK
>>  rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
>>   default_speculation_safe_value)
>>   
>> +DEFHOOK
>> +(predict_doloop_p,
>> + "Return true if we can predict it is possible to use low-overhead loops\n\
>> +for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
>> +which is going to be checked.  This target hook is required only when the\n\
>> +target supports low-overhead loops, and will help some earlier middle-end\n\
>> +passes to make some decisions.",
>> + bool, (struct loop *loop),
>> + default_predict_doloop_p)
>>  
>>  DEFHOOK
>>  (can_use_doloop_p,
>> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
>> index 318f7e9..56ed4cc 100644
>> --- a/gcc/targhooks.c
>> +++ b/gcc/targhooks.c
>> @@ -643,6 +643,19 @@ default_has_ifunc_p (void)
>>    return HAVE_GNU_INDIRECT_FUNCTION;
>>  }
>>  
>> +/* True if we can predict this loop is possible to be transformed to a
>> +   low-overhead loop, otherwise returns false.
>> +
>> +   By default, false is returned, as this hook's applicability should be
>> +   verified for each target.  Target maintainers should re-define the hook
>> +   if the target can take advantage of it.  */
>> +
>> +bool
>> +default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
>> +{
>> +  return false;
>> +}
>> +
>>  /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
>>     an error message.
>>  
>> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
>> index 5943627..70bfb17 100644
>> --- a/gcc/targhooks.h
>> +++ b/gcc/targhooks.h
>> @@ -85,6 +85,7 @@ extern bool default_fixed_point_supported_p (void);
>>  
>>  extern bool default_has_ifunc_p (void);
>>  
>> +extern bool default_predict_doloop_p (struct loop *);
>>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>>  
>>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
>> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
>> index a44b4cb..ed7f2a5 100644
>> --- a/gcc/tree-ssa-loop-ivopts.c
>> +++ b/gcc/tree-ssa-loop-ivopts.c
>> @@ -3776,6 +3776,111 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
>>    return NULL_TREE;
>>  }
>>  
>> +/* Check whether number of iteration computation is too costly for doloop
>> +   transformation.  It expands the gimple sequence to equivalent RTL insn
>> +   sequence, then evaluate the cost.
>> +
>> +   Return true if it's costly, otherwise return false.  */
>> +
>> +static bool
>> +costly_iter_for_doloop_p (struct loop *loop, tree niters)
>> +{
> 
> I already said this but I expect IVOPTs to already cost initial
> value computation of new IVs.  I also doubt it's worth
> going into RTL details here, estimate_num_insns should be
> enough, like via using expression_expensive_p from SCEV analysis
> which is already used by IVOPTs.
> 
> You try to be perfect here but end up adding more
> magic numbers (PARAM_MAX_ITERATIONS_COMPUTATION_COST).
> 
>> +  tree type = TREE_TYPE (niters);
>> +  unsigned cost = 0;
>> +  bool speed = optimize_loop_for_speed_p (loop);
>> +  int regno = LAST_VIRTUAL_REGISTER + 1;
>> +  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
>> +  start_sequence ();
>> +  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
>> +  rtx_insn *seq = get_insns ();
>> +  end_sequence ();
>> +
>> +  for (; seq; seq = NEXT_INSN (seq))
>> +    {
>> +      if (!INSN_P (seq))
>> +	continue;
>> +      rtx body = PATTERN (seq);
>> +      if (GET_CODE (body) == SET)
>> +	{
>> +	  rtx set_val = XEXP (body, 1);
>> +	  enum rtx_code code = GET_CODE (set_val);
>> +	  enum rtx_class cls = GET_RTX_CLASS (code);
>> +	  /* For now, we only consider these two RTX classes, to match what we
>> +	 get in doloop_optimize, excluding operations like zero/sign extend.  */
>> +	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
>> +	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
>> +	}
>> +    }
>> +  unsigned max_cost
>> +    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
>> +  if (cost > max_cost)
>> +    return true;
>> +
>> +  return false;
>> +}
>> +
>> +/* Predict whether the given loop will be transformed in the RTL
>> +   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
>> +   as possible.  This is only for target independent checks, see
>> +   targetm.predict_doloop_p for the target dependent ones.
>> +
>> +   Some RTL specific checks seems unable to be checked in gimple, if any new
>> +   checks or easy checks _are_ missing here, please add them.  */
>> +
>> +static bool
>> +generic_predict_doloop_p (struct ivopts_data *data)
>> +{
>> +  struct loop *loop = data->current_loop;
>> +
>> +  /* Call target hook for target dependent checks.  */
>> +  if (!targetm.predict_doloop_p (loop))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "target specific checks.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check iteration description to know it's
>> +     suitable or not.  */
>> +  edge exit = loop_latch_edge (loop);
>> +  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
>> +  if (niter_desc == NULL)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "unexpected niters.\n");
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether iteration count too small
>> +     and not profitable.  */
>> +  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
>> +  if (est_niter == -1)
>> +    est_niter = get_likely_max_loop_iterations_int (loop);
>> +  if (est_niter >= 0 && est_niter < 3)
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file,
>> +		 "predict doloop failure due to"
>> +		 "too few iterations (%u).\n",
>> +		 (unsigned int) est_niter);
>> +      return false;
>> +    }
>> +
>> +  /* Similar to doloop_optimize, check whether number of iterations too costly
>> +     to compute.  */
>> +  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
>> +    {
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	fprintf (dump_file, "predict doloop failure due to"
>> +			    "costly niter computation.\n");
>> +      return false;
>> +    }
>> +
>> +  return true;
>> +}
>> +
>>  /* Determines cost of the computation of EXPR.  */
>>  
>>  static unsigned
>>
>
Kewen.Lin May 21, 2019, 5:50 a.m. UTC | #12
on 2019/5/20 下午10:43, Jeff Law wrote:
> On 5/20/19 4:24 AM, Segher Boessenkool wrote:
>> Let me try to answer a bit here...
>>
>> On Mon, May 20, 2019 at 11:28:26AM +0200, Richard Biener wrote:
>>> On Thu, 16 May 2019, linkw@linux.ibm.com wrote: 
>>
>>> So the better way would be to expose that via a target hook somehow.
>>> Or simply restrict IVOPTs processing to innermost loops for now.
>>
>> I think we should have two hooks: one is called with the struct loop as
>> parameter; and the other is called for every statement in the loop, if
>> the hook isn't null anyway.  Or perhaps we do not need that second one.
> I'd wait to see a compelling example from real world code where we need
> to scan the statements.  Otherwise we're just dragging in more target
> specific decisions which in fact we want to minimize target stuff.

The scan is trying to do similar thing like default_invalid_within_doloop.
It scans for hardware counter register clobbering.  I think it's important
and valuable to scan especially for call since it's common.

  if (CALL_P (insn))
    return "Function call in loop.";

  if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
    return "Computed branch in the loop.";

But it's a question whether to make it as part of generic.  I double checked
that most of the doloop targets use this default behavior, only 5 targets are
using their own TARGET_INVALID_WITHIN_DOLOOP, so it might be a good thing to 
make it common to share.


Thanks, 
Kewen
Kewen.Lin May 21, 2019, 6:03 a.m. UTC | #13
on 2019/5/21 上午12:37, Segher Boessenkool wrote:
> On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
>>> I think we should have two hooks: one is called with the struct loop as
>>> parameter; and the other is called for every statement in the loop, if
>>> the hook isn't null anyway.  Or perhaps we do not need that second one.
>> I'd wait to see a compelling example from real world code where we need
>> to scan the statements.  Otherwise we're just dragging in more target
>> specific decisions which in fact we want to minimize target stuff.
> 
> The ivopts pass will be too optimistic about what loops will end up as a
> doloop, and cost things accordingly.  The cases where we cannot later
> actually use a doloop are doing pretty much per iteration, so I think
> ivopts will still make good decisions.  We'll need to make the rtl part
> not actually do a doloop then, but we probably still need that logic
> anyway.
> 
> Kewen, Bin, will that work satisfactorily do you think?
> 

If my understanding on this question is correct, IMHO we should try to make
IVOPTs conservative than optimistic, since once the predict is wrong from
too optimistic decision, the costing on the doloop use is wrong, it's very
possible to affect the global optimal set.  It looks we don't have any ways
to recover it in RTL then?  (otherwise, there should be better place to fix
the PR).  Although it's also possible to miss some good cases, it's at least
as good as before, I'm inclined to make it conservative.


Thanks,
Kewen
Bin.Cheng May 21, 2019, 6:32 a.m. UTC | #14
On Tue, May 21, 2019 at 1:50 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> on 2019/5/20 下午10:43, Jeff Law wrote:
> > On 5/20/19 4:24 AM, Segher Boessenkool wrote:
> >> Let me try to answer a bit here...
> >>
> >> On Mon, May 20, 2019 at 11:28:26AM +0200, Richard Biener wrote:
> >>> On Thu, 16 May 2019, linkw@linux.ibm.com wrote:
> >>
> >>> So the better way would be to expose that via a target hook somehow.
> >>> Or simply restrict IVOPTs processing to innermost loops for now.
> >>
> >> I think we should have two hooks: one is called with the struct loop as
> >> parameter; and the other is called for every statement in the loop, if
> >> the hook isn't null anyway.  Or perhaps we do not need that second one.
> > I'd wait to see a compelling example from real world code where we need
> > to scan the statements.  Otherwise we're just dragging in more target
> > specific decisions which in fact we want to minimize target stuff.
>
> The scan is trying to do similar thing like default_invalid_within_doloop.
> It scans for hardware counter register clobbering.  I think it's important
> and valuable to scan especially for call since it's common.
>
>   if (CALL_P (insn))
>     return "Function call in loop.";
>
>   if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
>     return "Computed branch in the loop.";
>
> But it's a question whether to make it as part of generic.  I double checked
> that most of the doloop targets use this default behavior, only 5 targets are
> using their own TARGET_INVALID_WITHIN_DOLOOP, so it might be a good thing to
> make it common to share.
I was not following previous rounds of discussion.  But again, is it
possible to make all code(including scanning every stmt in every bb in
loop, as well as niter checks) generic, export a target hook checking
on gimple_stmt.  In this case, backends could still make their own
decision?

Thanks,
bin
>
>
> Thanks,
> Kewen
>
Richard Biener May 21, 2019, 9:58 a.m. UTC | #15
On Mon, 20 May 2019, Segher Boessenkool wrote:

> On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
> > > I think we should have two hooks: one is called with the struct loop as
> > > parameter; and the other is called for every statement in the loop, if
> > > the hook isn't null anyway.  Or perhaps we do not need that second one.
> > I'd wait to see a compelling example from real world code where we need
> > to scan the statements.  Otherwise we're just dragging in more target
> > specific decisions which in fact we want to minimize target stuff.
> 
> The ivopts pass will be too optimistic about what loops will end up as a
> doloop, and cost things accordingly.  The cases where we cannot later
> actually use a doloop are doing pretty much per iteration, so I think
> ivopts will still make good decisions.  We'll need to make the rtl part
> not actually do a doloop then, but we probably still need that logic
> anyway.

Yes - my thinking was that if IVOPTs _not_ choose a doloop IV then
the RTL side has to give up, so that's bad.  But if it chooses
a doloop IV but we end up failing to do a doloop that's not too
bad.  After all we indeed cannot remove RTL doloop analysis
at this point.

So the important thing is to make IVOPTs create a separate
counter IV that is only used in the update/compare/branch
and cost that appropriately (not even sure if _that_ is
actually required!).

A slight complication might be that if IVOPTs decides
to use a doloop IV but creates another equivalent IV
for other uses then later CSE might end up unifying them
again.  We should probably make IVOPTs aware of this.

> Kewen, Bin, will that work satisfactorily do you think?
> 
> 
> Segher
>
Richard Biener May 21, 2019, 10:20 a.m. UTC | #16
On Tue, 21 May 2019, Kewen.Lin wrote:

> on 2019/5/21 上午12:37, Segher Boessenkool wrote:
> > On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
> >>> I think we should have two hooks: one is called with the struct loop as
> >>> parameter; and the other is called for every statement in the loop, if
> >>> the hook isn't null anyway.  Or perhaps we do not need that second one.
> >> I'd wait to see a compelling example from real world code where we need
> >> to scan the statements.  Otherwise we're just dragging in more target
> >> specific decisions which in fact we want to minimize target stuff.
> > 
> > The ivopts pass will be too optimistic about what loops will end up as a
> > doloop, and cost things accordingly.  The cases where we cannot later
> > actually use a doloop are doing pretty much per iteration, so I think
> > ivopts will still make good decisions.  We'll need to make the rtl part
> > not actually do a doloop then, but we probably still need that logic
> > anyway.
> > 
> > Kewen, Bin, will that work satisfactorily do you think?
> > 
> 
> If my understanding on this question is correct, IMHO we should try to make
> IVOPTs conservative than optimistic, since once the predict is wrong from
> too optimistic decision, the costing on the doloop use is wrong, it's very
> possible to affect the global optimal set.  It looks we don't have any ways
> to recover it in RTL then?  (otherwise, there should be better place to fix
> the PR).  Although it's also possible to miss some good cases, it's at least
> as good as before, I'm inclined to make it conservative.

I wonder if you could simply benchmark what happens if you make
IVOPTs _always_ create a doloop IV (if possible)?  I doubt the
cases where a doloop IV is bad (calls, etc.) are too common and
that in those cases the extra simple IV hurts.

Richard.
Segher Boessenkool May 21, 2019, 10:36 a.m. UTC | #17
On Tue, May 21, 2019 at 01:50:31PM +0800, Kewen.Lin wrote:
> on 2019/5/20 下午10:43, Jeff Law wrote:
> > On 5/20/19 4:24 AM, Segher Boessenkool wrote:
> >> Let me try to answer a bit here...
> >>
> >> On Mon, May 20, 2019 at 11:28:26AM +0200, Richard Biener wrote:
> >>> On Thu, 16 May 2019, linkw@linux.ibm.com wrote: 
> >>
> >>> So the better way would be to expose that via a target hook somehow.
> >>> Or simply restrict IVOPTs processing to innermost loops for now.
> >>
> >> I think we should have two hooks: one is called with the struct loop as
> >> parameter; and the other is called for every statement in the loop, if
> >> the hook isn't null anyway.  Or perhaps we do not need that second one.
> > I'd wait to see a compelling example from real world code where we need
> > to scan the statements.  Otherwise we're just dragging in more target
> > specific decisions which in fact we want to minimize target stuff.
> 
> The scan is trying to do similar thing like default_invalid_within_doloop.
> It scans for hardware counter register clobbering.  I think it's important
> and valuable to scan especially for call since it's common.

Ah, right, without this check we would say many more loops can be doloop
than can in fact be.

>   if (CALL_P (insn))
>     return "Function call in loop.";
> 
>   if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
>     return "Computed branch in the loop.";
> 
> But it's a question whether to make it as part of generic.  I double checked
> that most of the doloop targets use this default behavior, only 5 targets are
> using their own TARGET_INVALID_WITHIN_DOLOOP, so it might be a good thing to 
> make it common to share.

Yeah, and have the default hook for gimple be similar to the rtl one.


Segher
Segher Boessenkool May 21, 2019, 11:09 a.m. UTC | #18
On Tue, May 21, 2019 at 02:03:04PM +0800, Kewen.Lin wrote:
> on 2019/5/21 上午12:37, Segher Boessenkool wrote:
> > On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
> >>> I think we should have two hooks: one is called with the struct loop as
> >>> parameter; and the other is called for every statement in the loop, if
> >>> the hook isn't null anyway.  Or perhaps we do not need that second one.
> >> I'd wait to see a compelling example from real world code where we need
> >> to scan the statements.  Otherwise we're just dragging in more target
> >> specific decisions which in fact we want to minimize target stuff.
> > 
> > The ivopts pass will be too optimistic about what loops will end up as a
> > doloop, and cost things accordingly.  The cases where we cannot later
> > actually use a doloop are doing pretty much per iteration, so I think
> > ivopts will still make good decisions.  We'll need to make the rtl part
> > not actually do a doloop then, but we probably still need that logic
> > anyway.
> > 
> > Kewen, Bin, will that work satisfactorily do you think?
> 
> If my understanding on this question is correct, IMHO we should try to make
> IVOPTs conservative than optimistic, since once the predict is wrong from
> too optimistic decision, the costing on the doloop use is wrong, it's very
> possible to affect the global optimal set.

Ah, it does change what is chosen, and it happens a lot as well...  So
never mind, this is one simplification too far :-)

> It looks we don't have any ways to recover it in RTL then?

We don't.  We'd have to redo everything that happened in between...

> (otherwise, there should be better place to fix
> the PR).  Although it's also possible to miss some good cases, it's at least
> as good as before, I'm inclined to make it conservative.


Segher
Segher Boessenkool May 21, 2019, 12:42 p.m. UTC | #19
On Tue, May 21, 2019 at 12:20:50PM +0200, Richard Biener wrote:
> On Tue, 21 May 2019, Kewen.Lin wrote:
> 
> > on 2019/5/21 上午12:37, Segher Boessenkool wrote:
> > > On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
> > >>> I think we should have two hooks: one is called with the struct loop as
> > >>> parameter; and the other is called for every statement in the loop, if
> > >>> the hook isn't null anyway.  Or perhaps we do not need that second one.
> > >> I'd wait to see a compelling example from real world code where we need
> > >> to scan the statements.  Otherwise we're just dragging in more target
> > >> specific decisions which in fact we want to minimize target stuff.
> > > 
> > > The ivopts pass will be too optimistic about what loops will end up as a
> > > doloop, and cost things accordingly.  The cases where we cannot later
> > > actually use a doloop are doing pretty much per iteration, so I think
> > > ivopts will still make good decisions.  We'll need to make the rtl part
> > > not actually do a doloop then, but we probably still need that logic
> > > anyway.
> > > 
> > > Kewen, Bin, will that work satisfactorily do you think?
> > 
> > If my understanding on this question is correct, IMHO we should try to make
> > IVOPTs conservative than optimistic, since once the predict is wrong from
> > too optimistic decision, the costing on the doloop use is wrong, it's very
> > possible to affect the global optimal set.  It looks we don't have any ways
> > to recover it in RTL then?  (otherwise, there should be better place to fix
> > the PR).  Although it's also possible to miss some good cases, it's at least
> > as good as before, I'm inclined to make it conservative.
> 
> I wonder if you could simply benchmark what happens if you make
> IVOPTs _always_ create a doloop IV (if possible)?

That would help yes.

> I doubt the
> cases where a doloop IV is bad (calls, etc.) are too common and

They are quite common I think :-(  Like, more than the number of valid
doloops for us.  But let's see numbers :-)

> that in those cases the extra simple IV hurts.

That, I don't know.  We do not usually need an IV that counts down, but
will it be worse performance?


Segher
Jeff Law May 21, 2019, 3:22 p.m. UTC | #20
On 5/21/19 4:20 AM, Richard Biener wrote:
> On Tue, 21 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/21 上午12:37, Segher Boessenkool wrote:
>>> On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
>>>>> I think we should have two hooks: one is called with the struct loop as
>>>>> parameter; and the other is called for every statement in the loop, if
>>>>> the hook isn't null anyway.  Or perhaps we do not need that second one.
>>>> I'd wait to see a compelling example from real world code where we need
>>>> to scan the statements.  Otherwise we're just dragging in more target
>>>> specific decisions which in fact we want to minimize target stuff.
>>>
>>> The ivopts pass will be too optimistic about what loops will end up as a
>>> doloop, and cost things accordingly.  The cases where we cannot later
>>> actually use a doloop are doing pretty much per iteration, so I think
>>> ivopts will still make good decisions.  We'll need to make the rtl part
>>> not actually do a doloop then, but we probably still need that logic
>>> anyway.
>>>
>>> Kewen, Bin, will that work satisfactorily do you think?
>>>
>>
>> If my understanding on this question is correct, IMHO we should try to make
>> IVOPTs conservative than optimistic, since once the predict is wrong from
>> too optimistic decision, the costing on the doloop use is wrong, it's very
>> possible to affect the global optimal set.  It looks we don't have any ways
>> to recover it in RTL then?  (otherwise, there should be better place to fix
>> the PR).  Although it's also possible to miss some good cases, it's at least
>> as good as before, I'm inclined to make it conservative.
> 
> I wonder if you could simply benchmark what happens if you make
> IVOPTs _always_ create a doloop IV (if possible)?  I doubt the
> cases where a doloop IV is bad (calls, etc.) are too common and
> that in those cases the extra simple IV hurts.
This had been in the back of my mind as well.  I wonder how the RTL IV
bits would respond to that.

Jeff
Kewen.Lin May 22, 2019, 2:07 a.m. UTC | #21
on 2019/5/21 下午6:20, Richard Biener wrote:
> On Tue, 21 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/21 上午12:37, Segher Boessenkool wrote:
>>> On Mon, May 20, 2019 at 08:43:59AM -0600, Jeff Law wrote:
>>>>> I think we should have two hooks: one is called with the struct loop as
>>>>> parameter; and the other is called for every statement in the loop, if
>>>>> the hook isn't null anyway.  Or perhaps we do not need that second one.
>>>> I'd wait to see a compelling example from real world code where we need
>>>> to scan the statements.  Otherwise we're just dragging in more target
>>>> specific decisions which in fact we want to minimize target stuff.
>>>
>>> The ivopts pass will be too optimistic about what loops will end up as a
>>> doloop, and cost things accordingly.  The cases where we cannot later
>>> actually use a doloop are doing pretty much per iteration, so I think
>>> ivopts will still make good decisions.  We'll need to make the rtl part
>>> not actually do a doloop then, but we probably still need that logic
>>> anyway.
>>>
>>> Kewen, Bin, will that work satisfactorily do you think?
>>>
>>
>> If my understanding on this question is correct, IMHO we should try to make
>> IVOPTs conservative than optimistic, since once the predict is wrong from
>> too optimistic decision, the costing on the doloop use is wrong, it's very
>> possible to affect the global optimal set.  It looks we don't have any ways
>> to recover it in RTL then?  (otherwise, there should be better place to fix
>> the PR).  Although it's also possible to miss some good cases, it's at least
>> as good as before, I'm inclined to make it conservative.
> 
> I wonder if you could simply benchmark what happens if you make
> IVOPTs _always_ create a doloop IV (if possible)?  I doubt the
> cases where a doloop IV is bad (calls, etc.) are too common and
> that in those cases the extra simple IV hurts.
> 

OK.  I'll do the changes and measure with SPEC2017.  Maybe also 
extend it to two other checks niter cost check and estimated
niter range check.  It may take some days.  You might have to
expect late response.  :)


Thanks
Kewen


> Richard.
>
Kewen.Lin June 11, 2019, 2:38 a.m. UTC | #22
>> If my understanding on this question is correct, IMHO we should try to make
>> IVOPTs conservative than optimistic, since once the predict is wrong from
>> too optimistic decision, the costing on the doloop use is wrong, it's very
>> possible to affect the global optimal set.  It looks we don't have any ways
>> to recover it in RTL then?  (otherwise, there should be better place to fix
>> the PR).  Although it's also possible to miss some good cases, it's at least
>> as good as before, I'm inclined to make it conservative.
> 
> I wonder if you could simply benchmark what happens if you make
> IVOPTs _always_ create a doloop IV (if possible)?  I doubt the
> cases where a doloop IV is bad (calls, etc.) are too common and
> that in those cases the extra simple IV hurts.
> 

Hi Richard and all,

With these different settings:
	Base) without any changes
	A) having predict_doloop and enable all checks
	B) A + disable check on "too few iterations" (0 <= est_niter < 3)
	C) A + disable costly niter check
	D) A + disable invalid stmt check (call/computed_goto/switch)

I collected some runtime performance data with SPEC2017 as following:

			Avs.Base   Bvs.A  Cvs.A  Dvs.A
500.perlbench_r		0.00%	0.38%	0.00%	-0.19%
502.gcc_r		0.00%	0.38%	0.00%	0.00%
505.mcf_r		0.89%	0.00%	0.00%	0.00%
520.omnetpp_r		-0.41%	-1.25%	0.00%	0.00%
523.xalancbmk_r		-0.36%	-0.36%	-0.36%	-0.73%
525.x264_r		1.14%	0.00%	0.00%	0.00%
531.deepsjeng_r		-0.26%	0.26%	0.00%	0.00%
541.leela_r		0.00%	0.37%	0.00%	0.00%
548.exchange2_r		0.85%	-0.21%	0.00%	0.00%
557.xz_r		-0.77%	0.52%	-0.26%	0.00%
503.bwaves_r		0.00%	0.00%	0.36%	0.00%
507.cactuBSSN_r		-0.57%	0.00%	0.00%	0.00%
508.namd_r		-0.69%	0.35%	0.00%	0.35%
510.parest_r		0.17%	-0.17%	0.00%	-0.17%
511.povray_r		-1.31%	-0.44%	0.15%	0.15%
519.lbm_r		0.00%	0.00%	0.00%	0.00%
521.wrf_r		0.33%	-0.44%	-0.33%	-0.33%
526.blender_r		0.26%	0.26%	0.00%	0.00%
527.cam4_r		0.59%	-0.59%	0.00%	-0.39%
538.imagick_r		0.45%	0.00%	0.00%	0.00%
544.nab_r		0.23%	0.00%	0.00%	0.00%
549.fotonik3d_r		1.80%	-0.29%	0.00%	0.00%
554.roms_r		0.00%	0.00%	0.00%	0.00%
geomean			0.10%	-0.05%	-0.02%	-0.06%

As above, the difference is very small, looks like caused by noise and can 
be ignored. 

I also ran partial of test suite with some explicit statistics dumping 
(on gcc/g++/gfortran etc.). No regressions found. The unique files number 
with predicted doloop found are:

  A) 3297
  B) 3416
  C) 3297
  D) 3858

Some observations:
  * Based on A) and C), we can see the checking on costly niter is useless, 
    I plan to give the check up or replace it with one existing interface 
    expression_expensive_p as Richard mentioned.  (Correct me if you have 
    any concerns.)
  * B) does filter some cases, I checked a few different cases, they are 
    written with small iteration count indeed.
  * The delta number isn't small between A) and D).

I ran some filtering by compiling C/C++ files at -O2 with A) and D) (-O2 is 
probably not the actual option used in each testing, may not cause the 
difference, but just for simplicity), obtained dumping assembly file and did
further comparison, then I got 60 files finally.

I looked into all 60 cases (I assumed these are typical enough, don't need
to go through the Fortran etc.).  Most of wi/wo differences are expected, 
55 of them use more insns to update biv, but 5 of them are trivial since they
have to use original biv later so it's kept wi/wo the scanning.  Some of 55 
takes one more register in prologue/epilogue, some of them have fewer setup 
insns (which are to calculate the original value and bound for selected iv).

Some typical case looks like:

 Replacing exit test: if (ivtmp_2 != 0)                                        \  Replacing exit test: if (ivtmp_2 != 0)
  xxx ()                                                                       \  xxx ()
  {                                                                            \  {
    unsigned long ivtmp.9;                                                     \    unsigned long ivtmp.9;
    int iter;                                                                  \    int iter;
    int _1;                                                                    \    int _1;
  -----------------------------------------------------------------------------\    unsigned int ivtmp_2;
  -----------------------------------------------------------------------------\    unsigned int ivtmp_3;
    struct bla[100] * _13;                                                     \    struct bla[100] * _13;
    void * _14;                                                                \    void * _14;
    unsigned long _15;                                                         \  -----------------------------------------------------------------------------
    unsigned long _16;                                                         \  -----------------------------------------------------------------------------
                                                                               \
    <bb 2> [local count: 10737418]:                                            \    <bb 2> [local count: 10737418]:
    _13 = &arr_base + 188;                                                     \    _13 = &arr_base + 188;
    ivtmp.9_8 = (unsigned long) _13;                                           \    ivtmp.9_8 = (unsigned long) _13;
    _15 = (unsigned long) &arr_base;                                           \  -----------------------------------------------------------------------------
    _16 = _15 + 44988;                                                         \  -----------------------------------------------------------------------------
                                                                               \
    <bb 3> [local count: 1063004407]:                                          \    <bb 3> [local count: 1063004407]:
  -----------------------------------------------------------------------------\    # ivtmp_3 = PHI <100(2), ivtmp_2(5)>
    # ivtmp.9_10 = PHI <ivtmp.9_8(2), ivtmp.9_9(5)>                            \    # ivtmp.9_10 = PHI <ivtmp.9_8(2), ivtmp.9_9(5)>
    _1 = foo ();                                                               \    _1 = foo ();
    _14 = (void *) ivtmp.9_10;                                                 \    _14 = (void *) ivtmp.9_10;
    MEM[base: _14, offset: 0B] = _1;                                           \    MEM[base: _14, offset: 0B] = _1;
  -----------------------------------------------------------------------------\    ivtmp_2 = ivtmp_3 - 1;
    ivtmp.9_9 = ivtmp.9_10 + 448;                                              \    ivtmp.9_9 = ivtmp.9_10 + 448;
    if (ivtmp.9_9 != _16)                                                      \    if (ivtmp_2 != 0)
      goto <bb 5>; [98.99%]                                                    \      goto <bb 5>; [98.99%]
    else                                                                       \    else
      goto <bb 4>; [1.01%]                                                     \      goto <bb 4>; [1.01%]


  >-------addis 31,2,.LC0@toc@ha                                               \  >-------li 31,100
  >-------ld 31,.LC0@toc@l(31)                                                 \  >-------addis 30,2,.LC0@toc@ha
  >-------addis 30,31,0x1                                                      \  >-------ld 30,.LC0@toc@l(30)
  >-------std 0,16(1)                                                          \  >-------std 0,16(1)
  >-------stdu 1,-48(1)                                                        \  >-------stdu 1,-48(1)
  >-------.cfi_def_cfa_offset 48                                               \  >-------.cfi_def_cfa_offset 48
  >-------.cfi_offset 65, 16                                                   \  -----------------------------------------------------------------------------
  >-------addi 30,30,-20736                                                    \  >-------.cfi_offset 65, 16
  >-------.p2align 5                                                           \  >-------.p2align 5
  .L2:                                                                         \  .L2:
  >-------bl foo                                                               \  >-------bl foo
  >-------nop                                                                  \  >-------nop
  >-------addi 31,31,448                                                       \  >-------addi 9,31,-1
  >-------stw 3,-448(31)                                                       \  >-------addi 30,30,448
  >-------cmpld 0,31,30                                                        \  >-------rldicl. 31,9,0,32
  -----------------------------------------------------------------------------\  >-------stw 3,-448(30)
  >-------bne 0,.L2                                                            \  >-------bne 0,.L2
  >-------addi 1,1,48                                                          \  >-------addi 1,1,48
  >-------.cfi_def_cfa_offset 0                                                \  >-------.cfi_def_cfa_offset 0
  >-------ld 0,16(1)                                                           \  >-------ld 0,16(1)
  >-------ld 30,-16(1)                                                         \  >-------ld 30,-16(1)
  >-------ld 31,-8(1)                                                          \  >-------ld 31,-8(1)


As shown above, P8 SPEC2017 performance evaluation shows to disable 
invalid_stmt scanning doesn't have any impacts.  It looks it's fine
not to force it.  The test suite small cases show it's possible to
have sub optimal instruction sequence with eliminated BIV and its update. 

I guess the concern on the scanning is compilation time cost, is it
worth to doing according to current data?

What do you think of this? 

Thanks a lot in advance!


Kewen
Richard Biener June 11, 2019, 12:17 p.m. UTC | #23
On Tue, 11 Jun 2019, Kewen.Lin wrote:

> >> If my understanding on this question is correct, IMHO we should try to make
> >> IVOPTs conservative than optimistic, since once the predict is wrong from
> >> too optimistic decision, the costing on the doloop use is wrong, it's very
> >> possible to affect the global optimal set.  It looks we don't have any ways
> >> to recover it in RTL then?  (otherwise, there should be better place to fix
> >> the PR).  Although it's also possible to miss some good cases, it's at least
> >> as good as before, I'm inclined to make it conservative.
> > 
> > I wonder if you could simply benchmark what happens if you make
> > IVOPTs _always_ create a doloop IV (if possible)?  I doubt the
> > cases where a doloop IV is bad (calls, etc.) are too common and
> > that in those cases the extra simple IV hurts.
> > 
> 
> Hi Richard and all,
> 
> With these different settings:
> 	Base) without any changes
> 	A) having predict_doloop and enable all checks
> 	B) A + disable check on "too few iterations" (0 <= est_niter < 3)
> 	C) A + disable costly niter check
> 	D) A + disable invalid stmt check (call/computed_goto/switch)
> 
> I collected some runtime performance data with SPEC2017 as following:
> 
> 			Avs.Base   Bvs.A  Cvs.A  Dvs.A
> 500.perlbench_r		0.00%	0.38%	0.00%	-0.19%
> 502.gcc_r		0.00%	0.38%	0.00%	0.00%
> 505.mcf_r		0.89%	0.00%	0.00%	0.00%
> 520.omnetpp_r		-0.41%	-1.25%	0.00%	0.00%
> 523.xalancbmk_r		-0.36%	-0.36%	-0.36%	-0.73%
> 525.x264_r		1.14%	0.00%	0.00%	0.00%
> 531.deepsjeng_r		-0.26%	0.26%	0.00%	0.00%
> 541.leela_r		0.00%	0.37%	0.00%	0.00%
> 548.exchange2_r		0.85%	-0.21%	0.00%	0.00%
> 557.xz_r		-0.77%	0.52%	-0.26%	0.00%
> 503.bwaves_r		0.00%	0.00%	0.36%	0.00%
> 507.cactuBSSN_r		-0.57%	0.00%	0.00%	0.00%
> 508.namd_r		-0.69%	0.35%	0.00%	0.35%
> 510.parest_r		0.17%	-0.17%	0.00%	-0.17%
> 511.povray_r		-1.31%	-0.44%	0.15%	0.15%
> 519.lbm_r		0.00%	0.00%	0.00%	0.00%
> 521.wrf_r		0.33%	-0.44%	-0.33%	-0.33%
> 526.blender_r		0.26%	0.26%	0.00%	0.00%
> 527.cam4_r		0.59%	-0.59%	0.00%	-0.39%
> 538.imagick_r		0.45%	0.00%	0.00%	0.00%
> 544.nab_r		0.23%	0.00%	0.00%	0.00%
> 549.fotonik3d_r		1.80%	-0.29%	0.00%	0.00%
> 554.roms_r		0.00%	0.00%	0.00%	0.00%
> geomean			0.10%	-0.05%	-0.02%	-0.06%
> 
> As above, the difference is very small, looks like caused by noise and can 
> be ignored. 
> 
> I also ran partial of test suite with some explicit statistics dumping 
> (on gcc/g++/gfortran etc.). No regressions found. The unique files number 
> with predicted doloop found are:
> 
>   A) 3297
>   B) 3416
>   C) 3297
>   D) 3858
> 
> Some observations:
>   * Based on A) and C), we can see the checking on costly niter is useless, 
>     I plan to give the check up or replace it with one existing interface 
>     expression_expensive_p as Richard mentioned.  (Correct me if you have 
>     any concerns.)
>   * B) does filter some cases, I checked a few different cases, they are 
>     written with small iteration count indeed.
>   * The delta number isn't small between A) and D).
> 
> I ran some filtering by compiling C/C++ files at -O2 with A) and D) (-O2 is 
> probably not the actual option used in each testing, may not cause the 
> difference, but just for simplicity), obtained dumping assembly file and did
> further comparison, then I got 60 files finally.
> 
> I looked into all 60 cases (I assumed these are typical enough, don't need
> to go through the Fortran etc.).  Most of wi/wo differences are expected, 
> 55 of them use more insns to update biv, but 5 of them are trivial since they
> have to use original biv later so it's kept wi/wo the scanning.  Some of 55 
> takes one more register in prologue/epilogue, some of them have fewer setup 
> insns (which are to calculate the original value and bound for selected iv).
> 
> Some typical case looks like:
> 
>  Replacing exit test: if (ivtmp_2 != 0)                                        \  Replacing exit test: if (ivtmp_2 != 0)
>   xxx ()                                                                       \  xxx ()
>   {                                                                            \  {
>     unsigned long ivtmp.9;                                                     \    unsigned long ivtmp.9;
>     int iter;                                                                  \    int iter;
>     int _1;                                                                    \    int _1;
>   -----------------------------------------------------------------------------\    unsigned int ivtmp_2;
>   -----------------------------------------------------------------------------\    unsigned int ivtmp_3;
>     struct bla[100] * _13;                                                     \    struct bla[100] * _13;
>     void * _14;                                                                \    void * _14;
>     unsigned long _15;                                                         \  -----------------------------------------------------------------------------
>     unsigned long _16;                                                         \  -----------------------------------------------------------------------------
>                                                                                \
>     <bb 2> [local count: 10737418]:                                            \    <bb 2> [local count: 10737418]:
>     _13 = &arr_base + 188;                                                     \    _13 = &arr_base + 188;
>     ivtmp.9_8 = (unsigned long) _13;                                           \    ivtmp.9_8 = (unsigned long) _13;
>     _15 = (unsigned long) &arr_base;                                           \  -----------------------------------------------------------------------------
>     _16 = _15 + 44988;                                                         \  -----------------------------------------------------------------------------
>                                                                                \
>     <bb 3> [local count: 1063004407]:                                          \    <bb 3> [local count: 1063004407]:
>   -----------------------------------------------------------------------------\    # ivtmp_3 = PHI <100(2), ivtmp_2(5)>
>     # ivtmp.9_10 = PHI <ivtmp.9_8(2), ivtmp.9_9(5)>                            \    # ivtmp.9_10 = PHI <ivtmp.9_8(2), ivtmp.9_9(5)>
>     _1 = foo ();                                                               \    _1 = foo ();
>     _14 = (void *) ivtmp.9_10;                                                 \    _14 = (void *) ivtmp.9_10;
>     MEM[base: _14, offset: 0B] = _1;                                           \    MEM[base: _14, offset: 0B] = _1;
>   -----------------------------------------------------------------------------\    ivtmp_2 = ivtmp_3 - 1;
>     ivtmp.9_9 = ivtmp.9_10 + 448;                                              \    ivtmp.9_9 = ivtmp.9_10 + 448;
>     if (ivtmp.9_9 != _16)                                                      \    if (ivtmp_2 != 0)
>       goto <bb 5>; [98.99%]                                                    \      goto <bb 5>; [98.99%]
>     else                                                                       \    else
>       goto <bb 4>; [1.01%]                                                     \      goto <bb 4>; [1.01%]
> 
> 
>   >-------addis 31,2,.LC0@toc@ha                                               \  >-------li 31,100
>   >-------ld 31,.LC0@toc@l(31)                                                 \  >-------addis 30,2,.LC0@toc@ha
>   >-------addis 30,31,0x1                                                      \  >-------ld 30,.LC0@toc@l(30)
>   >-------std 0,16(1)                                                          \  >-------std 0,16(1)
>   >-------stdu 1,-48(1)                                                        \  >-------stdu 1,-48(1)
>   >-------.cfi_def_cfa_offset 48                                               \  >-------.cfi_def_cfa_offset 48
>   >-------.cfi_offset 65, 16                                                   \  -----------------------------------------------------------------------------
>   >-------addi 30,30,-20736                                                    \  >-------.cfi_offset 65, 16
>   >-------.p2align 5                                                           \  >-------.p2align 5
>   .L2:                                                                         \  .L2:
>   >-------bl foo                                                               \  >-------bl foo
>   >-------nop                                                                  \  >-------nop
>   >-------addi 31,31,448                                                       \  >-------addi 9,31,-1
>   >-------stw 3,-448(31)                                                       \  >-------addi 30,30,448
>   >-------cmpld 0,31,30                                                        \  >-------rldicl. 31,9,0,32
>   -----------------------------------------------------------------------------\  >-------stw 3,-448(30)
>   >-------bne 0,.L2                                                            \  >-------bne 0,.L2
>   >-------addi 1,1,48                                                          \  >-------addi 1,1,48
>   >-------.cfi_def_cfa_offset 0                                                \  >-------.cfi_def_cfa_offset 0
>   >-------ld 0,16(1)                                                           \  >-------ld 0,16(1)
>   >-------ld 30,-16(1)                                                         \  >-------ld 30,-16(1)
>   >-------ld 31,-8(1)                                                          \  >-------ld 31,-8(1)
> 
> 
> As shown above, P8 SPEC2017 performance evaluation shows to disable 
> invalid_stmt scanning doesn't have any impacts.  It looks it's fine
> not to force it.  The test suite small cases show it's possible to
> have sub optimal instruction sequence with eliminated BIV and its update. 
> 
> I guess the concern on the scanning is compilation time cost, is it
> worth to doing according to current data?
> 
> What do you think of this? 

According to the data I'd say keep the code as simple as possible
at start and add tuning or extra checks only later together with
actual testcases where it matters.

Richard.
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a21f4f7..2fd52d7 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -83,6 +83,7 @@ 
 #include "tree-ssa-propagate.h"
 #include "tree-vrp.h"
 #include "tree-ssanames.h"
+#include "tree-cfg.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -1914,6 +1915,9 @@  static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_CAN_USE_DOLOOP_P
 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
 
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
@@ -39436,7 +39440,80 @@  rs6000_mangle_decl_assembler_name (tree decl, tree id)
   return id;
 }
 
-
+/* Check whether there are some instructions preventing doloop transformation
+   inside loop body, mainly for instructions which are possible to kill CTR.
+
+   Return true if some invalid insn exits, otherwise return false.  */
+
+static bool
+invalid_insn_for_doloop_p (struct loop *loop)
+{
+  basic_block *body = get_loop_body (loop);
+  gimple_stmt_iterator gsi;
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple *stmt = gsi_stmt (gsi);
+	if (is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
+	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file,
+		       "predict doloop failure due to finding call.\n");
+	    return true;
+	  }
+	if (computed_goto_p (stmt))
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file, "predict doloop failure due to"
+				  "finding computed jump.\n");
+	    return true;
+	  }
+
+	/* TODO: Now this hook is expected to be called in ivopts, which is
+	   before switchlower1/switchlower2.  Checking for SWITCH at this point
+	   will eliminate some good candidates.  But since there are only a few
+	   cases having _a_ switch statement in the innermost loop, it can be a
+	   low priority enhancement.  */
+	if (gimple_code (stmt) == GIMPLE_SWITCH)
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file,
+		       "predict doloop failure due to finding switch.\n");
+	    return true;
+	  }
+      }
+
+  return false;
+}
+
+/* Predict whether the given loop in gimple will be transformed in the RTL
+   doloop_optimize pass.  This is for rs6000 target specific.  */
+
+static bool
+rs6000_predict_doloop_p (struct loop *loop)
+{
+  gcc_assert (loop);
+
+  /* On rs6000, targetm.can_use_doloop_p is actually
+     can_use_doloop_if_innermost.  Just ensure it's innermost.  */
+  if (loop->inner != NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "no innermost.\n");
+      return false;
+    }
+
+  /* Similar to doloop_optimize targetm.invalid_within_doloop, check for
+     CALL, tablejump, computed_jump.  */
+  if (invalid_insn_for_doloop_p (loop))
+    return false;
+
+  return true;
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8c8978b..e595b8b 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11603,6 +11603,14 @@  function version at run-time for a given set of function versions.
 body must be generated.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_PREDICT_DOLOOP_P (struct loop *@var{loop})
+Return true if we can predict it is possible to use low-overhead loops
+for a particular loop.  The parameter @var{loop} is a pointer to the loop
+which is going to be checked.  This target hook is required only when the
+target supports low-overhead loops, and will help some earlier middle-end
+passes to make some decisions.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fe1194e..e047734 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7942,6 +7942,8 @@  to by @var{ce_info}.
 
 @hook TARGET_GENERATE_VERSION_DISPATCHER_BODY
 
+@hook TARGET_PREDICT_DOLOOP_P
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 66cee07..0a80de3 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4227,6 +4227,15 @@  DEFHOOK
 rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
  default_speculation_safe_value)
  
+DEFHOOK
+(predict_doloop_p,
+ "Return true if we can predict it is possible to use low-overhead loops\n\
+for a particular loop.  The parameter @var{loop} is a pointer to the loop\n\
+which is going to be checked.  This target hook is required only when the\n\
+target supports low-overhead loops, and will help some earlier middle-end\n\
+passes to make some decisions.",
+ bool, (struct loop *loop),
+ default_predict_doloop_p)
 
 DEFHOOK
 (can_use_doloop_p,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 318f7e9..56ed4cc 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -643,6 +643,19 @@  default_has_ifunc_p (void)
   return HAVE_GNU_INDIRECT_FUNCTION;
 }
 
+/* True if we can predict this loop is possible to be transformed to a
+   low-overhead loop, otherwise returns false.
+
+   By default, false is returned, as this hook's applicability should be
+   verified for each target.  Target maintainers should re-define the hook
+   if the target can take advantage of it.  */
+
+bool
+default_predict_doloop_p (struct loop *loop ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
    an error message.
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 5943627..70bfb17 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -85,6 +85,7 @@  extern bool default_fixed_point_supported_p (void);
 
 extern bool default_has_ifunc_p (void);
 
+extern bool default_predict_doloop_p (struct loop *);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index a44b4cb..ed7f2a5 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -3776,6 +3776,111 @@  prepare_decl_rtl (tree *expr_p, int *ws, void *data)
   return NULL_TREE;
 }
 
+/* Check whether number of iteration computation is too costly for doloop
+   transformation.  It expands the gimple sequence to equivalent RTL insn
+   sequence, then evaluate the cost.
+
+   Return true if it's costly, otherwise return false.  */
+
+static bool
+costly_iter_for_doloop_p (struct loop *loop, tree niters)
+{
+  tree type = TREE_TYPE (niters);
+  unsigned cost = 0;
+  bool speed = optimize_loop_for_speed_p (loop);
+  int regno = LAST_VIRTUAL_REGISTER + 1;
+  walk_tree (&niters, prepare_decl_rtl, &regno, NULL);
+  start_sequence ();
+  expand_expr (niters, NULL_RTX, TYPE_MODE (type), EXPAND_NORMAL);
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  for (; seq; seq = NEXT_INSN (seq))
+    {
+      if (!INSN_P (seq))
+	continue;
+      rtx body = PATTERN (seq);
+      if (GET_CODE (body) == SET)
+	{
+	  rtx set_val = XEXP (body, 1);
+	  enum rtx_code code = GET_CODE (set_val);
+	  enum rtx_class cls = GET_RTX_CLASS (code);
+	  /* For now, we only consider these two RTX classes, to match what we
+	 get in doloop_optimize, excluding operations like zero/sign extend.  */
+	  if (cls == RTX_BIN_ARITH || cls == RTX_COMM_ARITH)
+	    cost += set_src_cost (set_val, GET_MODE (set_val), speed);
+	}
+    }
+  unsigned max_cost
+    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
+  if (cost > max_cost)
+    return true;
+
+  return false;
+}
+
+/* Predict whether the given loop will be transformed in the RTL
+   doloop_optimize pass.  Attempt to duplicate as many doloop_optimize checks
+   as possible.  This is only for target independent checks, see
+   targetm.predict_doloop_p for the target dependent ones.
+
+   Some RTL specific checks seems unable to be checked in gimple, if any new
+   checks or easy checks _are_ missing here, please add them.  */
+
+static bool
+generic_predict_doloop_p (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  /* Call target hook for target dependent checks.  */
+  if (!targetm.predict_doloop_p (loop))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "target specific checks.\n");
+      return false;
+    }
+
+  /* Similar to doloop_optimize, check iteration description to know it's
+     suitable or not.  */
+  edge exit = loop_latch_edge (loop);
+  struct tree_niter_desc *niter_desc = niter_for_exit (data, exit);
+  if (niter_desc == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "unexpected niters.\n");
+      return false;
+    }
+
+  /* Similar to doloop_optimize, check whether iteration count too small
+     and not profitable.  */
+  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
+  if (est_niter == -1)
+    est_niter = get_likely_max_loop_iterations_int (loop);
+  if (est_niter >= 0 && est_niter < 3)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "predict doloop failure due to"
+		 "too few iterations (%u).\n",
+		 (unsigned int) est_niter);
+      return false;
+    }
+
+  /* Similar to doloop_optimize, check whether number of iterations too costly
+     to compute.  */
+  if (costly_iter_for_doloop_p (loop, niter_desc->niter))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "predict doloop failure due to"
+			    "costly niter computation.\n");
+      return false;
+    }
+
+  return true;
+}
+
 /* Determines cost of the computation of EXPR.  */
 
 static unsigned