diff mbox series

[V3] Split loop for NE condition.

Message ID 20210604080419.119328-1-guojiufu@linux.ibm.com
State New
Headers show
Series [V3] Split loop for NE condition. | expand

Commit Message

Jiufu Guo June 4, 2021, 8:04 a.m. UTC
Update the patch since v2:
. Check index and bound from gcond before checking if wrap.
. Update test case, and add an executable case.
. Refine code comments.
. Enhance the checking for i++/++i in the loop header.
. Enhance code to handle equal condition on exit

Bootstrap and regtest pass on powerpc64le, and also pass regtest
on bootstrap-O3. Is this ok for trunk?

BR.
Jiufu Guo.


When there is the possibility that wrap may happen on the loop index,
a few optimizations would not happen. For example code:

foo (int *a, int *b, unsigned k, unsigned n)
{
  while (++k != n)
    a[k] = b[k]  + 1;
}

For this code, if "k > n", k would wrap.  if "k < n" at begining,
it could be optimized (e.g. vectorization).

We can split the loop into two loops:

  while (++k > n)
    a[k] = b[k]  + 1;
  while (k++ < n)
    a[k] = b[k]  + 1;

This patch splits this kind of loop to achieve better performance.

gcc/ChangeLog:

2021-06-04  Jiufu Guo  <guojiufu@linux.ibm.com>

	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
	(get_ne_cond_branch): New function.
	(split_ne_loop): New function.
	(split_loop_on_ne_cond): New function.
	(tree_ssa_split_loops): Use split_loop_on_ne_cond.

gcc/testsuite/ChangeLog:

2021-06-04  Jiufu Guo  <guojiufu@linux.ibm.com>

	* gcc.dg/loop-split1.c: New test.
	* gcc.dg/loop-split2.c: New test.
	* g++.dg/vect/pr98064.cc: Suppress warning.

---
 gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
 gcc/testsuite/gcc.dg/loop-split1.c   | 101 +++++++++++
 gcc/testsuite/gcc.dg/loop-split2.c   |  54 ++++++
 gcc/tree-ssa-loop-split.c            | 251 ++++++++++++++++++++++++++-
 4 files changed, 404 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
 create mode 100644 gcc/testsuite/gcc.dg/loop-split2.c

Comments

Richard Biener June 8, 2021, 10:13 a.m. UTC | #1
On Fri, 4 Jun 2021, Jiufu Guo wrote:

> Update the patch since v2:
> . Check index and bound from gcond before checking if wrap.
> . Update test case, and add an executable case.
> . Refine code comments.
> . Enhance the checking for i++/++i in the loop header.
> . Enhance code to handle equal condition on exit
> 
> Bootstrap and regtest pass on powerpc64le, and also pass regtest
> on bootstrap-O3. Is this ok for trunk?
> 
> BR.
> Jiufu Guo.
> 
> 
> When there is the possibility that wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
> 
> foo (int *a, int *b, unsigned k, unsigned n)
> {
>   while (++k != n)
>     a[k] = b[k]  + 1;
> }
> 
> For this code, if "k > n", k would wrap.  if "k < n" at begining,
> it could be optimized (e.g. vectorization).
> 
> We can split the loop into two loops:
> 
>   while (++k > n)
>     a[k] = b[k]  + 1;
>   while (k++ < n)
>     a[k] = b[k]  + 1;
> 
> This patch splits this kind of loop to achieve better performance.
> 
> gcc/ChangeLog:
> 
> 2021-06-04  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
> 	(get_ne_cond_branch): New function.
> 	(split_ne_loop): New function.
> 	(split_loop_on_ne_cond): New function.
> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-06-04  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* gcc.dg/loop-split1.c: New test.
> 	* gcc.dg/loop-split2.c: New test.
> 	* g++.dg/vect/pr98064.cc: Suppress warning.
> 
> ---
>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>  gcc/testsuite/gcc.dg/loop-split1.c   | 101 +++++++++++
>  gcc/testsuite/gcc.dg/loop-split2.c   |  54 ++++++
>  gcc/tree-ssa-loop-split.c            | 251 ++++++++++++++++++++++++++-
>  4 files changed, 404 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split2.c
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
>  
>  const long long &min(const long long &__a, long long &__b) {
>    if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++, q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 00000000000..0d3fded3f61
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void abort (void);
> +extern void exit (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +int a[258];
> +int b[258];
> +
> +int main()
> +{
> +  __builtin_memcpy (b, a, sizeof (a));
> +
> +  if (bar (a, b, 3, 8) != 9)
> +    abort ();
> +
> +  if (bar (a, b, 8, 3) != 4)
> +    abort ();
> +
> +  b[100] += 1;
> +  if (bar (a, b, 90, 110) != 100)
> +    abort ();
> +
> +  if (bar (a, b, 110, 105) != 100)
> +    abort ();
> +
> +  foo (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort();
> +
> +  exit (0);
> +}
> +
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..9c0de66e288 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>  
>  /* This file implements two kinds of loop splitting.
>  
> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
>     conditional).  I.e. the second loop can now be entered either
>     via the original entry or via NEW_E, so the entry values of LOOP2
>     phi nodes are either the original ones or those at the exit
> -   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
> -   this.  The loops need to fulfill easy_exit_values().  */
> +   of LOOP1.  Selecting the previous value instead next value as the
> +   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
> +   LOOP2 pre-header reflecting this.  The loops need to fulfill
> +   easy_exit_values().  */
>  
>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>  
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1598,241 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
>  
> +/* Check if the LOOP exit branch is like "if (idx != bound)",
> +   Return the branch edge which exit loop, if wrap may happen on "idx".  */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      basic_block bb = e->src;
> +
> +      /* Check if exit at gcond.  */
> +      gimple *last = last_stmt (bb);
> +      if (!last || gimple_code (last) != GIMPLE_COND)
> +	continue;
> +      gcond *cond = as_a<gcond *> (last);
> +      enum tree_code code = gimple_cond_code (cond);
> +      if (!(code == NE_EXPR
> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))

The NE_EXPR check misses a corresponding && (e->flags & EDGE_FALSE_VALUE)
check.

> +	continue;
> +
> +      /* Check if bound is invarant.  */
> +      tree idx = gimple_cond_lhs (cond);
> +      tree bnd = gimple_cond_rhs (cond);
> +      if (expr_invariant_in_loop_p (loop, idx))
> +	std::swap (idx, bnd);
> +      else if (!expr_invariant_in_loop_p (loop, bnd))
> +	continue;
> +
> +      /* Only unsigned type conversion could cause wrap.  */
> +      tree type = TREE_TYPE (idx);
> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> +	  || !TYPE_UNSIGNED (type))
> +	continue;
> +
> +      /* Avoid to split if bound is MAX/MIN val.  */
> +      tree bound_type = TREE_TYPE (bnd);
> +      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P (bound_type)
> +	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> +	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
> +	continue;

Note you do not require 'bnd' to be constant and thus at runtime those
cases still need to be handled correctly.

> +      /* Check if there is possible wrap.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +
> +      /* If exit edge is just before the empty latch, it is easy to link
> +	 the split loops: just jump from the exit edge of one loop to the
> +	 header of new loop.  */
> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == bb
> +	  && empty_block_p (loop->latch))
> +	return e;
> +
> +      /* If exit edge is at end of header, and header contains i++ or ++i,
> +	 only, it is simple to link the split loops:  jump from the end of
> +	 one loop header to the new loop header, and use unchanged PHI
> +	 result of first loop as the entry PHI value of the second loop.  */
> +      if (bb == loop->header)
> +	{
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (bb);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gphi *phi = psi.phi ();
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* Check it is ++i or ++i */
> +	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +	  tree prev = PHI_RESULT (phi);
> +	  if (idx != prev && idx != next)
> +	    continue;
> +
> +	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
> +	  if (gsi_end_p (gsi))
> +	    continue;
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
> +	      || gimple_assign_rhs1 (s1) != prev)
> +	    continue;
> +
> +	  gsi_next_nondebug (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Invert edges for gcond.  */
> +  if (gimple_cond_code (gc) == EQ_EXPR)
> +    {
> +      auto invert_edge = [](basic_block bb) {
> +	edge out = EDGE_SUCC (bb, 0);
> +	edge in = EDGE_SUCC (bb, 1);
> +	if (in->flags & EDGE_TRUE_VALUE)
> +	  std::swap (in, out);
> +	in->flags |= EDGE_TRUE_VALUE;
> +	in->flags &= ~EDGE_FALSE_VALUE;
> +	out->flags |= EDGE_FALSE_VALUE;
> +	out->flags &= ~EDGE_TRUE_VALUE;
> +      };
> +
> +      invert_edge (gimple_bb (gc));
> +      invert_edge (gimple_bb (dup_gc));
> +    }
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */

It now occurs to me that we nowhere check the evolution of IDX
(split_at_bb_p uses simple_iv for this for example).  The transform
assumes that we will actually hit i == n and that i increments, but
while you check the control IV from number_of_iterations_exit
for NE_EXPR that does not guarantee a positive evolution.

Your testcases do not include any negative step examples, but I guess
the conditions need to be swapped in this case?

I think you also have to consider the order we split, say with

  for (i = start; i != end; ++i)
    {
      push (i);
      if (a[i] != b[i])
        break;
    }

push (i) calls need to be in the same order for all cases of
start < end, start == end and start > end (and also cover
runtime testcases with end == 0 or end == UINT_MAX, likewise
for start).

> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, up_code);
> +  gimple_cond_set_code (dup_gc, down_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop
> +    = pred_e && pred_e->src == cond_e->src && empty_block_p (loop->latch);
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, down_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> +  free (bbs);
> +
> +  if (num > param_max_peeled_insns)
> +    return false;
> +
> +  edge branch_edge = get_ne_cond_branch (loop);
> +  if (branch_edge && split_ne_loop (loop, branch_edge))

please check get_ne_cond_branch first, the can_copy_bbs_p and size
estimates are of linear complexity in the loop size.  Please also
test the size estimate before can_copy_bbs and terminate the loop
when you reach param_max_peeled_insns.

> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  */
>  
>  static unsigned int
> @@ -1622,7 +1862,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>  	continue;
>  
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +	  || split_loop_on_ne_cond (loop))
>  	{
>  	  /* Mark our containing loop as having had some split inner loops.  */
>  	  loop_outer (loop)->aux = loop;
>
Jiufu Guo June 9, 2021, 9:42 a.m. UTC | #2
On 2021-06-08 18:13, Richard Biener wrote:
> On Fri, 4 Jun 2021, Jiufu Guo wrote:
> 
cut...
>> +      gcond *cond = as_a<gcond *> (last);
>> +      enum tree_code code = gimple_cond_code (cond);
>> +      if (!(code == NE_EXPR
>> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
> 
> The NE_EXPR check misses a corresponding && (e->flags & 
> EDGE_FALSE_VALUE)
> check.
> 
Thanks, check (e->flags & EDGE_FALSE_VALUE) would be safer.

>> +	continue;
>> +
>> +      /* Check if bound is invarant.  */
>> +      tree idx = gimple_cond_lhs (cond);
>> +      tree bnd = gimple_cond_rhs (cond);
>> +      if (expr_invariant_in_loop_p (loop, idx))
>> +	std::swap (idx, bnd);
>> +      else if (!expr_invariant_in_loop_p (loop, bnd))
>> +	continue;
>> +
>> +      /* Only unsigned type conversion could cause wrap.  */
>> +      tree type = TREE_TYPE (idx);
>> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
>> +	  || !TYPE_UNSIGNED (type))
>> +	continue;
>> +
>> +      /* Avoid to split if bound is MAX/MIN val.  */
>> +      tree bound_type = TREE_TYPE (bnd);
>> +      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P 
>> (bound_type)
>> +	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>> +	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
>> +	continue;
> 
> Note you do not require 'bnd' to be constant and thus at runtime those
> cases still need to be handled correctly.
Yes, bnd is not required to be constant.  The above code is filtering 
the case
where bnd is const max/min value of the type.  So, the code could be 
updated as:
       if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
           || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))

> 
>> +      /* Check if there is possible wrap.  */
>> +      class tree_niter_desc niter;
>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
cut...
>> +
>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> 
> It now occurs to me that we nowhere check the evolution of IDX
> (split_at_bb_p uses simple_iv for this for example).  The transform
> assumes that we will actually hit i == n and that i increments, but
> while you check the control IV from number_of_iterations_exit
> for NE_EXPR that does not guarantee a positive evolution.
> 
If I do not correctly reply your question, please point out:
number_of_iterations_exit is similar with simple_iv to invoke 
simple_iv_with_niters
which check the evolution, and number_of_iterations_exit check 
number_of_iterations_cond
which check no_overflow more accurate, this is one reason I use this 
function.

This transform assumes that the last run hits i==n.
Otherwise, the loop may run infinitely wrap after wrap.
For safe, if the step is 1 or -1,  this assumption would be true.  I 
would add this check.

Thanks so much for pointing out I missed the negative step!

> Your testcases do not include any negative step examples, but I guess
> the conditions need to be swapped in this case?

I would add cases and code to support step 1/-1.

> 
> I think you also have to consider the order we split, say with
> 
>   for (i = start; i != end; ++i)
>     {
>       push (i);
>       if (a[i] != b[i])
>         break;
>     }
> 
> push (i) calls need to be in the same order for all cases of
> start < end, start == end and start > end (and also cover
> runtime testcases with end == 0 or end == UINT_MAX, likewise
> for start).
I add tests for the above cases. If missing sth, please point out, 
thanks!

> 
>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
cut....

Thanks again for the very helpful review!

BR,
Jiufu Guo.
Jiufu Guo June 9, 2021, 11:18 a.m. UTC | #3
On 2021-06-09 17:42, guojiufu via Gcc-patches wrote:
> On 2021-06-08 18:13, Richard Biener wrote:
>> On Fri, 4 Jun 2021, Jiufu Guo wrote:
>> 
> cut...
>>> +      gcond *cond = as_a<gcond *> (last);
>>> +      enum tree_code code = gimple_cond_code (cond);
>>> +      if (!(code == NE_EXPR
>>> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
>> 
>> The NE_EXPR check misses a corresponding && (e->flags & 
>> EDGE_FALSE_VALUE)
>> check.
>> 
> Thanks, check (e->flags & EDGE_FALSE_VALUE) would be safer.
> 
>>> +	continue;
>>> +
>>> +      /* Check if bound is invarant.  */
>>> +      tree idx = gimple_cond_lhs (cond);
>>> +      tree bnd = gimple_cond_rhs (cond);
>>> +      if (expr_invariant_in_loop_p (loop, idx))
>>> +	std::swap (idx, bnd);
>>> +      else if (!expr_invariant_in_loop_p (loop, bnd))
>>> +	continue;
>>> +
>>> +      /* Only unsigned type conversion could cause wrap.  */
>>> +      tree type = TREE_TYPE (idx);
>>> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
>>> +	  || !TYPE_UNSIGNED (type))
>>> +	continue;
>>> +
>>> +      /* Avoid to split if bound is MAX/MIN val.  */
>>> +      tree bound_type = TREE_TYPE (bnd);
>>> +      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P 
>>> (bound_type)
>>> +	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>>> +	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
>>> +	continue;
>> 
>> Note you do not require 'bnd' to be constant and thus at runtime those
>> cases still need to be handled correctly.
> Yes, bnd is not required to be constant.  The above code is filtering 
> the case
> where bnd is const max/min value of the type.  So, the code could be 
> updated as:
>       if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>           || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
> 
>> 
>>> +      /* Check if there is possible wrap.  */
>>> +      class tree_niter_desc niter;
>>> +      if (!number_of_iterations_exit (loop, e, &niter, false, 
>>> false))
> cut...
>>> +
>>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>> 
>> It now occurs to me that we nowhere check the evolution of IDX
>> (split_at_bb_p uses simple_iv for this for example).  The transform
>> assumes that we will actually hit i == n and that i increments, but
>> while you check the control IV from number_of_iterations_exit
>> for NE_EXPR that does not guarantee a positive evolution.
>> 
> If I do not correctly reply your question, please point out:
> number_of_iterations_exit is similar with simple_iv to invoke
> simple_iv_with_niters
> which check the evolution, and number_of_iterations_exit check
> number_of_iterations_cond
> which check no_overflow more accurate, this is one reason I use this 
> function.
> 
> This transform assumes that the last run hits i==n.
> Otherwise, the loop may run infinitely wrap after wrap.
> For safe, if the step is 1 or -1,  this assumption would be true.  I
> would add this check.
> 
> Thanks so much for pointing out I missed the negative step!
> 
>> Your testcases do not include any negative step examples, but I guess
>> the conditions need to be swapped in this case?
> 
> I would add cases and code to support step 1/-1.
> 
>> 
>> I think you also have to consider the order we split, say with
>> 
>>   for (i = start; i != end; ++i)
>>     {
>>       push (i);
>>       if (a[i] != b[i])
>>         break;
>>     }
>> 
>> push (i) calls need to be in the same order for all cases of
>> start < end, start == end and start > end (and also cover
>> runtime testcases with end == 0 or end == UINT_MAX, likewise
>> for start).
> I add tests for the above cases. If missing sth, please point out, 
> thanks!
> 
>> 
>>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> cut....
> 
> Thanks again for the very helpful review!
> 
> BR,
> Jiufu Guo.

Here is the updated patch, thanks for your time!

diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
b/gcc/testsuite/gcc.dg/loop-split1.c
new file mode 100644
index 00000000000..dd2d03a7b96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split1.c
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
+
+void
+foo (int *a, int *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+void
+foo_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+void
+foo1 (int *a, int *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+/* No wrap.  */
+void
+foo1_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned
+foo2 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo2_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo3 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+/* No wrap.  */
+unsigned
+foo3_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+void
+bar ();
+void
+foo4 (unsigned n, unsigned i)
+{
+  do
+    {
+      if (i == n)
+	return;
+      bar ();
+      ++i;
+    }
+  while (1);
+}
+
+unsigned
+find_skip_diff (char *p, char *q, unsigned n, unsigned i)
+{
+  while (p[i] == q[i] && ++i != n)
+    p++, q++;
+
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-split2.c 
b/gcc/testsuite/gcc.dg/loop-split2.c
new file mode 100644
index 00000000000..56377e2f2f5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split2.c
@@ -0,0 +1,155 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+extern void
+abort (void);
+extern void
+exit (int);
+void
+push (int);
+
+#define NI __attribute__ ((noinline))
+
+void NI
+foo (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned NI
+bar (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (l++ != n)
+    {
+      push (l);
+      if (a[l] != b[l])
+	break;
+      push (l + 1);
+    }
+  return l;
+}
+
+void NI
+foo_1 (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (--l != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned NI
+bar_1 (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (l-- != n)
+    {
+      push (l);
+      if (a[l] != b[l])
+	break;
+      push (l + 1);
+    }
+
+  return l;
+}
+
+int a[258];
+int b[258];
+int c[1024];
+static int top = 0;
+void
+push (int e)
+{
+  c[top++] = e;
+}
+
+void
+reset ()
+{
+  top = 0;
+  __builtin_memset (c, 0, sizeof (c));
+}
+
+#define check(a, b) (a == b)
+
+int
+check_c (int *c, int a0, int a1, int a2, int a3, int a4, int a5)
+{
+  return check (c[0], a0) && check (c[1], a1) && check (c[2], a2)
+	 && check (c[3], a3) && check (c[4], a4) && check (c[5], a5);
+}
+
+int
+main ()
+{
+  __builtin_memcpy (b, a, sizeof (a));
+  reset ();
+  if (bar (a, b, 6, 8) != 9 || !check_c (c, 7, 8, 8, 9, 0, 0))
+    abort ();
+
+  reset ();
+  if (bar (a, b, 5, 3) != 4 || !check_c (c, 6, 7, 7, 8, 8, 9)
+      || !check_c (c + 496, 254, 255, 255, 256, 0, 1))
+    abort ();
+
+  reset ();
+  if (bar (a, b, 6, 6) != 7 || !check_c (c, 0, 0, 0, 0, 0, 0))
+    abort ();
+
+  reset ();
+  if (bar (a, b, 253, 255) != 0 || !check_c (c, 254, 255, 255, 256, 0, 
0))
+    abort ();
+
+  reset ();
+  if (bar (a, b, 253, 0) != 1 || !check_c (c, 254, 255, 255, 256, 0, 
1))
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 6, 8) != 7 || !check_c (c, 5, 6, 4, 5, 3, 4))
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 5, 3) != 2 || !check_c (c, 4, 5, 3, 4, 0, 0))
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 6, 6) != 5)
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 2, 255) != 254 || !check_c (c, 1, 2, 0, 1, 255, 
256))
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 2, 0) != 255 || !check_c (c, 1, 2, 0, 1, 0, 0))
+    abort ();
+
+  b[100] += 1;
+  reset ();
+  if (bar (a, b, 90, 110) != 100)
+    abort ();
+
+  reset ();
+  if (bar (a, b, 110, 105) != 100)
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 90, 110) != 109)
+    abort ();
+
+  reset ();
+  if (bar_1 (a, b, 2, 90) != 100)
+    abort ();
+
+  foo (a, b, 99, 99);
+  a[99] = b[99] + 1;
+  for (int i = 0; i < 256; i++)
+    if (a[i] != b[i] + 1)
+      abort ();
+
+  foo_1 (a, b, 99, 99);
+  a[99] = b[99] + 1;
+  for (int i = 0; i < 256; i++)
+    if (a[i] != b[i] + 1)
+      abort ();
+
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.dg/loop-split3.c 
b/gcc/testsuite/gcc.dg/loop-split3.c
new file mode 100644
index 00000000000..ec93ee8bd12
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split3.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
+
+void
+foo (int *a, int *b, unsigned l, unsigned n)
+{
+  while (--l != n)
+    a[l] = b[l] + 1;
+}
+
+void
+foo1 (int *a, int *b, unsigned l, unsigned n)
+{
+  while (l-- != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned
+foo2 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (--l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo3 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (l-- != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+void
+bar ();
+void
+foo4 (unsigned n, unsigned i)
+{
+  do
+    {
+      if (i == n)
+	return;
+      bar ();
+      --i;
+    }
+  while (1);
+}
+
+unsigned
+find_skip_diff (char *p, char *q, unsigned n, unsigned i)
+{
+  while (p[i] == q[i] && --i != n)
+    p--, q--;
+
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-times "Loop split" 6 "lsplit" } } */
diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..e9f23b32186 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "cfghooks.h"
  #include "gimple-fold.h"
  #include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"

  /* This file implements two kinds of loop splitting.

@@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
     conditional).  I.e. the second loop can now be entered either
     via the original entry or via NEW_E, so the entry values of LOOP2
     phi nodes are either the original ones or those at the exit
-   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
-   this.  The loops need to fulfill easy_exit_values().  */
+   of LOOP1.  Selecting the previous value instead next value as the
+   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
+   LOOP2 pre-header reflecting this.  The loops need to fulfill
+   easy_exit_values().  */

  static void
-connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
+connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
+		   bool use_prev = false)
  {
    basic_block rest = loop_preheader_edge (loop2)->src;
    gcc_assert (new_e->dest == rest);
@@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop 
*loop2, edge new_e)

        gphi * newphi = create_phi_node (new_init, rest);
        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
-      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
+      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
new_e,
+		   UNKNOWN_LOCATION);
        SET_USE (op, new_init);
      }
  }
@@ -1593,6 +1598,252 @@ split_loop_on_cond (struct loop *loop)
    return do_split;
  }

+/* Check if the LOOP exit branch is like "if (idx != bound)",
+   Return the branch edge which exit loop, if wrap may happen on "idx". 
  */
+
+static edge
+get_ne_cond_branch (struct loop *loop, tree *step)
+{
+  int i;
+  edge e;
+
+  auto_vec<edge> edges = get_loop_exit_edges (loop);
+  FOR_EACH_VEC_ELT (edges, i, e)
+    {
+      basic_block bb = e->src;
+
+      /* Check if exit at gcond.  */
+      gimple *last = last_stmt (bb);
+      if (!last || gimple_code (last) != GIMPLE_COND)
+	continue;
+      gcond *cond = as_a<gcond *> (last);
+      enum tree_code code = gimple_cond_code (cond);
+      if (!((code == NE_EXPR && (e->flags & EDGE_FALSE_VALUE))
+	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
+	continue;
+
+      /* Check if bound is invarant.  */
+      tree idx = gimple_cond_lhs (cond);
+      tree bnd = gimple_cond_rhs (cond);
+      if (expr_invariant_in_loop_p (loop, idx))
+	std::swap (idx, bnd);
+      else if (!expr_invariant_in_loop_p (loop, bnd))
+	continue;
+
+      /* Only unsigned type conversion could cause wrap.  */
+      tree type = TREE_TYPE (idx);
+      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
+	  || !TYPE_UNSIGNED (type))
+	continue;
+
+      /* Avoid to split if bound is MAX/MIN val.  */
+      tree bound_type = TREE_TYPE (bnd);
+      if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
+	  || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
+	continue;
+
+      /* Check if there is possible wrap.  */
+      class tree_niter_desc niter;
+      if (!number_of_iterations_exit (loop, e, &niter, false, false))
+	continue;
+      if (niter.control.no_overflow)
+	return NULL;
+      if (niter.cmp != NE_EXPR)
+	continue;
+      if (!integer_onep (niter.control.step)
+	  && !integer_minus_onep (niter.control.step))
+	continue;
+      *step = niter.control.step;
+
+      /* If exit edge is just before the empty latch, it is easy to 
link
+	 the split loops: just jump from the exit edge of one loop to the
+	 header of new loop.  */
+      if (single_pred_p (loop->latch)
+	  && single_pred_edge (loop->latch)->src == bb
+	  && empty_block_p (loop->latch))
+	return e;
+
+      /* If exit edge is at end of header, and header contains i++ or 
++i,
+	 only, it is simple to link the split loops: jump from the end of
+	 one loop header to the new loop header, and use unchanged PHI result
+	 of the first loop as the entry PHI value of the second loop.  */
+      if (bb == loop->header)
+	{
+	  /* Only one phi.  */
+	  gphi_iterator psi = gsi_start_phis (bb);
+	  if (gsi_end_p (psi))
+	    continue;
+	  gphi *phi = psi.phi ();
+	  gsi_next (&psi);
+	  if (!gsi_end_p (psi))
+	    continue;
+
+	  /* Check it is ++i or ++i */
+	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+	  tree prev = PHI_RESULT (phi);
+	  if (idx != prev && idx != next)
+	    continue;
+
+	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
+	  if (gsi_end_p (gsi))
+	    continue;
+	  gimple *s1 = gsi_stmt (gsi);
+	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
+	      || gimple_assign_rhs1 (s1) != prev)
+	    continue;
+
+	  gsi_next_nondebug (&gsi);
+	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
+	    return e;
+	}
+    }
+
+  return NULL;
+}
+
+/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR. 
  */
+
+static bool
+split_ne_loop (struct loop *loop, edge cond_e, tree step)
+{
+  initialize_original_copy_tables ();
+
+  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
+				     profile_probability::always (),
+				     profile_probability::never (),
+				     profile_probability::always (),
+				     profile_probability::always (), true);
+
+  gcc_assert (loop2);
+  update_ssa (TODO_update_ssa);
+
+  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
+  free_original_copy_tables ();
+
+  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
+  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
+
+  /* Invert edges for gcond.  */
+  if (gimple_cond_code (gc) == EQ_EXPR)
+    {
+      auto invert_edge = [](basic_block bb) {
+	edge out = EDGE_SUCC (bb, 0);
+	edge in = EDGE_SUCC (bb, 1);
+	if (in->flags & EDGE_TRUE_VALUE)
+	  std::swap (in, out);
+	in->flags |= EDGE_TRUE_VALUE;
+	in->flags &= ~EDGE_FALSE_VALUE;
+	out->flags |= EDGE_FALSE_VALUE;
+	out->flags &= ~EDGE_TRUE_VALUE;
+      };
+
+      invert_edge (gimple_bb (gc));
+      invert_edge (gimple_bb (dup_gc));
+    }
+
+  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
+  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
+  if (tree_int_cst_sign_bit (step))
+    inv = !inv;
+  enum tree_code first_loop_code = inv ? LT_EXPR : GT_EXPR;
+  enum tree_code second_loop_code = inv ? GT_EXPR : LT_EXPR;
+
+  gimple_cond_set_code (gc, first_loop_code);
+  gimple_cond_set_code (dup_gc, second_loop_code);
+
+  /* Link the exit cond edge to new loop.  */
+  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
+  edge pred_e = single_pred_edge (loop->latch);
+  bool simple_loop
+    = pred_e && pred_e->src == cond_e->src && empty_block_p 
(loop->latch);
+  if (simple_loop)
+    gimple_cond_set_code (break_cond, second_loop_code);
+  else
+    gimple_cond_make_true (break_cond);
+
+  basic_block break_bb = split_edge (cond_e);
+  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
+  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
+
+  edge to_exit = single_succ_edge (break_bb);
+  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
(loop2)->src, 0);
+  to_new_loop->flags |= EDGE_TRUE_VALUE;
+  to_exit->flags |= EDGE_FALSE_VALUE;
+  to_exit->flags &= ~EDGE_FALLTHRU;
+  to_exit->probability = cond_e->probability;
+  to_new_loop->probability = to_exit->probability.invert ();
+
+  update_ssa (TODO_update_ssa);
+
+  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
+
+  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, ";; Loop split on wrap index.\n");
+
+  return true;
+}
+
+/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
split.
+L_H:
+ if (i!=N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+
+The "i!=N" is like "i>N || i<N", then it could be transformed to:
+
+L_H:
+ if (i>N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+L1_H:
+ if (i<N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L1_H;
+
+The loop with "i<N" is in favor of both GIMPLE and RTL passes.  */
+
+static bool
+split_loop_on_ne_cond (class loop *loop)
+{
+  tree step;
+  edge branch_edge = get_ne_cond_branch (loop, &step);
+  if (!branch_edge)
+    return false;
+
+  int num = 0;
+  basic_block *bbs = get_loop_body (loop);
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
+
+  if (num > param_max_peeled_insns)
+    {
+      free (bbs);
+      return false;
+    }
+
+  if (!can_copy_bbs_p (bbs, loop->num_nodes))
+    {
+      free (bbs);
+      return false;
+    }
+  free (bbs);
+
+  if (split_ne_loop (loop, branch_edge, step))
+    return true;
+
+  return false;
+}
+
  /* Main entry point.  Perform loop splitting on all suitable loops.  */

  static unsigned int
@@ -1622,7 +1873,8 @@ tree_ssa_split_loops (void)
        if (optimize_loop_for_size_p (loop))
  	continue;

-      if (split_loop (loop) || split_loop_on_cond (loop))
+      if (split_loop (loop) || split_loop_on_cond (loop)
+	  || split_loop_on_ne_cond (loop))
  	{
  	  /* Mark our containing loop as having had some split inner loops.  
*/
  	  loop_outer (loop)->aux = loop;
Jiufu Guo June 21, 2021, 6:19 a.m. UTC | #4
On 2021-06-09 19:18, guojiufu wrote:
> On 2021-06-09 17:42, guojiufu via Gcc-patches wrote:
>> On 2021-06-08 18:13, Richard Biener wrote:
>>> On Fri, 4 Jun 2021, Jiufu Guo wrote:
>>> 
>> cut...
cut...
> 
> Here is the updated patch, thanks for your time!

Updates:
. Enhance code to support negative step.
. Check step +-1 to make sure it hits loop condition !=
. Enhance runtime cases to check more boundary cases and run order 
cases.
. Refine for compiling time: check loop num of insns and can_copy_bbs_p 
later

> 
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++, q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c
> b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 00000000000..56377e2f2f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,155 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void
> +abort (void);
> +extern void
> +exit (int);
> +void
> +push (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l++ != n)
> +    {
> +      push (l);
> +      if (a[l] != b[l])
> +	break;
> +      push (l + 1);
> +    }
> +  return l;
> +}
> +
> +void NI
> +foo_1 (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (--l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar_1 (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l-- != n)
> +    {
> +      push (l);
> +      if (a[l] != b[l])
> +	break;
> +      push (l + 1);
> +    }
> +
> +  return l;
> +}
> +
> +int a[258];
> +int b[258];
> +int c[1024];
> +static int top = 0;
> +void
> +push (int e)
> +{
> +  c[top++] = e;
> +}
> +
> +void
> +reset ()
> +{
> +  top = 0;
> +  __builtin_memset (c, 0, sizeof (c));
> +}
> +
> +#define check(a, b) (a == b)
> +
> +int
> +check_c (int *c, int a0, int a1, int a2, int a3, int a4, int a5)
> +{
> +  return check (c[0], a0) && check (c[1], a1) && check (c[2], a2)
> +	 && check (c[3], a3) && check (c[4], a4) && check (c[5], a5);
> +}
> +
> +int
> +main ()
> +{
> +  __builtin_memcpy (b, a, sizeof (a));
> +  reset ();
> +  if (bar (a, b, 6, 8) != 9 || !check_c (c, 7, 8, 8, 9, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 5, 3) != 4 || !check_c (c, 6, 7, 7, 8, 8, 9)
> +      || !check_c (c + 496, 254, 255, 255, 256, 0, 1))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 6, 6) != 7 || !check_c (c, 0, 0, 0, 0, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 253, 255) != 0 || !check_c (c, 254, 255, 255, 256, 0, 
> 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 253, 0) != 1 || !check_c (c, 254, 255, 255, 256, 0, 
> 1))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 6, 8) != 7 || !check_c (c, 5, 6, 4, 5, 3, 4))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 5, 3) != 2 || !check_c (c, 4, 5, 3, 4, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 6, 6) != 5)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 255) != 254 || !check_c (c, 1, 2, 0, 1, 255, 
> 256))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 0) != 255 || !check_c (c, 1, 2, 0, 1, 0, 0))
> +    abort ();
> +
> +  b[100] += 1;
> +  reset ();
> +  if (bar (a, b, 90, 110) != 100)
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 110, 105) != 100)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 90, 110) != 109)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 90) != 100)
> +    abort ();
> +
> +  foo (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort ();
> +
> +  foo_1 (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort ();
> +
> +  exit (0);
> +}
> diff --git a/gcc/testsuite/gcc.dg/loop-split3.c
> b/gcc/testsuite/gcc.dg/loop-split3.c
> new file mode 100644
> index 00000000000..ec93ee8bd12
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split3.c
> @@ -0,0 +1,62 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (--l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l-- != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (--l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l-- != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      --i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && --i != n)
> +    p--, q--;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 6 "lsplit" } } */
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..e9f23b32186 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
> 
>  /* This file implements two kinds of loop splitting.
> 
> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
>     conditional).  I.e. the second loop can now be entered either
>     via the original entry or via NEW_E, so the entry values of LOOP2
>     phi nodes are either the original ones or those at the exit
> -   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
> -   this.  The loops need to fulfill easy_exit_values().  */
> +   of LOOP1.  Selecting the previous value instead next value as the
> +   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
> +   LOOP2 pre-header reflecting this.  The loops need to fulfill
> +   easy_exit_values().  */
> 
>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop
> *loop2, edge new_e)
> 
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
> new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1598,252 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
> 
> +/* Check if the LOOP exit branch is like "if (idx != bound)",
> +   Return the branch edge which exit loop, if wrap may happen on 
> "idx".  */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop, tree *step)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      basic_block bb = e->src;
> +
> +      /* Check if exit at gcond.  */
> +      gimple *last = last_stmt (bb);
> +      if (!last || gimple_code (last) != GIMPLE_COND)
> +	continue;
> +      gcond *cond = as_a<gcond *> (last);
> +      enum tree_code code = gimple_cond_code (cond);
> +      if (!((code == NE_EXPR && (e->flags & EDGE_FALSE_VALUE))
> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
> +	continue;
> +
> +      /* Check if bound is invarant.  */
> +      tree idx = gimple_cond_lhs (cond);
> +      tree bnd = gimple_cond_rhs (cond);
> +      if (expr_invariant_in_loop_p (loop, idx))
> +	std::swap (idx, bnd);
> +      else if (!expr_invariant_in_loop_p (loop, bnd))
> +	continue;
> +
> +      /* Only unsigned type conversion could cause wrap.  */
> +      tree type = TREE_TYPE (idx);
> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> +	  || !TYPE_UNSIGNED (type))
> +	continue;
> +
> +      /* Avoid to split if bound is MAX/MIN val.  */
> +      tree bound_type = TREE_TYPE (bnd);
> +      if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> +	  || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
> +	continue;
> +
> +      /* Check if there is possible wrap.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +      if (!integer_onep (niter.control.step)
> +	  && !integer_minus_onep (niter.control.step))
> +	continue;
> +      *step = niter.control.step;
> +
> +      /* If exit edge is just before the empty latch, it is easy to 
> link
> +	 the split loops: just jump from the exit edge of one loop to the
> +	 header of new loop.  */
> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == bb
> +	  && empty_block_p (loop->latch))
> +	return e;
> +
> +      /* If exit edge is at end of header, and header contains i++ or 
> ++i,
> +	 only, it is simple to link the split loops: jump from the end of
> +	 one loop header to the new loop header, and use unchanged PHI result
> +	 of the first loop as the entry PHI value of the second loop.  */
> +      if (bb == loop->header)
> +	{
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (bb);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gphi *phi = psi.phi ();
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* Check it is ++i or ++i */
> +	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +	  tree prev = PHI_RESULT (phi);
> +	  if (idx != prev && idx != next)
> +	    continue;
> +
> +	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
> +	  if (gsi_end_p (gsi))
> +	    continue;
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
> +	      || gimple_assign_rhs1 (s1) != prev)
> +	    continue;
> +
> +	  gsi_next_nondebug (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
> LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e, tree step)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Invert edges for gcond.  */
> +  if (gimple_cond_code (gc) == EQ_EXPR)
> +    {
> +      auto invert_edge = [](basic_block bb) {
> +	edge out = EDGE_SUCC (bb, 0);
> +	edge in = EDGE_SUCC (bb, 1);
> +	if (in->flags & EDGE_TRUE_VALUE)
> +	  std::swap (in, out);
> +	in->flags |= EDGE_TRUE_VALUE;
> +	in->flags &= ~EDGE_FALSE_VALUE;
> +	out->flags |= EDGE_FALSE_VALUE;
> +	out->flags &= ~EDGE_TRUE_VALUE;
> +      };
> +
> +      invert_edge (gimple_bb (gc));
> +      invert_edge (gimple_bb (dup_gc));
> +    }
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  if (tree_int_cst_sign_bit (step))
> +    inv = !inv;
> +  enum tree_code first_loop_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code second_loop_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, first_loop_code);
> +  gimple_cond_set_code (dup_gc, second_loop_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop
> +    = pred_e && pred_e->src == cond_e->src && empty_block_p 
> (loop->latch);
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, second_loop_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
> (loop2)->src, 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
> split.
> +L_H:
> + if (i!=N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transformed to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor of both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  tree step;
> +  edge branch_edge = get_ne_cond_branch (loop, &step);
> +  if (!branch_edge)
> +    return false;
> +
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
> &eni_size_weights);
> +
> +  if (num > param_max_peeled_insns)
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +  free (bbs);
> +
> +  if (split_ne_loop (loop, branch_edge, step))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  
> */
> 
>  static unsigned int
> @@ -1622,7 +1873,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>  	continue;
> 
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +	  || split_loop_on_ne_cond (loop))
>  	{
>  	  /* Mark our containing loop as having had some split inner loops.  
> */
>  	  loop_outer (loop)->aux = loop;
Richard Biener June 21, 2021, 8:51 a.m. UTC | #5
On Wed, 9 Jun 2021, guojiufu wrote:

> On 2021-06-09 17:42, guojiufu via Gcc-patches wrote:
> > On 2021-06-08 18:13, Richard Biener wrote:
> >> On Fri, 4 Jun 2021, Jiufu Guo wrote:
> >> 
> > cut...
> >>> +      gcond *cond = as_a<gcond *> (last);
> >>> +      enum tree_code code = gimple_cond_code (cond);
> >>> +      if (!(code == NE_EXPR
> >>> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
> >> 
> >> The NE_EXPR check misses a corresponding && (e->flags & EDGE_FALSE_VALUE)
> >> check.
> >> 
> > Thanks, check (e->flags & EDGE_FALSE_VALUE) would be safer.
> > 
> >>> +	continue;
> >>> +
> >>> +      /* Check if bound is invarant.  */
> >>> +      tree idx = gimple_cond_lhs (cond);
> >>> +      tree bnd = gimple_cond_rhs (cond);
> >>> +      if (expr_invariant_in_loop_p (loop, idx))
> >>> +	std::swap (idx, bnd);
> >>> +      else if (!expr_invariant_in_loop_p (loop, bnd))
> >>> +	continue;
> >>> +
> >>> +      /* Only unsigned type conversion could cause wrap.  */
> >>> +      tree type = TREE_TYPE (idx);
> >>> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> >>> +	  || !TYPE_UNSIGNED (type))
> >>> +	continue;
> >>> +
> >>> +      /* Avoid to split if bound is MAX/MIN val.  */
> >>> +      tree bound_type = TREE_TYPE (bnd);
> >>> +      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P (bound_type)
> >>> +	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> >>> +	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
> >>> +	continue;
> >> 
> >> Note you do not require 'bnd' to be constant and thus at runtime those
> >> cases still need to be handled correctly.
> > Yes, bnd is not required to be constant.  The above code is filtering the
> > case
> > where bnd is const max/min value of the type.  So, the code could be updated
> > as:
> >       if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> >           || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))

Yes, and the comment adjusted to "if bound is known to be MAX/MIN val."

> >> 
> >>> +      /* Check if there is possible wrap.  */
> >>> +      class tree_niter_desc niter;
> >>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> > cut...
> >>> +
> >>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> >> 
> >> It now occurs to me that we nowhere check the evolution of IDX
> >> (split_at_bb_p uses simple_iv for this for example).  The transform
> >> assumes that we will actually hit i == n and that i increments, but
> >> while you check the control IV from number_of_iterations_exit
> >> for NE_EXPR that does not guarantee a positive evolution.
> >> 
> > If I do not correctly reply your question, please point out:
> > number_of_iterations_exit is similar with simple_iv to invoke
> > simple_iv_with_niters
> > which check the evolution, and number_of_iterations_exit check
> > number_of_iterations_cond
> > which check no_overflow more accurate, this is one reason I use this
> > function.
> >
> > This transform assumes that the last run hits i==n.
> > Otherwise, the loop may run infinitely wrap after wrap.
> > For safe, if the step is 1 or -1,  this assumption would be true.  I
> > would add this check.

OK.

> > Thanks so much for pointing out I missed the negative step!
> > 
> >> Your testcases do not include any negative step examples, but I guess
> >> the conditions need to be swapped in this case?
> > 
> > I would add cases and code to support step 1/-1.
> > 
> >> 
> >> I think you also have to consider the order we split, say with
> >> 
> >>   for (i = start; i != end; ++i)
> >>     {
> >>       push (i);
> >>       if (a[i] != b[i])
> >>         break;
> >>     }
> >> 
> >> push (i) calls need to be in the same order for all cases of
> >> start < end, start == end and start > end (and also cover
> >> runtime testcases with end == 0 or end == UINT_MAX, likewise
> >> for start).
> > I add tests for the above cases. If missing sth, please point out, thanks!
> > 
> >> 
> >>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> >>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> >>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> > cut....
> > 
> > Thanks again for the very helpful review!
> > 
> > BR,
> > Jiufu Guo.
> 
> Here is the updated patch, thanks for your time!
> 
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++, q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c
> b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 00000000000..56377e2f2f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,155 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void
> +abort (void);
> +extern void
> +exit (int);
> +void
> +push (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l++ != n)
> +    {
> +      push (l);
> +      if (a[l] != b[l])
> +	break;
> +      push (l + 1);
> +    }
> +  return l;
> +}
> +
> +void NI
> +foo_1 (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (--l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar_1 (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l-- != n)
> +    {
> +      push (l);
> +      if (a[l] != b[l])
> +	break;
> +      push (l + 1);
> +    }
> +
> +  return l;
> +}
> +
> +int a[258];
> +int b[258];
> +int c[1024];
> +static int top = 0;
> +void
> +push (int e)
> +{
> +  c[top++] = e;
> +}
> +
> +void
> +reset ()
> +{
> +  top = 0;
> +  __builtin_memset (c, 0, sizeof (c));
> +}
> +
> +#define check(a, b) (a == b)
> +
> +int
> +check_c (int *c, int a0, int a1, int a2, int a3, int a4, int a5)
> +{
> +  return check (c[0], a0) && check (c[1], a1) && check (c[2], a2)
> +	 && check (c[3], a3) && check (c[4], a4) && check (c[5], a5);
> +}
> +
> +int
> +main ()
> +{
> +  __builtin_memcpy (b, a, sizeof (a));
> +  reset ();
> +  if (bar (a, b, 6, 8) != 9 || !check_c (c, 7, 8, 8, 9, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 5, 3) != 4 || !check_c (c, 6, 7, 7, 8, 8, 9)
> +      || !check_c (c + 496, 254, 255, 255, 256, 0, 1))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 6, 6) != 7 || !check_c (c, 0, 0, 0, 0, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 253, 255) != 0 || !check_c (c, 254, 255, 255, 256, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 253, 0) != 1 || !check_c (c, 254, 255, 255, 256, 0, 1))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 6, 8) != 7 || !check_c (c, 5, 6, 4, 5, 3, 4))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 5, 3) != 2 || !check_c (c, 4, 5, 3, 4, 0, 0))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 6, 6) != 5)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 255) != 254 || !check_c (c, 1, 2, 0, 1, 255, 256))
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 0) != 255 || !check_c (c, 1, 2, 0, 1, 0, 0))
> +    abort ();
> +
> +  b[100] += 1;
> +  reset ();
> +  if (bar (a, b, 90, 110) != 100)
> +    abort ();
> +
> +  reset ();
> +  if (bar (a, b, 110, 105) != 100)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 90, 110) != 109)
> +    abort ();
> +
> +  reset ();
> +  if (bar_1 (a, b, 2, 90) != 100)
> +    abort ();
> +
> +  foo (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort ();
> +
> +  foo_1 (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort ();
> +
> +  exit (0);
> +}
> diff --git a/gcc/testsuite/gcc.dg/loop-split3.c
> b/gcc/testsuite/gcc.dg/loop-split3.c
> new file mode 100644
> index 00000000000..ec93ee8bd12
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split3.c
> @@ -0,0 +1,62 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (--l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l-- != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (--l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l-- != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      --i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && --i != n)
> +    p--, q--;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 6 "lsplit" } } */
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..e9f23b32186 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
> 
>  /* This file implements two kinds of loop splitting.
> 
> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
>     conditional).  I.e. the second loop can now be entered either
>     via the original entry or via NEW_E, so the entry values of LOOP2
>     phi nodes are either the original ones or those at the exit
> -   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
> -   this.  The loops need to fulfill easy_exit_values().  */
> +   of LOOP1.  Selecting the previous value instead next value as the
> +   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
> +   LOOP2 pre-header reflecting this.  The loops need to fulfill
> +   easy_exit_values().  */
> 
>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2,
> edge new_e)
> 
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1598,252 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
> 
> +/* Check if the LOOP exit branch is like "if (idx != bound)",
> +   Return the branch edge which exit loop, if wrap may happen on "idx".  */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop, tree *step)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      basic_block bb = e->src;
> +
> +      /* Check if exit at gcond.  */
> +      gimple *last = last_stmt (bb);
> +      if (!last || gimple_code (last) != GIMPLE_COND)
> +	continue;
> +      gcond *cond = as_a<gcond *> (last);

   gcond *cont = safe_dyn_cast <gcond *> (last_stmt (bb));
   if (!last)
     continue;

is shorter.

> +      enum tree_code code = gimple_cond_code (cond);
> +      if (!((code == NE_EXPR && (e->flags & EDGE_FALSE_VALUE))
> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
> +	continue;
> +
> +      /* Check if bound is invarant.  */
> +      tree idx = gimple_cond_lhs (cond);
> +      tree bnd = gimple_cond_rhs (cond);
> +      if (expr_invariant_in_loop_p (loop, idx))
> +	std::swap (idx, bnd);
> +      else if (!expr_invariant_in_loop_p (loop, bnd))
> +	continue;
> +
> +      /* Only unsigned type conversion could cause wrap.  */
> +      tree type = TREE_TYPE (idx);
> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> +	  || !TYPE_UNSIGNED (type))
> +	continue;
> +
> +      /* Avoid to split if bound is MAX/MIN val.  */
> +      tree bound_type = TREE_TYPE (bnd);
> +      if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> +	  || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
> +	continue;
> +
> +      /* Check if there is possible wrap.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +      if (!integer_onep (niter.control.step)
> +	  && !integer_minus_onep (niter.control.step))
> +	continue;
> +      *step = niter.control.step;
> +
> +      /* If exit edge is just before the empty latch, it is easy to link
> +	 the split loops: just jump from the exit edge of one loop to the
> +	 header of new loop.  */
> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == bb
> +	  && empty_block_p (loop->latch))
> +	return e;
> +
> +      /* If exit edge is at end of header, and header contains i++ or ++i,
> +	 only, it is simple to link the split loops: jump from the end of
> +	 one loop header to the new loop header, and use unchanged PHI result
> +	 of the first loop as the entry PHI value of the second loop.  */
> +      if (bb == loop->header)
> +	{
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (bb);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gphi *phi = psi.phi ();
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* Check it is ++i or ++i */
> +	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +	  tree prev = PHI_RESULT (phi);
> +	  if (idx != prev && idx != next)
> +	    continue;
> +
> +	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
> +	  if (gsi_end_p (gsi))
> +	    continue;
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
> +	      || gimple_assign_rhs1 (s1) != prev)
> +	    continue;
> +
> +	  gsi_next_nondebug (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e, tree step)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Invert edges for gcond.  */
> +  if (gimple_cond_code (gc) == EQ_EXPR)
> +    {
> +      auto invert_edge = [](basic_block bb) {
> +	edge out = EDGE_SUCC (bb, 0);
> +	edge in = EDGE_SUCC (bb, 1);
> +	if (in->flags & EDGE_TRUE_VALUE)
> +	  std::swap (in, out);
> +	in->flags |= EDGE_TRUE_VALUE;
> +	in->flags &= ~EDGE_FALSE_VALUE;
> +	out->flags |= EDGE_FALSE_VALUE;
> +	out->flags &= ~EDGE_TRUE_VALUE;
> +      };
> +
> +      invert_edge (gimple_bb (gc));
> +      invert_edge (gimple_bb (dup_gc));
> +    }
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  if (tree_int_cst_sign_bit (step))
> +    inv = !inv;
> +  enum tree_code first_loop_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code second_loop_code = inv ? GT_EXPR : LT_EXPR;

You could handle gimple_cond_code (gc) == EQ_EXPR via

   if (gimple_cond_code (gc) == EQ_EXPR)
    {
     first_loop_code = invert_tree_comparison (first_loop_code, false);
     second_loop_code = invert_tree_comparison (second_loop_code, false);
   }

that looks simpler than the lambda dance with inverting the edge flags.

> +  gimple_cond_set_code (gc, first_loop_code);
> +  gimple_cond_set_code (dup_gc, second_loop_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop
> +    = pred_e && pred_e->src == cond_e->src && empty_block_p (loop->latch);
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, second_loop_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src,
> 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);

I've re-organized the pass to perform a single TODO_update_ssa at the
very end, please do not update SSA form here, nor

> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);

re-write into loop-closed SSA.

> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transformed to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor of both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  tree step;
> +  edge branch_edge = get_ne_cond_branch (loop, &step);
> +  if (!branch_edge)
> +    return false;
> +
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);

Since the motivation is to make data-refs analyzable after the transform
if there are no datarefs the transform only increases code-size.  In
particular I would look for calls which will be not analyzable.  Since
we're looking at each stmt above that could be embedded here.  As said
earlier once num exceeds param_max_peeled_insns you can stop the above
loop.  So heuristically I'd do sth like

    for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p 
(gsi); gsi_next (&gsi))
      {
        gimple  *stmt = gsi_stmt (gsi);
        if (is_gimple_debug (stmt))
          continue;
        if (gimple_has_side_effects (stmt))
          {
            free (bbs);
            return false;
          }
        num += estimate_num_insns (stmt, &eni_size_weights);
        if (num > param_max_peeled_insns)
          {
            free (bbs);
            return false;
          }
        if (gimple_vuse (stmt))
          any_dr = true;
      }
    if (!any_dr)
      {
        free (bbs);
        return false;
      }

There's also still the issue that the transformed loop will fail
number of iteration analysis for the loop that iterates until
the IV wraps.  That's a blocker for the acceptance of this transform.

Richard.
Jiufu Guo June 22, 2021, 3:55 a.m. UTC | #6
On 2021-06-21 16:51, Richard Biener wrote:
> On Wed, 9 Jun 2021, guojiufu wrote:
> 
>> On 2021-06-09 17:42, guojiufu via Gcc-patches wrote:
>> > On 2021-06-08 18:13, Richard Biener wrote:
>> >> On Fri, 4 Jun 2021, Jiufu Guo wrote:
>> >>
>> > cut...
>> >>> +      gcond *cond = as_a<gcond *> (last);
>> >>> +      enum tree_code code = gimple_cond_code (cond);
>> >>> +      if (!(code == NE_EXPR
>> >>> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
>> >>
>> >> The NE_EXPR check misses a corresponding && (e->flags & EDGE_FALSE_VALUE)
>> >> check.
>> >>
>> > Thanks, check (e->flags & EDGE_FALSE_VALUE) would be safer.
>> >
>> >>> +	continue;
>> >>> +
>> >>> +      /* Check if bound is invarant.  */
>> >>> +      tree idx = gimple_cond_lhs (cond);
>> >>> +      tree bnd = gimple_cond_rhs (cond);
>> >>> +      if (expr_invariant_in_loop_p (loop, idx))
>> >>> +	std::swap (idx, bnd);
>> >>> +      else if (!expr_invariant_in_loop_p (loop, bnd))
>> >>> +	continue;
>> >>> +
>> >>> +      /* Only unsigned type conversion could cause wrap.  */
>> >>> +      tree type = TREE_TYPE (idx);
>> >>> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
>> >>> +	  || !TYPE_UNSIGNED (type))
>> >>> +	continue;
>> >>> +
>> >>> +      /* Avoid to split if bound is MAX/MIN val.  */
>> >>> +      tree bound_type = TREE_TYPE (bnd);
>> >>> +      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P (bound_type)
>> >>> +	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>> >>> +	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
>> >>> +	continue;
>> >>
>> >> Note you do not require 'bnd' to be constant and thus at runtime those
>> >> cases still need to be handled correctly.
>> > Yes, bnd is not required to be constant.  The above code is filtering the
>> > case
>> > where bnd is const max/min value of the type.  So, the code could be updated
>> > as:
>> >       if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>> >           || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
> 
> Yes, and the comment adjusted to "if bound is known to be MAX/MIN val."
> 
>> >>
>> >>> +      /* Check if there is possible wrap.  */
>> >>> +      class tree_niter_desc niter;
>> >>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
>> > cut...
>> >>> +
>> >>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>> >>
>> >> It now occurs to me that we nowhere check the evolution of IDX
>> >> (split_at_bb_p uses simple_iv for this for example).  The transform
>> >> assumes that we will actually hit i == n and that i increments, but
>> >> while you check the control IV from number_of_iterations_exit
>> >> for NE_EXPR that does not guarantee a positive evolution.
>> >>
>> > If I do not correctly reply your question, please point out:
>> > number_of_iterations_exit is similar with simple_iv to invoke
>> > simple_iv_with_niters
>> > which check the evolution, and number_of_iterations_exit check
>> > number_of_iterations_cond
>> > which check no_overflow more accurate, this is one reason I use this
>> > function.
>> >
>> > This transform assumes that the last run hits i==n.
>> > Otherwise, the loop may run infinitely wrap after wrap.
>> > For safe, if the step is 1 or -1,  this assumption would be true.  I
>> > would add this check.
> 
> OK.
> 
>> > Thanks so much for pointing out I missed the negative step!
>> >
>> >> Your testcases do not include any negative step examples, but I guess
>> >> the conditions need to be swapped in this case?
>> >
>> > I would add cases and code to support step 1/-1.
>> >
>> >>
>> >> I think you also have to consider the order we split, say with
>> >>
>> >>   for (i = start; i != end; ++i)
>> >>     {
>> >>       push (i);
>> >>       if (a[i] != b[i])
>> >>         break;
>> >>     }
>> >>
>> >> push (i) calls need to be in the same order for all cases of
>> >> start < end, start == end and start > end (and also cover
>> >> runtime testcases with end == 0 or end == UINT_MAX, likewise
>> >> for start).
>> > I add tests for the above cases. If missing sth, please point out, thanks!
>> >
>> >>
>> >>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>> >>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>> >>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
>> > cut....
>> >
>> > Thanks again for the very helpful review!
>> >
>> > BR,
>> > Jiufu Guo.
>> 
>> Here is the updated patch, thanks for your time!
>> 
>> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
>> b/gcc/testsuite/gcc.dg/loop-split1.c
>> new file mode 100644
>> index 00000000000..dd2d03a7b96
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
>> @@ -0,0 +1,101 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
>> +
>> +void
>> +foo (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (++l != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +void
>> +foo_1 (int *a, int *b, unsigned n)
>> +{
>> +  unsigned l = 0;
>> +  while (++l != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +void
>> +foo1 (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (l++ != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +/* No wrap.  */
>> +void
>> +foo1_1 (int *a, int *b, unsigned n)
>> +{
>> +  unsigned l = 0;
>> +  while (l++ != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +unsigned
>> +foo2 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (++l != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +unsigned
>> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  l = 0;
>> +  while (++l != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +unsigned
>> +foo3 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (l++ != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +/* No wrap.  */
>> +unsigned
>> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  l = 0;
>> +  while (l++ != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +void
>> +bar ();
>> +void
>> +foo4 (unsigned n, unsigned i)
>> +{
>> +  do
>> +    {
>> +      if (i == n)
>> +	return;
>> +      bar ();
>> +      ++i;
>> +    }
>> +  while (1);
>> +}
>> +
>> +unsigned
>> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
>> +{
>> +  while (p[i] == q[i] && ++i != n)
>> +    p++, q++;
>> +
>> +  return i;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
>> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c
>> b/gcc/testsuite/gcc.dg/loop-split2.c
>> new file mode 100644
>> index 00000000000..56377e2f2f5
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
>> @@ -0,0 +1,155 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O3" } */
>> +
>> +extern void
>> +abort (void);
>> +extern void
>> +exit (int);
>> +void
>> +push (int);
>> +
>> +#define NI __attribute__ ((noinline))
>> +
>> +void NI
>> +foo (int *a, int *b, unsigned char l, unsigned char n)
>> +{
>> +  while (++l != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +unsigned NI
>> +bar (int *a, int *b, unsigned char l, unsigned char n)
>> +{
>> +  while (l++ != n)
>> +    {
>> +      push (l);
>> +      if (a[l] != b[l])
>> +	break;
>> +      push (l + 1);
>> +    }
>> +  return l;
>> +}
>> +
>> +void NI
>> +foo_1 (int *a, int *b, unsigned char l, unsigned char n)
>> +{
>> +  while (--l != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +unsigned NI
>> +bar_1 (int *a, int *b, unsigned char l, unsigned char n)
>> +{
>> +  while (l-- != n)
>> +    {
>> +      push (l);
>> +      if (a[l] != b[l])
>> +	break;
>> +      push (l + 1);
>> +    }
>> +
>> +  return l;
>> +}
>> +
>> +int a[258];
>> +int b[258];
>> +int c[1024];
>> +static int top = 0;
>> +void
>> +push (int e)
>> +{
>> +  c[top++] = e;
>> +}
>> +
>> +void
>> +reset ()
>> +{
>> +  top = 0;
>> +  __builtin_memset (c, 0, sizeof (c));
>> +}
>> +
>> +#define check(a, b) (a == b)
>> +
>> +int
>> +check_c (int *c, int a0, int a1, int a2, int a3, int a4, int a5)
>> +{
>> +  return check (c[0], a0) && check (c[1], a1) && check (c[2], a2)
>> +	 && check (c[3], a3) && check (c[4], a4) && check (c[5], a5);
>> +}
>> +
>> +int
>> +main ()
>> +{
>> +  __builtin_memcpy (b, a, sizeof (a));
>> +  reset ();
>> +  if (bar (a, b, 6, 8) != 9 || !check_c (c, 7, 8, 8, 9, 0, 0))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar (a, b, 5, 3) != 4 || !check_c (c, 6, 7, 7, 8, 8, 9)
>> +      || !check_c (c + 496, 254, 255, 255, 256, 0, 1))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar (a, b, 6, 6) != 7 || !check_c (c, 0, 0, 0, 0, 0, 0))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar (a, b, 253, 255) != 0 || !check_c (c, 254, 255, 255, 256, 
>> 0, 0))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar (a, b, 253, 0) != 1 || !check_c (c, 254, 255, 255, 256, 0, 
>> 1))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 6, 8) != 7 || !check_c (c, 5, 6, 4, 5, 3, 4))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 5, 3) != 2 || !check_c (c, 4, 5, 3, 4, 0, 0))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 6, 6) != 5)
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 2, 255) != 254 || !check_c (c, 1, 2, 0, 1, 255, 
>> 256))
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 2, 0) != 255 || !check_c (c, 1, 2, 0, 1, 0, 0))
>> +    abort ();
>> +
>> +  b[100] += 1;
>> +  reset ();
>> +  if (bar (a, b, 90, 110) != 100)
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar (a, b, 110, 105) != 100)
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 90, 110) != 109)
>> +    abort ();
>> +
>> +  reset ();
>> +  if (bar_1 (a, b, 2, 90) != 100)
>> +    abort ();
>> +
>> +  foo (a, b, 99, 99);
>> +  a[99] = b[99] + 1;
>> +  for (int i = 0; i < 256; i++)
>> +    if (a[i] != b[i] + 1)
>> +      abort ();
>> +
>> +  foo_1 (a, b, 99, 99);
>> +  a[99] = b[99] + 1;
>> +  for (int i = 0; i < 256; i++)
>> +    if (a[i] != b[i] + 1)
>> +      abort ();
>> +
>> +  exit (0);
>> +}
>> diff --git a/gcc/testsuite/gcc.dg/loop-split3.c
>> b/gcc/testsuite/gcc.dg/loop-split3.c
>> new file mode 100644
>> index 00000000000..ec93ee8bd12
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/loop-split3.c
>> @@ -0,0 +1,62 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
>> +
>> +void
>> +foo (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (--l != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +void
>> +foo1 (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (l-- != n)
>> +    a[l] = b[l] + 1;
>> +}
>> +
>> +unsigned
>> +foo2 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (--l != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +unsigned
>> +foo3 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (l-- != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +void
>> +bar ();
>> +void
>> +foo4 (unsigned n, unsigned i)
>> +{
>> +  do
>> +    {
>> +      if (i == n)
>> +	return;
>> +      bar ();
>> +      --i;
>> +    }
>> +  while (1);
>> +}
>> +
>> +unsigned
>> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
>> +{
>> +  while (p[i] == q[i] && --i != n)
>> +    p--, q--;
>> +
>> +  return i;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "Loop split" 6 "lsplit" } } */
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3a09bbc39e5..e9f23b32186 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "cfghooks.h"
>>  #include "gimple-fold.h"
>>  #include "gimplify-me.h"
>> +#include "tree-ssa-loop-ivopts.h"
>> 
>>  /* This file implements two kinds of loop splitting.
>> 
>> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
>>     conditional).  I.e. the second loop can now be entered either
>>     via the original entry or via NEW_E, so the entry values of LOOP2
>>     phi nodes are either the original ones or those at the exit
>> -   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
>> -   this.  The loops need to fulfill easy_exit_values().  */
>> +   of LOOP1.  Selecting the previous value instead next value as the
>> +   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
>> +   LOOP2 pre-header reflecting this.  The loops need to fulfill
>> +   easy_exit_values().  */
>> 
>>  static void
>> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
>> +		   bool use_prev = false)
>>  {
>>    basic_block rest = loop_preheader_edge (loop2)->src;
>>    gcc_assert (new_e->dest == rest);
>> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop 
>> *loop2,
>> edge new_e)
>> 
>>        gphi * newphi = create_phi_node (new_init, rest);
>>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
>> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
>> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
>> new_e,
>> +		   UNKNOWN_LOCATION);
>>        SET_USE (op, new_init);
>>      }
>>  }
>> @@ -1593,6 +1598,252 @@ split_loop_on_cond (struct loop *loop)
>>    return do_split;
>>  }
>> 
>> +/* Check if the LOOP exit branch is like "if (idx != bound)",
>> +   Return the branch edge which exit loop, if wrap may happen on 
>> "idx".  */
>> +
>> +static edge
>> +get_ne_cond_branch (struct loop *loop, tree *step)
>> +{
>> +  int i;
>> +  edge e;
>> +
>> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
>> +  FOR_EACH_VEC_ELT (edges, i, e)
>> +    {
>> +      basic_block bb = e->src;
>> +
>> +      /* Check if exit at gcond.  */
>> +      gimple *last = last_stmt (bb);
>> +      if (!last || gimple_code (last) != GIMPLE_COND)
>> +	continue;
>> +      gcond *cond = as_a<gcond *> (last);
> 
>    gcond *cont = safe_dyn_cast <gcond *> (last_stmt (bb));
>    if (!last)
>      continue;
> 
> is shorter.
Thanks.

> 
>> +      enum tree_code code = gimple_cond_code (cond);
>> +      if (!((code == NE_EXPR && (e->flags & EDGE_FALSE_VALUE))
>> +	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
>> +	continue;
>> +
>> +      /* Check if bound is invarant.  */
>> +      tree idx = gimple_cond_lhs (cond);
>> +      tree bnd = gimple_cond_rhs (cond);
>> +      if (expr_invariant_in_loop_p (loop, idx))
>> +	std::swap (idx, bnd);
>> +      else if (!expr_invariant_in_loop_p (loop, bnd))
>> +	continue;
>> +
>> +      /* Only unsigned type conversion could cause wrap.  */
>> +      tree type = TREE_TYPE (idx);
>> +      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
>> +	  || !TYPE_UNSIGNED (type))
>> +	continue;
>> +
>> +      /* Avoid to split if bound is MAX/MIN val.  */
>> +      tree bound_type = TREE_TYPE (bnd);
>> +      if (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
>> +	  || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type)))
>> +	continue;
>> +
>> +      /* Check if there is possible wrap.  */
>> +      class tree_niter_desc niter;
>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
>> +	continue;
>> +      if (niter.control.no_overflow)
>> +	return NULL;
>> +      if (niter.cmp != NE_EXPR)
>> +	continue;
>> +      if (!integer_onep (niter.control.step)
>> +	  && !integer_minus_onep (niter.control.step))
>> +	continue;
>> +      *step = niter.control.step;
>> +
>> +      /* If exit edge is just before the empty latch, it is easy to 
>> link
>> +	 the split loops: just jump from the exit edge of one loop to the
>> +	 header of new loop.  */
>> +      if (single_pred_p (loop->latch)
>> +	  && single_pred_edge (loop->latch)->src == bb
>> +	  && empty_block_p (loop->latch))
>> +	return e;
>> +
>> +      /* If exit edge is at end of header, and header contains i++ or 
>> ++i,
>> +	 only, it is simple to link the split loops: jump from the end of
>> +	 one loop header to the new loop header, and use unchanged PHI 
>> result
>> +	 of the first loop as the entry PHI value of the second loop.  */
>> +      if (bb == loop->header)
>> +	{
>> +	  /* Only one phi.  */
>> +	  gphi_iterator psi = gsi_start_phis (bb);
>> +	  if (gsi_end_p (psi))
>> +	    continue;
>> +	  gphi *phi = psi.phi ();
>> +	  gsi_next (&psi);
>> +	  if (!gsi_end_p (psi))
>> +	    continue;
>> +
>> +	  /* Check it is ++i or ++i */
>> +	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
>> +	  tree prev = PHI_RESULT (phi);
>> +	  if (idx != prev && idx != next)
>> +	    continue;
>> +
>> +	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb 
>> (bb);
>> +	  if (gsi_end_p (gsi))
>> +	    continue;
>> +	  gimple *s1 = gsi_stmt (gsi);
>> +	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
>> +	      || gimple_assign_rhs1 (s1) != prev)
>> +	    continue;
>> +
>> +	  gsi_next_nondebug (&gsi);
>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
>> +	    return e;
>> +	}
>> +    }
>> +
>> +  return NULL;
>> +}
>> +
>> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
>> LT_EXPR.  */
>> +
>> +static bool
>> +split_ne_loop (struct loop *loop, edge cond_e, tree step)
>> +{
>> +  initialize_original_copy_tables ();
>> +
>> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
>> +				     profile_probability::always (),
>> +				     profile_probability::never (),
>> +				     profile_probability::always (),
>> +				     profile_probability::always (), true);
>> +
>> +  gcc_assert (loop2);
>> +  update_ssa (TODO_update_ssa);
>> +
>> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
>> +  free_original_copy_tables ();
>> +
>> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
>> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
>> +
>> +  /* Invert edges for gcond.  */
>> +  if (gimple_cond_code (gc) == EQ_EXPR)
>> +    {
>> +      auto invert_edge = [](basic_block bb) {
>> +	edge out = EDGE_SUCC (bb, 0);
>> +	edge in = EDGE_SUCC (bb, 1);
>> +	if (in->flags & EDGE_TRUE_VALUE)
>> +	  std::swap (in, out);
>> +	in->flags |= EDGE_TRUE_VALUE;
>> +	in->flags &= ~EDGE_FALSE_VALUE;
>> +	out->flags |= EDGE_FALSE_VALUE;
>> +	out->flags &= ~EDGE_TRUE_VALUE;
>> +      };
>> +
>> +      invert_edge (gimple_bb (gc));
>> +      invert_edge (gimple_bb (dup_gc));
>> +    }
>> +
>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>> +  if (tree_int_cst_sign_bit (step))
>> +    inv = !inv;
>> +  enum tree_code first_loop_code = inv ? LT_EXPR : GT_EXPR;
>> +  enum tree_code second_loop_code = inv ? GT_EXPR : LT_EXPR;
> 
> You could handle gimple_cond_code (gc) == EQ_EXPR via
> 
>    if (gimple_cond_code (gc) == EQ_EXPR)
>     {
>      first_loop_code = invert_tree_comparison (first_loop_code, false);
>      second_loop_code = invert_tree_comparison (second_loop_code, 
> false);
>    }
> 
> that looks simpler than the lambda dance with inverting the edge flags.
I did not use invert_tree_comparison in the code, because it would 
generate
GE_EXPR/LE_EXPR which look like containing EQ_EXPR :)
I would update patch to use invert_tree_comparison which is simpler and 
shorter.

> 
>> +  gimple_cond_set_code (gc, first_loop_code);
>> +  gimple_cond_set_code (dup_gc, second_loop_code);
>> +
>> +  /* Link the exit cond edge to new loop.  */
>> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
>> +  edge pred_e = single_pred_edge (loop->latch);
>> +  bool simple_loop
>> +    = pred_e && pred_e->src == cond_e->src && empty_block_p 
>> (loop->latch);
>> +  if (simple_loop)
>> +    gimple_cond_set_code (break_cond, second_loop_code);
>> +  else
>> +    gimple_cond_make_true (break_cond);
>> +
>> +  basic_block break_bb = split_edge (cond_e);
>> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
>> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
>> +
>> +  edge to_exit = single_succ_edge (break_bb);
>> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
>> (loop2)->src,
>> 0);
>> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
>> +  to_exit->flags |= EDGE_FALSE_VALUE;
>> +  to_exit->flags &= ~EDGE_FALLTHRU;
>> +  to_exit->probability = cond_e->probability;
>> +  to_new_loop->probability = to_exit->probability.invert ();
>> +
>> +  update_ssa (TODO_update_ssa);
> 
> I've re-organized the pass to perform a single TODO_update_ssa at the
> very end, please do not update SSA form here, nor
> 
>> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
>> +
>> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> 
> re-write into loop-closed SSA.
Thanks!
> 
>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
>> +
>> +  return true;
>> +}
>> +
>> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
>> split.
>> +L_H:
>> + if (i!=N)
>> +   S;
>> + else
>> +   goto EXIT;
>> + i++;
>> + goto L_H;
>> +
>> +The "i!=N" is like "i>N || i<N", then it could be transformed to:
>> +
>> +L_H:
>> + if (i>N)
>> +   S;
>> + else
>> +   goto EXIT;
>> + i++;
>> + goto L_H;
>> +L1_H:
>> + if (i<N)
>> +   S;
>> + else
>> +   goto EXIT;
>> + i++;
>> + goto L1_H;
>> +
>> +The loop with "i<N" is in favor of both GIMPLE and RTL passes.  */
>> +
>> +static bool
>> +split_loop_on_ne_cond (class loop *loop)
>> +{
>> +  tree step;
>> +  edge branch_edge = get_ne_cond_branch (loop, &step);
>> +  if (!branch_edge)
>> +    return false;
>> +
>> +  int num = 0;
>> +  basic_block *bbs = get_loop_body (loop);
>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
>> &eni_size_weights);
> 
> Since the motivation is to make data-refs analyzable after the 
> transform
> if there are no datarefs the transform only increases code-size.  In
> particular I would look for calls which will be not analyzable.  Since
> we're looking at each stmt above that could be embedded here.  As said
> earlier once num exceeds param_max_peeled_insns you can stop the above
> loop.  So heuristically I'd do sth like
Right, if no following optimization enabled, it only increases code-size
and increase branches overhead, and may not gain performance.

> 
>     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p
> (gsi); gsi_next (&gsi))
>       {
>         gimple  *stmt = gsi_stmt (gsi);
>         if (is_gimple_debug (stmt))
>           continue;
>         if (gimple_has_side_effects (stmt))
>           {
>             free (bbs);
>             return false;
>           }
>         num += estimate_num_insns (stmt, &eni_size_weights);
>         if (num > param_max_peeled_insns)
>           {
>             free (bbs);
>             return false;
>           }
>         if (gimple_vuse (stmt))
>           any_dr = true;
>       }
>     if (!any_dr)
>       {
>         free (bbs);
>         return false;
>       }
> 
> There's also still the issue that the transformed loop will fail
> number of iteration analysis for the loop that iterates until
> the IV wraps.  That's a blocker for the acceptance of this transform.
Understand, I saw you opened PR101145.

Thanks for your comments!

BR,
Jiufu Guo.

> 
> Richard.
diff mbox series

Patch

diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc b/gcc/testsuite/g++.dg/vect/pr98064.cc
index 74043ce7725..dcb2985d05a 100644
--- a/gcc/testsuite/g++.dg/vect/pr98064.cc
+++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
@@ -1,5 +1,7 @@ 
 // { dg-do compile }
-// { dg-additional-options "-O3" }
+// { dg-additional-options "-O3 -Wno-stringop-overflow" }
+/* There is warning message when "short g = var_8; g; g++"
+   is optimized/analyzed as string operation,e.g. memset.  */
 
 const long long &min(const long long &__a, long long &__b) {
   if (__b < __a)
diff --git a/gcc/testsuite/gcc.dg/loop-split1.c b/gcc/testsuite/gcc.dg/loop-split1.c
new file mode 100644
index 00000000000..dd2d03a7b96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split1.c
@@ -0,0 +1,101 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
+
+void
+foo (int *a, int *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+void
+foo_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+void
+foo1 (int *a, int *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+/* No wrap.  */
+void
+foo1_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned
+foo2 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo2_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo3 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+/* No wrap.  */
+unsigned
+foo3_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+void
+bar ();
+void
+foo4 (unsigned n, unsigned i)
+{
+  do
+    {
+      if (i == n)
+	return;
+      bar ();
+      ++i;
+    }
+  while (1);
+}
+
+unsigned
+find_skip_diff (char *p, char *q, unsigned n, unsigned i)
+{
+  while (p[i] == q[i] && ++i != n)
+    p++, q++;
+
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-split2.c b/gcc/testsuite/gcc.dg/loop-split2.c
new file mode 100644
index 00000000000..0d3fded3f61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split2.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+extern void abort (void);
+extern void exit (int);
+
+#define NI __attribute__ ((noinline))
+
+void NI
+foo (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned NI
+bar (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+int a[258];
+int b[258];
+
+int main()
+{
+  __builtin_memcpy (b, a, sizeof (a));
+
+  if (bar (a, b, 3, 8) != 9)
+    abort ();
+
+  if (bar (a, b, 8, 3) != 4)
+    abort ();
+
+  b[100] += 1;
+  if (bar (a, b, 90, 110) != 100)
+    abort ();
+
+  if (bar (a, b, 110, 105) != 100)
+    abort ();
+
+  foo (a, b, 99, 99);
+  a[99] = b[99] + 1;
+  for (int i = 0; i < 256; i++)
+    if (a[i] != b[i] + 1)
+      abort();
+
+  exit (0);
+}
+
diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..9c0de66e288 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -41,6 +41,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "cfghooks.h"
 #include "gimple-fold.h"
 #include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"
 
 /* This file implements two kinds of loop splitting.
 
@@ -229,11 +230,14 @@  easy_exit_values (class loop *loop)
    conditional).  I.e. the second loop can now be entered either
    via the original entry or via NEW_E, so the entry values of LOOP2
    phi nodes are either the original ones or those at the exit
-   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
-   this.  The loops need to fulfill easy_exit_values().  */
+   of LOOP1.  Selecting the previous value instead next value as the
+   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
+   LOOP2 pre-header reflecting this.  The loops need to fulfill
+   easy_exit_values().  */
 
 static void
-connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
+connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
+		   bool use_prev = false)
 {
   basic_block rest = loop_preheader_edge (loop2)->src;
   gcc_assert (new_e->dest == rest);
@@ -279,7 +283,8 @@  connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
 
       gphi * newphi = create_phi_node (new_init, rest);
       add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
-      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
+      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
+		   UNKNOWN_LOCATION);
       SET_USE (op, new_init);
     }
 }
@@ -1593,6 +1598,241 @@  split_loop_on_cond (struct loop *loop)
   return do_split;
 }
 
+/* Check if the LOOP exit branch is like "if (idx != bound)",
+   Return the branch edge which exit loop, if wrap may happen on "idx".  */
+
+static edge
+get_ne_cond_branch (struct loop *loop)
+{
+  int i;
+  edge e;
+
+  auto_vec<edge> edges = get_loop_exit_edges (loop);
+  FOR_EACH_VEC_ELT (edges, i, e)
+    {
+      basic_block bb = e->src;
+
+      /* Check if exit at gcond.  */
+      gimple *last = last_stmt (bb);
+      if (!last || gimple_code (last) != GIMPLE_COND)
+	continue;
+      gcond *cond = as_a<gcond *> (last);
+      enum tree_code code = gimple_cond_code (cond);
+      if (!(code == NE_EXPR
+	    || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
+	continue;
+
+      /* Check if bound is invarant.  */
+      tree idx = gimple_cond_lhs (cond);
+      tree bnd = gimple_cond_rhs (cond);
+      if (expr_invariant_in_loop_p (loop, idx))
+	std::swap (idx, bnd);
+      else if (!expr_invariant_in_loop_p (loop, bnd))
+	continue;
+
+      /* Only unsigned type conversion could cause wrap.  */
+      tree type = TREE_TYPE (idx);
+      if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
+	  || !TYPE_UNSIGNED (type))
+	continue;
+
+      /* Avoid to split if bound is MAX/MIN val.  */
+      tree bound_type = TREE_TYPE (bnd);
+      if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P (bound_type)
+	  && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
+	      || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
+	continue;
+
+      /* Check if there is possible wrap.  */
+      class tree_niter_desc niter;
+      if (!number_of_iterations_exit (loop, e, &niter, false, false))
+	continue;
+      if (niter.control.no_overflow)
+	return NULL;
+      if (niter.cmp != NE_EXPR)
+	continue;
+
+      /* If exit edge is just before the empty latch, it is easy to link
+	 the split loops: just jump from the exit edge of one loop to the
+	 header of new loop.  */
+      if (single_pred_p (loop->latch)
+	  && single_pred_edge (loop->latch)->src == bb
+	  && empty_block_p (loop->latch))
+	return e;
+
+      /* If exit edge is at end of header, and header contains i++ or ++i,
+	 only, it is simple to link the split loops:  jump from the end of
+	 one loop header to the new loop header, and use unchanged PHI
+	 result of first loop as the entry PHI value of the second loop.  */
+      if (bb == loop->header)
+	{
+	  /* Only one phi.  */
+	  gphi_iterator psi = gsi_start_phis (bb);
+	  if (gsi_end_p (psi))
+	    continue;
+	  gphi *phi = psi.phi ();
+	  gsi_next (&psi);
+	  if (!gsi_end_p (psi))
+	    continue;
+
+	  /* Check it is ++i or ++i */
+	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+	  tree prev = PHI_RESULT (phi);
+	  if (idx != prev && idx != next)
+	    continue;
+
+	  gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
+	  if (gsi_end_p (gsi))
+	    continue;
+	  gimple *s1 = gsi_stmt (gsi);
+	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
+	      || gimple_assign_rhs1 (s1) != prev)
+	    continue;
+
+	  gsi_next_nondebug (&gsi);
+	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
+	    return e;
+	}
+    }
+
+  return NULL;
+}
+
+/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
+
+static bool
+split_ne_loop (struct loop *loop, edge cond_e)
+{
+  initialize_original_copy_tables ();
+
+  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
+				     profile_probability::always (),
+				     profile_probability::never (),
+				     profile_probability::always (),
+				     profile_probability::always (), true);
+
+  gcc_assert (loop2);
+  update_ssa (TODO_update_ssa);
+
+  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
+  free_original_copy_tables ();
+
+  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
+  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
+
+  /* Invert edges for gcond.  */
+  if (gimple_cond_code (gc) == EQ_EXPR)
+    {
+      auto invert_edge = [](basic_block bb) {
+	edge out = EDGE_SUCC (bb, 0);
+	edge in = EDGE_SUCC (bb, 1);
+	if (in->flags & EDGE_TRUE_VALUE)
+	  std::swap (in, out);
+	in->flags |= EDGE_TRUE_VALUE;
+	in->flags &= ~EDGE_FALSE_VALUE;
+	out->flags |= EDGE_FALSE_VALUE;
+	out->flags &= ~EDGE_TRUE_VALUE;
+      };
+
+      invert_edge (gimple_bb (gc));
+      invert_edge (gimple_bb (dup_gc));
+    }
+
+  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
+  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
+  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
+  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
+
+  gimple_cond_set_code (gc, up_code);
+  gimple_cond_set_code (dup_gc, down_code);
+
+  /* Link the exit cond edge to new loop.  */
+  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
+  edge pred_e = single_pred_edge (loop->latch);
+  bool simple_loop
+    = pred_e && pred_e->src == cond_e->src && empty_block_p (loop->latch);
+  if (simple_loop)
+    gimple_cond_set_code (break_cond, down_code);
+  else
+    gimple_cond_make_true (break_cond);
+
+  basic_block break_bb = split_edge (cond_e);
+  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
+  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
+
+  edge to_exit = single_succ_edge (break_bb);
+  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 0);
+  to_new_loop->flags |= EDGE_TRUE_VALUE;
+  to_exit->flags |= EDGE_FALSE_VALUE;
+  to_exit->flags &= ~EDGE_FALLTHRU;
+  to_exit->probability = cond_e->probability;
+  to_new_loop->probability = to_exit->probability.invert ();
+
+  update_ssa (TODO_update_ssa);
+
+  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
+
+  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, ";; Loop split on wrap index.\n");
+
+  return true;
+}
+
+/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
+L_H:
+ if (i!=N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+
+The "i!=N" is like "i>N || i<N", then it could be transform to:
+
+L_H:
+ if (i>N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+L1_H:
+ if (i<N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L1_H;
+
+The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
+
+static bool
+split_loop_on_ne_cond (class loop *loop)
+{
+  int num = 0;
+  basic_block *bbs = get_loop_body (loop);
+
+  if (!can_copy_bbs_p (bbs, loop->num_nodes))
+    {
+      free (bbs);
+      return false;
+    }
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
+  free (bbs);
+
+  if (num > param_max_peeled_insns)
+    return false;
+
+  edge branch_edge = get_ne_cond_branch (loop);
+  if (branch_edge && split_ne_loop (loop, branch_edge))
+    return true;
+
+  return false;
+}
+
 /* Main entry point.  Perform loop splitting on all suitable loops.  */
 
 static unsigned int
@@ -1622,7 +1862,8 @@  tree_ssa_split_loops (void)
       if (optimize_loop_for_size_p (loop))
 	continue;
 
-      if (split_loop (loop) || split_loop_on_cond (loop))
+      if (split_loop (loop) || split_loop_on_cond (loop)
+	  || split_loop_on_ne_cond (loop))
 	{
 	  /* Mark our containing loop as having had some split inner loops.  */
 	  loop_outer (loop)->aux = loop;