diff mbox series

[V2] Split loop for NE condition.

Message ID 20210517020143.60075-1-guojiufu@linux.ibm.com
State New
Headers show
Series [V2] Split loop for NE condition. | expand

Commit Message

Jiufu Guo May 17, 2021, 2:01 a.m. UTC
When there is the possibility that overflow/wrap may happen on the loop index,
a few optimizations would not happen. For example code:

foo (int *a, int *b, unsigned k, unsigned n)
{
  while (++k != n)
    a[k] = b[k]  + 1;
}

For this code, if "k > n", k would wrap.  if "k < n" at begining,
it could be optimized (e.g. vectorization).

We can split the loop into two loops:

  while (++k > n)
    a[k] = b[k]  + 1;
  while (l++ < n)
    a[k] = b[k]  + 1;

then for the second loop, it could be optimized.

This patch is spltting this kind of small loop to achieve better performance.

Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?

Thanks!

Jiufu Guo.

gcc/ChangeLog:

2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>

	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
	(get_ne_cond_branch): New function.
	(split_ne_loop): New function.
	(split_loop_on_ne_cond): New function.
	(tree_ssa_split_loops): Use split_loop_on_ne_cond.

gcc/testsuite/ChangeLog:

2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>

	* gcc.dg/loop-split1.c: New test.
	* g++.dg/vect/pr98064.cc: Suppress warning.
---
 gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
 gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
 gcc/tree-ssa-loop-split.c            | 188 ++++++++++++++++++++++++++-
 3 files changed, 296 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c

Comments

Bernd Edlinger May 18, 2021, 6:36 a.m. UTC | #1
On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
> When there is the possibility that overflow/wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
> 
> foo (int *a, int *b, unsigned k, unsigned n)
> {
>   while (++k != n)
>     a[k] = b[k]  + 1;
> }
> 
> For this code, if "k > n", k would wrap.  if "k < n" at begining,
> it could be optimized (e.g. vectorization).
> 
> We can split the loop into two loops:
> 
>   while (++k > n)
>     a[k] = b[k]  + 1;
>   while (l++ < n)
>     a[k] = b[k]  + 1;
> 
> then for the second loop, it could be optimized.
> 
> This patch is spltting this kind of small loop to achieve better performance.
> 
> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
> 
> Thanks!
> 
> Jiufu Guo.
> 
> gcc/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
> 	(get_ne_cond_branch): New function.
> 	(split_ne_loop): New function.
> 	(split_loop_on_ne_cond): New function.
> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* gcc.dg/loop-split1.c: New test.
> 	* g++.dg/vect/pr98064.cc: Suppress warning.
> ---
>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>  gcc/tree-ssa-loop-split.c            | 188 ++++++++++++++++++++++++++-
>  3 files changed, 296 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
>  
>  const long long &min(const long long &__a, long long &__b) {
>    if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..30b006b1b14
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,108 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void bar();
> +void foo4(unsigned n,  unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +        return;
> +      bar();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +foo5 (double *a, unsigned n, unsigned i)
> +{
> +  while (a[i] > 0 && i != n)
> +    i++;
> +
> +  return i;
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++,q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..5c1742b5ff4 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>  
>  /* This file implements two kinds of loop splitting.
>  
> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>     this.  The loops need to fulfill easy_exit_values().  */
>  
>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>  
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
>  
> +/* Check if the LOOP exit branch likes "if (idx != bound)",
> +   Return the branch edge which exit loop, if overflow/wrap
> +   may happen on "idx".  */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      /* Check if there is possible wrap/overflow.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +
> +      /* Check loop is simple to split.  */
> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == e->src
> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
> +	return e;
> +
> +      /* Simple header.  */
> +      if (e->src == loop->header)
> +	{
> +	  if (get_virtual_phi (e->src))
> +	    continue;
> +
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (e->src);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* ++i or ++i */
> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
> +	  if (gsi_end_p (gsi))
> +	    continue;
> +
> +	  gimple *gc = last_stmt (e->src);
> +	  tree idx = gimple_cond_lhs (gc);
> +	  if (expr_invariant_in_loop_p (loop, idx))
> +	    idx = gimple_cond_rhs (gc);
> +
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!(is_gimple_assign (s1) && idx
> +		&& (idx == gimple_assign_lhs (s1)
> +		    || idx == gimple_assign_rhs1 (s1))))
> +	    continue;
> +
> +	  gsi_next (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, up_code);
> +  gimple_cond_set_code (dup_gc, down_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, down_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> +   S;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> +  free (bbs);
> +
> +  if (num > param_max_peeled_insns)
> +    return false;
> +
> +  edge branch_edge = get_ne_cond_branch (loop);
> +  if (branch_edge && split_ne_loop (loop, branch_edge))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  */
>  
>  static unsigned int
> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>  	continue;
>  
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +	  || split_loop_on_ne_cond (loop))
>  	{
>  	  /* Mark our containing loop as having had some split inner loops.  */
>  	  loop_outer (loop)->aux = loop;
> 

Hi,

I've tried this patch and it does not seem to pass its own test:

FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9

Note you should always do a bootstrap and regression test as described
here: https://gcc.gnu.org/contribute.html#testing

Also please state in your future patch submissions how you tested the patch,
like "bootstrap and regression test on x86_64-pc-linux-gnu" for instance.


Bernd.
Jiufu Guo May 18, 2021, 9:28 a.m. UTC | #2
On 2021-05-18 14:36, Bernd Edlinger wrote:
> On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
>> When there is the possibility that overflow/wrap may happen on the 
>> loop index,
>> a few optimizations would not happen. For example code:
>> 
>> foo (int *a, int *b, unsigned k, unsigned n)
>> {
>>   while (++k != n)
>>     a[k] = b[k]  + 1;
>> }
>> 
>> For this code, if "k > n", k would wrap.  if "k < n" at begining,
>> it could be optimized (e.g. vectorization).
>> 
>> We can split the loop into two loops:
>> 
>>   while (++k > n)
>>     a[k] = b[k]  + 1;
>>   while (l++ < n)
>>     a[k] = b[k]  + 1;
>> 
>> then for the second loop, it could be optimized.
>> 
>> This patch is spltting this kind of small loop to achieve better 
>> performance.
>> 
>> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
>> 
>> Thanks!
>> 
>> Jiufu Guo.
>> 
>> gcc/ChangeLog:
>> 
>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>> 
>> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
>> 	(get_ne_cond_branch): New function.
>> 	(split_ne_loop): New function.
>> 	(split_loop_on_ne_cond): New function.
>> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>> 
>> 	* gcc.dg/loop-split1.c: New test.
>> 	* g++.dg/vect/pr98064.cc: Suppress warning.
>> ---
>>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>>  gcc/tree-ssa-loop-split.c            | 188 
>> ++++++++++++++++++++++++++-
>>  3 files changed, 296 insertions(+), 4 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>> 
>> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
>> b/gcc/testsuite/g++.dg/vect/pr98064.cc
>> index 74043ce7725..dcb2985d05a 100644
>> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
>> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
>> @@ -1,5 +1,7 @@
>>  // { dg-do compile }
>> -// { dg-additional-options "-O3" }
>> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
>> +/* There is warning message when "short g = var_8; g; g++"
>> +   is optimized/analyzed as string operation,e.g. memset.  */
>> 
>>  const long long &min(const long long &__a, long long &__b) {
>>    if (__b < __a)
>> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
>> b/gcc/testsuite/gcc.dg/loop-split1.c
>> new file mode 100644
>> index 00000000000..30b006b1b14
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
>> @@ -0,0 +1,108 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
>> +
>> +void
>> +foo (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (++l != n)
>> +    a[l] = b[l]  + 1;
>> +}
>> +void
>> +foo_1 (int *a, int *b, unsigned n)
>> +{
>> +  unsigned l = 0;
>> +  while (++l != n)
>> +    a[l] = b[l]  + 1;
>> +}
>> +
>> +void
>> +foo1 (int *a, int *b, unsigned l, unsigned n)
>> +{
>> +  while (l++ != n)
>> +    a[l] = b[l]  + 1;
>> +}
>> +
>> +/* No wrap.  */
>> +void
>> +foo1_1 (int *a, int *b, unsigned n)
>> +{
>> +  unsigned l = 0;
>> +  while (l++ != n)
>> +    a[l] = b[l]  + 1;
>> +}
>> +
>> +unsigned
>> +foo2 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (++l != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +unsigned
>> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  l = 0;
>> +  while (++l != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +unsigned
>> +foo3 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  while (l++ != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +/* No wrap.  */
>> +unsigned
>> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
>> +{
>> +  l = 0;
>> +  while (l++ != n)
>> +    if (a[l] != b[l])
>> +      break;
>> +
>> +  return l;
>> +}
>> +
>> +void bar();
>> +void foo4(unsigned n,  unsigned i)
>> +{
>> +  do
>> +    {
>> +      if (i == n)
>> +        return;
>> +      bar();
>> +      ++i;
>> +    }
>> +  while (1);
>> +}
>> +
>> +unsigned
>> +foo5 (double *a, unsigned n, unsigned i)
>> +{
>> +  while (a[i] > 0 && i != n)
>> +    i++;
>> +
>> +  return i;
>> +}
>> +
>> +unsigned
>> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
>> +{
>> +  while (p[i] == q[i] && ++i != n)
>> +    p++,q++;
>> +
>> +  return i;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3a09bbc39e5..5c1742b5ff4 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "cfghooks.h"
>>  #include "gimple-fold.h"
>>  #include "gimplify-me.h"
>> +#include "tree-ssa-loop-ivopts.h"
>> 
>>  /* This file implements two kinds of loop splitting.
>> 
>> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>>     this.  The loops need to fulfill easy_exit_values().  */
>> 
>>  static void
>> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
>> +		   bool use_prev = false)
>>  {
>>    basic_block rest = loop_preheader_edge (loop2)->src;
>>    gcc_assert (new_e->dest == rest);
>> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop 
>> *loop2, edge new_e)
>> 
>>        gphi * newphi = create_phi_node (new_init, rest);
>>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
>> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
>> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
>> new_e,
>> +		   UNKNOWN_LOCATION);
>>        SET_USE (op, new_init);
>>      }
>>  }
>> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>>    return do_split;
>>  }
>> 
>> +/* Check if the LOOP exit branch likes "if (idx != bound)",
>> +   Return the branch edge which exit loop, if overflow/wrap
>> +   may happen on "idx".  */
>> +
>> +static edge
>> +get_ne_cond_branch (struct loop *loop)
>> +{
>> +  int i;
>> +  edge e;
>> +
>> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
>> +  FOR_EACH_VEC_ELT (edges, i, e)
>> +    {
>> +      /* Check if there is possible wrap/overflow.  */
>> +      class tree_niter_desc niter;
>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
>> +	continue;
>> +      if (niter.control.no_overflow)
>> +	return NULL;
>> +      if (niter.cmp != NE_EXPR)
>> +	continue;
>> +
>> +      /* Check loop is simple to split.  */
>> +      if (single_pred_p (loop->latch)
>> +	  && single_pred_edge (loop->latch)->src == e->src
>> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
>> +	return e;
>> +
>> +      /* Simple header.  */
>> +      if (e->src == loop->header)
>> +	{
>> +	  if (get_virtual_phi (e->src))
>> +	    continue;
>> +
>> +	  /* Only one phi.  */
>> +	  gphi_iterator psi = gsi_start_phis (e->src);
>> +	  if (gsi_end_p (psi))
>> +	    continue;
>> +	  gsi_next (&psi);
>> +	  if (!gsi_end_p (psi))
>> +	    continue;
>> +
>> +	  /* ++i or ++i */
>> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
>> +	  if (gsi_end_p (gsi))
>> +	    continue;
>> +
>> +	  gimple *gc = last_stmt (e->src);
>> +	  tree idx = gimple_cond_lhs (gc);
>> +	  if (expr_invariant_in_loop_p (loop, idx))
>> +	    idx = gimple_cond_rhs (gc);
>> +
>> +	  gimple *s1 = gsi_stmt (gsi);
>> +	  if (!(is_gimple_assign (s1) && idx
>> +		&& (idx == gimple_assign_lhs (s1)
>> +		    || idx == gimple_assign_rhs1 (s1))))
>> +	    continue;
>> +
>> +	  gsi_next (&gsi);
>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
>> +	    return e;
>> +	}
>> +    }
>> +
>> +  return NULL;
>> +}
>> +
>> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
>> LT_EXPR.  */
>> +
>> +static bool
>> +split_ne_loop (struct loop *loop, edge cond_e)
>> +{
>> +  initialize_original_copy_tables ();
>> +
>> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
>> +				     profile_probability::always (),
>> +				     profile_probability::never (),
>> +				     profile_probability::always (),
>> +				     profile_probability::always (), true);
>> +
>> +  gcc_assert (loop2);
>> +  update_ssa (TODO_update_ssa);
>> +
>> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
>> +  free_original_copy_tables ();
>> +
>> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
>> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
>> +
>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
>> +
>> +  gimple_cond_set_code (gc, up_code);
>> +  gimple_cond_set_code (dup_gc, down_code);
>> +
>> +  /* Link the exit cond edge to new loop.  */
>> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
>> +  edge pred_e = single_pred_edge (loop->latch);
>> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
>> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
>> +  if (simple_loop)
>> +    gimple_cond_set_code (break_cond, down_code);
>> +  else
>> +    gimple_cond_make_true (break_cond);
>> +
>> +  basic_block break_bb = split_edge (cond_e);
>> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
>> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
>> +
>> +  edge to_exit = single_succ_edge (break_bb);
>> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
>> (loop2)->src, 0);
>> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
>> +  to_exit->flags |= EDGE_FALSE_VALUE;
>> +  to_exit->flags &= ~EDGE_FALLTHRU;
>> +  to_exit->probability = cond_e->probability;
>> +  to_new_loop->probability = to_exit->probability.invert ();
>> +
>> +  update_ssa (TODO_update_ssa);
>> +
>> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
>> +
>> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
>> +
>> +  return true;
>> +}
>> +
>> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
>> split.
>> +L_H:
>> + if (i!=N)
>> +   S;
>> + i++;
>> + goto L_H;
>> +
>> +The "i!=N" is like "i>N || i<N", then it could be transform to:
>> +
>> +L_H:
>> + if (i>N)
>> +   S;
>> + i++;
>> + goto L_H;
>> +L1_H:
>> + if (i<N)
>> +   S;
>> + i++;
>> + goto L1_H;
>> +
>> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
>> +
>> +static bool
>> +split_loop_on_ne_cond (class loop *loop)
>> +{
>> +  int num = 0;
>> +  basic_block *bbs = get_loop_body (loop);
>> +
>> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
>> +    {
>> +      free (bbs);
>> +      return false;
>> +    }
>> +
>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
>> &eni_size_weights);
>> +  free (bbs);
>> +
>> +  if (num > param_max_peeled_insns)
>> +    return false;
>> +
>> +  edge branch_edge = get_ne_cond_branch (loop);
>> +  if (branch_edge && split_ne_loop (loop, branch_edge))
>> +    return true;
>> +
>> +  return false;
>> +}
>> +
>>  /* Main entry point.  Perform loop splitting on all suitable loops.  
>> */
>> 
>>  static unsigned int
>> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>>        if (optimize_loop_for_size_p (loop))
>>  	continue;
>> 
>> -      if (split_loop (loop) || split_loop_on_cond (loop))
>> +      if (split_loop (loop) || split_loop_on_cond (loop)
>> +	  || split_loop_on_ne_cond (loop))
>>  	{
>>  	  /* Mark our containing loop as having had some split inner loops.  
>> */
>>  	  loop_outer (loop)->aux = loop;
>> 
> 
> Hi,
> 
> I've tried this patch and it does not seem to pass its own test:
> 
> FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9
> 
> Note you should always do a bootstrap and regression test as described
> here: https://gcc.gnu.org/contribute.html#testing
> 
> Also please state in your future patch submissions how you tested the 
> patch,
> like "bootstrap and regression test on x86_64-pc-linux-gnu" for 
> instance.

Hi Bernd,

Thanks for your tests and comments!

It is interesting, this patch would be target independent.
I had a double test, it passes at my side.

# of expected passes            2

And if change the test case slightly, the case fails.
e.g. change to: scan-tree-dump-times "Loop split" 10 "lsplit"

My test machine is powerpc64le-linux-gnu.

BR,
Jiufu Guo.

> 
> 
> Bernd.
Jiufu Guo May 18, 2021, 10:32 a.m. UTC | #3
On 2021-05-18 17:28, guojiufu via Gcc-patches wrote:
> On 2021-05-18 14:36, Bernd Edlinger wrote:
>> On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
>>> When there is the possibility that overflow/wrap may happen on the 
>>> loop index,
>>> a few optimizations would not happen. For example code:
>>> 
>>> foo (int *a, int *b, unsigned k, unsigned n)
>>> {
>>>   while (++k != n)
>>>     a[k] = b[k]  + 1;
>>> }
>>> 
>>> For this code, if "k > n", k would wrap.  if "k < n" at begining,
>>> it could be optimized (e.g. vectorization).
>>> 
>>> We can split the loop into two loops:
>>> 
>>>   while (++k > n)
>>>     a[k] = b[k]  + 1;
>>>   while (l++ < n)
>>>     a[k] = b[k]  + 1;
>>> 
>>> then for the second loop, it could be optimized.
>>> 
>>> This patch is spltting this kind of small loop to achieve better 
>>> performance.
>>> 
>>> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
>>> 
>>> Thanks!
>>> 
>>> Jiufu Guo.
>>> 
>>> gcc/ChangeLog:
>>> 
>>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>>> 
>>> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
>>> 	(get_ne_cond_branch): New function.
>>> 	(split_ne_loop): New function.
>>> 	(split_loop_on_ne_cond): New function.
>>> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>>> 
>>> 	* gcc.dg/loop-split1.c: New test.
>>> 	* g++.dg/vect/pr98064.cc: Suppress warning.
>>> ---
>>>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>>>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>>>  gcc/tree-ssa-loop-split.c            | 188 
>>> ++++++++++++++++++++++++++-
>>>  3 files changed, 296 insertions(+), 4 deletions(-)
>>>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>>> 
>>> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
>>> b/gcc/testsuite/g++.dg/vect/pr98064.cc
>>> index 74043ce7725..dcb2985d05a 100644
>>> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
>>> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
>>> @@ -1,5 +1,7 @@
>>>  // { dg-do compile }
>>> -// { dg-additional-options "-O3" }
>>> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
>>> +/* There is warning message when "short g = var_8; g; g++"
>>> +   is optimized/analyzed as string operation,e.g. memset.  */
>>> 
>>>  const long long &min(const long long &__a, long long &__b) {
>>>    if (__b < __a)
>>> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
>>> b/gcc/testsuite/gcc.dg/loop-split1.c
>>> new file mode 100644
>>> index 00000000000..30b006b1b14
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
>>> @@ -0,0 +1,108 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
>>> +
>>> +void
>>> +foo (int *a, int *b, unsigned l, unsigned n)
>>> +{
>>> +  while (++l != n)
>>> +    a[l] = b[l]  + 1;
>>> +}
>>> +void
>>> +foo_1 (int *a, int *b, unsigned n)
>>> +{
>>> +  unsigned l = 0;
>>> +  while (++l != n)
>>> +    a[l] = b[l]  + 1;
>>> +}
>>> +
>>> +void
>>> +foo1 (int *a, int *b, unsigned l, unsigned n)
>>> +{
>>> +  while (l++ != n)
>>> +    a[l] = b[l]  + 1;
>>> +}
>>> +
>>> +/* No wrap.  */
>>> +void
>>> +foo1_1 (int *a, int *b, unsigned n)
>>> +{
>>> +  unsigned l = 0;
>>> +  while (l++ != n)
>>> +    a[l] = b[l]  + 1;
>>> +}
>>> +
>>> +unsigned
>>> +foo2 (char *a, char *b, unsigned l, unsigned n)
>>> +{
>>> +  while (++l != n)
>>> +    if (a[l] != b[l])
>>> +      break;
>>> +
>>> +  return l;
>>> +}
>>> +
>>> +unsigned
>>> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
>>> +{
>>> +  l = 0;
>>> +  while (++l != n)
>>> +    if (a[l] != b[l])
>>> +      break;
>>> +
>>> +  return l;
>>> +}
>>> +
>>> +unsigned
>>> +foo3 (char *a, char *b, unsigned l, unsigned n)
>>> +{
>>> +  while (l++ != n)
>>> +    if (a[l] != b[l])
>>> +      break;
>>> +
>>> +  return l;
>>> +}
>>> +
>>> +/* No wrap.  */
>>> +unsigned
>>> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
>>> +{
>>> +  l = 0;
>>> +  while (l++ != n)
>>> +    if (a[l] != b[l])
>>> +      break;
>>> +
>>> +  return l;
>>> +}
>>> +
>>> +void bar();
>>> +void foo4(unsigned n,  unsigned i)
>>> +{
>>> +  do
>>> +    {
>>> +      if (i == n)
>>> +        return;
>>> +      bar();
>>> +      ++i;
>>> +    }
>>> +  while (1);
>>> +}
>>> +
>>> +unsigned
>>> +foo5 (double *a, unsigned n, unsigned i)
>>> +{
>>> +  while (a[i] > 0 && i != n)
>>> +    i++;
>>> +
>>> +  return i;
>>> +}
>>> +
>>> +unsigned
>>> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
>>> +{
>>> +  while (p[i] == q[i] && ++i != n)
>>> +    p++,q++;
>>> +
>>> +  return i;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>>> index 3a09bbc39e5..5c1742b5ff4 100644
>>> --- a/gcc/tree-ssa-loop-split.c
>>> +++ b/gcc/tree-ssa-loop-split.c
>>> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>>>  #include "cfghooks.h"
>>>  #include "gimple-fold.h"
>>>  #include "gimplify-me.h"
>>> +#include "tree-ssa-loop-ivopts.h"
>>> 
>>>  /* This file implements two kinds of loop splitting.
>>> 
>>> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>>>     this.  The loops need to fulfill easy_exit_values().  */
>>> 
>>>  static void
>>> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>>> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
>>> +		   bool use_prev = false)
>>>  {
>>>    basic_block rest = loop_preheader_edge (loop2)->src;
>>>    gcc_assert (new_e->dest == rest);
>>> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop 
>>> *loop2, edge new_e)
>>> 
>>>        gphi * newphi = create_phi_node (new_init, rest);
>>>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
>>> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
>>> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
>>> new_e,
>>> +		   UNKNOWN_LOCATION);
>>>        SET_USE (op, new_init);
>>>      }
>>>  }
>>> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>>>    return do_split;
>>>  }
>>> 
>>> +/* Check if the LOOP exit branch likes "if (idx != bound)",
>>> +   Return the branch edge which exit loop, if overflow/wrap
>>> +   may happen on "idx".  */
>>> +
>>> +static edge
>>> +get_ne_cond_branch (struct loop *loop)
>>> +{
>>> +  int i;
>>> +  edge e;
>>> +
>>> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
>>> +  FOR_EACH_VEC_ELT (edges, i, e)
>>> +    {
>>> +      /* Check if there is possible wrap/overflow.  */
>>> +      class tree_niter_desc niter;
>>> +      if (!number_of_iterations_exit (loop, e, &niter, false, 
>>> false))
>>> +	continue;
>>> +      if (niter.control.no_overflow)
>>> +	return NULL;
>>> +      if (niter.cmp != NE_EXPR)
>>> +	continue;
>>> +
>>> +      /* Check loop is simple to split.  */
>>> +      if (single_pred_p (loop->latch)
>>> +	  && single_pred_edge (loop->latch)->src == e->src
>>> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
>>> +	return e;
>>> +
>>> +      /* Simple header.  */
>>> +      if (e->src == loop->header)
>>> +	{
>>> +	  if (get_virtual_phi (e->src))
>>> +	    continue;
>>> +
>>> +	  /* Only one phi.  */
>>> +	  gphi_iterator psi = gsi_start_phis (e->src);
>>> +	  if (gsi_end_p (psi))
>>> +	    continue;
>>> +	  gsi_next (&psi);
>>> +	  if (!gsi_end_p (psi))
>>> +	    continue;
>>> +
>>> +	  /* ++i or ++i */
>>> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
>>> +	  if (gsi_end_p (gsi))
>>> +	    continue;
>>> +
>>> +	  gimple *gc = last_stmt (e->src);
>>> +	  tree idx = gimple_cond_lhs (gc);
>>> +	  if (expr_invariant_in_loop_p (loop, idx))
>>> +	    idx = gimple_cond_rhs (gc);
>>> +
>>> +	  gimple *s1 = gsi_stmt (gsi);
>>> +	  if (!(is_gimple_assign (s1) && idx
>>> +		&& (idx == gimple_assign_lhs (s1)
>>> +		    || idx == gimple_assign_rhs1 (s1))))
>>> +	    continue;
>>> +
>>> +	  gsi_next (&gsi);
>>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
>>> +	    return e;
>>> +	}
>>> +    }
>>> +
>>> +  return NULL;
>>> +}
>>> +
>>> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
>>> LT_EXPR.  */
>>> +
>>> +static bool
>>> +split_ne_loop (struct loop *loop, edge cond_e)
>>> +{
>>> +  initialize_original_copy_tables ();
>>> +
>>> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
>>> +				     profile_probability::always (),
>>> +				     profile_probability::never (),
>>> +				     profile_probability::always (),
>>> +				     profile_probability::always (), true);
>>> +
>>> +  gcc_assert (loop2);
>>> +  update_ssa (TODO_update_ssa);
>>> +
>>> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
>>> +  free_original_copy_tables ();
>>> +
>>> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
>>> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
>>> +
>>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
>>> +
>>> +  gimple_cond_set_code (gc, up_code);
>>> +  gimple_cond_set_code (dup_gc, down_code);
>>> +
>>> +  /* Link the exit cond edge to new loop.  */
>>> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
>>> +  edge pred_e = single_pred_edge (loop->latch);
>>> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
>>> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
>>> +  if (simple_loop)
>>> +    gimple_cond_set_code (break_cond, down_code);
>>> +  else
>>> +    gimple_cond_make_true (break_cond);
>>> +
>>> +  basic_block break_bb = split_edge (cond_e);
>>> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
>>> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
>>> +
>>> +  edge to_exit = single_succ_edge (break_bb);
>>> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
>>> (loop2)->src, 0);
>>> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
>>> +  to_exit->flags |= EDGE_FALSE_VALUE;
>>> +  to_exit->flags &= ~EDGE_FALLTHRU;
>>> +  to_exit->probability = cond_e->probability;
>>> +  to_new_loop->probability = to_exit->probability.invert ();
>>> +
>>> +  update_ssa (TODO_update_ssa);
>>> +
>>> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
>>> +
>>> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
>>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>>> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
>>> +
>>> +  return true;
>>> +}
>>> +
>>> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
>>> split.
>>> +L_H:
>>> + if (i!=N)
>>> +   S;
>>> + i++;
>>> + goto L_H;
>>> +
>>> +The "i!=N" is like "i>N || i<N", then it could be transform to:
>>> +
>>> +L_H:
>>> + if (i>N)
>>> +   S;
>>> + i++;
>>> + goto L_H;
>>> +L1_H:
>>> + if (i<N)
>>> +   S;
>>> + i++;
>>> + goto L1_H;
>>> +
>>> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
>>> +
>>> +static bool
>>> +split_loop_on_ne_cond (class loop *loop)
>>> +{
>>> +  int num = 0;
>>> +  basic_block *bbs = get_loop_body (loop);
>>> +
>>> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
>>> +    {
>>> +      free (bbs);
>>> +      return false;
>>> +    }
>>> +
>>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>>> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
>>> &eni_size_weights);
>>> +  free (bbs);
>>> +
>>> +  if (num > param_max_peeled_insns)
>>> +    return false;
>>> +
>>> +  edge branch_edge = get_ne_cond_branch (loop);
>>> +  if (branch_edge && split_ne_loop (loop, branch_edge))
>>> +    return true;
>>> +
>>> +  return false;
>>> +}
>>> +
>>>  /* Main entry point.  Perform loop splitting on all suitable loops.  
>>> */
>>> 
>>>  static unsigned int
>>> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>>>        if (optimize_loop_for_size_p (loop))
>>>  	continue;
>>> 
>>> -      if (split_loop (loop) || split_loop_on_cond (loop))
>>> +      if (split_loop (loop) || split_loop_on_cond (loop)
>>> +	  || split_loop_on_ne_cond (loop))
>>>  	{
>>>  	  /* Mark our containing loop as having had some split inner loops. 
>>>  */
>>>  	  loop_outer (loop)->aux = loop;
>>> 
>> 
>> Hi,
>> 
>> I've tried this patch and it does not seem to pass its own test:
>> 
>> FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9
>> 
>> Note you should always do a bootstrap and regression test as described
>> here: https://gcc.gnu.org/contribute.html#testing
>> 
>> Also please state in your future patch submissions how you tested the 
>> patch,
>> like "bootstrap and regression test on x86_64-pc-linux-gnu" for 
>> instance.
> 
> Hi Bernd,
> 
> Thanks for your tests and comments!
> 
> It is interesting, this patch would be target independent.
> I had a double test, it passes at my side.
> 
> # of expected passes            2
> 
> And if change the test case slightly, the case fails.
> e.g. change to: scan-tree-dump-times "Loop split" 10 "lsplit"
> 
> My test machine is powerpc64le-linux-gnu.

On x86_64-pc-linux-gnu, I can reproduce the failure too.

It is 8 times of "Loop split".

I will have a look to see why.

BR.

Jiufu Guo.
> 
> BR,
> Jiufu Guo.
> 
>> 
>> 
>> Bernd.
Segher Boessenkool May 18, 2021, 11 a.m. UTC | #4
On Tue, May 18, 2021 at 08:36:34AM +0200, Bernd Edlinger wrote:
> On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
> > Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?

> I've tried this patch and it does not seem to pass its own test:
> 
> FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9

On what target?  :-P

> Note you should always do a bootstrap and regression test as described
> here: https://gcc.gnu.org/contribute.html#testing
> 
> Also please state in your future patch submissions how you tested the patch,
> like "bootstrap and regression test on x86_64-pc-linux-gnu" for instance.

Jiu Fu did all that.  If you understand that ppc64le means
powerpc64-linux of course.

It is interesting one loop fails on x86.  But that is why we have
stage1 :-)


Segher
Jiufu Guo May 18, 2021, 11:04 a.m. UTC | #5
On 2021-05-18 18:32, guojiufu wrote:
> On 2021-05-18 17:28, guojiufu via Gcc-patches wrote:
>> On 2021-05-18 14:36, Bernd Edlinger wrote:
>>> On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
>>>> When there is the possibility that overflow/wrap may happen on the 
>>>> loop index,
>>>> a few optimizations would not happen. For example code:
>>>> 
>>>> foo (int *a, int *b, unsigned k, unsigned n)
>>>> {
>>>>   while (++k != n)
>>>>     a[k] = b[k]  + 1;
>>>> }
>>>> 
>>>> For this code, if "k > n", k would wrap.  if "k < n" at begining,
>>>> it could be optimized (e.g. vectorization).
>>>> 
>>>> We can split the loop into two loops:
>>>> 
>>>>   while (++k > n)
>>>>     a[k] = b[k]  + 1;
>>>>   while (l++ < n)
>>>>     a[k] = b[k]  + 1;
>>>> 
>>>> then for the second loop, it could be optimized.
>>>> 
>>>> This patch is spltting this kind of small loop to achieve better 
>>>> performance.
>>>> 
>>>> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
>>>> 
>>>> Thanks!
>>>> 
>>>> Jiufu Guo.
>>>> 
>>>> gcc/ChangeLog:
>>>> 
>>>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>>>> 
>>>> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
>>>> 	(get_ne_cond_branch): New function.
>>>> 	(split_ne_loop): New function.
>>>> 	(split_loop_on_ne_cond): New function.
>>>> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
>>>> 
>>>> gcc/testsuite/ChangeLog:
>>>> 
>>>> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
>>>> 
>>>> 	* gcc.dg/loop-split1.c: New test.
>>>> 	* g++.dg/vect/pr98064.cc: Suppress warning.
>>>> ---
>>>>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>>>>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>>>>  gcc/tree-ssa-loop-split.c            | 188 
>>>> ++++++++++++++++++++++++++-
>>>>  3 files changed, 296 insertions(+), 4 deletions(-)
>>>>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>>>> 
>>>> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
>>>> b/gcc/testsuite/g++.dg/vect/pr98064.cc
>>>> index 74043ce7725..dcb2985d05a 100644
>>>> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
>>>> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
>>>> @@ -1,5 +1,7 @@
>>>>  // { dg-do compile }
>>>> -// { dg-additional-options "-O3" }
>>>> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
>>>> +/* There is warning message when "short g = var_8; g; g++"
>>>> +   is optimized/analyzed as string operation,e.g. memset.  */
>>>> 
>>>>  const long long &min(const long long &__a, long long &__b) {
>>>>    if (__b < __a)
>>>> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
>>>> b/gcc/testsuite/gcc.dg/loop-split1.c
>>>> new file mode 100644
>>>> index 00000000000..30b006b1b14
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
>>>> @@ -0,0 +1,108 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
>>>> +
>>>> +void
>>>> +foo (int *a, int *b, unsigned l, unsigned n)
>>>> +{
>>>> +  while (++l != n)
>>>> +    a[l] = b[l]  + 1;
>>>> +}
>>>> +void
>>>> +foo_1 (int *a, int *b, unsigned n)
>>>> +{
>>>> +  unsigned l = 0;
>>>> +  while (++l != n)
>>>> +    a[l] = b[l]  + 1;
>>>> +}
>>>> +
>>>> +void
>>>> +foo1 (int *a, int *b, unsigned l, unsigned n)
>>>> +{
>>>> +  while (l++ != n)
>>>> +    a[l] = b[l]  + 1;
>>>> +}
>>>> +
>>>> +/* No wrap.  */
>>>> +void
>>>> +foo1_1 (int *a, int *b, unsigned n)
>>>> +{
>>>> +  unsigned l = 0;
>>>> +  while (l++ != n)
>>>> +    a[l] = b[l]  + 1;
>>>> +}
>>>> +
>>>> +unsigned
>>>> +foo2 (char *a, char *b, unsigned l, unsigned n)
>>>> +{
>>>> +  while (++l != n)
>>>> +    if (a[l] != b[l])
>>>> +      break;
>>>> +
>>>> +  return l;
>>>> +}
>>>> +
>>>> +unsigned
>>>> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
>>>> +{
>>>> +  l = 0;
>>>> +  while (++l != n)
>>>> +    if (a[l] != b[l])
>>>> +      break;
>>>> +
>>>> +  return l;
>>>> +}
>>>> +
>>>> +unsigned
>>>> +foo3 (char *a, char *b, unsigned l, unsigned n)
>>>> +{
>>>> +  while (l++ != n)
>>>> +    if (a[l] != b[l])
>>>> +      break;
>>>> +
>>>> +  return l;
>>>> +}
>>>> +
>>>> +/* No wrap.  */
>>>> +unsigned
>>>> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
>>>> +{
>>>> +  l = 0;
>>>> +  while (l++ != n)
>>>> +    if (a[l] != b[l])
>>>> +      break;
>>>> +
>>>> +  return l;
>>>> +}
>>>> +
>>>> +void bar();
>>>> +void foo4(unsigned n,  unsigned i)
>>>> +{
>>>> +  do
>>>> +    {
>>>> +      if (i == n)
>>>> +        return;
>>>> +      bar();
>>>> +      ++i;
>>>> +    }
>>>> +  while (1);
>>>> +}
>>>> +
>>>> +unsigned
>>>> +foo5 (double *a, unsigned n, unsigned i)
>>>> +{
>>>> +  while (a[i] > 0 && i != n)
>>>> +    i++;
>>>> +
>>>> +  return i;
>>>> +}
>>>> +
>>>> +unsigned
>>>> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
>>>> +{
>>>> +  while (p[i] == q[i] && ++i != n)
>>>> +    p++,q++;
>>>> +
>>>> +  return i;
>>>> +}
>>>> +
>>>> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
>>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>>>> index 3a09bbc39e5..5c1742b5ff4 100644
>>>> --- a/gcc/tree-ssa-loop-split.c
>>>> +++ b/gcc/tree-ssa-loop-split.c
>>>> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>>>>  #include "cfghooks.h"
>>>>  #include "gimple-fold.h"
>>>>  #include "gimplify-me.h"
>>>> +#include "tree-ssa-loop-ivopts.h"
>>>> 
>>>>  /* This file implements two kinds of loop splitting.
>>>> 
>>>> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>>>>     this.  The loops need to fulfill easy_exit_values().  */
>>>> 
>>>>  static void
>>>> -connect_loop_phis (class loop *loop1, class loop *loop2, edge 
>>>> new_e)
>>>> +connect_loop_phis (class loop *loop1, class loop *loop2, edge 
>>>> new_e,
>>>> +		   bool use_prev = false)
>>>>  {
>>>>    basic_block rest = loop_preheader_edge (loop2)->src;
>>>>    gcc_assert (new_e->dest == rest);
>>>> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop 
>>>> *loop2, edge new_e)
>>>> 
>>>>        gphi * newphi = create_phi_node (new_init, rest);
>>>>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
>>>> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
>>>> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : 
>>>> next, new_e,
>>>> +		   UNKNOWN_LOCATION);
>>>>        SET_USE (op, new_init);
>>>>      }
>>>>  }
>>>> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>>>>    return do_split;
>>>>  }
>>>> 
>>>> +/* Check if the LOOP exit branch likes "if (idx != bound)",
>>>> +   Return the branch edge which exit loop, if overflow/wrap
>>>> +   may happen on "idx".  */
>>>> +
>>>> +static edge
>>>> +get_ne_cond_branch (struct loop *loop)
>>>> +{
>>>> +  int i;
>>>> +  edge e;
>>>> +
>>>> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
>>>> +  FOR_EACH_VEC_ELT (edges, i, e)
>>>> +    {
>>>> +      /* Check if there is possible wrap/overflow.  */
>>>> +      class tree_niter_desc niter;
>>>> +      if (!number_of_iterations_exit (loop, e, &niter, false, 
>>>> false))
>>>> +	continue;
>>>> +      if (niter.control.no_overflow)
>>>> +	return NULL;
>>>> +      if (niter.cmp != NE_EXPR)
>>>> +	continue;
>>>> +
>>>> +      /* Check loop is simple to split.  */
>>>> +      if (single_pred_p (loop->latch)
>>>> +	  && single_pred_edge (loop->latch)->src == e->src
>>>> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
>>>> +	return e;
>>>> +
>>>> +      /* Simple header.  */
>>>> +      if (e->src == loop->header)
>>>> +	{
>>>> +	  if (get_virtual_phi (e->src))
>>>> +	    continue;
>>>> +
>>>> +	  /* Only one phi.  */
>>>> +	  gphi_iterator psi = gsi_start_phis (e->src);
>>>> +	  if (gsi_end_p (psi))
>>>> +	    continue;
>>>> +	  gsi_next (&psi);
>>>> +	  if (!gsi_end_p (psi))
>>>> +	    continue;
>>>> +
>>>> +	  /* ++i or ++i */
>>>> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
>>>> +	  if (gsi_end_p (gsi))
>>>> +	    continue;
>>>> +
>>>> +	  gimple *gc = last_stmt (e->src);
>>>> +	  tree idx = gimple_cond_lhs (gc);
>>>> +	  if (expr_invariant_in_loop_p (loop, idx))
>>>> +	    idx = gimple_cond_rhs (gc);
>>>> +
>>>> +	  gimple *s1 = gsi_stmt (gsi);
>>>> +	  if (!(is_gimple_assign (s1) && idx
>>>> +		&& (idx == gimple_assign_lhs (s1)
>>>> +		    || idx == gimple_assign_rhs1 (s1))))
>>>> +	    continue;
>>>> +
>>>> +	  gsi_next (&gsi);
>>>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
>>>> +	    return e;
>>>> +	}
>>>> +    }
>>>> +
>>>> +  return NULL;
>>>> +}
>>>> +
>>>> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
>>>> LT_EXPR.  */
>>>> +
>>>> +static bool
>>>> +split_ne_loop (struct loop *loop, edge cond_e)
>>>> +{
>>>> +  initialize_original_copy_tables ();
>>>> +
>>>> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
>>>> +				     profile_probability::always (),
>>>> +				     profile_probability::never (),
>>>> +				     profile_probability::always (),
>>>> +				     profile_probability::always (), true);
>>>> +
>>>> +  gcc_assert (loop2);
>>>> +  update_ssa (TODO_update_ssa);
>>>> +
>>>> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
>>>> +  free_original_copy_tables ();
>>>> +
>>>> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
>>>> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
>>>> +
>>>> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
>>>> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
>>>> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
>>>> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
>>>> +
>>>> +  gimple_cond_set_code (gc, up_code);
>>>> +  gimple_cond_set_code (dup_gc, down_code);
>>>> +
>>>> +  /* Link the exit cond edge to new loop.  */
>>>> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
>>>> +  edge pred_e = single_pred_edge (loop->latch);
>>>> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
>>>> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
>>>> +  if (simple_loop)
>>>> +    gimple_cond_set_code (break_cond, down_code);
>>>> +  else
>>>> +    gimple_cond_make_true (break_cond);
>>>> +
>>>> +  basic_block break_bb = split_edge (cond_e);
>>>> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
>>>> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
>>>> +
>>>> +  edge to_exit = single_succ_edge (break_bb);
>>>> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
>>>> (loop2)->src, 0);
>>>> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
>>>> +  to_exit->flags |= EDGE_FALSE_VALUE;
>>>> +  to_exit->flags &= ~EDGE_FALLTHRU;
>>>> +  to_exit->probability = cond_e->probability;
>>>> +  to_new_loop->probability = to_exit->probability.invert ();
>>>> +
>>>> +  update_ssa (TODO_update_ssa);
>>>> +
>>>> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
>>>> +
>>>> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
>>>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>>>> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
>>>> +
>>>> +  return true;
>>>> +}
>>>> +
>>>> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
>>>> split.
>>>> +L_H:
>>>> + if (i!=N)
>>>> +   S;
>>>> + i++;
>>>> + goto L_H;
>>>> +
>>>> +The "i!=N" is like "i>N || i<N", then it could be transform to:
>>>> +
>>>> +L_H:
>>>> + if (i>N)
>>>> +   S;
>>>> + i++;
>>>> + goto L_H;
>>>> +L1_H:
>>>> + if (i<N)
>>>> +   S;
>>>> + i++;
>>>> + goto L1_H;
>>>> +
>>>> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
>>>> +
>>>> +static bool
>>>> +split_loop_on_ne_cond (class loop *loop)
>>>> +{
>>>> +  int num = 0;
>>>> +  basic_block *bbs = get_loop_body (loop);
>>>> +
>>>> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
>>>> +    {
>>>> +      free (bbs);
>>>> +      return false;
>>>> +    }
>>>> +
>>>> +  for (unsigned i = 0; i < loop->num_nodes; i++)
>>>> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
>>>> &eni_size_weights);
>>>> +  free (bbs);
>>>> +
>>>> +  if (num > param_max_peeled_insns)
>>>> +    return false;
>>>> +
>>>> +  edge branch_edge = get_ne_cond_branch (loop);
>>>> +  if (branch_edge && split_ne_loop (loop, branch_edge))
>>>> +    return true;
>>>> +
>>>> +  return false;
>>>> +}
>>>> +
>>>>  /* Main entry point.  Perform loop splitting on all suitable loops. 
>>>>  */
>>>> 
>>>>  static unsigned int
>>>> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>>>>        if (optimize_loop_for_size_p (loop))
>>>>  	continue;
>>>> 
>>>> -      if (split_loop (loop) || split_loop_on_cond (loop))
>>>> +      if (split_loop (loop) || split_loop_on_cond (loop)
>>>> +	  || split_loop_on_ne_cond (loop))
>>>>  	{
>>>>  	  /* Mark our containing loop as having had some split inner 
>>>> loops.  */
>>>>  	  loop_outer (loop)->aux = loop;
>>>> 
>>> 
>>> Hi,
>>> 
>>> I've tried this patch and it does not seem to pass its own test:
>>> 
>>> FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9
>>> 
>>> Note you should always do a bootstrap and regression test as 
>>> described
>>> here: https://gcc.gnu.org/contribute.html#testing
>>> 
>>> Also please state in your future patch submissions how you tested the 
>>> patch,
>>> like "bootstrap and regression test on x86_64-pc-linux-gnu" for 
>>> instance.
>> 
>> Hi Bernd,
>> 
>> Thanks for your tests and comments!
>> 
>> It is interesting, this patch would be target independent.
>> I had a double test, it passes at my side.
>> 
>> # of expected passes            2
>> 
>> And if change the test case slightly, the case fails.
>> e.g. change to: scan-tree-dump-times "Loop split" 10 "lsplit"
>> 
>> My test machine is powerpc64le-linux-gnu.
> 
> On x86_64-pc-linux-gnu, I can reproduce the failure too.
> 
> It is 8 times of "Loop split".
> 
> I will have a look to see why.

It is:
unsigned
foo5 (double *a, unsigned n, unsigned i)
{
   while (a[i] > 0 && i != n)
     i++;

   return i;
}

on x86, the "&&" is transformed as "&"
   _11 = n_9(D) != i_10;
   _13 = _4 > 0.0;
   _12 = _11 & _13;

on powerpc64le, two gcond(s) are kept.

I would remove "foo5", and check "8" times of "Loop split".

Thanks again.

BR.
Jiufu Guo.

> 
> BR.
> 
> Jiufu Guo.
>> 
>> BR,
>> Jiufu Guo.
>> 
>>> 
>>> 
>>> Bernd.
Bernd Edlinger May 18, 2021, 11:14 a.m. UTC | #6
On 5/18/21 1:00 PM, Segher Boessenkool wrote:> On Tue, May 18, 2021 at 08:36:34AM +0200, Bernd Edlinger wrote:
>> On 5/17/21 4:01 AM, Jiufu Guo via Gcc-patches wrote:
>>> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
> 
>> I've tried this patch and it does not seem to pass its own test:
>>
>> FAIL: gcc.dg/loop-split1.c scan-tree-dump-times lsplit "Loop split" 9
> 
> On what target?  :-P
> 
>> Note you should always do a bootstrap and regression test as described
>> here: https://gcc.gnu.org/contribute.html#testing
>>
>> Also please state in your future patch submissions how you tested the patch,
>> like "bootstrap and regression test on x86_64-pc-linux-gnu" for instance.
> 
> Jiu Fu did all that.  If you understand that ppc64le means
> powerpc64-linux of course.
> 

Yeah indeed, I overlooked that, sorry for the noise.

> It is interesting one loop fails on x86.  But that is why we have
> stage1 :-)
> 

Sure, after looking more closely it seems like 90% of the test works
already.


Bernd.


> 
> Segher
>
Richard Biener May 26, 2021, 9:50 a.m. UTC | #7
On Mon, 17 May 2021, Jiufu Guo wrote:

> When there is the possibility that overflow/wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
> 
> foo (int *a, int *b, unsigned k, unsigned n)
> {
>   while (++k != n)
>     a[k] = b[k]  + 1;
> }
> 
> For this code, if "k > n", k would wrap.  if "k < n" at begining,
> it could be optimized (e.g. vectorization).
> 
> We can split the loop into two loops:
> 
>   while (++k > n)
>     a[k] = b[k]  + 1;
>   while (l++ < n)
>     a[k] = b[k]  + 1;
> 
> then for the second loop, it could be optimized.

Btw, I think even the first loop should be vectorized.  I see we do
not handle it in niter analysis:

Analyzing loop at t.c:3
t.c:3:14: note:  === analyze_loop_nest ===
t.c:3:14: note:   === vect_analyze_loop_form ===
t.c:3:14: note:    === get_loop_niters ===
t.c:3:14: missed:   not vectorized: number of iterations cannot be 
computed.

but the number of iterations should be UINT_MAX - k (unless I'm
missing sth), may_be_zero would be sth like k < n.  It would be
nice to not split this into loops that niter analysis cannot handle ...

> This patch is spltting this kind of small loop to achieve better performance.
> 
> Bootstrap and regtest pass on ppc64le.  Is this ok for trunk?
> 
> Thanks!
> 
> Jiufu Guo.
> 
> gcc/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* tree-ssa-loop-split.c (connect_loop_phis): Add new param.
> 	(get_ne_cond_branch): New function.
> 	(split_ne_loop): New function.
> 	(split_loop_on_ne_cond): New function.
> 	(tree_ssa_split_loops): Use split_loop_on_ne_cond.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-05-15  Jiufu Guo  <guojiufu@linux.ibm.com>
> 
> 	* gcc.dg/loop-split1.c: New test.
> 	* g++.dg/vect/pr98064.cc: Suppress warning.
> ---
>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>  gcc/testsuite/gcc.dg/loop-split1.c   | 108 +++++++++++++++
>  gcc/tree-ssa-loop-split.c            | 188 ++++++++++++++++++++++++++-
>  3 files changed, 296 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
>  
>  const long long &min(const long long &__a, long long &__b) {
>    if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..30b006b1b14
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,108 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l]  + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void bar();
> +void foo4(unsigned n,  unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +        return;
> +      bar();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +foo5 (double *a, unsigned n, unsigned i)
> +{
> +  while (a[i] > 0 && i != n)
> +    i++;
> +
> +  return i;
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++,q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */

Please consider making the testcase execute ones, verifying computation
results.

> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..5c1742b5ff4 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>  
>  /* This file implements two kinds of loop splitting.
>  
> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>     this.  The loops need to fulfill easy_exit_values().  */
>  

please document use_prev

>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>  
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
>  
> +/* Check if the LOOP exit branch likes "if (idx != bound)",

is like

> +   Return the branch edge which exit loop, if overflow/wrap
> +   may happen on "idx".  */

I think we only want to handle wrapping (thus not undefined overflow).

> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      /* Check if there is possible wrap/overflow.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +
> +      /* Check loop is simple to split.  */

it seems like the following and below condition mean "simple"
is either all code is before or after the exit in question,
please improve the comment to explain the two cases.

> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == e->src
> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))

split_loop uses empty_block_p (loop->latch).

> +	return e;
> +
> +      /* Simple header.  */
> +      if (e->src == loop->header)
> +	{
> +	  if (get_virtual_phi (e->src))
> +	    continue;

So this disqualifies all loops which store to memory?

> +
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (e->src);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* ++i or ++i */
> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);

I think you want gsi_start_nondebug_after_labels_bb (e->src)

> +
> +	  gimple *gc = last_stmt (e->src);
> +	  tree idx = gimple_cond_lhs (gc);

you have to check the last stmt is a GIMPLE_COND, we have
recorded exits that exit on EH for example.

> +	  if (expr_invariant_in_loop_p (loop, idx))
> +	    idx = gimple_cond_rhs (gc);
> +
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!(is_gimple_assign (s1) && idx
> +		&& (idx == gimple_assign_lhs (s1)
> +		    || idx == gimple_assign_rhs1 (s1))))
> +	    continue;

I think you want to allow a IV PHI plus an IV increment
in the block and nothing else.  The pattern matching isn't
exact though and could be fooled by, say

 header:
   i_2 = PHI <i_5(D), i_3(latch)>
   _3 = i_2 + 5;
   if (i_2 != n_4(D))
     goto exit;

 latch:
   i_3 = i_2 + 1;
   foo (_3);
   goto header;

that is, the increment you matched is not the new iterator value
but instead a derived value used in a stmt in the latch.  So
I think if you really cannot allow any code besides the IV
increment then you need to actually get the PHI node and
match the only real stmt with the definition of the PHIs
latch edge value.

Or relax all this, of course.

> +	  gsi_next (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, up_code);
> +  gimple_cond_set_code (dup_gc, down_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, down_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> +   S;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> +  free (bbs);
> +
> +  if (num > param_max_peeled_insns)
> +    return false;
> +
> +  edge branch_edge = get_ne_cond_branch (loop);
> +  if (branch_edge && split_ne_loop (loop, branch_edge))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  */
>  
>  static unsigned int
> @@ -1622,7 +1803,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>  	continue;
>  
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +	  || split_loop_on_ne_cond (loop))
>  	{
>  	  /* Mark our containing loop as having had some split inner loops.  */
>  	  loop_outer (loop)->aux = loop;
>
Segher Boessenkool May 26, 2021, 1:22 p.m. UTC | #8
On Wed, May 26, 2021 at 11:50:07AM +0200, Richard Biener wrote:
> > We can split the loop into two loops:
> > 
> >   while (++k > n)
> >     a[k] = b[k]  + 1;
> >   while (l++ < n)
> >     a[k] = b[k]  + 1;
> > 
> > then for the second loop, it could be optimized.
> 
> Btw, I think even the first loop should be vectorized.  I see we do
> not handle it in niter analysis:
> 
> Analyzing loop at t.c:3
> t.c:3:14: note:  === analyze_loop_nest ===
> t.c:3:14: note:   === vect_analyze_loop_form ===
> t.c:3:14: note:    === get_loop_niters ===
> t.c:3:14: missed:   not vectorized: number of iterations cannot be 
> computed.
> 
> but the number of iterations should be UINT_MAX - k (unless I'm
> missing sth), may_be_zero would be sth like k < n.  It would be
> nice to not split this into loops that niter analysis cannot handle ...

As long as it doesn't do that for signed loop counters, because that
would be a waste -- ever executing such code is UB, so vectorising it
will only cost extra insns (usually).


Segher
Jiufu Guo June 1, 2021, 3:28 a.m. UTC | #9
On 2021-05-26 17:50, Richard Biener wrote:
> On Mon, 17 May 2021, Jiufu Guo wrote:
> 
...
>> 
>>   while (++k > n)
>>     a[k] = b[k]  + 1;
>> 
>> then for the second loop, it could be optimized.
> 
> Btw, I think even the first loop should be vectorized.  I see we do
> not handle it in niter analysis:
> 
> Analyzing loop at t.c:3
> t.c:3:14: note:  === analyze_loop_nest ===
> t.c:3:14: note:   === vect_analyze_loop_form ===
> t.c:3:14: note:    === get_loop_niters ===
> t.c:3:14: missed:   not vectorized: number of iterations cannot be
> computed.
> 
> but the number of iterations should be UINT_MAX - k (unless I'm
> missing sth), may_be_zero would be sth like k < n.  It would be
> nice to not split this into loops that niter analysis cannot handle ...
> 
For this case on the first loop, it is not vectorized by trunk gcc.
While since we know the type of 'k' and 'n' is unsigned, the umber of 
iterations
would be computed.  I'm wondering about enhancing 
'number_of_iterations_cond' may
able to handle this.

...
>> +
>> +/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
> 
> Please consider making the testcase execute ones, verifying computation
> results.
Thanks!
> 
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3a09bbc39e5..5c1742b5ff4 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "cfghooks.h"
>>  #include "gimple-fold.h"
>>  #include "gimplify-me.h"
>> +#include "tree-ssa-loop-ivopts.h"
>> 
>>  /* This file implements two kinds of loop splitting.
>> 
>> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
>>     this.  The loops need to fulfill easy_exit_values().  */
>> 
> 
> please document use_prev
Thanks.
> 
>>  static void
>> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
>> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
>> +		   bool use_prev = false)
>>  {
>>    basic_block rest = loop_preheader_edge (loop2)->src;
>>    gcc_assert (new_e->dest == rest);
>> @@ -279,7 +281,8 @@ connect_loop_phis (class loop *loop1, class loop 
>> *loop2, edge new_e)
>> 
>>        gphi * newphi = create_phi_node (new_init, rest);
>>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
>> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
>> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
>> new_e,
>> +		   UNKNOWN_LOCATION);
>>        SET_USE (op, new_init);
>>      }
>>  }
>> @@ -1593,6 +1596,184 @@ split_loop_on_cond (struct loop *loop)
>>    return do_split;
>>  }
>> 
>> +/* Check if the LOOP exit branch likes "if (idx != bound)",
> 
> is like
Thanks.
> 
>> +   Return the branch edge which exit loop, if overflow/wrap
>> +   may happen on "idx".  */
> 
> I think we only want to handle wrapping (thus not undefined overflow).
Yes, you are right.
> 
>> +
>> +static edge
>> +get_ne_cond_branch (struct loop *loop)
>> +{
>> +  int i;
>> +  edge e;
>> +
>> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
>> +  FOR_EACH_VEC_ELT (edges, i, e)
>> +    {
>> +      /* Check if there is possible wrap/overflow.  */
>> +      class tree_niter_desc niter;
>> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
>> +	continue;
>> +      if (niter.control.no_overflow)
>> +	return NULL;
>> +      if (niter.cmp != NE_EXPR)
>> +	continue;
>> +
>> +      /* Check loop is simple to split.  */
> 
> it seems like the following and below condition mean "simple"
> is either all code is before or after the exit in question,
> please improve the comment to explain the two cases.
> 
>> +      if (single_pred_p (loop->latch)
>> +	  && single_pred_edge (loop->latch)->src == e->src
>> +	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
> 
> split_loop uses empty_block_p (loop->latch).
yes, empty_block_p also filter label/debug insts.
> 
>> +	return e;
>> +
>> +      /* Simple header.  */
>> +      if (e->src == loop->header)
>> +	{
>> +	  if (get_virtual_phi (e->src))
>> +	    continue;
> 
> So this disqualifies all loops which store to memory?
We do not need to check this condition since here allows only the 
i++/++i header.
> 
>> +
>> +	  /* Only one phi.  */
>> +	  gphi_iterator psi = gsi_start_phis (e->src);
>> +	  if (gsi_end_p (psi))
>> +	    continue;
>> +	  gsi_next (&psi);
>> +	  if (!gsi_end_p (psi))
>> +	    continue;
>> +
>> +	  /* ++i or ++i */
>> +	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
> 
> I think you want gsi_start_nondebug_after_labels_bb (e->src)
Thanks.
> 
>> +
>> +	  gimple *gc = last_stmt (e->src);
>> +	  tree idx = gimple_cond_lhs (gc);
> 
> you have to check the last stmt is a GIMPLE_COND, we have
> recorded exits that exit on EH for example.

Above checking "number_of_iterations_exit" could make sure
it is a GIMPLE_COND.  I would add gcc_assert to check/hit this.
> 
>> +	  if (expr_invariant_in_loop_p (loop, idx))
>> +	    idx = gimple_cond_rhs (gc);
>> +
>> +	  gimple *s1 = gsi_stmt (gsi);
>> +	  if (!(is_gimple_assign (s1) && idx
>> +		&& (idx == gimple_assign_lhs (s1)
>> +		    || idx == gimple_assign_rhs1 (s1))))
>> +	    continue;
> 
> I think you want to allow a IV PHI plus an IV increment
> in the block and nothing else.  The pattern matching isn't
> exact though and could be fooled by, say
> 
>  header:
>    i_2 = PHI <i_5(D), i_3(latch)>
>    _3 = i_2 + 5;
>    if (i_2 != n_4(D))
>      goto exit;
> 
>  latch:
>    i_3 = i_2 + 1;
>    foo (_3);
>    goto header;
> 
> that is, the increment you matched is not the new iterator value
> but instead a derived value used in a stmt in the latch.  So
> I think if you really cannot allow any code besides the IV
> increment then you need to actually get the PHI node and
> match the only real stmt with the definition of the PHIs
> latch edge value.
You are right!  Checking PHI's result/latch-edge value would be better.
Thanks.

> 
> Or relax all this, of course.
It would easy to handle the above cases: e->src before latch, or simple 
header.
To relax this, we may need to peel (partial peel) one loop between the 
first loop
and the second loop, or jump into the middle of the second loop.  I had 
a quick
try to implement this, but not find a good way.
Thanks for any suggestions!

> 
>> +	  gsi_next (&gsi);
>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
>> +	    return e;
>> +	}
>> +    }
>> +
>> +  return NULL;
>> +}
>> +

Below is an updated patch.  Thanks again for your comments!


diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
b/gcc/testsuite/g++.dg/vect/pr98064.cc
index 74043ce7725..dcb2985d05a 100644
--- a/gcc/testsuite/g++.dg/vect/pr98064.cc
+++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
@@ -1,5 +1,7 @@
  // { dg-do compile }
-// { dg-additional-options "-O3" }
+// { dg-additional-options "-O3 -Wno-stringop-overflow" }
+/* There is warning message when "short g = var_8; g; g++"
+   is optimized/analyzed as string operation,e.g. memset.  */

  const long long &min(const long long &__a, long long &__b) {
    if (__b < __a)
diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
b/gcc/testsuite/gcc.dg/loop-split1.c
new file mode 100644
index 00000000000..dd2d03a7b96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split1.c
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
+
+void
+foo (int *a, int *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+void
+foo_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+void
+foo1 (int *a, int *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+/* No wrap.  */
+void
+foo1_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (l++ != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned
+foo2 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo2_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo3 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+/* No wrap.  */
+unsigned
+foo3_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+void
+bar ();
+void
+foo4 (unsigned n, unsigned i)
+{
+  do
+    {
+      if (i == n)
+	return;
+      bar ();
+      ++i;
+    }
+  while (1);
+}
+
+unsigned
+find_skip_diff (char *p, char *q, unsigned n, unsigned i)
+{
+  while (p[i] == q[i] && ++i != n)
+    p++, q++;
+
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-split2.c 
b/gcc/testsuite/gcc.dg/loop-split2.c
new file mode 100644
index 00000000000..0d3fded3f61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split2.c
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+extern void abort (void);
+extern void exit (int);
+
+#define NI __attribute__ ((noinline))
+
+void NI
+foo (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (++l != n)
+    a[l] = b[l] + 1;
+}
+
+unsigned NI
+bar (int *a, int *b, unsigned char l, unsigned char n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+int a[258];
+int b[258];
+
+int main()
+{
+  __builtin_memcpy (b, a, sizeof (a));
+
+  if (bar (a, b, 3, 8) != 9)
+    abort ();
+
+  if (bar (a, b, 8, 3) != 4)
+    abort ();
+
+  b[100] += 1;
+  if (bar (a, b, 90, 110) != 100)
+    abort ();
+
+  if (bar (a, b, 110, 105) != 100)
+    abort ();
+
+  foo (a, b, 99, 99);
+  a[99] = b[99] + 1;
+  for (int i = 0; i < 256; i++)
+    if (a[i] != b[i] + 1)
+      abort();
+
+  exit (0);
+}
+
diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..0428b0abea6 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "cfghooks.h"
  #include "gimple-fold.h"
  #include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"

  /* This file implements two kinds of loop splitting.

@@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
     conditional).  I.e. the second loop can now be entered either
     via the original entry or via NEW_E, so the entry values of LOOP2
     phi nodes are either the original ones or those at the exit
-   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
-   this.  The loops need to fulfill easy_exit_values().  */
+   of LOOP1.  Selecting the previous value instead next value as the
+   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
+   LOOP2 pre-header reflecting this.  The loops need to fulfill
+   easy_exit_values().  */

  static void
-connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
+connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
+		   bool use_prev = false)
  {
    basic_block rest = loop_preheader_edge (loop2)->src;
    gcc_assert (new_e->dest == rest);
@@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop 
*loop2, edge new_e)

        gphi * newphi = create_phi_node (new_init, rest);
        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
-      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
+      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
new_e,
+		   UNKNOWN_LOCATION);
        SET_USE (op, new_init);
      }
  }
@@ -1593,6 +1598,199 @@ split_loop_on_cond (struct loop *loop)
    return do_split;
  }

+/* Check if the LOOP exit branch is like "if (idx != bound)",
+   Return the branch edge which exit loop, if wrap may happen on "idx". 
  */
+
+static edge
+get_ne_cond_branch (struct loop *loop)
+{
+  int i;
+  edge e;
+
+  auto_vec<edge> edges = get_loop_exit_edges (loop);
+  FOR_EACH_VEC_ELT (edges, i, e)
+    {
+      /* Check if there is possible wrap.  */
+      class tree_niter_desc niter;
+      if (!number_of_iterations_exit (loop, e, &niter, false, false))
+	continue;
+      if (niter.control.no_overflow)
+	return NULL;
+      if (niter.cmp != NE_EXPR)
+	continue;
+
+      /* If exit edge is just before the empty latch, it is easy to 
link
+	 the split loops: just jump from the exit edge of one loop to the
+	 header of new loop.  */
+      if (single_pred_p (loop->latch)
+	  && single_pred_edge (loop->latch)->src == e->src
+	  && empty_block_p (loop->latch))
+	return e;
+
+      /* If exit edge is at end of header, and header contains i++ or 
++i,
+	 only, it is simple to link the split loops:  jump from the end of
+	 one loop header to the new loop header, and use unchanged PHI
+	 result of first loop as the entry PHI value of the second loop.  */
+      if (e->src == loop->header)
+	{
+	  /* Only one phi.  */
+	  gphi_iterator psi = gsi_start_phis (e->src);
+	  if (gsi_end_p (psi))
+	    continue;
+	  gphi *phi = psi.phi ();
+	  gsi_next (&psi);
+	  if (!gsi_end_p (psi))
+	    continue;
+
+	  /* Get the idx from last stmt (the gcond) of e->src.  */
+	  gimple *gc = last_stmt (e->src);
+	  gcc_assert (gimple_code (gc) == GIMPLE_COND);
+	  tree idx = gimple_cond_lhs (gc);
+	  if (expr_invariant_in_loop_p (loop, idx))
+	    idx = gimple_cond_rhs (gc);
+
+	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+	  tree prev = PHI_RESULT (phi);
+	  if (idx != prev && idx != next)
+	    continue;
+
+	  /* ++i or ++i */
+	  gimple_stmt_iterator gsi
+	    = gsi_start_nondebug_after_labels_bb (e->src);
+	  if (gsi_end_p (gsi))
+	    continue;
+
+	  gimple *s1 = gsi_stmt (gsi);
+	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
+	      || gimple_assign_rhs1 (s1) != prev)
+	    continue;
+
+	  gsi_next (&gsi);
+	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
+	    return e;
+	}
+    }
+
+  return NULL;
+}
+
+/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR. 
  */
+
+static bool
+split_ne_loop (struct loop *loop, edge cond_e)
+{
+  initialize_original_copy_tables ();
+
+  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
+				     profile_probability::always (),
+				     profile_probability::never (),
+				     profile_probability::always (),
+				     profile_probability::always (), true);
+
+  gcc_assert (loop2);
+  update_ssa (TODO_update_ssa);
+
+  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
+  free_original_copy_tables ();
+
+  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
+  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
+
+  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
+  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
+  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
+  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
+
+  gimple_cond_set_code (gc, up_code);
+  gimple_cond_set_code (dup_gc, down_code);
+
+  /* Link the exit cond edge to new loop.  */
+  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
+  edge pred_e = single_pred_edge (loop->latch);
+  bool simple_loop = pred_e && pred_e->src == cond_e->src
+		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
+  if (simple_loop)
+    gimple_cond_set_code (break_cond, down_code);
+  else
+    gimple_cond_make_true (break_cond);
+
+  basic_block break_bb = split_edge (cond_e);
+  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
+  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
+
+  edge to_exit = single_succ_edge (break_bb);
+  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
(loop2)->src, 0);
+  to_new_loop->flags |= EDGE_TRUE_VALUE;
+  to_exit->flags |= EDGE_FALSE_VALUE;
+  to_exit->flags &= ~EDGE_FALLTHRU;
+  to_exit->probability = cond_e->probability;
+  to_new_loop->probability = to_exit->probability.invert ();
+
+  update_ssa (TODO_update_ssa);
+
+  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
+
+  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, ";; Loop split on wrap index.\n");
+
+  return true;
+}
+
+/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
split.
+L_H:
+ if (i!=N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+
+The "i!=N" is like "i>N || i<N", then it could be transform to:
+
+L_H:
+ if (i>N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L_H;
+L1_H:
+ if (i<N)
+   S;
+ else
+   goto EXIT;
+ i++;
+ goto L1_H;
+
+The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
+
+static bool
+split_loop_on_ne_cond (class loop *loop)
+{
+  int num = 0;
+  basic_block *bbs = get_loop_body (loop);
+
+  if (!can_copy_bbs_p (bbs, loop->num_nodes))
+    {
+      free (bbs);
+      return false;
+    }
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
+  free (bbs);
+
+  if (num > param_max_peeled_insns)
+    return false;
+
+  edge branch_edge = get_ne_cond_branch (loop);
+  if (branch_edge && split_ne_loop (loop, branch_edge))
+    return true;
+
+  return false;
+}
+
  /* Main entry point.  Perform loop splitting on all suitable loops.  */

  static unsigned int
@@ -1622,7 +1820,8 @@ tree_ssa_split_loops (void)
        if (optimize_loop_for_size_p (loop))
  	continue;

-      if (split_loop (loop) || split_loop_on_cond (loop))
+      if (split_loop (loop) || split_loop_on_cond (loop)
+	  || split_loop_on_ne_cond (loop))
  	{
  	  /* Mark our containing loop as having had some split inner loops.  
*/
  	  loop_outer (loop)->aux = loop;
Jiufu Guo June 1, 2021, 8:23 a.m. UTC | #10
On 2021-06-01 11:28, guojiufu via Gcc-patches wrote:
> On 2021-05-26 17:50, Richard Biener wrote:
>> On Mon, 17 May 2021, Jiufu Guo wrote:
>> 
....
>> 
>> Or relax all this, of course.
> It would easy to handle the above cases: e->src before latch, or simple 
> header.
> To relax this, we may need to peel (partial peel) one loop between the
> first loop
> and the second loop, or jump into the middle of the second loop.  I had 
> a quick
> try to implement this, but not find a good way.
> Thanks for any suggestions!

Previously, I tested the GCC bootstrap to statistic this kind of loop.
'ch/tree-ssa-loop-ch.c" pass already peeled partial loop and transform 
loops into
do-while form.  This would be one reason that this kind of loop is rare.

BR.
Jiufu Guo.

> 
>> 
>>> +	  gsi_next (&gsi);
>>> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
>>> +	    return e;
>>> +	}
>>> +    }
>>> +
>>> +  return NULL;
>>> +}
>>> +
> 
> Below is an updated patch.  Thanks again for your comments!
> 
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc
> b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
> 
>  const long long &min(const long long &__a, long long &__b) {
>    if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +    {
> +      if (i == n)
> +	return;
> +      bar ();
> +      ++i;
> +    }
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +    p++, q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c
> b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 00000000000..0d3fded3f61
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void abort (void);
> +extern void exit (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (++l != n)
> +    a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (l++ != n)
> +    if (a[l] != b[l])
> +      break;
> +
> +  return l;
> +}
> +
> +int a[258];
> +int b[258];
> +
> +int main()
> +{
> +  __builtin_memcpy (b, a, sizeof (a));
> +
> +  if (bar (a, b, 3, 8) != 9)
> +    abort ();
> +
> +  if (bar (a, b, 8, 3) != 4)
> +    abort ();
> +
> +  b[100] += 1;
> +  if (bar (a, b, 90, 110) != 100)
> +    abort ();
> +
> +  if (bar (a, b, 110, 105) != 100)
> +    abort ();
> +
> +  foo (a, b, 99, 99);
> +  a[99] = b[99] + 1;
> +  for (int i = 0; i < 256; i++)
> +    if (a[i] != b[i] + 1)
> +      abort();
> +
> +  exit (0);
> +}
> +
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..0428b0abea6 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfghooks.h"
>  #include "gimple-fold.h"
>  #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
> 
>  /* This file implements two kinds of loop splitting.
> 
> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
>     conditional).  I.e. the second loop can now be entered either
>     via the original entry or via NEW_E, so the entry values of LOOP2
>     phi nodes are either the original ones or those at the exit
> -   of LOOP1.  Insert new phi nodes in LOOP2 pre-header reflecting
> -   this.  The loops need to fulfill easy_exit_values().  */
> +   of LOOP1.  Selecting the previous value instead next value as the
> +   exit value of LOOP1 if USE_PREV is true.  Insert new phi nodes in
> +   LOOP2 pre-header reflecting this.  The loops need to fulfill
> +   easy_exit_values().  */
> 
>  static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> +		   bool use_prev = false)
>  {
>    basic_block rest = loop_preheader_edge (loop2)->src;
>    gcc_assert (new_e->dest == rest);
> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop
> *loop2, edge new_e)
> 
>        gphi * newphi = create_phi_node (new_init, rest);
>        add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> -      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> +      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, 
> new_e,
> +		   UNKNOWN_LOCATION);
>        SET_USE (op, new_init);
>      }
>  }
> @@ -1593,6 +1598,199 @@ split_loop_on_cond (struct loop *loop)
>    return do_split;
>  }
> 
> +/* Check if the LOOP exit branch is like "if (idx != bound)",
> +   Return the branch edge which exit loop, if wrap may happen on 
> "idx".  */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> +  int i;
> +  edge e;
> +
> +  auto_vec<edge> edges = get_loop_exit_edges (loop);
> +  FOR_EACH_VEC_ELT (edges, i, e)
> +    {
> +      /* Check if there is possible wrap.  */
> +      class tree_niter_desc niter;
> +      if (!number_of_iterations_exit (loop, e, &niter, false, false))
> +	continue;
> +      if (niter.control.no_overflow)
> +	return NULL;
> +      if (niter.cmp != NE_EXPR)
> +	continue;
> +
> +      /* If exit edge is just before the empty latch, it is easy to 
> link
> +	 the split loops: just jump from the exit edge of one loop to the
> +	 header of new loop.  */
> +      if (single_pred_p (loop->latch)
> +	  && single_pred_edge (loop->latch)->src == e->src
> +	  && empty_block_p (loop->latch))
> +	return e;
> +
> +      /* If exit edge is at end of header, and header contains i++ or 
> ++i,
> +	 only, it is simple to link the split loops:  jump from the end of
> +	 one loop header to the new loop header, and use unchanged PHI
> +	 result of first loop as the entry PHI value of the second loop.  */
> +      if (e->src == loop->header)
> +	{
> +	  /* Only one phi.  */
> +	  gphi_iterator psi = gsi_start_phis (e->src);
> +	  if (gsi_end_p (psi))
> +	    continue;
> +	  gphi *phi = psi.phi ();
> +	  gsi_next (&psi);
> +	  if (!gsi_end_p (psi))
> +	    continue;
> +
> +	  /* Get the idx from last stmt (the gcond) of e->src.  */
> +	  gimple *gc = last_stmt (e->src);
> +	  gcc_assert (gimple_code (gc) == GIMPLE_COND);
> +	  tree idx = gimple_cond_lhs (gc);
> +	  if (expr_invariant_in_loop_p (loop, idx))
> +	    idx = gimple_cond_rhs (gc);
> +
> +	  tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +	  tree prev = PHI_RESULT (phi);
> +	  if (idx != prev && idx != next)
> +	    continue;
> +
> +	  /* ++i or ++i */
> +	  gimple_stmt_iterator gsi
> +	    = gsi_start_nondebug_after_labels_bb (e->src);
> +	  if (gsi_end_p (gsi))
> +	    continue;
> +
> +	  gimple *s1 = gsi_stmt (gsi);
> +	  if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
> +	      || gimple_assign_rhs1 (s1) != prev)
> +	    continue;
> +
> +	  gsi_next (&gsi);
> +	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
> +	    return e;
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and 
> LT_EXPR.  */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> +  initialize_original_copy_tables ();
> +
> +  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> +				     profile_probability::always (),
> +				     profile_probability::never (),
> +				     profile_probability::always (),
> +				     profile_probability::always (), true);
> +
> +  gcc_assert (loop2);
> +  update_ssa (TODO_update_ssa);
> +
> +  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> +  free_original_copy_tables ();
> +
> +  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> +  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> +  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> +  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> +  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> +  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> +  gimple_cond_set_code (gc, up_code);
> +  gimple_cond_set_code (dup_gc, down_code);
> +
> +  /* Link the exit cond edge to new loop.  */
> +  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> +  edge pred_e = single_pred_edge (loop->latch);
> +  bool simple_loop = pred_e && pred_e->src == cond_e->src
> +		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
> +  if (simple_loop)
> +    gimple_cond_set_code (break_cond, down_code);
> +  else
> +    gimple_cond_make_true (break_cond);
> +
> +  basic_block break_bb = split_edge (cond_e);
> +  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> +  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> +  edge to_exit = single_succ_edge (break_bb);
> +  edge to_new_loop = make_edge (break_bb, loop_preheader_edge 
> (loop2)->src, 0);
> +  to_new_loop->flags |= EDGE_TRUE_VALUE;
> +  to_exit->flags |= EDGE_FALSE_VALUE;
> +  to_exit->flags &= ~EDGE_FALLTHRU;
> +  to_exit->probability = cond_e->probability;
> +  to_new_loop->probability = to_exit->probability.invert ();
> +
> +  update_ssa (TODO_update_ssa);
> +
> +  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> +  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> +  return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to 
> split.
> +L_H:
> + if (i!=N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> +   S;
> + else
> +   goto EXIT;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> +  int num = 0;
> +  basic_block *bbs = get_loop_body (loop);
> +
> +  if (!can_copy_bbs_p (bbs, loop->num_nodes))
> +    {
> +      free (bbs);
> +      return false;
> +    }
> +
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    num += estimate_num_insns_seq (bb_seq (bbs[i]), 
> &eni_size_weights);
> +  free (bbs);
> +
> +  if (num > param_max_peeled_insns)
> +    return false;
> +
> +  edge branch_edge = get_ne_cond_branch (loop);
> +  if (branch_edge && split_ne_loop (loop, branch_edge))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Main entry point.  Perform loop splitting on all suitable loops.  
> */
> 
>  static unsigned int
> @@ -1622,7 +1820,8 @@ tree_ssa_split_loops (void)
>        if (optimize_loop_for_size_p (loop))
>  	continue;
> 
> -      if (split_loop (loop) || split_loop_on_cond (loop))
> +      if (split_loop (loop) || split_loop_on_cond (loop)
> +	  || split_loop_on_ne_cond (loop))
>  	{
>  	  /* Mark our containing loop as having had some split inner loops.  
> */
>  	  loop_outer (loop)->aux = loop;
diff mbox series

Patch

diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc b/gcc/testsuite/g++.dg/vect/pr98064.cc
index 74043ce7725..dcb2985d05a 100644
--- a/gcc/testsuite/g++.dg/vect/pr98064.cc
+++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
@@ -1,5 +1,7 @@ 
 // { dg-do compile }
-// { dg-additional-options "-O3" }
+// { dg-additional-options "-O3 -Wno-stringop-overflow" }
+/* There is warning message when "short g = var_8; g; g++"
+   is optimized/analyzed as string operation,e.g. memset.  */
 
 const long long &min(const long long &__a, long long &__b) {
   if (__b < __a)
diff --git a/gcc/testsuite/gcc.dg/loop-split1.c b/gcc/testsuite/gcc.dg/loop-split1.c
new file mode 100644
index 00000000000..30b006b1b14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-split1.c
@@ -0,0 +1,108 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
+
+void
+foo (int *a, int *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    a[l] = b[l]  + 1;
+}
+void
+foo_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (++l != n)
+    a[l] = b[l]  + 1;
+}
+
+void
+foo1 (int *a, int *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    a[l] = b[l]  + 1;
+}
+
+/* No wrap.  */
+void
+foo1_1 (int *a, int *b, unsigned n)
+{
+  unsigned l = 0;
+  while (l++ != n)
+    a[l] = b[l]  + 1;
+}
+
+unsigned
+foo2 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo2_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (++l != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+unsigned
+foo3 (char *a, char *b, unsigned l, unsigned n)
+{
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+/* No wrap.  */
+unsigned
+foo3_1 (char *a, char *b, unsigned l, unsigned n)
+{
+  l = 0;
+  while (l++ != n)
+    if (a[l] != b[l])
+      break;
+
+  return l;
+}
+
+void bar();
+void foo4(unsigned n,  unsigned i)
+{
+  do
+    {
+      if (i == n)
+        return;
+      bar();
+      ++i;
+    }
+  while (1);
+}
+
+unsigned
+foo5 (double *a, unsigned n, unsigned i)
+{
+  while (a[i] > 0 && i != n)
+    i++;
+
+  return i;
+}
+
+unsigned
+find_skip_diff (char *p, char *q, unsigned n, unsigned i)
+{
+  while (p[i] == q[i] && ++i != n)
+    p++,q++;
+
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-times "Loop split" 9 "lsplit" } } */
diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..5c1742b5ff4 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -41,6 +41,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "cfghooks.h"
 #include "gimple-fold.h"
 #include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"
 
 /* This file implements two kinds of loop splitting.
 
@@ -233,7 +234,8 @@  easy_exit_values (class loop *loop)
    this.  The loops need to fulfill easy_exit_values().  */
 
 static void
-connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
+connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
+		   bool use_prev = false)
 {
   basic_block rest = loop_preheader_edge (loop2)->src;
   gcc_assert (new_e->dest == rest);
@@ -279,7 +281,8 @@  connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
 
       gphi * newphi = create_phi_node (new_init, rest);
       add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
-      add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
+      add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
+		   UNKNOWN_LOCATION);
       SET_USE (op, new_init);
     }
 }
@@ -1593,6 +1596,184 @@  split_loop_on_cond (struct loop *loop)
   return do_split;
 }
 
+/* Check if the LOOP exit branch likes "if (idx != bound)",
+   Return the branch edge which exit loop, if overflow/wrap
+   may happen on "idx".  */
+
+static edge
+get_ne_cond_branch (struct loop *loop)
+{
+  int i;
+  edge e;
+
+  auto_vec<edge> edges = get_loop_exit_edges (loop);
+  FOR_EACH_VEC_ELT (edges, i, e)
+    {
+      /* Check if there is possible wrap/overflow.  */
+      class tree_niter_desc niter;
+      if (!number_of_iterations_exit (loop, e, &niter, false, false))
+	continue;
+      if (niter.control.no_overflow)
+	return NULL;
+      if (niter.cmp != NE_EXPR)
+	continue;
+
+      /* Check loop is simple to split.  */
+      if (single_pred_p (loop->latch)
+	  && single_pred_edge (loop->latch)->src == e->src
+	  && (gsi_end_p (gsi_start_nondebug_bb (loop->latch))))
+	return e;
+
+      /* Simple header.  */
+      if (e->src == loop->header)
+	{
+	  if (get_virtual_phi (e->src))
+	    continue;
+
+	  /* Only one phi.  */
+	  gphi_iterator psi = gsi_start_phis (e->src);
+	  if (gsi_end_p (psi))
+	    continue;
+	  gsi_next (&psi);
+	  if (!gsi_end_p (psi))
+	    continue;
+
+	  /* ++i or ++i */
+	  gimple_stmt_iterator gsi = gsi_start_bb (e->src);
+	  if (gsi_end_p (gsi))
+	    continue;
+
+	  gimple *gc = last_stmt (e->src);
+	  tree idx = gimple_cond_lhs (gc);
+	  if (expr_invariant_in_loop_p (loop, idx))
+	    idx = gimple_cond_rhs (gc);
+
+	  gimple *s1 = gsi_stmt (gsi);
+	  if (!(is_gimple_assign (s1) && idx
+		&& (idx == gimple_assign_lhs (s1)
+		    || idx == gimple_assign_rhs1 (s1))))
+	    continue;
+
+	  gsi_next (&gsi);
+	  if (!gsi_end_p (gsi) && gsi_stmt (gsi) == gc)
+	    return e;
+	}
+    }
+
+  return NULL;
+}
+
+/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR.  */
+
+static bool
+split_ne_loop (struct loop *loop, edge cond_e)
+{
+  initialize_original_copy_tables ();
+
+  struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
+				     profile_probability::always (),
+				     profile_probability::never (),
+				     profile_probability::always (),
+				     profile_probability::always (), true);
+
+  gcc_assert (loop2);
+  update_ssa (TODO_update_ssa);
+
+  basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
+  free_original_copy_tables ();
+
+  gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
+  gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
+
+  /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
+  bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
+  enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
+  enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
+
+  gimple_cond_set_code (gc, up_code);
+  gimple_cond_set_code (dup_gc, down_code);
+
+  /* Link the exit cond edge to new loop.  */
+  gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
+  edge pred_e = single_pred_edge (loop->latch);
+  bool simple_loop = pred_e && pred_e->src == cond_e->src
+		     && (gsi_end_p (gsi_start_nondebug_bb (loop->latch)));
+  if (simple_loop)
+    gimple_cond_set_code (break_cond, down_code);
+  else
+    gimple_cond_make_true (break_cond);
+
+  basic_block break_bb = split_edge (cond_e);
+  gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
+  gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
+
+  edge to_exit = single_succ_edge (break_bb);
+  edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, 0);
+  to_new_loop->flags |= EDGE_TRUE_VALUE;
+  to_exit->flags |= EDGE_FALSE_VALUE;
+  to_exit->flags &= ~EDGE_FALLTHRU;
+  to_exit->probability = cond_e->probability;
+  to_new_loop->probability = to_exit->probability.invert ();
+
+  update_ssa (TODO_update_ssa);
+
+  connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
+
+  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, ";; Loop split on wrap index.\n");
+
+  return true;
+}
+
+/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
+L_H:
+ if (i!=N)
+   S;
+ i++;
+ goto L_H;
+
+The "i!=N" is like "i>N || i<N", then it could be transform to:
+
+L_H:
+ if (i>N)
+   S;
+ i++;
+ goto L_H;
+L1_H:
+ if (i<N)
+   S;
+ i++;
+ goto L1_H;
+
+The loop with "i<N" is in favor both GIMPLE and RTL passes.  */
+
+static bool
+split_loop_on_ne_cond (class loop *loop)
+{
+  int num = 0;
+  basic_block *bbs = get_loop_body (loop);
+
+  if (!can_copy_bbs_p (bbs, loop->num_nodes))
+    {
+      free (bbs);
+      return false;
+    }
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
+  free (bbs);
+
+  if (num > param_max_peeled_insns)
+    return false;
+
+  edge branch_edge = get_ne_cond_branch (loop);
+  if (branch_edge && split_ne_loop (loop, branch_edge))
+    return true;
+
+  return false;
+}
+
 /* Main entry point.  Perform loop splitting on all suitable loops.  */
 
 static unsigned int
@@ -1622,7 +1803,8 @@  tree_ssa_split_loops (void)
       if (optimize_loop_for_size_p (loop))
 	continue;
 
-      if (split_loop (loop) || split_loop_on_cond (loop))
+      if (split_loop (loop) || split_loop_on_cond (loop)
+	  || split_loop_on_ne_cond (loop))
 	{
 	  /* Mark our containing loop as having had some split inner loops.  */
 	  loop_outer (loop)->aux = loop;