diff mbox series

[4/21] middle-end: update loop peeling code to maintain LCSSA form for early breaks

Message ID ZUiX5Fenyx5FRkrJ@arm.com
State New
Headers show
Series None | expand

Commit Message

Tamar Christina Nov. 6, 2023, 7:38 a.m. UTC
Hi All,

This splits the part of the function that does peeling for loops at exits to
a different function.  In this new function we also peel for early breaks.

Peeling for early breaks works by redirecting all early break exits to a
single "early break" block and combine them and the normal exit edge together
later in a different block which then goes into the epilog preheader.

This allows us to re-use all the existing code for IV updates, Additionally this
also enables correct linking for multiple vector epilogues.

flush_pending_stmts cannot be used in this scenario since it updates the PHI
nodes in the order that they are in the exit destination blocks.  This means
they are in CFG visit order.  With a single exit this doesn't matter but with
multiple exits with different live values through the different exits the order
usually does not line up.

Additionally the vectorizer helper functions expect to be able to iterate over
the nodes in the order that they occur in the loop header blocks.  This is an
invariant we must maintain.  To do this we just inline the work
flush_pending_stmts but maintain the order by using the header blocks to guide
the work.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_is_loop_exit_latch_pred): New.
	(slpeel_tree_duplicate_loop_for_vectorization): New.
	(slpeel_tree_duplicate_loop_to_edge_cfg): use it.
	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.
	(vect_is_loop_exit_latch_pred): New.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 43ca985c53ce58aa83fb9689a9ea9b20b207e0a8..6fbb5b80986fd657814b48eb009b52b094f331e6 100644




--
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 43ca985c53ce58aa83fb9689a9ea9b20b207e0a8..6fbb5b80986fd657814b48eb009b52b094f331e6 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1444,6 +1444,151 @@ slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 }
 
+/* Determine if the exit choosen by the loop vectorizer differs from the
+   natural loop exit.  i.e. if the exit leads to the loop patch or not.
+   When this happens we need to flip the understanding of main and other
+   exits by peeling and IV updates.  */
+
+bool
+vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
+{
+  return single_pred (loop->latch) == loop_exit->src;
+}
+
+/* Perform peeling for when the peeled loop is placed after the original loop.
+   This maintains LCSSA and creates the appropriate blocks for multiple exit
+   vectorization.   */
+
+void static
+slpeel_tree_duplicate_loop_for_vectorization (class loop *loop, edge loop_exit,
+					      vec<edge> &loop_exits, edge e,
+					      class loop *new_loop,
+					      bool flow_loops,
+					      basic_block new_preheader)
+{
+  bool multiple_exits_p = loop_exits.length () > 1;
+  basic_block main_loop_exit_block = new_preheader;
+  if (multiple_exits_p)
+    {
+      edge loop_entry = single_succ_edge (new_preheader);
+      new_preheader = split_edge (loop_entry);
+    }
+
+  /* First create the empty phi nodes so that when we flush the
+     statements they can be filled in.   However because there is no order
+     between the PHI nodes in the exits and the loop headers we need to
+     order them base on the order of the two headers.  First record the new
+     phi nodes. Then redirect the edges and flush the changes.  This writes out the new
+    SSA names.  */
+  for (auto exit : loop_exits)
+    {
+      basic_block dest
+	= exit == loop_exit ? main_loop_exit_block : new_preheader;
+      redirect_edge_and_branch (exit, dest);
+    }
+
+  /* Copy the current loop LC PHI nodes between the original loop exit
+     block and the new loop header.  This allows us to later split the
+     preheader block and still find the right LC nodes.  */
+  edge loop_entry = single_succ_edge (new_preheader);
+  hash_set<tree> lcssa_vars;
+  if (flow_loops)
+    for (auto gsi_from = gsi_start_phis (loop->header),
+	 gsi_to = gsi_start_phis (new_loop->header);
+	 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	 gsi_next (&gsi_from), gsi_next (&gsi_to))
+      {
+	gimple *from_phi = gsi_stmt (gsi_from);
+	gimple *to_phi = gsi_stmt (gsi_to);
+	tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, loop_latch_edge (loop));
+
+	/* In all cases, even in early break situations we're only
+	   interested in the number of fully executed loop iters.  As such
+	   we discard any partially done iteration.  So we simply propagate
+	   the phi nodes from the latch to the merge block.  */
+	tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	gphi *lcssa_phi = create_phi_node (new_res, main_loop_exit_block);
+
+	/* Check if we haven't picked a different loop exit.  If we have we
+	   need to flip the LCSSA vars to prevent incorrect linking.  */
+	tree alt_arg = gimple_phi_result (from_phi);
+	if (!vect_is_loop_exit_latch_pred (loop_exit, loop))
+	  std::swap (new_arg, alt_arg);
+
+	lcssa_vars.add (new_arg);
+
+	/* Main loop exit should use the final iter value.  */
+	add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+
+	/* All other exits use the previous iters.  */
+	if (multiple_exits_p)
+	  {
+	    tree alt_res = copy_ssa_name (alt_arg);
+	    gphi *alt_lcssa_phi = create_phi_node (alt_res, new_preheader);
+	    edge main_e = single_succ_edge (main_loop_exit_block);
+	    for (edge e : loop_exits)
+	      if (e != loop_exit)
+		{
+		  add_phi_arg (alt_lcssa_phi, alt_arg, e, UNKNOWN_LOCATION);
+		  SET_PHI_ARG_DEF (alt_lcssa_phi, main_e->dest_idx, new_res);
+		}
+	    new_res = alt_res; /* Push it down to the new_loop header.  */
+	  }
+
+	adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
+    }
+
+  /* Copy over any live SSA vars that may not have been materialized in the
+     loops themselves but would be in the exit block.  However when the live
+     value is not used inside the loop then we don't need to do this,  if we do
+     then when we split the guard block the branch edge can end up containing the
+     wrong reference,  particularly if it shares an edge with something that has
+     bypassed the loop.  This is not something peeling can check so we need to
+     anticipate the usage of the live variable here.  */
+  auto exit_map = redirect_edge_var_map_vector (loop_exit);
+  if (exit_map)
+    for (auto vm : exit_map)
+      {
+	if (lcssa_vars.contains (vm.def)
+	    || TREE_CODE (vm.def) != SSA_NAME)
+	  continue;
+
+	imm_use_iterator imm_iter;
+	use_operand_p use_p;
+	bool use_in_loop = false;
+
+	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vm.def)
+	  {
+	    basic_block bb = gimple_bb (USE_STMT (use_p));
+	    if (flow_bb_inside_loop_p (loop, bb)
+		&& !gimple_vuse (USE_STMT (use_p)))
+	      {
+		use_in_loop = true;
+		break;
+	      }
+	  }
+
+	if (!use_in_loop && SSA_VAR_P (vm.def))
+	  {
+	    /* Do a final check to see if it's perhaps defined in the loop.
+	       This mirrors the relevancy analysis's used_outside_scope.  */
+	    if (virtual_operand_p (vm.def)
+		&& (SSA_NAME_IS_DEFAULT_DEF (vm.def)
+		    || !flow_bb_inside_loop_p (loop,
+				gimple_bb (SSA_NAME_DEF_STMT (vm.def)))))
+	      continue;
+	  }
+
+	tree new_res = copy_ssa_name (vm.result);
+	gphi *lcssa_phi = create_phi_node (new_res, e->dest);
+	add_phi_arg (lcssa_phi, vm.def, loop_exit, vm.locus);
+    }
+
+  /* Now clear all the redirect maps.  */
+  for (auto exit : loop_exits)
+    redirect_edge_var_map_clear (exit);
+}
+
 /* Given LOOP this function generates a new copy of it and puts it
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
@@ -1455,13 +1600,16 @@ slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    copies remains the same.
 
    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
-   dominators were updated during the peeling.  */
+   dominators were updated during the peeling.  When doing early break vectorization
+   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
+   memory references that need to be updated should we decide to vectorize.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
 					edge scalar_exit, edge e, edge *new_e,
-					bool flow_loops)
+					bool flow_loops,
+					vec<basic_block> *updated_doms)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1593,7 +1741,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
     }
 
   auto loop_exits = get_loop_exit_edges (loop);
+  bool multiple_exits_p = loop_exits.length () > 1;
   auto_vec<basic_block> doms;
+  class loop *update_loop = NULL;
 
   if (at_exit) /* Add the loop copy at exit.  */
     {
@@ -1603,103 +1753,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  flush_pending_stmts (new_exit);
 	}
 
-      auto_vec <gimple *> new_phis;
-      hash_map <tree, tree> new_phi_args;
-      /* First create the empty phi nodes so that when we flush the
-	 statements they can be filled in.   However because there is no order
-	 between the PHI nodes in the exits and the loop headers we need to
-	 order them base on the order of the two headers.  First record the new
-	 phi nodes.  */
-      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
-	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
-	{
-	  gimple *from_phi = gsi_stmt (gsi_from);
-	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	  gphi *res = create_phi_node (new_res, new_preheader);
-	  new_phis.safe_push (res);
-	}
-
-      /* Then redirect the edges and flush the changes.  This writes out the new
-	 SSA names.  */
-      for (edge exit : loop_exits)
-	{
-	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
-	  flush_pending_stmts (temp_e);
-	}
-      /* Record the new SSA names in the cache so that we can skip materializing
-	 them again when we fill in the rest of the LCSSA variables.  */
-      for (auto phi : new_phis)
-	{
-	  tree new_arg = gimple_phi_arg (phi, 0)->def;
-
-	  if (!SSA_VAR_P (new_arg))
-	    continue;
-	  /* If the PHI MEM node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.   */
-	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
-	     be a .VDEF or a PHI that operates on MEM. And said definition
-	     must not be inside the main loop.  Or we must be a parameter.
-	     In the last two cases we may remove a non-MEM PHI node, but since
-	     they dominate both loops the removal is unlikely to cause trouble
-	     as the exits must already be using them.  */
-	  if (virtual_operand_p (new_arg)
-	      && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
-		  || !flow_bb_inside_loop_p (loop,
-				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
-	    {
-	      auto gsi = gsi_for_stmt (phi);
-	      remove_phi_node (&gsi, true);
-	      continue;
-	    }
-	  new_phi_args.put (new_arg, gimple_phi_result (phi));
-
-	  if (TREE_CODE (new_arg) != SSA_NAME)
-	    continue;
-	  /* If the PHI node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.  Unless the
-	      the loop has been versioned.  If it has then we need the PHI
-	      node such that later when the loop guard is added the original
-	      dominating PHI can be found.  */
-	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (new_arg));
-	  if (loop == scalar_loop
-	      && (!def_bb || !flow_bb_inside_loop_p (loop, def_bb)))
-	    {
-	      auto gsi = gsi_for_stmt (phi);
-	      remove_phi_node (&gsi, true);
-	    }
-	}
-
-      /* Copy the current loop LC PHI nodes between the original loop exit
-	 block and the new loop header.  This allows us to later split the
-	 preheader block and still find the right LC nodes.  */
-      edge loop_entry = single_succ_edge (new_preheader);
-      if (flow_loops)
-	for (auto gsi_from = gsi_start_phis (loop->header),
-	     gsi_to = gsi_start_phis (new_loop->header);
-	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
-	     gsi_next (&gsi_from), gsi_next (&gsi_to))
-	  {
-	    gimple *from_phi = gsi_stmt (gsi_from);
-	    gimple *to_phi = gsi_stmt (gsi_to);
-	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
-						  loop_latch_edge (loop));
-
-	    /* Check if we've already created a new phi node during edge
-	       redirection.  If we have, only propagate the value downwards.  */
-	    if (tree *res = new_phi_args.get (new_arg))
-	      {
-		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
-		continue;
-	      }
-
-	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
-
-	    /* Main loop exit should use the final iter value.  */
-	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
-
-	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
-	  }
+      slpeel_tree_duplicate_loop_for_vectorization (loop, loop_exit, loop_exits,
+						    e, new_loop, flow_loops,
+						    new_preheader);
 
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
 
@@ -1713,6 +1769,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (preheader);
       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
 			       loop_preheader_edge (scalar_loop)->src);
+
+      /* Finally after wiring the new epilogue we need to update its main exit
+	 to the original function exit we recorded.  Other exits are already
+	 correct.  */
+      if (multiple_exits_p)
+	{
+	  update_loop = new_loop;
+	  for (edge e : get_loop_exit_edges (loop))
+	    doms.safe_push (e->dest);
+	  doms.safe_push (exit_dest);
+
+	  /* Likely a fall-through edge, so update if needed.  */
+	  if (single_succ_p (exit_dest))
+	    doms.safe_push (single_succ (exit_dest));
+	}
     }
   else /* Add the copy at entry.  */
     {
@@ -1760,6 +1831,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (new_preheader);
       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
 			       loop_preheader_edge (new_loop)->src);
+
+      if (multiple_exits_p)
+	update_loop = loop;
+    }
+
+  if (multiple_exits_p)
+    {
+      for (edge e : get_loop_exit_edges (update_loop))
+	{
+	  edge ex;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
+	    {
+	      /* Find the first non-fallthrough block as fall-throughs can't
+		 dominate other blocks.  */
+	      if (single_succ_p (ex->dest))
+		{
+		  doms.safe_push (ex->dest);
+		  ex = single_succ_edge (ex->dest);
+		}
+	      doms.safe_push (ex->dest);
+	    }
+	  doms.safe_push (e->dest);
+	}
+
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      if (updated_doms)
+	updated_doms->safe_splice (doms);
     }
 
   free (new_bbs);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1418913d2c308b0cf78352e29dc9958746fb9c94..d8b532c4b8ca92a856368a686598859fab9d40e9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
 {
   if (bb == (bb->loop_father)->header)
     return true;
-  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
+
   return false;
 }
 
@@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *, bool = true);
+						    edge, edge *, bool = true,
+						    vec<basic_block> * = NULL);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
@@ -2223,6 +2224,7 @@ extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
+extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,

Comments

Tamar Christina Nov. 15, 2023, midnight UTC | #1
Patch updated to latest trunk,

This splits the part of the function that does peeling for loops at exits to
a different function.  In this new function we also peel for early breaks.

Peeling for early breaks works by redirecting all early break exits to a
single "early break" block and combine them and the normal exit edge together
later in a different block which then goes into the epilog preheader.

This allows us to re-use all the existing code for IV updates, Additionally this
also enables correct linking for multiple vector epilogues.

flush_pending_stmts cannot be used in this scenario since it updates the PHI
nodes in the order that they are in the exit destination blocks.  This means
they are in CFG visit order.  With a single exit this doesn't matter but with
multiple exits with different live values through the different exits the order
usually does not line up.

Additionally the vectorizer helper functions expect to be able to iterate over
the nodes in the order that they occur in the loop header blocks.  This is an
invariant we must maintain.  To do this we just inline the work
flush_pending_stmts but maintain the order by using the header blocks to guide
the work.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_is_loop_exit_latch_pred): New.
	(slpeel_tree_duplicate_loop_for_vectorization): New.
	(slpeel_tree_duplicate_loop_to_edge_cfg): use it.
	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.
	(vect_is_loop_exit_latch_pred): New.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index b9161274ce401a7307f3e61ad23aa036701190d7..fafbf924e8db18eb4eec7a4a1906d10f6ce9812f 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1392,6 +1392,153 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
 		     (gimple *) cond_stmt);
 }
 
+/* Determine if the exit choosen by the loop vectorizer differs from the
+   natural loop exit.  i.e. if the exit leads to the loop patch or not.
+   When this happens we need to flip the understanding of main and other
+   exits by peeling and IV updates.  */
+
+bool inline
+vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
+{
+  return single_pred (loop->latch) == loop_exit->src;
+}
+
+/* Perform peeling for when the peeled loop is placed after the original loop.
+   This maintains LCSSA and creates the appropriate blocks for multiple exit
+   vectorization.   */
+
+void static
+slpeel_tree_duplicate_loop_for_vectorization (class loop *loop, edge loop_exit,
+					      vec<edge> &loop_exits,
+					      class loop *new_loop,
+					      bool flow_loops,
+					      basic_block new_preheader)
+{
+  bool multiple_exits_p = loop_exits.length () > 1;
+  basic_block main_loop_exit_block = new_preheader;
+  if (multiple_exits_p)
+    {
+      edge loop_entry = single_succ_edge (new_preheader);
+      new_preheader = split_edge (loop_entry);
+    }
+
+  auto_vec <gimple *> new_phis;
+  hash_map <tree, tree> new_phi_args;
+  /* First create the empty phi nodes so that when we flush the
+     statements they can be filled in.   However because there is no order
+     between the PHI nodes in the exits and the loop headers we need to
+     order them base on the order of the two headers.  First record the new
+     phi nodes. Then redirect the edges and flush the changes.  This writes out
+     the new SSA names.  */
+  for (auto gsi_from = gsi_start_phis (loop_exit->dest);
+       !gsi_end_p (gsi_from); gsi_next (&gsi_from))
+    {
+      gimple *from_phi = gsi_stmt (gsi_from);
+      tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+      gphi *res = create_phi_node (new_res, main_loop_exit_block);
+      new_phis.safe_push (res);
+    }
+
+  for (auto exit : loop_exits)
+    {
+      basic_block dest
+	= exit == loop_exit ? main_loop_exit_block : new_preheader;
+      redirect_edge_and_branch (exit, dest);
+    }
+
+  /* Only fush the main exit, the remaining exits we need to match the order
+     in the loop->header which with multiple exits may not be the same.  */
+  flush_pending_stmts (loop_exit);
+
+  /* Record the new SSA names in the cache so that we can skip materializing
+     them again when we fill in the rest of the LCSSA variables.  */
+  for (auto phi : new_phis)
+    {
+      tree new_arg = gimple_phi_arg (phi, 0)->def;
+
+      if (!SSA_VAR_P (new_arg))
+	continue;
+
+      /* If the PHI MEM node dominates the loop then we shouldn't create
+	 a new LC-SSSA PHI for it in the intermediate block.   */
+      /* A MEM phi that consitutes a new DEF for the vUSE chain can either
+	 be a .VDEF or a PHI that operates on MEM. And said definition
+	 must not be inside the main loop.  Or we must be a parameter.
+	 In the last two cases we may remove a non-MEM PHI node, but since
+	 they dominate both loops the removal is unlikely to cause trouble
+	 as the exits must already be using them.  */
+      if (virtual_operand_p (new_arg)
+	  && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
+	      || !flow_bb_inside_loop_p (loop,
+				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
+	{
+	  auto gsi = gsi_for_stmt (phi);
+	  remove_phi_node (&gsi, true);
+	  continue;
+	}
+
+      /* If we decide to remove the PHI node we should also not
+	 rematerialize it later on.  */
+      new_phi_args.put (new_arg, gimple_phi_result (phi));
+
+      if (TREE_CODE (new_arg) != SSA_NAME)
+	continue;
+    }
+
+  /* Copy the current loop LC PHI nodes between the original loop exit
+     block and the new loop header.  This allows us to later split the
+     preheader block and still find the right LC nodes.  */
+  edge loop_entry = single_succ_edge (new_preheader);
+  if (flow_loops)
+    for (auto gsi_from = gsi_start_phis (loop->header),
+	 gsi_to = gsi_start_phis (new_loop->header);
+	 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	 gsi_next (&gsi_from), gsi_next (&gsi_to))
+      {
+	gimple *from_phi = gsi_stmt (gsi_from);
+	gimple *to_phi = gsi_stmt (gsi_to);
+	tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, loop_latch_edge (loop));
+	tree *res = NULL;
+
+	/* Check if we've already created a new phi node during edge
+	   redirection.  If we have, only propagate the value downwards.  */
+	if ((res = new_phi_args.get (new_arg)))
+	  new_arg = *res;
+
+	/* All other exits use the previous iters.  */
+	if (multiple_exits_p)
+	  {
+	    tree alt_arg = gimple_phi_result (from_phi);
+	    tree alt_res = copy_ssa_name (alt_arg);
+	    gphi *alt_lcssa_phi = create_phi_node (alt_res, new_preheader);
+	    edge main_e = single_succ_edge (main_loop_exit_block);
+	    for (edge e : loop_exits)
+	      if (e != loop_exit)
+		{
+		  add_phi_arg (alt_lcssa_phi, alt_arg, e, UNKNOWN_LOCATION);
+		  SET_PHI_ARG_DEF (alt_lcssa_phi, main_e->dest_idx, new_arg);
+		}
+	    new_arg = alt_res; /* Push it down to the new_loop header.  */
+	  } else if (!res) {
+	    /* For non-early break we need to keep the possibly live values in
+	       the exit block.  For early break these are kept in the merge
+	       block in the code above.  */
+	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
+
+	    /* Main loop exit should use the final iter value.  */
+	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+	    new_arg = new_res;
+	  }
+
+	adjust_phi_and_debug_stmts (to_phi, loop_entry, new_arg);
+    }
+
+  /* Now clear all the redirect maps.  */
+  for (auto exit : loop_exits)
+    redirect_edge_var_map_clear (exit);
+}
+
 /* Given LOOP this function generates a new copy of it and puts it
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
@@ -1403,13 +1550,16 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
    copies remains the same.
 
    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
-   dominators were updated during the peeling.  */
+   dominators were updated during the peeling.  When doing early break vectorization
+   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
+   memory references that need to be updated should we decide to vectorize.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
 					edge scalar_exit, edge e, edge *new_e,
-					bool flow_loops)
+					bool flow_loops,
+					vec<basic_block> *updated_doms)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1526,7 +1676,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       }
 
   auto loop_exits = get_loop_exit_edges (loop);
+  bool multiple_exits_p = loop_exits.length () > 1;
   auto_vec<basic_block> doms;
+  class loop *update_loop = NULL;
 
   if (at_exit) /* Add the loop copy at exit.  */
     {
@@ -1536,91 +1688,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  flush_pending_stmts (new_exit);
 	}
 
-      auto_vec <gimple *> new_phis;
-      hash_map <tree, tree> new_phi_args;
-      /* First create the empty phi nodes so that when we flush the
-	 statements they can be filled in.   However because there is no order
-	 between the PHI nodes in the exits and the loop headers we need to
-	 order them base on the order of the two headers.  First record the new
-	 phi nodes.  */
-      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
-	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
-	{
-	  gimple *from_phi = gsi_stmt (gsi_from);
-	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	  gphi *res = create_phi_node (new_res, new_preheader);
-	  new_phis.safe_push (res);
-	}
-
-      /* Then redirect the edges and flush the changes.  This writes out the new
-	 SSA names.  */
-      for (edge exit : loop_exits)
-	{
-	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
-	  flush_pending_stmts (temp_e);
-	}
-      /* Record the new SSA names in the cache so that we can skip materializing
-	 them again when we fill in the rest of the LCSSA variables.  */
-      for (auto phi : new_phis)
-	{
-	  tree new_arg = gimple_phi_arg (phi, 0)->def;
-
-	  if (!SSA_VAR_P (new_arg))
-	    continue;
-	  /* If the PHI MEM node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.   */
-	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
-	     be a .VDEF or a PHI that operates on MEM. And said definition
-	     must not be inside the main loop.  Or we must be a parameter.
-	     In the last two cases we may remove a non-MEM PHI node, but since
-	     they dominate both loops the removal is unlikely to cause trouble
-	     as the exits must already be using them.  */
-	  if (virtual_operand_p (new_arg)
-	      && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
-		  || !flow_bb_inside_loop_p (loop,
-				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
-	    {
-	      auto gsi = gsi_for_stmt (phi);
-	      remove_phi_node (&gsi, true);
-	      continue;
-	    }
-	  new_phi_args.put (new_arg, gimple_phi_result (phi));
-
-	  if (TREE_CODE (new_arg) != SSA_NAME)
-	    continue;
-	}
-
-      /* Copy the current loop LC PHI nodes between the original loop exit
-	 block and the new loop header.  This allows us to later split the
-	 preheader block and still find the right LC nodes.  */
-      edge loop_entry = single_succ_edge (new_preheader);
-      if (flow_loops)
-	for (auto gsi_from = gsi_start_phis (loop->header),
-	     gsi_to = gsi_start_phis (new_loop->header);
-	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
-	     gsi_next (&gsi_from), gsi_next (&gsi_to))
-	  {
-	    gimple *from_phi = gsi_stmt (gsi_from);
-	    gimple *to_phi = gsi_stmt (gsi_to);
-	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
-						  loop_latch_edge (loop));
-
-	    /* Check if we've already created a new phi node during edge
-	       redirection.  If we have, only propagate the value downwards.  */
-	    if (tree *res = new_phi_args.get (new_arg))
-	      {
-		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
-		continue;
-	      }
-
-	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
-
-	    /* Main loop exit should use the final iter value.  */
-	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
-
-	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
-	  }
+      slpeel_tree_duplicate_loop_for_vectorization (loop, loop_exit, loop_exits,
+						    new_loop, flow_loops,
+						    new_preheader);
 
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
 
@@ -1634,6 +1704,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (preheader);
       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
 			       loop_preheader_edge (scalar_loop)->src);
+
+      /* Finally after wiring the new epilogue we need to update its main exit
+	 to the original function exit we recorded.  Other exits are already
+	 correct.  */
+      if (multiple_exits_p)
+	{
+	  update_loop = new_loop;
+	  for (edge e : get_loop_exit_edges (loop))
+	    doms.safe_push (e->dest);
+	  doms.safe_push (exit_dest);
+
+	  /* Likely a fall-through edge, so update if needed.  */
+	  if (single_succ_p (exit_dest))
+	    doms.safe_push (single_succ (exit_dest));
+	}
     }
   else /* Add the copy at entry.  */
     {
@@ -1681,6 +1766,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (new_preheader);
       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
 			       loop_preheader_edge (new_loop)->src);
+
+      if (multiple_exits_p)
+	update_loop = loop;
+    }
+
+  if (multiple_exits_p)
+    {
+      for (edge e : get_loop_exit_edges (update_loop))
+	{
+	  edge ex;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
+	    {
+	      /* Find the first non-fallthrough block as fall-throughs can't
+		 dominate other blocks.  */
+	      if (single_succ_p (ex->dest))
+		{
+		  doms.safe_push (ex->dest);
+		  ex = single_succ_edge (ex->dest);
+		}
+	      doms.safe_push (ex->dest);
+	    }
+	  doms.safe_push (e->dest);
+	}
+
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      if (updated_doms)
+	updated_doms->safe_splice (doms);
     }
 
   free (new_bbs);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 76451a7fefe6ff966cecfa2cbc7b11336b038565..b9a71a0b5f5407417e8366b0df132df20c7f60aa 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
 {
   if (bb == (bb->loop_father)->header)
     return true;
-  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
+
   return false;
 }
 
@@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *, bool = true);
+						    edge, edge *, bool = true,
+						    vec<basic_block> * = NULL);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
@@ -2223,6 +2224,7 @@ extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
+extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
Richard Biener Nov. 15, 2023, 12:40 p.m. UTC | #2
On Wed, 15 Nov 2023, Tamar Christina wrote:

> Patch updated to latest trunk,
> 
> This splits the part of the function that does peeling for loops at exits to
> a different function.  In this new function we also peel for early breaks.
> 
> Peeling for early breaks works by redirecting all early break exits to a
> single "early break" block and combine them and the normal exit edge together
> later in a different block which then goes into the epilog preheader.
> 
> This allows us to re-use all the existing code for IV updates, Additionally this
> also enables correct linking for multiple vector epilogues.
> 
> flush_pending_stmts cannot be used in this scenario since it updates the PHI
> nodes in the order that they are in the exit destination blocks.  This means
> they are in CFG visit order.  With a single exit this doesn't matter but with
> multiple exits with different live values through the different exits the order
> usually does not line up.
> 
> Additionally the vectorizer helper functions expect to be able to iterate over
> the nodes in the order that they occur in the loop header blocks.  This is an
> invariant we must maintain.  To do this we just inline the work
> flush_pending_stmts but maintain the order by using the header blocks to guide
> the work.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop-manip.cc (vect_is_loop_exit_latch_pred): New.
> 	(slpeel_tree_duplicate_loop_for_vectorization): New.
> 	(slpeel_tree_duplicate_loop_to_edge_cfg): use it.
> 	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
> 	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.
> 	(vect_is_loop_exit_latch_pred): New.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index b9161274ce401a7307f3e61ad23aa036701190d7..fafbf924e8db18eb4eec7a4a1906d10f6ce9812f 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1392,6 +1392,153 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
>  		     (gimple *) cond_stmt);
>  }
>  
> +/* Determine if the exit choosen by the loop vectorizer differs from the
> +   natural loop exit.  i.e. if the exit leads to the loop patch or not.
> +   When this happens we need to flip the understanding of main and other
> +   exits by peeling and IV updates.  */
> +
> +bool inline
> +vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)

Ick, bad name - didn't see its use(s) in this patch?


> +{
> +  return single_pred (loop->latch) == loop_exit->src;
> +}
> +
> +/* Perform peeling for when the peeled loop is placed after the original loop.
> +   This maintains LCSSA and creates the appropriate blocks for multiple exit
> +   vectorization.   */
> +
> +void static
> +slpeel_tree_duplicate_loop_for_vectorization (class loop *loop, edge loop_exit,
> +					      vec<edge> &loop_exits,
> +					      class loop *new_loop,
> +					      bool flow_loops,
> +					      basic_block new_preheader)

also bad name ;)  I don't see a strong reason to factor this out.

> +{
> +  bool multiple_exits_p = loop_exits.length () > 1;
> +  basic_block main_loop_exit_block = new_preheader;
> +  if (multiple_exits_p)
> +    {
> +      edge loop_entry = single_succ_edge (new_preheader);
> +      new_preheader = split_edge (loop_entry);
> +    }
> +
> +  auto_vec <gimple *> new_phis;
> +  hash_map <tree, tree> new_phi_args;
> +  /* First create the empty phi nodes so that when we flush the
> +     statements they can be filled in.   However because there is no order
> +     between the PHI nodes in the exits and the loop headers we need to
> +     order them base on the order of the two headers.  First record the new
> +     phi nodes. Then redirect the edges and flush the changes.  This writes out
> +     the new SSA names.  */
> +  for (auto gsi_from = gsi_start_phis (loop_exit->dest);
> +       !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> +    {
> +      gimple *from_phi = gsi_stmt (gsi_from);
> +      tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> +      gphi *res = create_phi_node (new_res, main_loop_exit_block);
> +      new_phis.safe_push (res);
> +    }
> +
> +  for (auto exit : loop_exits)
> +    {
> +      basic_block dest
> +	= exit == loop_exit ? main_loop_exit_block : new_preheader;
> +      redirect_edge_and_branch (exit, dest);
> +    }
> +
> +  /* Only fush the main exit, the remaining exits we need to match the order
> +     in the loop->header which with multiple exits may not be the same.  */
> +  flush_pending_stmts (loop_exit);
> +
> +  /* Record the new SSA names in the cache so that we can skip materializing
> +     them again when we fill in the rest of the LCSSA variables.  */
> +  for (auto phi : new_phis)
> +    {
> +      tree new_arg = gimple_phi_arg (phi, 0)->def;
> +
> +      if (!SSA_VAR_P (new_arg))
> +	continue;
> +
> +      /* If the PHI MEM node dominates the loop then we shouldn't create
> +	 a new LC-SSSA PHI for it in the intermediate block.   */
> +      /* A MEM phi that consitutes a new DEF for the vUSE chain can either
> +	 be a .VDEF or a PHI that operates on MEM. And said definition
> +	 must not be inside the main loop.  Or we must be a parameter.
> +	 In the last two cases we may remove a non-MEM PHI node, but since
> +	 they dominate both loops the removal is unlikely to cause trouble
> +	 as the exits must already be using them.  */
> +      if (virtual_operand_p (new_arg)
> +	  && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
> +	      || !flow_bb_inside_loop_p (loop,
> +				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
> +	{
> +	  auto gsi = gsi_for_stmt (phi);
> +	  remove_phi_node (&gsi, true);
> +	  continue;
> +	}
> +
> +      /* If we decide to remove the PHI node we should also not
> +	 rematerialize it later on.  */
> +      new_phi_args.put (new_arg, gimple_phi_result (phi));
> +
> +      if (TREE_CODE (new_arg) != SSA_NAME)
> +	continue;
> +    }
> +
> +  /* Copy the current loop LC PHI nodes between the original loop exit
> +     block and the new loop header.  This allows us to later split the
> +     preheader block and still find the right LC nodes.  */
> +  edge loop_entry = single_succ_edge (new_preheader);
> +  if (flow_loops)
> +    for (auto gsi_from = gsi_start_phis (loop->header),
> +	 gsi_to = gsi_start_phis (new_loop->header);
> +	 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> +	 gsi_next (&gsi_from), gsi_next (&gsi_to))
> +      {
> +	gimple *from_phi = gsi_stmt (gsi_from);
> +	gimple *to_phi = gsi_stmt (gsi_to);
> +	tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, loop_latch_edge (loop));
> +	tree *res = NULL;
> +
> +	/* Check if we've already created a new phi node during edge
> +	   redirection.  If we have, only propagate the value downwards.  */
> +	if ((res = new_phi_args.get (new_arg)))
> +	  new_arg = *res;
> +
> +	/* All other exits use the previous iters.  */
> +	if (multiple_exits_p)
> +	  {
> +	    tree alt_arg = gimple_phi_result (from_phi);
> +	    tree alt_res = copy_ssa_name (alt_arg);
> +	    gphi *alt_lcssa_phi = create_phi_node (alt_res, new_preheader);
> +	    edge main_e = single_succ_edge (main_loop_exit_block);
> +	    for (edge e : loop_exits)
> +	      if (e != loop_exit)
> +		{
> +		  add_phi_arg (alt_lcssa_phi, alt_arg, e, UNKNOWN_LOCATION);
> +		  SET_PHI_ARG_DEF (alt_lcssa_phi, main_e->dest_idx, new_arg);
> +		}
> +	    new_arg = alt_res; /* Push it down to the new_loop header.  */

I think it would be clearer to separate alternate exit from main exit
handling more completely - we don't have the new_phi_args map for
the alternate exits.

Thus first only redirect and fixup the main exit and then redirect
the alternate exits, immediately wiping the edge_var_map, and
manually create the only relevant PHIs.

In principle this early-break handling could be fully within the
if (flow_loops) condition (including populating the new_phi_args
map for the main exit).

The code itself looks fine to me.

Richard.

> +	  } else if (!res) {
> +	    /* For non-early break we need to keep the possibly live values in
> +	       the exit block.  For early break these are kept in the merge
> +	       block in the code above.  */
> +	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> +	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
> +
> +	    /* Main loop exit should use the final iter value.  */
> +	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> +	    new_arg = new_res;
> +	  }
> +
> +	adjust_phi_and_debug_stmts (to_phi, loop_entry, new_arg);
> +    }
> +
> +  /* Now clear all the redirect maps.  */
> +  for (auto exit : loop_exits)
> +    redirect_edge_var_map_clear (exit);
> +}
> +
>  /* Given LOOP this function generates a new copy of it and puts it
>     on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
>     non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
> @@ -1403,13 +1550,16 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
>     copies remains the same.
>  
>     If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
> -   dominators were updated during the peeling.  */
> +   dominators were updated during the peeling.  When doing early break vectorization
> +   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
> +   memory references that need to be updated should we decide to vectorize.  */
>  
>  class loop *
>  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  					class loop *scalar_loop,
>  					edge scalar_exit, edge e, edge *new_e,
> -					bool flow_loops)
> +					bool flow_loops,
> +					vec<basic_block> *updated_doms)
>  {
>    class loop *new_loop;
>    basic_block *new_bbs, *bbs, *pbbs;
> @@ -1526,7 +1676,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        }
>  
>    auto loop_exits = get_loop_exit_edges (loop);
> +  bool multiple_exits_p = loop_exits.length () > 1;
>    auto_vec<basic_block> doms;
> +  class loop *update_loop = NULL;
>  
>    if (at_exit) /* Add the loop copy at exit.  */
>      {
> @@ -1536,91 +1688,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	  flush_pending_stmts (new_exit);
>  	}
>  
> -      auto_vec <gimple *> new_phis;
> -      hash_map <tree, tree> new_phi_args;
> -      /* First create the empty phi nodes so that when we flush the
> -	 statements they can be filled in.   However because there is no order
> -	 between the PHI nodes in the exits and the loop headers we need to
> -	 order them base on the order of the two headers.  First record the new
> -	 phi nodes.  */
> -      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> -	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> -	{
> -	  gimple *from_phi = gsi_stmt (gsi_from);
> -	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> -	  gphi *res = create_phi_node (new_res, new_preheader);
> -	  new_phis.safe_push (res);
> -	}
> -
> -      /* Then redirect the edges and flush the changes.  This writes out the new
> -	 SSA names.  */
> -      for (edge exit : loop_exits)
> -	{
> -	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
> -	  flush_pending_stmts (temp_e);
> -	}
> -      /* Record the new SSA names in the cache so that we can skip materializing
> -	 them again when we fill in the rest of the LCSSA variables.  */
> -      for (auto phi : new_phis)
> -	{
> -	  tree new_arg = gimple_phi_arg (phi, 0)->def;
> -
> -	  if (!SSA_VAR_P (new_arg))
> -	    continue;
> -	  /* If the PHI MEM node dominates the loop then we shouldn't create
> -	      a new LC-SSSA PHI for it in the intermediate block.   */
> -	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
> -	     be a .VDEF or a PHI that operates on MEM. And said definition
> -	     must not be inside the main loop.  Or we must be a parameter.
> -	     In the last two cases we may remove a non-MEM PHI node, but since
> -	     they dominate both loops the removal is unlikely to cause trouble
> -	     as the exits must already be using them.  */
> -	  if (virtual_operand_p (new_arg)
> -	      && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
> -		  || !flow_bb_inside_loop_p (loop,
> -				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
> -	    {
> -	      auto gsi = gsi_for_stmt (phi);
> -	      remove_phi_node (&gsi, true);
> -	      continue;
> -	    }
> -	  new_phi_args.put (new_arg, gimple_phi_result (phi));
> -
> -	  if (TREE_CODE (new_arg) != SSA_NAME)
> -	    continue;
> -	}
> -
> -      /* Copy the current loop LC PHI nodes between the original loop exit
> -	 block and the new loop header.  This allows us to later split the
> -	 preheader block and still find the right LC nodes.  */
> -      edge loop_entry = single_succ_edge (new_preheader);
> -      if (flow_loops)
> -	for (auto gsi_from = gsi_start_phis (loop->header),
> -	     gsi_to = gsi_start_phis (new_loop->header);
> -	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> -	     gsi_next (&gsi_from), gsi_next (&gsi_to))
> -	  {
> -	    gimple *from_phi = gsi_stmt (gsi_from);
> -	    gimple *to_phi = gsi_stmt (gsi_to);
> -	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> -						  loop_latch_edge (loop));
> -
> -	    /* Check if we've already created a new phi node during edge
> -	       redirection.  If we have, only propagate the value downwards.  */
> -	    if (tree *res = new_phi_args.get (new_arg))
> -	      {
> -		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
> -		continue;
> -	      }
> -
> -	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> -	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
> -
> -	    /* Main loop exit should use the final iter value.  */
> -	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> -
> -	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
> -	  }
> +      slpeel_tree_duplicate_loop_for_vectorization (loop, loop_exit, loop_exits,
> +						    new_loop, flow_loops,
> +						    new_preheader);
>  
>        set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
>  
> @@ -1634,6 +1704,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        delete_basic_block (preheader);
>        set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
>  			       loop_preheader_edge (scalar_loop)->src);
> +
> +      /* Finally after wiring the new epilogue we need to update its main exit
> +	 to the original function exit we recorded.  Other exits are already
> +	 correct.  */
> +      if (multiple_exits_p)
> +	{
> +	  update_loop = new_loop;
> +	  for (edge e : get_loop_exit_edges (loop))
> +	    doms.safe_push (e->dest);
> +	  doms.safe_push (exit_dest);
> +
> +	  /* Likely a fall-through edge, so update if needed.  */
> +	  if (single_succ_p (exit_dest))
> +	    doms.safe_push (single_succ (exit_dest));
> +	}
>      }
>    else /* Add the copy at entry.  */
>      {
> @@ -1681,6 +1766,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        delete_basic_block (new_preheader);
>        set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
>  			       loop_preheader_edge (new_loop)->src);
> +
> +      if (multiple_exits_p)
> +	update_loop = loop;
> +    }
> +
> +  if (multiple_exits_p)
> +    {
> +      for (edge e : get_loop_exit_edges (update_loop))
> +	{
> +	  edge ex;
> +	  edge_iterator ei;
> +	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
> +	    {
> +	      /* Find the first non-fallthrough block as fall-throughs can't
> +		 dominate other blocks.  */
> +	      if (single_succ_p (ex->dest))
> +		{
> +		  doms.safe_push (ex->dest);
> +		  ex = single_succ_edge (ex->dest);
> +		}
> +	      doms.safe_push (ex->dest);
> +	    }
> +	  doms.safe_push (e->dest);
> +	}
> +
> +      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
> +      if (updated_doms)
> +	updated_doms->safe_splice (doms);
>      }
>  
>    free (new_bbs);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 76451a7fefe6ff966cecfa2cbc7b11336b038565..b9a71a0b5f5407417e8366b0df132df20c7f60aa 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
>  {
>    if (bb == (bb->loop_father)->header)
>      return true;
> -  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
> +
>    return false;
>  }
>  
> @@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
>  					 const_edge);
>  class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
>  						    class loop *, edge,
> -						    edge, edge *, bool = true);
> +						    edge, edge *, bool = true,
> +						    vec<basic_block> * = NULL);
>  class loop *vect_loop_versioning (loop_vec_info, gimple *);
>  extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
>  				    tree *, tree *, tree *, int, bool, bool,
> @@ -2223,6 +2224,7 @@ extern dump_user_location_t find_loop_location (class loop *);
>  extern bool vect_can_advance_ivs_p (loop_vec_info);
>  extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
>  extern edge vec_init_loop_exit_info (class loop *);
> +extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
>  
>  /* In tree-vect-stmts.cc.  */
>  extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
>
Tamar Christina Nov. 20, 2023, 9:51 p.m. UTC | #3
Hi All,

Here's the respun patch:

This splits the part of the function that does peeling for loops at exits to
a different function.  In this new function we also peel for early breaks.

Peeling for early breaks works by redirecting all early break exits to a
single "early break" block and combine them and the normal exit edge together
later in a different block which then goes into the epilog preheader.

This allows us to re-use all the existing code for IV updates, Additionally this
also enables correct linking for multiple vector epilogues.

flush_pending_stmts cannot be used in this scenario since it updates the PHI
nodes in the order that they are in the exit destination blocks.  This means
they are in CFG visit order.  With a single exit this doesn't matter but with
multiple exits with different live values through the different exits the order
usually does not line up.

Additionally the vectorizer helper functions expect to be able to iterate over
the nodes in the order that they occur in the loop header blocks.  This is an
invariant we must maintain.  To do this we just inline the work
flush_pending_stmts but maintain the order by using the header blocks to guide
the work.

The way peeling is done result in LIM noticing that in some cases the condition
and the results are loop invariant and tries to move them out of the loop.

While the resulting code is operationally sound, moving the compare out of the
gcond results in generating code that no longer branches, so cbranch is no
longer applicable.  As such I now add code to check during this motion to see
if the target supports flag setting vector comparison as general operation.

One outstanding issue is probability scaling, testcases like
vect-epilogues-2.c.176t.vect fails because during peeling I make an intermediate
edge that is used to keep IV updates simple.  This edge seems to have the wrong
count:

  if (ivtmp_71 < bnd.8_54)
    goto <bb 8>; [89.00%]
  else
    goto <bb 24>; [11.00%]
;;    succ:       8 [89.0% (guessed)]  count:765459809 (estimated locally, freq 6.4808) (TRUE_VALUE,EXECUTABLE)
;;                24 [11.0% (guessed)]  count:94607391 (estimated locally, freq 0.8010) (FALSE_VALUE,EXECUTABLE)

;;   basic block 24, loop depth 0, count 105119324 (estimated locally, freq 0.8900), maybe hot
;;   Invalid sum of incoming counts 94607391 (estimated locally, freq 0.8010), should be 105119324 (estimated locally, freq 0.8900)
;;    prev block 8, next block 30, flags: (NEW, VISITED)
;;    pred:       3 [11.0% (guessed)]  count:94607391 (estimated locally, freq 0.8010) (FALSE_VALUE,EXECUTABLE)
  # res_46 = PHI <res_13(3)>

If I'm reading this error correctly, the edge count should be 94607391 but it
got 105119324.  My guess is that something scalled up the BB count, i.e.
94607391 * 1.11, but... I don't know why or what.  Any thoughts?

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues aside from
profile counts above.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* gcc/tree-ssa-loop-im.cc (compute_invariantness): Import insn-codes.h
	and optabs-tree.h and check for vector compare motion out of gcond.
	* tree-vect-loop-manip.cc
	(slpeel_tree_duplicate_loop_for_vectorization): New.
	(slpeel_tree_duplicate_loop_to_edge_cfg): use it.
	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.

--- inline copy of patch ---

diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
index 396963b6754c7671e2e5404302a69129918555e2..92a9318a1ca0a2da50ff2f29cf271d2e78fddd77 100644
--- a/gcc/tree-ssa-loop-im.cc
+++ b/gcc/tree-ssa-loop-im.cc
@@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-dfa.h"
 #include "tree-ssa.h"
 #include "dbgcnt.h"
+#include "insn-codes.h"
+#include "optabs-tree.h"
 
 /* TODO:  Support for predicated code motion.  I.e.
 
@@ -1138,6 +1140,24 @@ compute_invariantness (basic_block bb)
 	    continue;
 	  }
 
+	/* Check if one of the depedent statement is a vector compare whether
+	   the target supports it,  otherwise it's invalid to hoist it out of
+	   the gcond it belonged to.  */
+	for (auto dep_stmt : lim_data->depends)
+	  {
+	     if (is_gimple_assign (dep_stmt)
+		 && VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (dep_stmt))))
+		{
+		  tree type = TREE_TYPE (gimple_assign_lhs (dep_stmt));
+		  auto code = gimple_assign_rhs_code (dep_stmt);
+		  if (!target_supports_op_p (type, code, optab_vector))
+		    pos = MOVE_IMPOSSIBLE;
+		}
+	  }
+
+	if (pos == MOVE_IMPOSSIBLE)
+	  continue;
+
 	if (dump_file && (dump_flags & TDF_DETAILS))
 	  {
 	    print_gimple_stmt (dump_file, stmt, 2);
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index b9161274ce401a7307f3e61ad23aa036701190d7..9d17e6877e1e16b359fdc86a92bf33a9760f8f86 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1403,13 +1403,16 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
    copies remains the same.
 
    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
-   dominators were updated during the peeling.  */
+   dominators were updated during the peeling.  When doing early break vectorization
+   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
+   memory references that need to be updated should we decide to vectorize.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
 					edge scalar_exit, edge e, edge *new_e,
-					bool flow_loops)
+					bool flow_loops,
+					vec<basic_block> *updated_doms)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1526,7 +1529,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       }
 
   auto loop_exits = get_loop_exit_edges (loop);
+  bool multiple_exits_p = loop_exits.length () > 1;
   auto_vec<basic_block> doms;
+  class loop *update_loop = NULL;
 
   if (at_exit) /* Add the loop copy at exit.  */
     {
@@ -1536,39 +1541,61 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  flush_pending_stmts (new_exit);
 	}
 
+      bool multiple_exits_p = loop_exits.length () > 1;
+      basic_block main_loop_exit_block = new_preheader;
+      basic_block alt_loop_exit_block = NULL;
+      /* Create intermediate edge for main exit.  */
+      edge loop_e = single_succ_edge (new_preheader);
+      new_preheader = split_edge (loop_e);
+
       auto_vec <gimple *> new_phis;
       hash_map <tree, tree> new_phi_args;
       /* First create the empty phi nodes so that when we flush the
 	 statements they can be filled in.   However because there is no order
 	 between the PHI nodes in the exits and the loop headers we need to
 	 order them base on the order of the two headers.  First record the new
-	 phi nodes.  */
-      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
+	 phi nodes. Then redirect the edges and flush the changes.  This writes
+	 out the new SSA names.  */
+      for (auto gsi_from = gsi_start_phis (loop_exit->dest);
 	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
 	{
 	  gimple *from_phi = gsi_stmt (gsi_from);
 	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	  gphi *res = create_phi_node (new_res, new_preheader);
+	  gphi *res = create_phi_node (new_res, main_loop_exit_block);
 	  new_phis.safe_push (res);
 	}
 
-      /* Then redirect the edges and flush the changes.  This writes out the new
-	 SSA names.  */
-      for (edge exit : loop_exits)
+      for (auto exit : loop_exits)
 	{
-	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
-	  flush_pending_stmts (temp_e);
+	  basic_block dest = main_loop_exit_block;
+	  if (exit != loop_exit)
+	    {
+	      if (!alt_loop_exit_block)
+		{
+		  alt_loop_exit_block = split_edge (exit);
+		  edge res = redirect_edge_and_branch (
+				single_succ_edge (alt_loop_exit_block),
+				new_preheader);
+		  flush_pending_stmts (res);
+		  continue;
+		}
+	      dest = alt_loop_exit_block;
+	    }
+	  edge e = redirect_edge_and_branch (exit, dest);
+	  flush_pending_stmts (e);
 	}
+
       /* Record the new SSA names in the cache so that we can skip materializing
 	 them again when we fill in the rest of the LCSSA variables.  */
       for (auto phi : new_phis)
 	{
-	  tree new_arg = gimple_phi_arg (phi, 0)->def;
+	  tree new_arg = gimple_phi_arg (phi, loop_exit->dest_idx)->def;
 
 	  if (!SSA_VAR_P (new_arg))
 	    continue;
+
 	  /* If the PHI MEM node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.   */
+	     a new LC-SSSA PHI for it in the intermediate block.   */
 	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
 	     be a .VDEF or a PHI that operates on MEM. And said definition
 	     must not be inside the main loop.  Or we must be a parameter.
@@ -1584,6 +1611,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	      remove_phi_node (&gsi, true);
 	      continue;
 	    }
+
+	  /* If we decide to remove the PHI node we should also not
+	     rematerialize it later on.  */
 	  new_phi_args.put (new_arg, gimple_phi_result (phi));
 
 	  if (TREE_CODE (new_arg) != SSA_NAME)
@@ -1595,34 +1625,58 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	 preheader block and still find the right LC nodes.  */
       edge loop_entry = single_succ_edge (new_preheader);
       if (flow_loops)
-	for (auto gsi_from = gsi_start_phis (loop->header),
-	     gsi_to = gsi_start_phis (new_loop->header);
-	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
-	     gsi_next (&gsi_from), gsi_next (&gsi_to))
-	  {
-	    gimple *from_phi = gsi_stmt (gsi_from);
-	    gimple *to_phi = gsi_stmt (gsi_to);
-	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
-						  loop_latch_edge (loop));
+	{
+	  /* Link through the main exit first.  */
+	  for (auto gsi_from = gsi_start_phis (loop->header),
+	       gsi_to = gsi_start_phis (new_loop->header);
+	       !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	       gsi_next (&gsi_from), gsi_next (&gsi_to))
+	    {
+	      gimple *from_phi = gsi_stmt (gsi_from);
+	      gimple *to_phi = gsi_stmt (gsi_to);
+	      tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						    loop_latch_edge (loop));
 
-	    /* Check if we've already created a new phi node during edge
-	       redirection.  If we have, only propagate the value downwards.  */
-	    if (tree *res = new_phi_args.get (new_arg))
-	      {
-		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
-		continue;
-	      }
+	      /* Check if we've already created a new phi node during edge
+		 redirection.  If we have, only propagate the value
+		 downwards.  */
+	      if (tree *res = new_phi_args.get (new_arg))
+		new_arg = *res;
 
-	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
+	      tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	      gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
 
-	    /* Main loop exit should use the final iter value.  */
-	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+	      /* Main loop exit should use the final iter value.  */
+	      SET_PHI_ARG_DEF (lcssa_phi, loop_exit->dest_idx, new_arg);
 
-	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
-	  }
+	      adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
+	    }
 
-      set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
+	  set_immediate_dominator (CDI_DOMINATORS, main_loop_exit_block, loop_exit->src);
+	  set_immediate_dominator (CDI_DOMINATORS, new_preheader, main_loop_exit_block);
+
+	  /* Now link the alternative exits.  */
+	  if (multiple_exits_p)
+	    {
+	      for (auto gsi_from = gsi_start_phis (loop->header),
+		   gsi_to = gsi_start_phis (new_preheader);
+		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+		   gsi_next (&gsi_from), gsi_next (&gsi_to))
+		{
+		  gimple *from_phi = gsi_stmt (gsi_from);
+		  gimple *to_phi = gsi_stmt (gsi_to);
+
+		  tree alt_arg = gimple_phi_result (from_phi);
+		  edge main_e = single_succ_edge (alt_loop_exit_block);
+		  for (edge e : loop_exits)
+		    if (e != loop_exit)
+		      SET_PHI_ARG_DEF (to_phi, main_e->dest_idx, alt_arg);
+		}
+
+	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
+				       loop->header);
+	    }
+	}
 
       if (was_imm_dom || duplicate_outer_loop)
 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
@@ -1634,6 +1688,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (preheader);
       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
 			       loop_preheader_edge (scalar_loop)->src);
+
+      /* Finally after wiring the new epilogue we need to update its main exit
+	 to the original function exit we recorded.  Other exits are already
+	 correct.  */
+      if (multiple_exits_p)
+	{
+	  update_loop = new_loop;
+	  for (edge e : get_loop_exit_edges (loop))
+	    doms.safe_push (e->dest);
+	  doms.safe_push (exit_dest);
+
+	  /* Likely a fall-through edge, so update if needed.  */
+	  if (single_succ_p (exit_dest))
+	    doms.safe_push (single_succ (exit_dest));
+	}
     }
   else /* Add the copy at entry.  */
     {
@@ -1681,6 +1750,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (new_preheader);
       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
 			       loop_preheader_edge (new_loop)->src);
+
+      if (multiple_exits_p)
+	update_loop = loop;
+    }
+
+  if (multiple_exits_p)
+    {
+      for (edge e : get_loop_exit_edges (update_loop))
+	{
+	  edge ex;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
+	    {
+	      /* Find the first non-fallthrough block as fall-throughs can't
+		 dominate other blocks.  */
+	      if (single_succ_p (ex->dest))
+		{
+		  doms.safe_push (ex->dest);
+		  ex = single_succ_edge (ex->dest);
+		}
+	      doms.safe_push (ex->dest);
+	    }
+	  doms.safe_push (e->dest);
+	}
+
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      if (updated_doms)
+	updated_doms->safe_splice (doms);
     }
 
   free (new_bbs);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b5e27d1c46d9cb3dfe5b44f1b49c9e4204572ff1..39aa4d1250efe308acccf484d370f8adfd1ba843 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
 {
   if (bb == (bb->loop_father)->header)
     return true;
-  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
+
   return false;
 }
 
@@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *, bool = true);
+						    edge, edge *, bool = true,
+						    vec<basic_block> * = NULL);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
Tamar Christina Nov. 24, 2023, 10:16 a.m. UTC | #4
Hi All,

Here's an updated patch, which takes a slightly different approach but makes things much easier later on.

Peeling for early breaks works by redirecting all early break exits to a
single "early break" block and combine them and the normal exit edge together
later in a different block which then goes into the epilog preheader.

This allows us to re-use all the existing code for IV updates, Additionally this
also enables correct linking for multiple vector epilogues.

flush_pending_stmts cannot be used in this scenario since it updates the PHI
nodes in the order that they are in the exit destination blocks.  This means
they are in CFG visit order.  With a single exit this doesn't matter but with
multiple exits with different live values through the different exits the order
usually does not line up.

Additionally the vectorizer helper functions expect to be able to iterate over
the nodes in the order that they occur in the loop header blocks.  This is an
invariant we must maintain.  To do this we just inline the work
flush_pending_stmts but maintain the order by using the header blocks to guide
the work.

The way peeling is done result in LIM noticing that in some cases the condition
and the results are loop invariant and tries to move them out of the loop.

While the resulting code is operationally sound, moving the compare out of the
gcond results in generating code that no longer branches, so cbranch is no
longer applicable.  As such I now add code to check during this motion to see
if the target supports flag setting vector comparison as general operation.

Because of the change in peeling I now also have to update the BB counts for
the loop exit intermediate block.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* gcc/tree-ssa-loop-im.cc (compute_invariantness): Import insn-codes.h
	and optabs-tree.h and check for vector compare motion out of gcond.
	* tree-vect-loop-manip.cc
	(slpeel_tree_duplicate_loop_to_edge_cfg): Peel using intermediate blocks.
	(vect_update_ivs_after_vectorizer): Drop assert.
	(vect_do_peeling): Correct BB count for new intermediate block.
	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.

--- inline copy of patch ---

diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
index 396963b6754c7671e2e5404302a69129918555e2..92a9318a1ca0a2da50ff2f29cf271d2e78fddd77 100644
--- a/gcc/tree-ssa-loop-im.cc
+++ b/gcc/tree-ssa-loop-im.cc
@@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-dfa.h"
 #include "tree-ssa.h"
 #include "dbgcnt.h"
+#include "insn-codes.h"
+#include "optabs-tree.h"
 
 /* TODO:  Support for predicated code motion.  I.e.
 
@@ -1138,6 +1140,24 @@ compute_invariantness (basic_block bb)
 	    continue;
 	  }
 
+	/* Check if one of the depedent statement is a vector compare whether
+	   the target supports it,  otherwise it's invalid to hoist it out of
+	   the gcond it belonged to.  */
+	for (auto dep_stmt : lim_data->depends)
+	  {
+	     if (is_gimple_assign (dep_stmt)
+		 && VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (dep_stmt))))
+		{
+		  tree type = TREE_TYPE (gimple_assign_lhs (dep_stmt));
+		  auto code = gimple_assign_rhs_code (dep_stmt);
+		  if (!target_supports_op_p (type, code, optab_vector))
+		    pos = MOVE_IMPOSSIBLE;
+		}
+	  }
+
+	if (pos == MOVE_IMPOSSIBLE)
+	  continue;
+
 	if (dump_file && (dump_flags & TDF_DETAILS))
 	  {
 	    print_gimple_stmt (dump_file, stmt, 2);
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index b9161274ce401a7307f3e61ad23aa036701190d7..0b042b2baf976572af962dd40d5dc311a419ee60 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1403,13 +1403,16 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
    copies remains the same.
 
    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
-   dominators were updated during the peeling.  */
+   dominators were updated during the peeling.  When doing early break vectorization
+   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
+   memory references that need to be updated should we decide to vectorize.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
 					edge scalar_exit, edge e, edge *new_e,
-					bool flow_loops)
+					bool flow_loops,
+					vec<basic_block> *updated_doms)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1526,7 +1529,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       }
 
   auto loop_exits = get_loop_exit_edges (loop);
+  bool multiple_exits_p = loop_exits.length () > 1;
   auto_vec<basic_block> doms;
+  class loop *update_loop = NULL;
 
   if (at_exit) /* Add the loop copy at exit.  */
     {
@@ -1536,39 +1541,65 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  flush_pending_stmts (new_exit);
 	}
 
+      bool multiple_exits_p = loop_exits.length () > 1;
+      basic_block main_loop_exit_block = new_preheader;
+      basic_block alt_loop_exit_block = NULL;
+      /* Create intermediate edge for main exit.  But only useful for early
+	 exits.  */
+      if (multiple_exits_p)
+	{
+	  edge loop_e = single_succ_edge (new_preheader);
+	  new_preheader = split_edge (loop_e);
+	}
+
       auto_vec <gimple *> new_phis;
       hash_map <tree, tree> new_phi_args;
       /* First create the empty phi nodes so that when we flush the
 	 statements they can be filled in.   However because there is no order
 	 between the PHI nodes in the exits and the loop headers we need to
 	 order them base on the order of the two headers.  First record the new
-	 phi nodes.  */
-      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
+	 phi nodes. Then redirect the edges and flush the changes.  This writes
+	 out the new SSA names.  */
+      for (auto gsi_from = gsi_start_phis (loop_exit->dest);
 	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
 	{
 	  gimple *from_phi = gsi_stmt (gsi_from);
 	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	  gphi *res = create_phi_node (new_res, new_preheader);
+	  gphi *res = create_phi_node (new_res, main_loop_exit_block);
 	  new_phis.safe_push (res);
 	}
 
-      /* Then redirect the edges and flush the changes.  This writes out the new
-	 SSA names.  */
-      for (edge exit : loop_exits)
+      for (auto exit : loop_exits)
 	{
-	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
-	  flush_pending_stmts (temp_e);
+	  basic_block dest = main_loop_exit_block;
+	  if (exit != loop_exit)
+	    {
+	      if (!alt_loop_exit_block)
+		{
+		  alt_loop_exit_block = split_edge (exit);
+		  edge res = redirect_edge_and_branch (
+				single_succ_edge (alt_loop_exit_block),
+				new_preheader);
+		  flush_pending_stmts (res);
+		  continue;
+		}
+	      dest = alt_loop_exit_block;
+	    }
+	  edge e = redirect_edge_and_branch (exit, dest);
+	  flush_pending_stmts (e);
 	}
+
       /* Record the new SSA names in the cache so that we can skip materializing
 	 them again when we fill in the rest of the LCSSA variables.  */
       for (auto phi : new_phis)
 	{
-	  tree new_arg = gimple_phi_arg (phi, 0)->def;
+	  tree new_arg = gimple_phi_arg (phi, loop_exit->dest_idx)->def;
 
 	  if (!SSA_VAR_P (new_arg))
 	    continue;
+
 	  /* If the PHI MEM node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.   */
+	     a new LC-SSSA PHI for it in the intermediate block.   */
 	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
 	     be a .VDEF or a PHI that operates on MEM. And said definition
 	     must not be inside the main loop.  Or we must be a parameter.
@@ -1584,6 +1615,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	      remove_phi_node (&gsi, true);
 	      continue;
 	    }
+
+	  /* If we decide to remove the PHI node we should also not
+	     rematerialize it later on.  */
 	  new_phi_args.put (new_arg, gimple_phi_result (phi));
 
 	  if (TREE_CODE (new_arg) != SSA_NAME)
@@ -1595,34 +1629,68 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	 preheader block and still find the right LC nodes.  */
       edge loop_entry = single_succ_edge (new_preheader);
       if (flow_loops)
-	for (auto gsi_from = gsi_start_phis (loop->header),
-	     gsi_to = gsi_start_phis (new_loop->header);
-	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
-	     gsi_next (&gsi_from), gsi_next (&gsi_to))
-	  {
-	    gimple *from_phi = gsi_stmt (gsi_from);
-	    gimple *to_phi = gsi_stmt (gsi_to);
-	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
-						  loop_latch_edge (loop));
+	{
+	  /* Link through the main exit first.  */
+	  for (auto gsi_from = gsi_start_phis (loop->header),
+	       gsi_to = gsi_start_phis (new_loop->header);
+	       !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	       gsi_next (&gsi_from), gsi_next (&gsi_to))
+	    {
+	      gimple *from_phi = gsi_stmt (gsi_from);
+	      gimple *to_phi = gsi_stmt (gsi_to);
+	      tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
+						    loop_latch_edge (loop));
+
+	      /* Check if we've already created a new phi node during edge
+		 redirection.  If we have, only propagate the value
+		 downwards.  */
+	      if (tree *res = new_phi_args.get (new_arg))
+		{
+		  if (multiple_exits_p)
+		    new_arg = *res;
+		  else
+		    {
+		      adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
+		      continue;
+		    }
+		}
 
-	    /* Check if we've already created a new phi node during edge
-	       redirection.  If we have, only propagate the value downwards.  */
-	    if (tree *res = new_phi_args.get (new_arg))
-	      {
-		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
-		continue;
-	      }
+	      tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	      gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
 
-	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
+	      /* Main loop exit should use the final iter value.  */
+	      SET_PHI_ARG_DEF (lcssa_phi, loop_exit->dest_idx, new_arg);
 
-	    /* Main loop exit should use the final iter value.  */
-	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+	      adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
+	    }
 
-	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
-	  }
+	  set_immediate_dominator (CDI_DOMINATORS, main_loop_exit_block,
+				   loop_exit->src);
+
+	  /* Now link the alternative exits.  */
+	  if (multiple_exits_p)
+	    {
+	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
+				       main_loop_exit_block);
+	      for (auto gsi_from = gsi_start_phis (loop->header),
+		   gsi_to = gsi_start_phis (new_preheader);
+		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+		   gsi_next (&gsi_from), gsi_next (&gsi_to))
+		{
+		  gimple *from_phi = gsi_stmt (gsi_from);
+		  gimple *to_phi = gsi_stmt (gsi_to);
+
+		  tree alt_arg = gimple_phi_result (from_phi);
+		  edge main_e = single_succ_edge (alt_loop_exit_block);
+		  for (edge e : loop_exits)
+		    if (e != loop_exit)
+		      SET_PHI_ARG_DEF (to_phi, main_e->dest_idx, alt_arg);
+		}
 
-      set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
+	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
+				       loop->header);
+	    }
+	}
 
       if (was_imm_dom || duplicate_outer_loop)
 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
@@ -1634,6 +1702,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (preheader);
       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
 			       loop_preheader_edge (scalar_loop)->src);
+
+      /* Finally after wiring the new epilogue we need to update its main exit
+	 to the original function exit we recorded.  Other exits are already
+	 correct.  */
+      if (multiple_exits_p)
+	{
+	  update_loop = new_loop;
+	  for (edge e : get_loop_exit_edges (loop))
+	    doms.safe_push (e->dest);
+	  doms.safe_push (exit_dest);
+
+	  /* Likely a fall-through edge, so update if needed.  */
+	  if (single_succ_p (exit_dest))
+	    doms.safe_push (single_succ (exit_dest));
+	}
     }
   else /* Add the copy at entry.  */
     {
@@ -1681,6 +1764,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (new_preheader);
       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
 			       loop_preheader_edge (new_loop)->src);
+
+      if (multiple_exits_p)
+	update_loop = loop;
+    }
+
+  if (multiple_exits_p)
+    {
+      for (edge e : get_loop_exit_edges (update_loop))
+	{
+	  edge ex;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
+	    {
+	      /* Find the first non-fallthrough block as fall-throughs can't
+		 dominate other blocks.  */
+	      if (single_succ_p (ex->dest))
+		{
+		  doms.safe_push (ex->dest);
+		  ex = single_succ_edge (ex->dest);
+		}
+	      doms.safe_push (ex->dest);
+	    }
+	  doms.safe_push (e->dest);
+	}
+
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      if (updated_doms)
+	updated_doms->safe_splice (doms);
     }
 
   free (new_bbs);
@@ -2050,7 +2161,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
 
   /* Make sure there exists a single-predecessor exit bb:  */
   gcc_assert (single_pred_p (exit_bb));
-  gcc_assert (single_succ_edge (exit_bb) == update_e);
 
   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
@@ -3138,6 +3248,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       epilog->force_vectorize = false;
       bb_before_epilog = loop_preheader_edge (epilog)->src;
 
+      /* Fixup the probabities of the new intermediate blocks that we use to connect
+	 to the merge block.  The rest are dealt with via bb_before_epilog
+	 adjustments. */
+	e->dest->count = e->count ();
+
       /* Scalar version loop may be preferred.  In this case, add guard
 	 and skip to epilog.  Note this only happens when the number of
 	 iterations of loop is unknown at compile time, otherwise this
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b5e27d1c46d9cb3dfe5b44f1b49c9e4204572ff1..39aa4d1250efe308acccf484d370f8adfd1ba843 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
 {
   if (bb == (bb->loop_father)->header)
     return true;
-  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
+
   return false;
 }
 
@@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *, bool = true);
+						    edge, edge *, bool = true,
+						    vec<basic_block> * = NULL);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
Richard Biener Nov. 24, 2023, 12:38 p.m. UTC | #5
On Fri, 24 Nov 2023, Tamar Christina wrote:

> Hi All,
> 
> Here's an updated patch, which takes a slightly different approach but makes things much easier later on.
> 
> Peeling for early breaks works by redirecting all early break exits to a
> single "early break" block and combine them and the normal exit edge together
> later in a different block which then goes into the epilog preheader.
> 
> This allows us to re-use all the existing code for IV updates, Additionally this
> also enables correct linking for multiple vector epilogues.
> 
> flush_pending_stmts cannot be used in this scenario since it updates the PHI
> nodes in the order that they are in the exit destination blocks.  This means
> they are in CFG visit order.  With a single exit this doesn't matter but with
> multiple exits with different live values through the different exits the order
> usually does not line up.
> 
> Additionally the vectorizer helper functions expect to be able to iterate over
> the nodes in the order that they occur in the loop header blocks.  This is an
> invariant we must maintain.  To do this we just inline the work
> flush_pending_stmts but maintain the order by using the header blocks to guide
> the work.
> 
> The way peeling is done result in LIM noticing that in some cases the condition
> and the results are loop invariant and tries to move them out of the loop.
> 
> While the resulting code is operationally sound, moving the compare out of the
> gcond results in generating code that no longer branches, so cbranch is no
> longer applicable.  As such I now add code to check during this motion to see
> if the target supports flag setting vector comparison as general operation.
> 
> Because of the change in peeling I now also have to update the BB counts for
> the loop exit intermediate block.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* gcc/tree-ssa-loop-im.cc (compute_invariantness): Import insn-codes.h
> 	and optabs-tree.h and check for vector compare motion out of gcond.
> 	* tree-vect-loop-manip.cc
> 	(slpeel_tree_duplicate_loop_to_edge_cfg): Peel using intermediate blocks.
> 	(vect_update_ivs_after_vectorizer): Drop assert.
> 	(vect_do_peeling): Correct BB count for new intermediate block.
> 	* tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
> 	(slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
> index 396963b6754c7671e2e5404302a69129918555e2..92a9318a1ca0a2da50ff2f29cf271d2e78fddd77 100644
> --- a/gcc/tree-ssa-loop-im.cc
> +++ b/gcc/tree-ssa-loop-im.cc
> @@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-dfa.h"
>  #include "tree-ssa.h"
>  #include "dbgcnt.h"
> +#include "insn-codes.h"
> +#include "optabs-tree.h"
>  
>  /* TODO:  Support for predicated code motion.  I.e.
>  
> @@ -1138,6 +1140,24 @@ compute_invariantness (basic_block bb)
>  	    continue;
>  	  }
>  
> +	/* Check if one of the depedent statement is a vector compare whether
> +	   the target supports it,  otherwise it's invalid to hoist it out of
> +	   the gcond it belonged to.  */
> +	for (auto dep_stmt : lim_data->depends)
> +	  {
> +	     if (is_gimple_assign (dep_stmt)
> +		 && VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (dep_stmt))))
> +		{
> +		  tree type = TREE_TYPE (gimple_assign_lhs (dep_stmt));
> +		  auto code = gimple_assign_rhs_code (dep_stmt);
> +		  if (!target_supports_op_p (type, code, optab_vector))
> +		    pos = MOVE_IMPOSSIBLE;
> +		}
> +	  }

I think it's more natural to handle this in determine_max_movement
where we specifically look at the condition we are going to turn
into a COND_EXPR condition.

I think it's also independent of this series - the issue should
be latent, but possibly only triggerable with a GIMPLE testcase.

Can you split it out?

The rest of the patch is OK.

Thanks,
Richard.

> +
> +	if (pos == MOVE_IMPOSSIBLE)
> +	  continue;
> +
>  	if (dump_file && (dump_flags & TDF_DETAILS))
>  	  {
>  	    print_gimple_stmt (dump_file, stmt, 2);
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index b9161274ce401a7307f3e61ad23aa036701190d7..0b042b2baf976572af962dd40d5dc311a419ee60 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1403,13 +1403,16 @@ vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo
>     copies remains the same.
>  
>     If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
> -   dominators were updated during the peeling.  */
> +   dominators were updated during the peeling.  When doing early break vectorization
> +   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
> +   memory references that need to be updated should we decide to vectorize.  */
>  
>  class loop *
>  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  					class loop *scalar_loop,
>  					edge scalar_exit, edge e, edge *new_e,
> -					bool flow_loops)
> +					bool flow_loops,
> +					vec<basic_block> *updated_doms)
>  {
>    class loop *new_loop;
>    basic_block *new_bbs, *bbs, *pbbs;
> @@ -1526,7 +1529,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        }
>  
>    auto loop_exits = get_loop_exit_edges (loop);
> +  bool multiple_exits_p = loop_exits.length () > 1;
>    auto_vec<basic_block> doms;
> +  class loop *update_loop = NULL;
>  
>    if (at_exit) /* Add the loop copy at exit.  */
>      {
> @@ -1536,39 +1541,65 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	  flush_pending_stmts (new_exit);
>  	}
>  
> +      bool multiple_exits_p = loop_exits.length () > 1;
> +      basic_block main_loop_exit_block = new_preheader;
> +      basic_block alt_loop_exit_block = NULL;
> +      /* Create intermediate edge for main exit.  But only useful for early
> +	 exits.  */
> +      if (multiple_exits_p)
> +	{
> +	  edge loop_e = single_succ_edge (new_preheader);
> +	  new_preheader = split_edge (loop_e);
> +	}
> +
>        auto_vec <gimple *> new_phis;
>        hash_map <tree, tree> new_phi_args;
>        /* First create the empty phi nodes so that when we flush the
>  	 statements they can be filled in.   However because there is no order
>  	 between the PHI nodes in the exits and the loop headers we need to
>  	 order them base on the order of the two headers.  First record the new
> -	 phi nodes.  */
> -      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> +	 phi nodes. Then redirect the edges and flush the changes.  This writes
> +	 out the new SSA names.  */
> +      for (auto gsi_from = gsi_start_phis (loop_exit->dest);
>  	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
>  	{
>  	  gimple *from_phi = gsi_stmt (gsi_from);
>  	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> -	  gphi *res = create_phi_node (new_res, new_preheader);
> +	  gphi *res = create_phi_node (new_res, main_loop_exit_block);
>  	  new_phis.safe_push (res);
>  	}
>  
> -      /* Then redirect the edges and flush the changes.  This writes out the new
> -	 SSA names.  */
> -      for (edge exit : loop_exits)
> +      for (auto exit : loop_exits)
>  	{
> -	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
> -	  flush_pending_stmts (temp_e);
> +	  basic_block dest = main_loop_exit_block;
> +	  if (exit != loop_exit)
> +	    {
> +	      if (!alt_loop_exit_block)
> +		{
> +		  alt_loop_exit_block = split_edge (exit);
> +		  edge res = redirect_edge_and_branch (
> +				single_succ_edge (alt_loop_exit_block),
> +				new_preheader);
> +		  flush_pending_stmts (res);
> +		  continue;
> +		}
> +	      dest = alt_loop_exit_block;
> +	    }
> +	  edge e = redirect_edge_and_branch (exit, dest);
> +	  flush_pending_stmts (e);
>  	}
> +
>        /* Record the new SSA names in the cache so that we can skip materializing
>  	 them again when we fill in the rest of the LCSSA variables.  */
>        for (auto phi : new_phis)
>  	{
> -	  tree new_arg = gimple_phi_arg (phi, 0)->def;
> +	  tree new_arg = gimple_phi_arg (phi, loop_exit->dest_idx)->def;
>  
>  	  if (!SSA_VAR_P (new_arg))
>  	    continue;
> +
>  	  /* If the PHI MEM node dominates the loop then we shouldn't create
> -	      a new LC-SSSA PHI for it in the intermediate block.   */
> +	     a new LC-SSSA PHI for it in the intermediate block.   */
>  	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
>  	     be a .VDEF or a PHI that operates on MEM. And said definition
>  	     must not be inside the main loop.  Or we must be a parameter.
> @@ -1584,6 +1615,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	      remove_phi_node (&gsi, true);
>  	      continue;
>  	    }
> +
> +	  /* If we decide to remove the PHI node we should also not
> +	     rematerialize it later on.  */
>  	  new_phi_args.put (new_arg, gimple_phi_result (phi));
>  
>  	  if (TREE_CODE (new_arg) != SSA_NAME)
> @@ -1595,34 +1629,68 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	 preheader block and still find the right LC nodes.  */
>        edge loop_entry = single_succ_edge (new_preheader);
>        if (flow_loops)
> -	for (auto gsi_from = gsi_start_phis (loop->header),
> -	     gsi_to = gsi_start_phis (new_loop->header);
> -	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> -	     gsi_next (&gsi_from), gsi_next (&gsi_to))
> -	  {
> -	    gimple *from_phi = gsi_stmt (gsi_from);
> -	    gimple *to_phi = gsi_stmt (gsi_to);
> -	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> -						  loop_latch_edge (loop));
> +	{
> +	  /* Link through the main exit first.  */
> +	  for (auto gsi_from = gsi_start_phis (loop->header),
> +	       gsi_to = gsi_start_phis (new_loop->header);
> +	       !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> +	       gsi_next (&gsi_from), gsi_next (&gsi_to))
> +	    {
> +	      gimple *from_phi = gsi_stmt (gsi_from);
> +	      gimple *to_phi = gsi_stmt (gsi_to);
> +	      tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> +						    loop_latch_edge (loop));
> +
> +	      /* Check if we've already created a new phi node during edge
> +		 redirection.  If we have, only propagate the value
> +		 downwards.  */
> +	      if (tree *res = new_phi_args.get (new_arg))
> +		{
> +		  if (multiple_exits_p)
> +		    new_arg = *res;
> +		  else
> +		    {
> +		      adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
> +		      continue;
> +		    }
> +		}
>  
> -	    /* Check if we've already created a new phi node during edge
> -	       redirection.  If we have, only propagate the value downwards.  */
> -	    if (tree *res = new_phi_args.get (new_arg))
> -	      {
> -		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
> -		continue;
> -	      }
> +	      tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> +	      gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
>  
> -	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> -	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
> +	      /* Main loop exit should use the final iter value.  */
> +	      SET_PHI_ARG_DEF (lcssa_phi, loop_exit->dest_idx, new_arg);
>  
> -	    /* Main loop exit should use the final iter value.  */
> -	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> +	      adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
> +	    }
>  
> -	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
> -	  }
> +	  set_immediate_dominator (CDI_DOMINATORS, main_loop_exit_block,
> +				   loop_exit->src);
> +
> +	  /* Now link the alternative exits.  */
> +	  if (multiple_exits_p)
> +	    {
> +	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> +				       main_loop_exit_block);
> +	      for (auto gsi_from = gsi_start_phis (loop->header),
> +		   gsi_to = gsi_start_phis (new_preheader);
> +		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> +		   gsi_next (&gsi_from), gsi_next (&gsi_to))
> +		{
> +		  gimple *from_phi = gsi_stmt (gsi_from);
> +		  gimple *to_phi = gsi_stmt (gsi_to);
> +
> +		  tree alt_arg = gimple_phi_result (from_phi);
> +		  edge main_e = single_succ_edge (alt_loop_exit_block);
> +		  for (edge e : loop_exits)
> +		    if (e != loop_exit)
> +		      SET_PHI_ARG_DEF (to_phi, main_e->dest_idx, alt_arg);
> +		}
>  
> -      set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
> +	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> +				       loop->header);
> +	    }
> +	}
>  
>        if (was_imm_dom || duplicate_outer_loop)
>  	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
> @@ -1634,6 +1702,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        delete_basic_block (preheader);
>        set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
>  			       loop_preheader_edge (scalar_loop)->src);
> +
> +      /* Finally after wiring the new epilogue we need to update its main exit
> +	 to the original function exit we recorded.  Other exits are already
> +	 correct.  */
> +      if (multiple_exits_p)
> +	{
> +	  update_loop = new_loop;
> +	  for (edge e : get_loop_exit_edges (loop))
> +	    doms.safe_push (e->dest);
> +	  doms.safe_push (exit_dest);
> +
> +	  /* Likely a fall-through edge, so update if needed.  */
> +	  if (single_succ_p (exit_dest))
> +	    doms.safe_push (single_succ (exit_dest));
> +	}
>      }
>    else /* Add the copy at entry.  */
>      {
> @@ -1681,6 +1764,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>        delete_basic_block (new_preheader);
>        set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
>  			       loop_preheader_edge (new_loop)->src);
> +
> +      if (multiple_exits_p)
> +	update_loop = loop;
> +    }
> +
> +  if (multiple_exits_p)
> +    {
> +      for (edge e : get_loop_exit_edges (update_loop))
> +	{
> +	  edge ex;
> +	  edge_iterator ei;
> +	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
> +	    {
> +	      /* Find the first non-fallthrough block as fall-throughs can't
> +		 dominate other blocks.  */
> +	      if (single_succ_p (ex->dest))
> +		{
> +		  doms.safe_push (ex->dest);
> +		  ex = single_succ_edge (ex->dest);
> +		}
> +	      doms.safe_push (ex->dest);
> +	    }
> +	  doms.safe_push (e->dest);
> +	}
> +
> +      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
> +      if (updated_doms)
> +	updated_doms->safe_splice (doms);
>      }
>  
>    free (new_bbs);
> @@ -2050,7 +2161,6 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
>  
>    /* Make sure there exists a single-predecessor exit bb:  */
>    gcc_assert (single_pred_p (exit_bb));
> -  gcc_assert (single_succ_edge (exit_bb) == update_e);
>  
>    for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
>         !gsi_end_p (gsi) && !gsi_end_p (gsi1);
> @@ -3138,6 +3248,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>        epilog->force_vectorize = false;
>        bb_before_epilog = loop_preheader_edge (epilog)->src;
>  
> +      /* Fixup the probabities of the new intermediate blocks that we use to connect
> +	 to the merge block.  The rest are dealt with via bb_before_epilog
> +	 adjustments. */
> +	e->dest->count = e->count ();
> +
>        /* Scalar version loop may be preferred.  In this case, add guard
>  	 and skip to epilog.  Note this only happens when the number of
>  	 iterations of loop is unknown at compile time, otherwise this
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index b5e27d1c46d9cb3dfe5b44f1b49c9e4204572ff1..39aa4d1250efe308acccf484d370f8adfd1ba843 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
>  {
>    if (bb == (bb->loop_father)->header)
>      return true;
> -  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
> +
>    return false;
>  }
>  
> @@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
>  					 const_edge);
>  class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
>  						    class loop *, edge,
> -						    edge, edge *, bool = true);
> +						    edge, edge *, bool = true,
> +						    vec<basic_block> * = NULL);
>  class loop *vect_loop_versioning (loop_vec_info, gimple *);
>  extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
>  				    tree *, tree *, tree *, int, bool, bool,
>
diff mbox series

Patch

--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1444,6 +1444,151 @@  slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 }
 
+/* Determine if the exit choosen by the loop vectorizer differs from the
+   natural loop exit.  i.e. if the exit leads to the loop patch or not.
+   When this happens we need to flip the understanding of main and other
+   exits by peeling and IV updates.  */
+
+bool
+vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
+{
+  return single_pred (loop->latch) == loop_exit->src;
+}
+
+/* Perform peeling for when the peeled loop is placed after the original loop.
+   This maintains LCSSA and creates the appropriate blocks for multiple exit
+   vectorization.   */
+
+void static
+slpeel_tree_duplicate_loop_for_vectorization (class loop *loop, edge loop_exit,
+					      vec<edge> &loop_exits, edge e,
+					      class loop *new_loop,
+					      bool flow_loops,
+					      basic_block new_preheader)
+{
+  bool multiple_exits_p = loop_exits.length () > 1;
+  basic_block main_loop_exit_block = new_preheader;
+  if (multiple_exits_p)
+    {
+      edge loop_entry = single_succ_edge (new_preheader);
+      new_preheader = split_edge (loop_entry);
+    }
+
+  /* First create the empty phi nodes so that when we flush the
+     statements they can be filled in.   However because there is no order
+     between the PHI nodes in the exits and the loop headers we need to
+     order them base on the order of the two headers.  First record the new
+     phi nodes. Then redirect the edges and flush the changes.  This writes out the new
+    SSA names.  */
+  for (auto exit : loop_exits)
+    {
+      basic_block dest
+	= exit == loop_exit ? main_loop_exit_block : new_preheader;
+      redirect_edge_and_branch (exit, dest);
+    }
+
+  /* Copy the current loop LC PHI nodes between the original loop exit
+     block and the new loop header.  This allows us to later split the
+     preheader block and still find the right LC nodes.  */
+  edge loop_entry = single_succ_edge (new_preheader);
+  hash_set<tree> lcssa_vars;
+  if (flow_loops)
+    for (auto gsi_from = gsi_start_phis (loop->header),
+	 gsi_to = gsi_start_phis (new_loop->header);
+	 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
+	 gsi_next (&gsi_from), gsi_next (&gsi_to))
+      {
+	gimple *from_phi = gsi_stmt (gsi_from);
+	gimple *to_phi = gsi_stmt (gsi_to);
+	tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, loop_latch_edge (loop));
+
+	/* In all cases, even in early break situations we're only
+	   interested in the number of fully executed loop iters.  As such
+	   we discard any partially done iteration.  So we simply propagate
+	   the phi nodes from the latch to the merge block.  */
+	tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
+	gphi *lcssa_phi = create_phi_node (new_res, main_loop_exit_block);
+
+	/* Check if we haven't picked a different loop exit.  If we have we
+	   need to flip the LCSSA vars to prevent incorrect linking.  */
+	tree alt_arg = gimple_phi_result (from_phi);
+	if (!vect_is_loop_exit_latch_pred (loop_exit, loop))
+	  std::swap (new_arg, alt_arg);
+
+	lcssa_vars.add (new_arg);
+
+	/* Main loop exit should use the final iter value.  */
+	add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
+
+	/* All other exits use the previous iters.  */
+	if (multiple_exits_p)
+	  {
+	    tree alt_res = copy_ssa_name (alt_arg);
+	    gphi *alt_lcssa_phi = create_phi_node (alt_res, new_preheader);
+	    edge main_e = single_succ_edge (main_loop_exit_block);
+	    for (edge e : loop_exits)
+	      if (e != loop_exit)
+		{
+		  add_phi_arg (alt_lcssa_phi, alt_arg, e, UNKNOWN_LOCATION);
+		  SET_PHI_ARG_DEF (alt_lcssa_phi, main_e->dest_idx, new_res);
+		}
+	    new_res = alt_res; /* Push it down to the new_loop header.  */
+	  }
+
+	adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
+    }
+
+  /* Copy over any live SSA vars that may not have been materialized in the
+     loops themselves but would be in the exit block.  However when the live
+     value is not used inside the loop then we don't need to do this,  if we do
+     then when we split the guard block the branch edge can end up containing the
+     wrong reference,  particularly if it shares an edge with something that has
+     bypassed the loop.  This is not something peeling can check so we need to
+     anticipate the usage of the live variable here.  */
+  auto exit_map = redirect_edge_var_map_vector (loop_exit);
+  if (exit_map)
+    for (auto vm : exit_map)
+      {
+	if (lcssa_vars.contains (vm.def)
+	    || TREE_CODE (vm.def) != SSA_NAME)
+	  continue;
+
+	imm_use_iterator imm_iter;
+	use_operand_p use_p;
+	bool use_in_loop = false;
+
+	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vm.def)
+	  {
+	    basic_block bb = gimple_bb (USE_STMT (use_p));
+	    if (flow_bb_inside_loop_p (loop, bb)
+		&& !gimple_vuse (USE_STMT (use_p)))
+	      {
+		use_in_loop = true;
+		break;
+	      }
+	  }
+
+	if (!use_in_loop && SSA_VAR_P (vm.def))
+	  {
+	    /* Do a final check to see if it's perhaps defined in the loop.
+	       This mirrors the relevancy analysis's used_outside_scope.  */
+	    if (virtual_operand_p (vm.def)
+		&& (SSA_NAME_IS_DEFAULT_DEF (vm.def)
+		    || !flow_bb_inside_loop_p (loop,
+				gimple_bb (SSA_NAME_DEF_STMT (vm.def)))))
+	      continue;
+	  }
+
+	tree new_res = copy_ssa_name (vm.result);
+	gphi *lcssa_phi = create_phi_node (new_res, e->dest);
+	add_phi_arg (lcssa_phi, vm.def, loop_exit, vm.locus);
+    }
+
+  /* Now clear all the redirect maps.  */
+  for (auto exit : loop_exits)
+    redirect_edge_var_map_clear (exit);
+}
+
 /* Given LOOP this function generates a new copy of it and puts it
    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
@@ -1455,13 +1600,16 @@  slpeel_duplicate_current_defs_from_edges (edge from, edge to)
    copies remains the same.
 
    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
-   dominators were updated during the peeling.  */
+   dominators were updated during the peeling.  When doing early break vectorization
+   then LOOP_VINFO needs to be provided and is used to keep track of any newly created
+   memory references that need to be updated should we decide to vectorize.  */
 
 class loop *
 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 					class loop *scalar_loop,
 					edge scalar_exit, edge e, edge *new_e,
-					bool flow_loops)
+					bool flow_loops,
+					vec<basic_block> *updated_doms)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1593,7 +1741,9 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
     }
 
   auto loop_exits = get_loop_exit_edges (loop);
+  bool multiple_exits_p = loop_exits.length () > 1;
   auto_vec<basic_block> doms;
+  class loop *update_loop = NULL;
 
   if (at_exit) /* Add the loop copy at exit.  */
     {
@@ -1603,103 +1753,9 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  flush_pending_stmts (new_exit);
 	}
 
-      auto_vec <gimple *> new_phis;
-      hash_map <tree, tree> new_phi_args;
-      /* First create the empty phi nodes so that when we flush the
-	 statements they can be filled in.   However because there is no order
-	 between the PHI nodes in the exits and the loop headers we need to
-	 order them base on the order of the two headers.  First record the new
-	 phi nodes.  */
-      for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
-	   !gsi_end_p (gsi_from); gsi_next (&gsi_from))
-	{
-	  gimple *from_phi = gsi_stmt (gsi_from);
-	  tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	  gphi *res = create_phi_node (new_res, new_preheader);
-	  new_phis.safe_push (res);
-	}
-
-      /* Then redirect the edges and flush the changes.  This writes out the new
-	 SSA names.  */
-      for (edge exit : loop_exits)
-	{
-	  edge temp_e = redirect_edge_and_branch (exit, new_preheader);
-	  flush_pending_stmts (temp_e);
-	}
-      /* Record the new SSA names in the cache so that we can skip materializing
-	 them again when we fill in the rest of the LCSSA variables.  */
-      for (auto phi : new_phis)
-	{
-	  tree new_arg = gimple_phi_arg (phi, 0)->def;
-
-	  if (!SSA_VAR_P (new_arg))
-	    continue;
-	  /* If the PHI MEM node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.   */
-	  /* A MEM phi that consitutes a new DEF for the vUSE chain can either
-	     be a .VDEF or a PHI that operates on MEM. And said definition
-	     must not be inside the main loop.  Or we must be a parameter.
-	     In the last two cases we may remove a non-MEM PHI node, but since
-	     they dominate both loops the removal is unlikely to cause trouble
-	     as the exits must already be using them.  */
-	  if (virtual_operand_p (new_arg)
-	      && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
-		  || !flow_bb_inside_loop_p (loop,
-				gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
-	    {
-	      auto gsi = gsi_for_stmt (phi);
-	      remove_phi_node (&gsi, true);
-	      continue;
-	    }
-	  new_phi_args.put (new_arg, gimple_phi_result (phi));
-
-	  if (TREE_CODE (new_arg) != SSA_NAME)
-	    continue;
-	  /* If the PHI node dominates the loop then we shouldn't create
-	      a new LC-SSSA PHI for it in the intermediate block.  Unless the
-	      the loop has been versioned.  If it has then we need the PHI
-	      node such that later when the loop guard is added the original
-	      dominating PHI can be found.  */
-	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (new_arg));
-	  if (loop == scalar_loop
-	      && (!def_bb || !flow_bb_inside_loop_p (loop, def_bb)))
-	    {
-	      auto gsi = gsi_for_stmt (phi);
-	      remove_phi_node (&gsi, true);
-	    }
-	}
-
-      /* Copy the current loop LC PHI nodes between the original loop exit
-	 block and the new loop header.  This allows us to later split the
-	 preheader block and still find the right LC nodes.  */
-      edge loop_entry = single_succ_edge (new_preheader);
-      if (flow_loops)
-	for (auto gsi_from = gsi_start_phis (loop->header),
-	     gsi_to = gsi_start_phis (new_loop->header);
-	     !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
-	     gsi_next (&gsi_from), gsi_next (&gsi_to))
-	  {
-	    gimple *from_phi = gsi_stmt (gsi_from);
-	    gimple *to_phi = gsi_stmt (gsi_to);
-	    tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
-						  loop_latch_edge (loop));
-
-	    /* Check if we've already created a new phi node during edge
-	       redirection.  If we have, only propagate the value downwards.  */
-	    if (tree *res = new_phi_args.get (new_arg))
-	      {
-		adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
-		continue;
-	      }
-
-	    tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
-	    gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
-
-	    /* Main loop exit should use the final iter value.  */
-	    add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
-
-	    adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
-	  }
+      slpeel_tree_duplicate_loop_for_vectorization (loop, loop_exit, loop_exits,
+						    e, new_loop, flow_loops,
+						    new_preheader);
 
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
 
@@ -1713,6 +1769,21 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (preheader);
       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
 			       loop_preheader_edge (scalar_loop)->src);
+
+      /* Finally after wiring the new epilogue we need to update its main exit
+	 to the original function exit we recorded.  Other exits are already
+	 correct.  */
+      if (multiple_exits_p)
+	{
+	  update_loop = new_loop;
+	  for (edge e : get_loop_exit_edges (loop))
+	    doms.safe_push (e->dest);
+	  doms.safe_push (exit_dest);
+
+	  /* Likely a fall-through edge, so update if needed.  */
+	  if (single_succ_p (exit_dest))
+	    doms.safe_push (single_succ (exit_dest));
+	}
     }
   else /* Add the copy at entry.  */
     {
@@ -1760,6 +1831,34 @@  slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
       delete_basic_block (new_preheader);
       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
 			       loop_preheader_edge (new_loop)->src);
+
+      if (multiple_exits_p)
+	update_loop = loop;
+    }
+
+  if (multiple_exits_p)
+    {
+      for (edge e : get_loop_exit_edges (update_loop))
+	{
+	  edge ex;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (ex, ei, e->dest->succs)
+	    {
+	      /* Find the first non-fallthrough block as fall-throughs can't
+		 dominate other blocks.  */
+	      if (single_succ_p (ex->dest))
+		{
+		  doms.safe_push (ex->dest);
+		  ex = single_succ_edge (ex->dest);
+		}
+	      doms.safe_push (ex->dest);
+	    }
+	  doms.safe_push (e->dest);
+	}
+
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      if (updated_doms)
+	updated_doms->safe_splice (doms);
     }
 
   free (new_bbs);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1418913d2c308b0cf78352e29dc9958746fb9c94..d8b532c4b8ca92a856368a686598859fab9d40e9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1821,7 +1821,7 @@  is_loop_header_bb_p (basic_block bb)
 {
   if (bb == (bb->loop_father)->header)
     return true;
-  gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
+
   return false;
 }
 
@@ -2212,7 +2212,8 @@  extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge,
 					 const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
 						    class loop *, edge,
-						    edge, edge *, bool = true);
+						    edge, edge *, bool = true,
+						    vec<basic_block> * = NULL);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
@@ -2223,6 +2224,7 @@  extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
+extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,