Fix PR87621, failed outer loop vectorization

Message ID alpine.LSU.2.20.1811081406230.1827@zhemvz.fhfr.qr
State New
Headers show
Series
  • Fix PR87621, failed outer loop vectorization
Related show

Commit Message

Richard Biener Nov. 8, 2018, 1:16 p.m.
The following fixes another instance of PR87914, this time it is
DOM wrecking loop-header copying presenting vectorization with
wrong loop form.  The loop header copying pass right before
vectorization isn't of great help since nothing cleans up after
it so loop form is still bogus.

I've tried to move loop-header copying after jump threading but
the fallout is too large for the moment.  So this resorts to
instead run CSE on the copied blocks.  I've took the liberty
to cleanup the vec-ch predicate a bit as well.

This then reveals another ICE in reduction handling which this
patch fixes.

Bootstrap & regtest running on x86_64-unknown-linux-gnu.

Richard.

From ea6b41e717b32c93ab3a27df55632f06fa1d71fc Mon Sep 17 00:00:00 2001
From: Richard Guenther <rguenther@suse.de>
Date: Thu, 8 Nov 2018 11:23:12 +0100
Subject: [PATCH] fix-pr87621

	PR tree-optimization/87621
	* tree-vect-loop.c (vectorizable_reduction): Handle reduction
	op with only phi inputs.
	* tree-ssa-loop-ch.c: Include tree-ssa-sccvn.h.
	(ch_base::copy_headers): Run CSE on copied loop headers.
	(pass_ch_vect::process_loop_p): Simplify.

	* g++.dg/vect/pr87621.cc: New testcase.

Patch

diff --git a/gcc/testsuite/g++.dg/vect/pr87621.cc b/gcc/testsuite/g++.dg/vect/pr87621.cc
new file mode 100644
index 00000000000..cfc53be4ee1
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr87621.cc
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+
+extern "C" double pow(double, double);
+template <typename T>
+T pow(T x, unsigned int n)
+{
+  if (!n)
+    return 1;
+
+  T y = 1;
+  while (n > 1)
+    {
+      if (n%2)
+	y *= x;
+      x = x*x;
+      n /= 2;
+    }
+  return x*y;
+}
+
+void testVec(int* x)
+{
+  for (int i = 0; i < 8; ++i)
+    x[i] = pow(x[i], 10);
+}
+
+/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_double && vect_hw_misalign } } } } */
diff --git a/gcc/tree-ssa-loop-ch.c b/gcc/tree-ssa-loop-ch.c
index c876d62405f..4d4813df3c8 100644
--- a/gcc/tree-ssa-loop-ch.c
+++ b/gcc/tree-ssa-loop-ch.c
@@ -33,6 +33,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-inline.h"
 #include "tree-ssa-scopedtables.h"
 #include "tree-ssa-threadedge.h"
+#include "tree-ssa-sccvn.h"
 #include "params.h"
 
 /* Duplicates headers of loops if they are small enough, so that the statements
@@ -297,12 +298,14 @@  ch_base::copy_headers (function *fun)
   bool changed = false;
 
   if (number_of_loops (fun) <= 1)
-      return 0;
+    return 0;
 
   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
   copied_bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
   bbs_size = n_basic_blocks_for_fn (fun);
 
+  auto_vec<std::pair<edge, loop_p> > copied;
+
   FOR_EACH_LOOP (loop, 0)
     {
       int initial_limit = PARAM_VALUE (PARAM_MAX_LOOP_HEADER_INSNS);
@@ -371,6 +374,7 @@  ch_base::copy_headers (function *fun)
 	  fprintf (dump_file, "Duplication failed.\n");
 	  continue;
 	}
+      copied.safe_push (std::make_pair (entry, loop));
 
       /* If the loop has the form "for (i = j; i < j + 10; i++)" then
 	 this copying can introduce a case where we rely on undefined
@@ -422,7 +426,28 @@  ch_base::copy_headers (function *fun)
     }
 
   if (changed)
-    update_ssa (TODO_update_ssa);
+    {
+      update_ssa (TODO_update_ssa);
+      /* After updating SSA form perform CSE on the loop header
+	 copies.  This is esp. required for the pass before
+	 vectorization since nothing cleans up copied exit tests
+	 that can now be simplified.  CSE from the entry of the
+	 region we copied till all loop exit blocks but not
+	 entering the loop itself.  */
+      for (unsigned i = 0; i < copied.length (); ++i)
+	{
+	  edge entry = copied[i].first;
+	  loop_p loop = copied[i].second;
+	  vec<edge> exit_edges = get_loop_exit_edges (loop);
+	  bitmap exit_bbs = BITMAP_ALLOC (NULL);
+	  for (unsigned j = 0; j < exit_edges.length (); ++j)
+	    bitmap_set_bit (exit_bbs, exit_edges[j]->dest->index);
+	  bitmap_set_bit (exit_bbs, loop->header->index);
+	  do_rpo_vn (cfun, entry, exit_bbs);
+	  BITMAP_FREE (exit_bbs);
+	  exit_edges.release ();
+	}
+    }
   free (bbs);
   free (copied_bbs);
 
@@ -473,24 +498,13 @@  pass_ch_vect::process_loop_p (struct loop *loop)
   if (loop->dont_vectorize)
     return false;
 
-  if (!do_while_loop_p (loop))
-    return true;
-
- /* The vectorizer won't handle anything with multiple exits, so skip.  */
+  /* The vectorizer won't handle anything with multiple exits, so skip.  */
   edge exit = single_exit (loop);
   if (!exit)
     return false;
 
-  /* Copy headers iff there looks to be code in the loop after the exit block,
-     i.e. the exit block has an edge to another block (besides the latch,
-     which should be empty).  */
-  edge_iterator ei;
-  edge e;
-  FOR_EACH_EDGE (e, ei, exit->src->succs)
-    if (!loop_exit_edge_p (loop, e)
-	&& e->dest != loop->header
-	&& e->dest != loop->latch)
-      return true;
+  if (!do_while_loop_p (loop))
+    return true;
 
   return false;
 }
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e392aab1d52..22bed26609b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6115,6 +6115,10 @@  vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
 	  break;
 	}
+      /* For a nested cycle we might end up with an operation like
+         phi_result * phi_result.  */
+      if (!vectype_in)
+	vectype_in = STMT_VINFO_VECTYPE (stmt_info);
       gcc_assert (vectype_in);
 
       if (slp_node)