diff mbox

Fix PR78205 -- fix BB SLP "gap" handling

Message ID alpine.LSU.2.11.1611071337420.5294@t29.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Nov. 7, 2016, 12:40 p.m. UTC
The following moves a overly conservative check that we do not access
excess elements when vectorizing a BB to a place where we can do
a better job with respect to the elements we actually use.

This means that for the included testcase we are not confused
by the read from c[4] but just do not vectorize the stores to x[0]
and x[1].

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2016-11-07  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/78205
	* tree-vect-stmts.c (vectorizable_load): Move check whether
	we may run into gaps when BB vectorizing SLP permutations ...
	* tree-vect-slp.c (vect_supported_load_permutation_p): ...
	here where we can do a more precise check.

	* gcc.dg/vect/bb-slp-pr78205.c: New testcase.
diff mbox

Patch

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 241893)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -6548,18 +6611,6 @@  vectorizable_load (gimple *stmt, gimple_
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
 	slp_perm = true;
 
-      /* ???  The following is overly pessimistic (as well as the loop
-         case above) in the case we can statically determine the excess
-	 elements loaded are within the bounds of a decl that is accessed.
-	 Likewise for BB vectorizations using masked loads is a possibility.  */
-      if (bb_vinfo && slp_perm && group_size % nunits != 0)
-	{
-	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			   "BB vectorization with gaps at the end of a load "
-			   "is not supported\n");
-	  return false;
-	}
-
       /* Invalidate assumptions made by dependence analysis when vectorization
 	 on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	(revision 241893)
+++ gcc/tree-vect-slp.c	(working copy)
@@ -1459,6 +1459,25 @@  vect_supported_load_permutation_p (slp_i
 	    SLP_TREE_LOAD_PERMUTATION (node).release ();
 	  else
 	    {
+	      stmt_vec_info group_info
+		= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
+	      group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
+	      unsigned nunits
+		= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (group_info));
+	      unsigned k, maxk = 0;
+	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
+		if (k > maxk)
+		  maxk = k;
+	      /* In BB vectorization we may not actually use a loaded vector
+		 accessing elements in excess of GROUP_SIZE.  */
+	      if (maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   "BB vectorization with gaps at the end of "
+				   "a load is not supported\n");
+		  return false;
+		}
+
 	      /* Verify the permutation can be generated.  */
 	      vec<tree> tem;
 	      unsigned n_perms;
Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c	(revision 0)
+++ gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c	(working copy)
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+double x[2], a[4], b[4], c[5];
+
+void foo ()
+{
+  a[0] = c[0];
+  a[1] = c[1];
+  a[2] = c[0];
+  a[3] = c[1];
+  b[0] = c[2];
+  b[1] = c[3];
+  b[2] = c[2];
+  b[3] = c[3];
+  x[0] = c[4];
+  x[1] = c[4];
+}
+
+/* We may not vectorize the store to x[] as it accesses c out-of bounds
+   but we do want to vectorize the other two store groups.  */
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "x\\\[\[0-1\]\\\] = " 2 "optimized" } } */