Fix PR87914

Message ID alpine.LSU.2.20.1811071557070.1827@zhemvz.fhfr.qr
State New
Headers show
Series
  • Fix PR87914
Related show

Commit Message

Richard Biener Nov. 7, 2018, 2:59 p.m.
This PR shows one example (IIRC I've seen others recently) where we
fail to handle outer loop vectorization because we do a poor job
identifying "safe" nested cycles.  This improves the situation.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

I've also built SPEC 2006 CPU with and without LTO on a Haswell machine.

I do expect fallout since the reduction code is still incredibly 
fragile...

Richard.

From 854d80f1822ae6b37afa865ae49d64ceaee68b26 Mon Sep 17 00:00:00 2001
From: Richard Guenther <rguenther@suse.de>
Date: Wed, 7 Nov 2018 12:19:45 +0100
Subject: [PATCH] fix-pr87914

2018-11-07  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87914
	* tree-vect-loop.c (vect_is_simple_reduction): Improve detection
	of nested cycles.
	(vectorizable_reduction): Handle shifts and rotates by dispatching
	to vectorizable_shift.
	* tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle
	in-loop uses of vect_nested_cycle defs.  Merge cycle and internal
	def cases.
	(vectorizable_shift): Export and handle being called as
	vect_nested_cycle.
	(vect_analyze_stmt): Call vectorizable_shift after
	vectorizable_reduction.
	* tree-vectorizer.h (vectorizable_shift): Declare.

	* lib/target-supports.exp (check_effective_target_vect_var_shift): New.
	(check_avx2_available): Likewise.
	* g++.dg/vect/pr87914.cc: New testcase.

Patch

diff --git a/gcc/testsuite/g++.dg/vect/pr87914.cc b/gcc/testsuite/g++.dg/vect/pr87914.cc
new file mode 100644
index 00000000000..12fbba3af2f
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr87914.cc
@@ -0,0 +1,49 @@ 
+// { dg-do run }
+// { dg-additional-options "-fopenmp-simd" }
+// { dg-additional-options "-mavx2" { target { avx2_runtime } } }
+
+extern "C" int memcmp(const void *s1, const void *s2, __SIZE_TYPE__ n);
+extern "C" void abort(void);
+
+template <typename T>
+T reverseBits(T x)
+{
+  unsigned int s = sizeof(x) * 8;
+  T mask = ~T(0);
+  while ((s >>= 1) > 0)
+    {
+      mask ^= (mask << s);
+      x = ((x >> s) & mask) | ((x << s) & ~mask); // unsupported use in stmt
+    }
+  return x;
+}
+
+void __attribute__((noinline,noipa))
+test_reverseBits(unsigned* x)
+{
+#pragma omp simd aligned(x:32)
+  for (int i = 0; i < 16; ++i)
+    x[i] = reverseBits(x[i]); // couldn't vectorize loop
+}
+
+int main()
+{
+  unsigned arr[16] __attribute__((aligned(32)))
+    = { 0x01020304, 0x05060708, 0x0a0b0c0d, 0x0e0f1011,
+        0x11121314, 0x45065708, 0xfa0b3c0du, 0x0e0f1211,
+        0x21222324, 0x55066708, 0xfa0b2c0du, 0x1e0f1011,
+        0x31323334, 0x65067708, 0xfa0b5c0du, 0x0e3f1011 };
+  unsigned arr2[16]
+    = { 0x20c04080, 0x10e060a0, 0xb030d050, 0x8808f070u,
+        0x28c84888, 0x10ea60a2, 0xb03cd05f, 0x8848f070u,
+        0x24c44484, 0x10e660aa, 0xb034d05f, 0x8808f078u, 
+        0x2ccc4c8c, 0x10ee60a6, 0xb03ad05f, 0x8808fc70u };
+
+  test_reverseBits (arr);
+
+  if (memcmp (arr, arr2, sizeof (arr)) != 0)
+    abort ();
+  return 0;
+}
+
+// { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_var_shift && vect_int } } } }
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 9780e53dfc0..1d5ad9abdca 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -5316,6 +5316,15 @@  proc check_effective_target_vect_shift { } {
 		 && [check_effective_target_s390_vx]) }}]
 }
 
+# Return 1 if the target supports hardware vector shift by register operation.
+
+proc check_effective_target_vect_var_shift { } {
+    return [check_cached_effective_target_indexed vect_var_shift {
+      expr {(([istarget i?86-*-*] || [istarget x86_64-*-*])
+	     && [check_avx2_available])
+      }}]
+}
+
 proc check_effective_target_whole_vector_shift { } {
     if { [istarget i?86-*-*] || [istarget x86_64-*-*]
 	 || [istarget ia64-*-*]
@@ -7150,6 +7159,19 @@  proc check_avx_available { } {
   return 0;
 }
 
+# Return true if we are compiling for AVX2 target.
+
+proc check_avx2_available { } {
+  if { [check_no_compiler_messages avx_available assembly {
+    #ifndef __AVX2__
+    #error unsupported
+    #endif
+  } ""] } {
+    return 1;
+  }
+  return 0;
+}
+
 # Return true if we are compiling for SSSE3 target.
 
 proc check_ssse3_available { } {
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 51be405b5a0..e392aab1d52 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2880,6 +2880,11 @@  vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
+      /* For inner loop reductions in nested vectorization there are no
+         constraints on the number of uses in the inner loop.  */
+      if (loop == vect_loop->inner)
+	continue;
+
       nloop_uses++;
       if (nloop_uses > 1)
         {
@@ -2938,13 +2943,19 @@  vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       else
 	/* We can have more than one loop-closed PHI.  */
 	lcphis.safe_push (as_a <gphi *> (use_stmt));
-      if (nloop_uses > 1)
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "reduction used in loop.\n");
-	  return NULL;
-	}
+    }
+
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
+  if ((!nested_in_vect_loop || !lcphis.is_empty ())
+      && nloop_uses > 1)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "reduction used in loop.\n");
+      return NULL;
     }
 
   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
@@ -3005,9 +3016,15 @@  vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
     }
 
   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   code = orig_code = gimple_assign_rhs_code (def_stmt);
 
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      if (dump_enabled_p ())
+	report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
   /* We can handle "res -= x[i]", which is non-associative by
      simply rewriting this into "res += -x[i]".  Avoid changing
      gimple instruction for the first simple tests and only do this
@@ -6488,6 +6505,19 @@  vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   vec_mode = TYPE_MODE (vectype_in);
   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
+  if (nested_cycle)
+    {
+      def_bb = gimple_bb (reduc_def_phi);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                                       loop_preheader_edge (def_stmt_loop));
+      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
+      if (def_arg_stmt_info
+	  && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
+	      == vect_double_reduction_def))
+        double_reduc = true;
+    }
+
   if (code == COND_EXPR)
     {
       /* Only call during the analysis stage, otherwise we'll lose
@@ -6502,20 +6532,26 @@  vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  return false;
         }
     }
-  else
+  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+	   || code == LROTATE_EXPR || code == RROTATE_EXPR)
     {
-      /* 4. Supportable by target?  */
-
-      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
+      /* Only call during the analysis stage, otherwise we'll lose
+	 STMT_VINFO_TYPE.  We only support this for nested cycles
+	 without double reductions at the moment.  */
+      if (!nested_cycle
+	  || double_reduc
+	  || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
+						NULL, cost_vec)))
 	{
-	  /* Shifts and rotates are only supported by vectorizable_shifts,
-	     not vectorizable_reduction.  */
           if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "unsupported shift or rotation.\n");
+			     "unsupported shift or rotation in reduction\n");
 	  return false;
 	}
+    }
+  else
+    {
+      /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
@@ -6620,19 +6656,6 @@  vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	orig_code = cond_reduc_op_code;
     }
 
-  if (nested_cycle)
-    {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-	  && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-	      == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
   reduc_fn = IFN_LAST;
 
   if (reduction_type == TREE_CODE_REDUCTION
@@ -7003,6 +7026,12 @@  vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           /* Multiple types are not supported for condition.  */
           break;
         }
+      if (code == LSHIFT_EXPR
+	  || code == RSHIFT_EXPR)
+	{
+	  vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
+	  break;
+	}
 
       /* Handle uses.  */
       if (j == 0)
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7127c17c788..8133149b2dc 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1461,6 +1461,16 @@  vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info,
       /* Code should use vect_get_vec_def_for_operand.  */
       gcc_unreachable ();
 
+    /* Operand is defined by a loop header phi.  In case of nested
+       cycles we also may have uses of the backedge def.  */
+    case vect_reduction_def:
+    case vect_double_reduction_def:
+    case vect_nested_cycle:
+    case vect_induction_def:
+      gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
+		  || dt == vect_nested_cycle);
+      /* Fallthru.  */
+
     /* operand is defined inside the loop.  */
     case vect_internal_def:
       {
@@ -1480,23 +1490,6 @@  vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info,
 	return vec_oprnd;
       }
 
-    /* operand is defined by a loop header phi.  */
-    case vect_reduction_def:
-    case vect_double_reduction_def:
-    case vect_nested_cycle:
-    case vect_induction_def:
-      {
-	gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI);
-
-	/* Get the def from the vectorized stmt.  */
-	vec_stmt_info = STMT_VINFO_VEC_STMT (def_stmt_info);
-	if (gphi *phi = dyn_cast <gphi *> (vec_stmt_info->stmt))
-	  vec_oprnd = PHI_RESULT (phi);
-	else
-	  vec_oprnd = gimple_get_lhs (vec_stmt_info->stmt);
-	return vec_oprnd;
-      }
-
     default:
       gcc_unreachable ();
     }
@@ -5363,7 +5356,7 @@  vect_supportable_shift (enum tree_code code, tree scalar_type)
    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
    Return true if STMT_INFO is vectorizable in this way.  */
 
-static bool
+bool
 vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		    stmt_vec_info *vec_stmt, slp_tree slp_node,
 		    stmt_vector_for_cost *cost_vec)
@@ -5401,6 +5394,7 @@  vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
       && ! vec_stmt)
     return false;
 
@@ -5480,7 +5474,8 @@  vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
   if ((dt[1] == vect_internal_def
-       || dt[1] == vect_induction_def)
+       || dt[1] == vect_induction_def
+       || dt[1] == vect_nested_cycle)
       && !slp_node)
     scalar_shift_arg = false;
   else if (dt[1] == vect_constant_def
@@ -9540,7 +9535,6 @@  vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
 	  || vectorizable_simd_clone_call (stmt_info, NULL, NULL, node,
 					   cost_vec)
 	  || vectorizable_conversion (stmt_info, NULL, NULL, node, cost_vec)
-	  || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_operation (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_assignment (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_load (stmt_info, NULL, NULL, node, node_instance,
@@ -9549,6 +9543,7 @@  vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
 	  || vectorizable_reduction (stmt_info, NULL, NULL, node,
 				     node_instance, cost_vec)
 	  || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec)
+	  || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_condition (stmt_info, NULL, NULL, NULL, 0, node,
 				     cost_vec)
 	  || vectorizable_comparison (stmt_info, NULL, NULL, NULL, node,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1434eeaf270..72a12aea8f3 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1487,6 +1487,9 @@  extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree,
 extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *,
 				    stmt_vec_info *, tree, int, slp_tree,
 				    stmt_vector_for_cost *);
+extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *,
+				stmt_vec_info *, slp_tree,
+				stmt_vector_for_cost *);
 extern void vect_get_load_cost (stmt_vec_info, int, bool,
 				unsigned int *, unsigned int *,
 				stmt_vector_for_cost *,