diff mbox series

[committed] Small inclusive scan SSE2 vectorization improvement

Message ID 20190620070923.GR815@tucnak
State New
Headers show
Series [committed] Small inclusive scan SSE2 vectorization improvement | expand

Commit Message

Jakub Jelinek June 20, 2019, 7:09 a.m. UTC
Hi!

This is a small improvement over the previous patch, the decision to use
whole vector left shift + optional VEC_COND_EXPR doesn't have to be binary
for the whole scan that contains several permutations, e.g. SSE2 can't do
non-whole vector left shift { 0, 4, 5, 6 } permutation, but can do
{ 0, 1, 4, 5 } and especially if the initializer is not 0, that saves some
instructions.

The following patch changes the code, so that it remembers what to do for
each of the permutations.

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.

2019-06-20  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (enum scan_store_kind): New type.
	(scan_store_can_perm_p): Change last argument from int * to
	vec<enum scan_store_kind> *, record precisely which permutations
	need whole vector left shift or that plus VEC_COND_EXPR.
	(vectorizable_scan_store): Adjust caller, use whole vector left shift
	and additional VEC_COND_EXPR only for those iterations that need it.


	Jakub
diff mbox series

Patch

--- gcc/tree-vect-stmts.c.jj	2019-06-19 11:58:53.161238429 +0200
+++ gcc/tree-vect-stmts.c	2019-06-19 12:40:50.675838267 +0200
@@ -6354,13 +6354,27 @@  scan_operand_equal_p (tree ref1, tree re
 }
 
 
+enum scan_store_kind {
+  /* Normal permutation.  */
+  scan_store_kind_perm,
+
+  /* Whole vector left shift permutation with zero init.  */
+  scan_store_kind_lshift_zero,
+
+  /* Whole vector left shift permutation and VEC_COND_EXPR.  */
+  scan_store_kind_lshift_cond
+};
+
 /* Function check_scan_store.
 
    Verify if we can perform the needed permutations or whole vector shifts.
-   Return -1 on failure, otherwise exact log2 of vectype's nunits.  */
+   Return -1 on failure, otherwise exact log2 of vectype's nunits.
+   USE_WHOLE_VECTOR is a vector of enum scan_store_kind which operation
+   to do at each step.  */
 
 static int
-scan_store_can_perm_p (tree vectype, tree init, int *use_whole_vector_p = NULL)
+scan_store_can_perm_p (tree vectype, tree init,
+		       vec<enum scan_store_kind> *use_whole_vector = NULL)
 {
   enum machine_mode vec_mode = TYPE_MODE (vectype);
   unsigned HOST_WIDE_INT nunits;
@@ -6371,50 +6385,59 @@  scan_store_can_perm_p (tree vectype, tre
     return -1;
 
   int i;
+  enum scan_store_kind whole_vector_shift_kind = scan_store_kind_perm;
   for (i = 0; i <= units_log2; ++i)
     {
       unsigned HOST_WIDE_INT j, k;
+      enum scan_store_kind kind = scan_store_kind_perm;
       vec_perm_builder sel (nunits, nunits, 1);
       sel.quick_grow (nunits);
-      if (i == 0)
+      if (i == units_log2)
 	{
 	  for (j = 0; j < nunits; ++j)
 	    sel[j] = nunits - 1;
 	}
       else
 	{
-	  for (j = 0; j < (HOST_WIDE_INT_1U << (i - 1)); ++j)
+	  for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
 	    sel[j] = j;
 	  for (k = 0; j < nunits; ++j, ++k)
 	    sel[j] = nunits + k;
 	}
-      vec_perm_indices indices (sel, i == 0 ? 1 : 2, nunits);
+      vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
       if (!can_vec_perm_const_p (vec_mode, indices))
-	break;
-    }
-
-  if (i == 0)
-    return -1;
-
-  if (i <= units_log2)
-    {
-      if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing)
-	return -1;
-      int kind = 1;
-      /* Whole vector shifts shift in zeros, so if init is all zero constant,
-	 there is no need to do anything further.  */
-      if ((TREE_CODE (init) != INTEGER_CST
-	   && TREE_CODE (init) != REAL_CST)
-	  || !initializer_zerop (init))
 	{
-	  tree masktype = build_same_sized_truth_vector_type (vectype);
-	  if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST))
+	  if (i == units_log2)
 	    return -1;
-	  kind = 2;
+
+	  if (whole_vector_shift_kind == scan_store_kind_perm)
+	    {
+	      if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing)
+		return -1;
+	      whole_vector_shift_kind = scan_store_kind_lshift_zero;
+	      /* Whole vector shifts shift in zeros, so if init is all zero
+		 constant, there is no need to do anything further.  */
+	      if ((TREE_CODE (init) != INTEGER_CST
+		   && TREE_CODE (init) != REAL_CST)
+		  || !initializer_zerop (init))
+		{
+		  tree masktype = build_same_sized_truth_vector_type (vectype);
+		  if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST))
+		    return -1;
+		  whole_vector_shift_kind = scan_store_kind_lshift_cond;
+		}
+	    }
+	  kind = whole_vector_shift_kind;
+	}
+      if (use_whole_vector)
+	{
+	  if (kind != scan_store_kind_perm && use_whole_vector->is_empty ())
+	    use_whole_vector->safe_grow_cleared (i);
+	  if (kind != scan_store_kind_perm || !use_whole_vector->is_empty ())
+	    use_whole_vector->safe_push (kind);
 	}
-      if (use_whole_vector_p)
-	*use_whole_vector_p = kind;
     }
+
   return units_log2;
 }
 
@@ -6726,11 +6749,12 @@  vectorizable_scan_store (stmt_vec_info s
   unsigned HOST_WIDE_INT nunits;
   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
     gcc_unreachable ();
-  int use_whole_vector_p = 0;
-  int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector_p);
+  auto_vec<enum scan_store_kind, 16> use_whole_vector;
+  int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector);
   gcc_assert (units_log2 > 0);
   auto_vec<tree, 16> perms;
   perms.quick_grow (units_log2 + 1);
+  tree zero_vec = NULL_TREE, masktype = NULL_TREE;
   for (int i = 0; i <= units_log2; ++i)
     {
       unsigned HOST_WIDE_INT j, k;
@@ -6739,23 +6763,28 @@  vectorizable_scan_store (stmt_vec_info s
       if (i == units_log2)
 	for (j = 0; j < nunits; ++j)
 	  sel[j] = nunits - 1;
-	else
-	  {
-	    for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
-	      sel[j] = j;
-	    for (k = 0; j < nunits; ++j, ++k)
-	      sel[j] = nunits + k;
-	  }
+      else
+	{
+	  for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
+	    sel[j] = j;
+	  for (k = 0; j < nunits; ++j, ++k)
+	    sel[j] = nunits + k;
+	}
       vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
-      if (use_whole_vector_p && i < units_log2)
-	perms[i] = vect_gen_perm_mask_any (vectype, indices);
+      if (!use_whole_vector.is_empty ()
+	  && use_whole_vector[i] != scan_store_kind_perm)
+	{
+	  if (zero_vec == NULL_TREE)
+	    zero_vec = build_zero_cst (vectype);
+	  if (masktype == NULL_TREE
+	      && use_whole_vector[i] == scan_store_kind_lshift_cond)
+	    masktype = build_same_sized_truth_vector_type (vectype);
+	  perms[i] = vect_gen_perm_mask_any (vectype, indices);
+	}
       else
 	perms[i] = vect_gen_perm_mask_checked (vectype, indices);
     }
 
-  tree zero_vec = use_whole_vector_p ? build_zero_cst (vectype) : NULL_TREE;
-  tree masktype = (use_whole_vector_p == 2
-		   ? build_same_sized_truth_vector_type (vectype) : NULL_TREE);
   stmt_vec_info prev_stmt_info = NULL;
   tree vec_oprnd1 = NULL_TREE;
   tree vec_oprnd2 = NULL_TREE;
@@ -6788,7 +6817,10 @@  vectorizable_scan_store (stmt_vec_info s
 	{
 	  tree new_temp = make_ssa_name (vectype);
 	  gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR,
-					   zero_vec ? zero_vec : vec_oprnd1, v,
+					   (zero_vec
+					    && (use_whole_vector[i]
+						!= scan_store_kind_perm))
+					   ? zero_vec : vec_oprnd1, v,
 					   perms[i]);
 	  new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
 	  if (prev_stmt_info == NULL)
@@ -6797,7 +6829,7 @@  vectorizable_scan_store (stmt_vec_info s
 	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
 	  prev_stmt_info = new_stmt_info;
 
-	  if (use_whole_vector_p == 2)
+	  if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond)
 	    {
 	      /* Whole vector shift shifted in zero bits, but if *init
 		 is not initializer_zerop, we need to replace those elements