diff mbox

[RFC] Gather vectorization (PR tree-optimization/50789)

Message ID 20111031142332.GD1052@tyan-ft48-01.lab.bos.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 31, 2011, 2:23 p.m. UTC
On Sat, Oct 29, 2011 at 03:53:37PM +0200, Toon Moene wrote:
> I wonder whether it will work with the attached Fortran routine - it
> sure would mean a boost to the 18%+ heaviest CPU user in our code.

It didn't do anything, but only because I used a bad approach in
vect_check_gather.  I have been using DR_BASE_ADDRESS/DR_OFFSET/DR_INIT
into which dr_analyze_innermost splits the reference, but that split is
into something hopefully usable for alias analysis, variable and constant
offset using split_constant_offset that would create largish expressions in
your testcase, which add many values together.  But for gather vectorization
we'd either have to gimplify such expressions back before the load and
allow them to be vectorized, or, as done in this incremental patch,
I instead do something similar to what dr_analyze_innermost does,
but with different goal - to split stuff off into a loop invariant that
can be computed before the loop (and will be put into the scalar part of
gather), and a SSA_NAME defined in the loop which contains the rest (plus
optionally sign/zero extending that into a wider type and/or scaling by
2/4/8.  With this incremental patch I get 4 loops in this testcase
with -O3 -mavx2 vectorized (compared to 0 before), with 262 vgather* insns.

On x86_64 unfortunately this doesn't figure out that it could do all the
additions for the variable index in 32-bit type and then sign extend:
idx_202 = *kp_201(D)[D.1941_200];
idy_209 = *kq_208(D)[D.1941_200];
ilev_216 = *kr_215(D)[D.1941_200];
D.1955_229 = *pgama_228(D)[D.1941_200];
D.1960_237 = *pbeta_236(D)[D.1941_200];
D.1965_245 = *palfa_244(D)[D.1941_200];
D.1966_246 = ilev_216 + -1;
D.1967_247 = (integer(kind=8)) D.1966_246;
D.1968_248 = D.1967_247 * stride.32_141;
D.1969_249 = D.1968_248 + offset.33_155;
D.1970_250 = idy_209 + -1;
D.1971_251 = (integer(kind=8)) D.1970_250;
D.1972_252 = D.1971_251 * stride.30_129;
D.1973_253 = D.1969_249 + D.1972_252;
D.1974_254 = idx_202 + -1;
D.1975_255 = (integer(kind=8)) D.1974_254;
D.1976_256 = D.1973_253 + D.1975_255;
D.1977_258 = *parg_257(D)[D.1976_256];
so for -m64 it emits vgatherqps instructions (V4DImode indexes, loads
V4SFmode values) and then merges those, while for -m32 it emits
just 131 vgatherdps instructions (V8SImode indexes, V8SFmode values).

Would be nice to cut down slightly this testcase into just one or two loops
that are vectorized and turn it into a runtime testcase which verifies
the vectorization was correct.

2011-10-31  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (vectorizable_load): Don't add
	DR_INIT (dr) to ptr.
	* tree-vect-data-refs.c (vect_check_gather): Rewritten not to use
	DR_BASE_ADDRESS or DR_OFFSET, instead call get_inner_reference
	and try to separate in between base and off.



	Jakub

Comments

Toon Moene Nov. 1, 2011, 5:06 a.m. UTC | #1
On 10/31/2011 03:23 PM, Jakub Jelinek wrote:

> On Sat, Oct 29, 2011 at 03:53:37PM +0200, Toon Moene wrote:

>> I wonder whether it will work with the attached Fortran routine - it
>> sure would mean a boost to the 18%+ heaviest CPU user in our code.

> Would be nice to cut down slightly this testcase into just one or two loops
> that are vectorized and turn it into a runtime testcase which verifies
> the vectorization was correct.

This is not a verifiable routine yet, but as the linear interpolation 
part already has all the juicy indirection necessary to test this 
vectorization, most of the routine can be thrown away, to leave the 
attached as essential.
diff mbox

Patch

--- gcc/tree-vect-stmts.c.jj	2011-10-31 12:13:45.000000000 +0100
+++ gcc/tree-vect-stmts.c	2011-10-31 13:21:13.000000000 +0100
@@ -4452,8 +4452,6 @@  vectorizable_load (gimple stmt, gimple_s
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
       ptr = fold_convert (ptrtype, gather_base);
-      ptr = fold_build2 (POINTER_PLUS_EXPR, ptrtype, ptr,
-			 fold_convert (sizetype, DR_INIT (dr)));
       if (!is_gimple_min_invariant (ptr))
 	{
 	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
--- gcc/tree-vect-data-refs.c.jj	2011-10-31 12:13:45.000000000 +0100
+++ gcc/tree-vect-data-refs.c	2011-10-31 14:53:18.000000000 +0100
@@ -2504,109 +2504,156 @@  tree
 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 		   tree *offp, int *scalep)
 {
-  HOST_WIDE_INT scale = 1;
+  HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree offtype = NULL_TREE;
-  tree base = DR_BASE_ADDRESS (dr);
-  tree off = DR_OFFSET (dr);
-  tree decl;
-
-  if (TREE_CODE (base) == POINTER_PLUS_EXPR
-      && integer_zerop (off)
-      && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
-      && !chrec_contains_symbols_defined_in_loop (TREE_OPERAND (base, 0),
-						  loop->num))
+  tree decl, base, off;
+  enum machine_mode pmode;
+  int punsignedp, pvolatilep;
+
+  base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
+			      &pmode, &punsignedp, &pvolatilep, false);
+  gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
+
+  if (TREE_CODE (base) == MEM_REF)
     {
-      off = TREE_OPERAND (base, 1);
+      if (!integer_zerop (TREE_OPERAND (base, 1)))
+	{
+	  if (off == NULL_TREE)
+	    {
+	      double_int moff = mem_ref_offset (base);
+	      off = double_int_to_tree (sizetype, moff);
+	    }
+	  else
+	    off = size_binop (PLUS_EXPR, off, TREE_OPERAND (base, 1));
+	}
       base = TREE_OPERAND (base, 0);
     }
-  if (is_gimple_min_invariant (base)
-      || (TREE_CODE (base) == SSA_NAME
-	  && !chrec_contains_symbols_defined_in_loop (base, loop->num)))
-    {
-      /* DR_BASE_ADDRESS + DR_INIT would go into the constant
-	 pointer, DR_OFFSET into vector.  */
-      STRIP_NOPS (off);
-      if (TREE_CODE (off) == MULT_EXPR
-	  && host_integerp (TREE_OPERAND (off, 1), 0))
+  else
+    base = build_fold_addr_expr (base);
+
+  if (off == NULL_TREE)
+    off = size_zero_node;
+
+  if (!is_gimple_min_invariant (base)
+      && (TREE_CODE (base) != SSA_NAME
+	  || chrec_contains_symbols_defined_in_loop (base, loop->num)))
+    {
+      if (!integer_zerop (off))
+	return NULL_TREE;
+      off = base;
+      base = size_int (pbitpos / BITS_PER_UNIT);
+    }
+  else
+    {
+      base = fold_convert (sizetype, base);
+      base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
+    }
+
+  STRIP_NOPS (off);
+  while (offtype == NULL_TREE)
+    {
+      enum tree_code code;
+      tree op0, op1, add = NULL_TREE;
+
+      if (TREE_CODE (off) == SSA_NAME)
 	{
-	  scale = tree_low_cst (TREE_OPERAND (off, 1), 0);
-	  if (scale != (int) scale)
+	  gimple def_stmt = SSA_NAME_DEF_STMT (off);
+
+	  if (!chrec_contains_symbols_defined_in_loop (off, loop->num))
 	    return NULL_TREE;
-	  off = TREE_OPERAND (off, 0);
+
+	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
+	    break;
+
+	  op0 = gimple_assign_rhs1 (def_stmt);
+	  code = gimple_assign_rhs_code (def_stmt);
+	  op1 = gimple_assign_rhs2 (def_stmt);
 	}
-      if (CONVERT_EXPR_P (off))
+      else
 	{
-	  tree op = TREE_OPERAND (off, 0);
-	  if (TREE_CODE (TREE_TYPE (op)) == INTEGER_TYPE
-	      && TYPE_PRECISION (TREE_TYPE (op))
-		 == GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (op)))
-	      && TYPE_PRECISION (TREE_TYPE (op))
-		 < TYPE_PRECISION (TREE_TYPE (off)))
-	    {
-	      off = op;
-	      offtype = TREE_TYPE (off);
-	    }
+	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
+	    return NULL_TREE;
+	  code = TREE_CODE (off);
+	  extract_ops_from_tree (off, &code, &op0, &op1);
 	}
-      STRIP_NOPS (off);
-      switch (TREE_CODE (off))
+      switch (code)
 	{
-	case SSA_NAME:
-	  break;
+	case POINTER_PLUS_EXPR:
 	case PLUS_EXPR:
+	  if (is_gimple_min_invariant (op0)
+	      || (TREE_CODE (op0) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op0, loop->num)))
+	    {
+	      add = op0;
+	      off = op1;
+	    do_add:
+	      add = fold_convert (sizetype, add);
+	      if (scale != 1)
+		add = size_binop (MULT_EXPR, add, size_int (scale));
+	      base = size_binop (PLUS_EXPR, base, add);
+	      continue;
+	    }
+	  if (is_gimple_min_invariant (op1)
+	      || (TREE_CODE (op1) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op1, loop->num)))
+	    {
+	      add = op1;
+	      off = op0;
+	      goto do_add;
+	    }
+	  break;
 	case MINUS_EXPR:
-	  if (TREE_CODE (TREE_OPERAND (off, 0)) == SSA_NAME
-	      && TREE_CODE (TREE_OPERAND (off, 1)) == SSA_NAME)
+	  if (is_gimple_min_invariant (op1)
+	      || (TREE_CODE (op1) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op1, loop->num)))
 	    {
-	      bool c0
-		= chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off,
-									0),
-							  loop->num);
-	      bool c1
-		= chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off,
-									1),
-							  loop->num);
-	      if (c0 && !c1)
-		{
-		  tree bo = fold_convert (sizetype, TREE_OPERAND (off, 1));
-		  if (scale != 1)
-		    bo = size_binop (MULT_EXPR, bo, size_int (scale));
-		  if (TREE_CODE (off) == MINUS_EXPR)
-		    bo = fold_build1 (NEGATE_EXPR, sizetype, bo);
-		  base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base),
-				      base, bo);
-		  off = TREE_OPERAND (off, 0);
-		  break;
-		}
-	      if (!c0 && c1 && TREE_CODE (off) == PLUS_EXPR)
-		{
-		  tree bo = fold_convert (sizetype, TREE_OPERAND (off, 0));
-		  if (scale != 1)
-		    bo = size_binop (MULT_EXPR, bo, size_int (scale));
-		  base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base),
-				      base, bo);
-		  off = TREE_OPERAND (off, 1);
-		  break;
-		}
+	      add = fold_convert (sizetype, op1);
+	      add = size_binop (MINUS_EXPR, size_zero_node, add);
+	      off = op0;
+	      goto do_add;
+	    }
+	  break;
+	case MULT_EXPR:
+	  if (scale == 1 && host_integerp (op1, 0))
+	    {
+	      scale = tree_low_cst (op1, 0);
+	      off = op0;
+	      continue;
+	    }
+	  break;
+	case SSA_NAME:
+	  off = op0;
+	  continue;
+	CASE_CONVERT:
+	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
+	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+	    break;
+	  if (TYPE_PRECISION (TREE_TYPE (op0))
+	      == TYPE_PRECISION (TREE_TYPE (off)))
+	    {
+	      off = op0;
+	      continue;
+	    }
+	  if (TYPE_PRECISION (TREE_TYPE (op0))
+	      < TYPE_PRECISION (TREE_TYPE (off)))
+	    {
+	      off = op0;
+	      offtype = TREE_TYPE (off);
+	      STRIP_NOPS (off);
+	      continue;
 	    }
-	  return NULL_TREE;
+	  break;
 	default:
-	  return NULL_TREE;
+	  break;
 	}
-    }
-  else
-    {
-      if (!integer_zerop (off))
-	return NULL_TREE;
-      else if (TREE_CODE (base) != SSA_NAME)
-	return NULL_TREE;
-      off = base;
-      base = build_int_cst (TREE_TYPE (DR_BASE_ADDRESS (dr)), 0);
+      break;
     }
 
-  if (!chrec_contains_symbols_defined_in_loop (off, loop->num))
+  if (TREE_CODE (off) != SSA_NAME
+      || !chrec_contains_symbols_defined_in_loop (off, loop->num))
     return NULL_TREE;
 
   if (offtype == NULL_TREE)