diff mbox

[6/6] Expand vector permutation with vec_perm and vec_perm_const.

Message ID 1318563805-7481-7-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Oct. 14, 2011, 3:43 a.m. UTC
From: Richard Henderson <rth@twiddle.net>

---
 gcc/doc/md.texi         |    6 ++
 gcc/genopinit.c         |    1 +
 gcc/optabs.c            |  216 ++++++++++++++++++++++++++++++++++++-----------
 gcc/optabs.h            |   12 ++-
 gcc/tree-vect-generic.c |    2 +-
 5 files changed, 181 insertions(+), 56 deletions(-)
diff mbox

Patch

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index fe27210..68a5548 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4041,6 +4041,12 @@  be computed modulo @math{2*@var{N}}.  Note that if
 @code{rtx_equal_p(operand1, operand2)}, this can be implemented
 with just operand 1 and selector elements modulo @var{N}.
 
+In order to make things easy for a number of targets, if there is no
+@samp{vec_perm} pattern for mode @var{m}, but there is for mode @var{q}
+where @var{q} is a vector of @code{QImode} of the same width as @var{m},
+the middle-end will lower the mode @var{m} @code{VEC_PERM_EXPR} to
+mode @var{q}.
+
 @cindex @code{vec_perm_const@var{m}} instruction pattern
 @item @samp{vec_perm_const@var{m}}
 Like @samp{vec_perm} except that the permutation is a compile-time
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index 4eefa03..d40e4c4 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -254,6 +254,7 @@  static const char * const optabs[] =
   "set_optab_handler (vec_shr_optab, $A, CODE_FOR_$(vec_shr_$a$))",
   "set_optab_handler (vec_realign_load_optab, $A, CODE_FOR_$(vec_realign_load_$a$))",
   "set_direct_optab_handler (vec_perm_optab, $A, CODE_FOR_$(vec_perm$a$))",
+  "set_direct_optab_handler (vec_perm_const_optab, $A, CODE_FOR_$(vec_perm_const$a$))",
   "set_convert_optab_handler (vcond_optab, $A, $B, CODE_FOR_$(vcond$a$b$))",
   "set_convert_optab_handler (vcondu_optab, $A, $B, CODE_FOR_$(vcondu$a$b$))",
   "set_optab_handler (ssum_widen_optab, $A, CODE_FOR_$(widen_ssum$I$a3$))",
diff --git a/gcc/optabs.c b/gcc/optabs.c
index e112467..e9a23f4 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -6687,87 +6687,203 @@  vector_compare_rtx (tree cond, bool unsignedp, enum insn_code icode)
 
 /* Return true if VEC_PERM_EXPR can be expanded using SIMD extensions
    of the CPU.  */
+
 bool
-expand_vec_perm_expr_p (enum machine_mode mode, tree v0, tree v1, tree mask)
+can_vec_perm_expr_p (tree type, tree sel)
 {
-  int v0_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0))));
-  int mask_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask))));
+  enum machine_mode mode, qimode;
+  mode = TYPE_MODE (type);
+
+  /* If the target doesn't implement a vector mode for the vector type,
+     then no operations are supported.  */
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (TREE_CODE (sel) == VECTOR_CST)
+    {
+      if (direct_optab_handler (vec_perm_const_optab, mode) != CODE_FOR_nothing
+	  && targetm.vectorize.builtin_vec_perm_ok (type, sel))
+	return true;
+    }
 
-  if (TREE_CODE (mask) == VECTOR_CST
-      && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), mask))
+  if (direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing)
     return true;
 
-  if (v0_mode_s != mask_mode_s
-      || TYPE_VECTOR_SUBPARTS (TREE_TYPE (v0))
-	 != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask))
-      || TYPE_VECTOR_SUBPARTS (TREE_TYPE (v1))
-	 != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)))
+  /* We allow fallback to a QI vector mode, and adjust the mask.  */
+  qimode = mode_for_vector (QImode, GET_MODE_SIZE (mode));
+  if (!VECTOR_MODE_P (qimode))
     return false;
 
-  return direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing;
+  /* ??? For completeness, we ought to check the QImode version of
+      vec_perm_const_optab.  But all users of this implicit lowering
+      feature implement the variable vec_perm_optab.  */
+  if (direct_optab_handler (vec_perm_optab, qimode) == CODE_FOR_nothing)
+    return false;
+
+  /* In order to support the lowering of non-constant permutations,
+     we need to support shifts and adds.  */
+  if (TREE_CODE (sel) != VECTOR_CST)
+    {
+      if (GET_MODE_UNIT_SIZE (mode) > 2
+	  && optab_handler (ashl_optab, mode) == CODE_FOR_nothing
+	  && optab_handler (vashl_optab, mode) == CODE_FOR_nothing)
+	return false;
+      if (optab_handler (add_optab, qimode) == CODE_FOR_nothing)
+	return false;
+    }
+
+  return true;
 }
 
-/* Generate instructions for VEC_COND_EXPR given its type and three
-   operands.  */
-rtx
-expand_vec_perm_expr (tree type, tree v0, tree v1, tree mask, rtx target)
+/* A subroutine of expand_vec_perm_expr for expanding one vec_perm insn.  */
+
+static rtx
+expand_vec_perm_expr_1 (enum insn_code icode, rtx target,
+			rtx v0, rtx v1, rtx sel)
 {
+  enum machine_mode tmode = GET_MODE (target);
+  enum machine_mode smode = GET_MODE (sel);
   struct expand_operand ops[4];
-  enum insn_code icode;
-  enum machine_mode mode = TYPE_MODE (type);
 
-  gcc_checking_assert (expand_vec_perm_expr_p (mode, v0, v1, mask));
+  create_output_operand (&ops[0], target, tmode);
+  create_input_operand (&ops[3], sel, smode);
 
-  if (TREE_CODE (mask) == VECTOR_CST)
+  /* Make an effort to preserve v0 == v1.  The target expander is able to
+     rely on this to determine if we're permuting a single input operand.  */
+  if (rtx_equal_p (v0, v1))
     {
-      tree m_type, call;
-      tree fn = targetm.vectorize.builtin_vec_perm (TREE_TYPE (v0), &m_type);
+      if (!insn_operand_matches (icode, 1, v0))
+        v0 = force_reg (tmode, v0);
+      gcc_checking_assert (insn_operand_matches (icode, 1, v0));
+      gcc_checking_assert (insn_operand_matches (icode, 2, v0));
 
-      if (!fn)
-	goto vec_perm;
+      create_fixed_operand (&ops[1], v0);
+      create_fixed_operand (&ops[2], v0);
+    }
+  else
+    {
+      create_input_operand (&ops[1], v0, tmode);
+      create_input_operand (&ops[2], v1, tmode);
+    }
 
-      if (m_type != TREE_TYPE (TREE_TYPE (mask)))
-	{
-	  int units = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask));
-	  tree cvt = build_vector_type (m_type, units);
-	  mask = fold_convert (cvt, mask);
-	}
+  if (maybe_expand_insn (icode, 4, ops))
+    return ops[0].value;
+  return NULL_RTX;
+}
 
-      call = fold_build1 (ADDR_EXPR, build_pointer_type (TREE_TYPE (fn)), fn);
-      call = build_call_nary (type, call, 3, v0, v1, mask);
+/* Generate instructions for VEC_PERM_EXPR given its type and three
+   operands.  */
+rtx
+expand_vec_perm_expr (tree type, tree v0, tree v1, tree sel, rtx target)
+{
+  enum insn_code icode;
+  enum machine_mode mode = TYPE_MODE (type);
+  enum machine_mode qimode;
+  rtx v0_rtx, v1_rtx, sel_rtx, *vec, vt, tmp;
+  unsigned int i, w, e, u;
 
-      return expand_expr_real_1 (call, target, VOIDmode, EXPAND_NORMAL, NULL);
+  if (!target)
+    target = gen_reg_rtx (mode);
+  v0_rtx = expand_normal (v0);
+  if (operand_equal_p (v0, v1, 0))
+    v1_rtx = v0_rtx;
+  else
+    v1_rtx = expand_normal (v1);
+  sel_rtx = expand_normal (sel);
+
+  /* If the input is a constant, expand it specially.  */
+  if (CONSTANT_P (sel_rtx))
+    {
+      icode = direct_optab_handler (vec_perm_const_optab, mode);
+      if (icode != CODE_FOR_nothing
+	  && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), sel)
+	  && (tmp = expand_vec_perm_expr_1 (icode, target, v0_rtx,
+					    v1_rtx, sel_rtx)) != NULL)
+	return tmp;
     }
 
- vec_perm:
+  /* Otherwise fall back to a fully variable permuation.  */
   icode = direct_optab_handler (vec_perm_optab, mode);
+  if (icode != CODE_FOR_nothing
+      && (tmp = expand_vec_perm_expr_1 (icode, target, v0_rtx,
+					v1_rtx, sel_rtx)) != NULL)
+    return tmp;
+
+  /* As a special case to aid several targets, lower the element-based
+     permutation to a byte-based permutation and try again.  */
+  qimode = mode_for_vector (QImode, GET_MODE_SIZE (mode));
+  if (!VECTOR_MODE_P (qimode))
+    return NULL_RTX;
 
+  /* ??? For completeness, we ought to check the QImode version of
+     vec_perm_const_optab.  But all users of this implicit lowering
+     feature implement the variable vec_perm_optab.  */
+  icode = direct_optab_handler (vec_perm_optab, qimode);
   if (icode == CODE_FOR_nothing)
-    return 0;
+    return NULL_RTX;
 
-  create_output_operand (&ops[0], target, mode);
-  create_input_operand (&ops[3], expand_normal (mask),
-			TYPE_MODE (TREE_TYPE (mask)));
+  w = GET_MODE_SIZE (mode);
+  e = GET_MODE_NUNITS (mode);
+  u = GET_MODE_UNIT_SIZE (mode);
+  vec = XALLOCAVEC (rtx, w);
 
-  if (operand_equal_p (v0, v1, 0))
+  if (CONSTANT_P (sel_rtx))
     {
-      rtx rtx_v0 = expand_normal (v0);
-      if (!insn_operand_matches (icode, 1, rtx_v0))
-        rtx_v0 = force_reg (mode, rtx_v0);
-
-      gcc_checking_assert (insn_operand_matches (icode, 2, rtx_v0));
+      unsigned int j;
+      for (i = 0; i < e; ++i)
+	{
+	  unsigned int this_e = INTVAL (XVECEXP (sel_rtx, 0, i));
+	  this_e &= 2 * e - 1;
+          this_e *= u;
 
-      create_fixed_operand (&ops[1], rtx_v0);
-      create_fixed_operand (&ops[2], rtx_v0);
+	  for (j = 0; j < u; ++j)
+	    vec[i * e + j] = GEN_INT (this_e + j);
+	}
+      sel_rtx = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec));
     }
   else
     {
-      create_input_operand (&ops[1], expand_normal (v0), mode);
-      create_input_operand (&ops[2], expand_normal (v1), mode);
-    }
+      /* Multiply each element by its byte size.  */
+      if (u == 2)
+	sel_rtx = expand_simple_binop (mode, PLUS, sel_rtx, sel_rtx,
+				       sel_rtx, 0, OPTAB_DIRECT);
+      else
+	sel_rtx = expand_simple_binop (mode, ASHIFT, sel_rtx,
+				       GEN_INT (exact_log2 (u)),
+				       sel_rtx, 0, OPTAB_DIRECT);
+      gcc_assert (sel_rtx);
 
-  expand_insn (icode, 4, ops);
-  return ops[0].value;
+      /* Broadcast the low byte each element into each of its bytes.  */
+      for (i = 0; i < w; ++i)
+	{
+	  int this_e = i / u * u;
+	  if (BYTES_BIG_ENDIAN)
+	    this_e += u - 1;
+	  vec[i] = GEN_INT (this_e);
+	}
+      vt = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec));
+      sel_rtx = gen_lowpart (qimode, sel_rtx);
+      sel_rtx = expand_vec_perm_expr_1 (icode, gen_reg_rtx (qimode),
+					sel_rtx, sel_rtx, vt);
+      gcc_assert (sel_rtx != NULL);
+
+      /* Add the byte offset to each byte element.  */
+      /* Note that the definition of the indicies here is memory ordering,
+	 so there should be no difference between big and little endian.  */
+      for (i = 0; i < w; ++i)
+	vec[i] = GEN_INT (i % u);
+      vt = gen_rtx_CONST_VECTOR (qimode, gen_rtvec_v (w, vec));
+      sel_rtx = expand_simple_binop (qimode, PLUS, sel_rtx, vt,
+				     NULL_RTX, 0, OPTAB_DIRECT);
+      gcc_assert (sel_rtx);
+    }
+
+  tmp = expand_vec_perm_expr_1 (icode, gen_lowpart (qimode, target),
+			        gen_lowpart (qimode, v0_rtx),
+			        gen_lowpart (qimode, v1_rtx), sel_rtx);
+  gcc_assert (tmp != NULL);
+
+  return gen_lowpart (mode, tmp);
 }
 
 
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 41ae7eb..926d21f 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -638,9 +638,6 @@  enum direct_optab_index
   DOI_reload_in,
   DOI_reload_out,
 
-  /* Vector shuffling.  */
-  DOI_vec_perm,
-
   /* Block move operation.  */
   DOI_movmem,
 
@@ -688,6 +685,10 @@  enum direct_optab_index
   /* Atomic clear with release semantics.  */
   DOI_sync_lock_release,
 
+  /* Vector permutation.  */
+  DOI_vec_perm,
+  DOI_vec_perm_const,
+
   DOI_MAX
 };
 
@@ -704,7 +705,6 @@  typedef struct direct_optab_d *direct_optab;
 #endif
 #define reload_in_optab (&direct_optab_table[(int) DOI_reload_in])
 #define reload_out_optab (&direct_optab_table[(int) DOI_reload_out])
-#define vec_perm_optab (&direct_optab_table[(int) DOI_vec_perm])
 #define movmem_optab (&direct_optab_table[(int) DOI_movmem])
 #define setmem_optab (&direct_optab_table[(int) DOI_setmem])
 #define cmpstr_optab (&direct_optab_table[(int) DOI_cmpstr])
@@ -734,6 +734,8 @@  typedef struct direct_optab_d *direct_optab;
   (&direct_optab_table[(int) DOI_sync_lock_test_and_set])
 #define sync_lock_release_optab \
   (&direct_optab_table[(int) DOI_sync_lock_release])
+#define vec_perm_optab (&direct_optab_table[DOI_vec_perm])
+#define vec_perm_const_optab (&direct_optab_table[(int) DOI_vec_perm_const])
 
 /* Target-dependent globals.  */
 struct target_optabs {
@@ -889,7 +891,7 @@  extern rtx expand_vec_cond_expr (tree, tree, tree, tree, rtx);
 extern rtx expand_vec_shift_expr (sepops, rtx);
 
 /* Return tree if target supports vector operations for VEC_PERM_EXPR.  */
-bool expand_vec_perm_expr_p (enum machine_mode, tree, tree, tree);
+extern bool can_vec_perm_expr_p (tree, tree);
 
 /* Generate code for VEC_PERM_EXPR.  */
 extern rtx expand_vec_perm_expr (tree, tree, tree, tree, rtx);
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index 7fba9bb..663ea00 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -641,7 +641,7 @@  lower_vec_perm (gimple_stmt_iterator *gsi)
   location_t loc = gimple_location (gsi_stmt (*gsi));
   unsigned i;
 
-  if (expand_vec_perm_expr_p (TYPE_MODE (vect_type), vec0, vec1, mask))
+  if (can_vec_perm_expr_p (vect_type, mask))
     return;
   
   warning_at (loc, OPT_Wvector_operation_performance,