Patchwork i386: Add AVX2 support to ix86_expand_vshuffle.

login
register
mail settings
Submitter Richard Henderson
Date Oct. 6, 2011, 4:47 p.m.
Message ID <1317919633-17991-3-git-send-email-rth@redhat.com>
Download mbox | patch
Permalink /patch/118132/
State New
Headers show

Comments

Richard Henderson - Oct. 6, 2011, 4:47 p.m.
This was tested only via compile, and inspecting the output.

I'm attempting to set up the Intel SDE as a sim target for
the testsuite, but apparently it only supports 32-bit binaries.


r~


---
 gcc/ChangeLog          |    9 ++++
 gcc/config/i386/i386.c |  112 ++++++++++++++++++++++++++++++++++++++++++++++--
 gcc/config/i386/sse.md |   31 ++++++++------
 3 files changed, 135 insertions(+), 17 deletions(-)

+	* config/i386/i386.c (ix86_expand_vshuffle): Add AVX2 support.
+	* config/i386/sse.md (sseshuffint): Remove.
+	(sseintvecmode): Support V16HI, V8HI, V32QI, V16QI.
+	(VSHUFFLE_AVX2): New mode iterator.
+	(vshuffle<mode>): Use it.
+	(avx_vec_concat<V_256>): Rename from *vec_concat<V_256>_avx.

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 688fba1..9960fd2 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19312,17 +19312,120 @@  ix86_expand_vshuffle (rtx operands[])
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx mask = operands[3];
-  rtx vt, vec[16];
+  rtx t1, t2, vt, vec[16];
   enum machine_mode mode = GET_MODE (op0);
   enum machine_mode maskmode = GET_MODE (mask);
   int w, e, i;
   bool one_operand_shuffle = rtx_equal_p (op0, op1);
 
-  gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128);
-
   /* Number of elements in the vector.  */
   w = GET_MODE_NUNITS (mode);
   e = GET_MODE_UNIT_SIZE (mode);
+  gcc_assert (w <= 16);
+
+  if (TARGET_AVX2)
+    {
+      if (mode == V4DImode || mode == V4DFmode)
+	{
+	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+	     an constant shuffle operand.  With a tiny bit of effort we can
+	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
+	     unfortunate but there's no avoiding it.  */
+	  t1 = gen_reg_rtx (V8SImode);
+
+	  /* Replicate the low bits of the V4DImode mask into V8SImode:
+	       mask = { A B C D }
+	       t1 = { A A B B C C D D }.  */
+	  for (i = 0; i < 4; ++i)
+	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+	  vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
+	  vt = force_reg (V8SImode, vt);
+	  mask = gen_lowpart (V8SImode, mask);
+	  emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+
+	  /* Multiply the shuffle indicies by two.  */
+	  emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx));
+
+	  /* Add one to the odd shuffle indicies:
+		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
+	  for (i = 0; i < 4; ++i)
+	    {
+	      vec[i * 2] = const0_rtx;
+	      vec[i * 2 + 1] = const1_rtx;
+	    }
+	  vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
+	  vt = force_const_mem (V8SImode, vt);
+	  emit_insn (gen_addv8si3 (t1, t1, vt));
+
+	  /* Continue as if V8SImode was used initially.  */
+	  operands[3] = mask = t1;
+	  target = gen_lowpart (V8SImode, target);
+	  op0 = gen_lowpart (V8SImode, op0);
+	  op1 = gen_lowpart (V8SImode, op1);
+	  maskmode = mode = V8SImode;
+	  w = 8;
+	  e = 4;
+	}
+
+      switch (mode)
+	{
+	case V8SImode:
+	  /* The VPERMD and VPERMPS instructions already properly ignore
+	     the high bits of the shuffle elements.  No need for us to
+	     perform an AND ourselves.  */
+	  if (one_operand_shuffle)
+	    emit_insn (gen_avx2_permvarv8si (target, mask, op0));
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SImode);
+	      t2 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
+	      emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
+	      goto merge_two;
+	    }
+	  return;
+
+	case V8SFmode:
+	  mask = gen_lowpart (V8SFmode, mask);
+	  if (one_operand_shuffle)
+	    emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SFmode);
+	      t2 = gen_reg_rtx (V8SFmode);
+	      emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
+	      emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
+	      goto merge_two;
+	    }
+	  return;
+
+        case V4SImode:
+	  /* By combining the two 128-bit input vectors into one 256-bit
+	     input vector, we can use VPERMD and VPERMPS for the full
+	     two-operand shuffle.  */
+	  t1 = gen_reg_rtx (V8SImode);
+	  t2 = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
+	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+	  return;
+
+        case V4SFmode:
+	  t1 = gen_reg_rtx (V8SFmode);
+	  t2 = gen_reg_rtx (V8SFmode);
+	  mask = gen_lowpart (V4SFmode, mask);
+	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
+	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+	  return;
+
+	default:
+	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
+	  break;
+	}
+    }
 
   if (TARGET_XOP)
     {
@@ -19394,7 +19497,7 @@  ix86_expand_vshuffle (rtx operands[])
     }
   else
     {
-      rtx xops[6], t1, t2;
+      rtx xops[6];
       bool ok;
 
       /* Shuffle the two input vectors independently.  */
@@ -19403,6 +19506,7 @@  ix86_expand_vshuffle (rtx operands[])
       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
 
+ merge_two:
       /* Then merge them together.  The key is whether any given control
          element contained a bit set that indicates the second word.  */
       mask = operands[3];
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 88f4d6c..bf1d448 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -230,19 +230,16 @@ 
    (V4SF "V4SF") (V2DF "V2DF")
    (TI "TI")])
 
-;; All 128bit vector modes
-(define_mode_attr sseshuffint
-  [(V16QI "V16QI") (V8HI "V8HI")
-   (V4SI "V4SI")  (V2DI "V2DI")
-   (V4SF "V4SI") (V2DF "V2DI")])
-
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V8SF "V8SI") (V4DF "V4DI")
    (V4SF "V4SI") (V2DF "V2DI")
    (V4DF "V4DI") (V8SF "V8SI")
    (V8SI "V8SI") (V4DI "V4DI")
-   (V4SI "V4SI") (V2DI "V2DI")])
+   (V4SI "V4SI") (V2DI "V2DI")
+   (V16HI "V16HI") (V8HI "V8HI")
+   (V32QI "V32QI") (V16QI "V16QI")
+  ])
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr ssedoublevecmode
@@ -6226,12 +6223,20 @@ 
   DONE;
 })
 
+;; ??? Irritatingly, the 256-bit VPSHUFB only shuffles within the 128-bit
+;; lanes.  For now, we don't try to support V32QI or V16HImode.  So we
+;; don't want to use VI_AVX2.
+(define_mode_iterator VSHUFFLE_AVX2
+  [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
+   (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")])
+
 (define_expand "vshuffle<mode>"
-  [(match_operand:V_128 0 "register_operand" "")
-   (match_operand:V_128 1 "register_operand" "")
-   (match_operand:V_128 2 "register_operand" "")
-   (match_operand:<sseshuffint> 3 "register_operand" "")]
-  "TARGET_SSSE3 || TARGET_AVX"
+  [(match_operand:VSHUFFLE_AVX2 0 "register_operand" "")
+   (match_operand:VSHUFFLE_AVX2 1 "register_operand" "")
+   (match_operand:VSHUFFLE_AVX2 2 "register_operand" "")
+   (match_operand:<sseintvecmode> 3 "register_operand" "")]
+  "TARGET_SSSE3 || TARGET_AVX || TARGET_XOP"
 {
   ix86_expand_vshuffle (operands);
   DONE;
@@ -12397,7 +12402,7 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "TI")])
 
-(define_insn "*vec_concat<mode>_avx"
+(define_insn "avx_vec_concat<mode>"
   [(set (match_operand:V_256 0 "register_operand" "=x,x")
 	(vec_concat:V_256
 	  (match_operand:<ssehalfvecmode> 1 "register_operand" "x,x")