Patchwork 32-byte integer vec_interleave_{high,low}<mode>

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 14, 2011, 6:16 a.m.
Message ID <20111014061613.GU2210@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/119704/
State New
Headers show

Comments

Jakub Jelinek - Oct. 14, 2011, 6:16 a.m.
Hi!

This patch adds VI_256 vec_interleave_{high,low}<mode> as well as
using it in the vector expander.
While it needs 3 insns for each, the first two will be actually CSEd
if both patterns are expanded (the usual case from the vectorizer, e.g.
for vect-strided-store-u32-i2.c), so we end up with 2 vunpck* insns
followed by 2 vperm2i128 insns.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2011-10-14  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (vec_interleave_high<mode>,
	vec_interleave_low<mode>): Add AVX2 expanders for VI_256
	modes.
	* config/i386/i386.c (expand_vec_perm_interleave3): New function.
	(ix86_expand_vec_perm_builtin_1): Call it.


	Jakub
Richard Henderson - Oct. 14, 2011, 4:45 p.m.
On 10/13/2011 11:16 PM, Jakub Jelinek wrote:
> 2011-10-14  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/sse.md (vec_interleave_high<mode>,
> 	vec_interleave_low<mode>): Add AVX2 expanders for VI_256
> 	modes.
> 	* config/i386/i386.c (expand_vec_perm_interleave3): New function.
> 	(ix86_expand_vec_perm_builtin_1): Call it.

Ok.


r~

Patch

--- gcc/config/i386/sse.md.jj	2011-10-13 14:50:15.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-13 17:34:26.000000000 +0200
@@ -6765,6 +6765,38 @@  (define_insn "vec_interleave_lowv4si"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "vec_interleave_high<mode>"
+  [(match_operand:VI_256 0 "register_operand" "=x")
+   (match_operand:VI_256 1 "register_operand" "x")
+   (match_operand:VI_256 2 "nonimmediate_operand" "xm")]
+ "TARGET_AVX2"
+{
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
+  emit_insn (gen_avx2_interleave_high<mode> (t2,  operands[1], operands[2]));
+  emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]),
+				gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4))));
+  DONE;
+})
+
+(define_expand "vec_interleave_low<mode>"
+  [(match_operand:VI_256 0 "register_operand" "=x")
+   (match_operand:VI_256 1 "register_operand" "x")
+   (match_operand:VI_256 2 "nonimmediate_operand" "xm")]
+ "TARGET_AVX2"
+{
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
+  emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
+  emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]),
+				gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4))));
+  DONE;
+})
+
 ;; Modes handled by pinsr patterns.
 (define_mode_iterator PINSR_MODE
   [(V16QI "TARGET_SSE4_1") V8HI
--- gcc/config/i386/i386.c.jj	2011-10-13 11:56:19.000000000 +0200
+++ gcc/config/i386/i386.c	2011-10-13 18:36:58.000000000 +0200
@@ -35474,6 +35474,82 @@  expand_vec_perm_interleave2 (struct expa
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation using 2 intra-lane interleave insns
+   and cross-lane shuffle for 32-byte vectors.  */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt;
+  rtx (*gen) (rtx, rtx, rtx);
+
+  if (d->op0 == d->op1)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+    ;
+  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+    ;
+  else
+    return false;
+
+  nelt = d->nelt;
+  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+    return false;
+  for (i = 0; i < nelt; i += 2)
+    if (d->perm[i] != d->perm[0] + i / 2
+	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V32QImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv32qi;
+      else
+	gen = gen_vec_interleave_lowv32qi;
+      break;
+    case V16HImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv16hi;
+      else
+	gen = gen_vec_interleave_lowv16hi;
+      break;
+    case V8SImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8si;
+      else
+	gen = gen_vec_interleave_lowv8si;
+      break;
+    case V4DImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4di;
+      else
+	gen = gen_vec_interleave_lowv4di;
+      break;
+    case V8SFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8sf;
+      else
+	gen = gen_vec_interleave_lowv8sf;
+      break;
+    case V4DFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4df;
+      else
+	gen = gen_vec_interleave_lowv4df;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (d->target, d->op0, d->op1));
+  return true;
+}
+
 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
    permutation with two pshufb insns and an ior.  We should have already
    failed all two instruction sequences.  */
@@ -35972,6 +36048,9 @@  ix86_expand_vec_perm_builtin_1 (struct e
   if (expand_vec_perm_pshufb2 (d))
     return true;
 
+  if (expand_vec_perm_interleave3 (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_vpshufb2_vpermq (d))