diff mbox

AVX2 vector permutation fixes plus vec_pack_trunc_{v16hi,v8si,v4di} support (take 2)

Message ID 20111012212304.GH2210@tyan-ft48-01.lab.bos.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 12, 2011, 9:23 p.m. UTC
On Wed, Oct 12, 2011 at 10:49:33AM -0700, Richard Henderson wrote:
> I believe I've commented on everything else in the previous messages.

Here is an updated patch which should incorporate your comments from both
mails (thanks for them).  Bootstrapped/regtested on x86_64-linux and
i686-linux, ok for trunk?

2011-10-12  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.md (UNSPEC_VPERMDI): Remove.
	* config/i386/i386.c (ix86_expand_vec_perm): Handle
	V16QImode and V32QImode for TARGET_AVX2.
	(MAX_VECT_LEN): Increase to 32.
	(expand_vec_perm_blend): Add support for 32-byte integer
	vectors with TARGET_AVX2.
	(valid_perm_using_mode_p): New function.
	(expand_vec_perm_pshufb): Add support for 32-byte integer
	vectors with TARGET_AVX2.
	(expand_vec_perm_vpshufb2_vpermq): New function.
	(expand_vec_perm_vpshufb2_vpermq_even_odd): New function.
	(expand_vec_perm_even_odd_1): Handle 32-byte integer vectors
	with TARGET_AVX2.
	(ix86_expand_vec_perm_builtin_1): Try expand_vec_perm_vpshufb2_vpermq
	and expand_vec_perm_vpshufb2_vpermq_even_odd.
	* config/i386/sse.md (VEC_EXTRACT_EVENODD_MODE): Add for TARGET_AVX2
	32-byte integer vector modes.
	(vec_pack_trunc_<mode>): Use VI248_AVX2 instead of VI248_128.
	(avx2_interleave_highv32qi, avx2_interleave_lowv32qi): Remove pasto.
	(avx2_pshufdv3, avx2_pshuflwv3, avx2_pshufhwv3): Generate
	4 new operands.
	(avx2_pshufd_1, avx2_pshuflw_1, avx2_pshufhw_1): Don't use
	match_dup, instead add 4 new operands and require they have
	right cross-lane values.
	(avx2_permv4di): Change into define_expand.
	(avx2_permv4di_1): New instruction.
	(avx2_permv2ti): Use nonimmediate_operand instead of register_operand
	for "xm" constrained operand.
	(VEC_PERM_AVX2): Add V32QI and V16QI for TARGET_AVX2.



	Jakub

Comments

Richard Henderson Oct. 12, 2011, 9:39 p.m. UTC | #1
On 10/12/2011 02:23 PM, Jakub Jelinek wrote:
> 2011-10-12  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/i386.md (UNSPEC_VPERMDI): Remove.
> 	* config/i386/i386.c (ix86_expand_vec_perm): Handle
> 	V16QImode and V32QImode for TARGET_AVX2.
> 	(MAX_VECT_LEN): Increase to 32.
> 	(expand_vec_perm_blend): Add support for 32-byte integer
> 	vectors with TARGET_AVX2.
> 	(valid_perm_using_mode_p): New function.
> 	(expand_vec_perm_pshufb): Add support for 32-byte integer
> 	vectors with TARGET_AVX2.
> 	(expand_vec_perm_vpshufb2_vpermq): New function.
> 	(expand_vec_perm_vpshufb2_vpermq_even_odd): New function.
> 	(expand_vec_perm_even_odd_1): Handle 32-byte integer vectors
> 	with TARGET_AVX2.
> 	(ix86_expand_vec_perm_builtin_1): Try expand_vec_perm_vpshufb2_vpermq
> 	and expand_vec_perm_vpshufb2_vpermq_even_odd.
> 	* config/i386/sse.md (VEC_EXTRACT_EVENODD_MODE): Add for TARGET_AVX2
> 	32-byte integer vector modes.
> 	(vec_pack_trunc_<mode>): Use VI248_AVX2 instead of VI248_128.
> 	(avx2_interleave_highv32qi, avx2_interleave_lowv32qi): Remove pasto.
> 	(avx2_pshufdv3, avx2_pshuflwv3, avx2_pshufhwv3): Generate
> 	4 new operands.
> 	(avx2_pshufd_1, avx2_pshuflw_1, avx2_pshufhw_1): Don't use
> 	match_dup, instead add 4 new operands and require they have
> 	right cross-lane values.
> 	(avx2_permv4di): Change into define_expand.
> 	(avx2_permv4di_1): New instruction.
> 	(avx2_permv2ti): Use nonimmediate_operand instead of register_operand
> 	for "xm" constrained operand.
> 	(VEC_PERM_AVX2): Add V32QI and V16QI for TARGET_AVX2.

Ok.


r~
diff mbox

Patch

--- gcc/config/i386/i386.md.jj	2011-10-12 20:28:19.000000000 +0200
+++ gcc/config/i386/i386.md	2011-10-12 20:30:00.000000000 +0200
@@ -235,7 +235,6 @@  (define_c_enum "unspec" [
   UNSPEC_VPERMSI
   UNSPEC_VPERMDF
   UNSPEC_VPERMSF
-  UNSPEC_VPERMDI
   UNSPEC_VPERMTI
   UNSPEC_GATHER
 
--- gcc/config/i386/i386.c.jj	2011-10-12 20:28:19.000000000 +0200
+++ gcc/config/i386/i386.c	2011-10-12 21:01:39.000000000 +0200
@@ -19334,7 +19334,7 @@  ix86_expand_vec_perm (rtx operands[])
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx mask = operands[3];
-  rtx t1, t2, vt, vec[16];
+  rtx t1, t2, t3, t4, vt, vt2, vec[32];
   enum machine_mode mode = GET_MODE (op0);
   enum machine_mode maskmode = GET_MODE (mask);
   int w, e, i;
@@ -19343,50 +19343,68 @@  ix86_expand_vec_perm (rtx operands[])
   /* Number of elements in the vector.  */
   w = GET_MODE_NUNITS (mode);
   e = GET_MODE_UNIT_SIZE (mode);
-  gcc_assert (w <= 16);
+  gcc_assert (w <= 32);
 
   if (TARGET_AVX2)
     {
-      if (mode == V4DImode || mode == V4DFmode)
+      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
 	{
 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
 	     an constant shuffle operand.  With a tiny bit of effort we can
 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
-	     unfortunate but there's no avoiding it.  */
-	  t1 = gen_reg_rtx (V8SImode);
+	     unfortunate but there's no avoiding it.
+	     Similarly for V16HImode we don't have instructions for variable
+	     shuffling, while for V32QImode we can use after preparing suitable
+	     masks vpshufb; vpshufb; vpermq; vpor.  */
+
+	  if (mode == V16HImode)
+	    {
+	      maskmode = mode = V32QImode;
+	      w = 32;
+	      e = 1;
+	    }
+	  else
+	    {
+	      maskmode = mode = V8SImode;
+	      w = 8;
+	      e = 4;
+	    }
+	  t1 = gen_reg_rtx (maskmode);
 
 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
 	       mask = { A B C D }
 	       t1 = { A A B B C C D D }.  */
-	  for (i = 0; i < 4; ++i)
+	  for (i = 0; i < w / 2; ++i)
 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
-	  vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
-	  vt = force_reg (V8SImode, vt);
-	  mask = gen_lowpart (V8SImode, mask);
-	  emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = force_reg (maskmode, vt);
+	  mask = gen_lowpart (maskmode, mask);
+	  if (maskmode == V8SImode)
+	    emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+	  else
+	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
 
 	  /* Multiply the shuffle indicies by two.  */
-	  emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx));
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+				    OPTAB_DIRECT);
 
 	  /* Add one to the odd shuffle indicies:
 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
-	  for (i = 0; i < 4; ++i)
+	  for (i = 0; i < w / 2; ++i)
 	    {
 	      vec[i * 2] = const0_rtx;
 	      vec[i * 2 + 1] = const1_rtx;
 	    }
-	  vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
-	  vt = force_const_mem (V8SImode, vt);
-	  emit_insn (gen_addv8si3 (t1, t1, vt));
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = force_const_mem (maskmode, vt);
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+				    OPTAB_DIRECT);
 
-	  /* Continue as if V8SImode was used initially.  */
+	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
 	  operands[3] = mask = t1;
-	  target = gen_lowpart (V8SImode, target);
-	  op0 = gen_lowpart (V8SImode, op0);
-	  op1 = gen_lowpart (V8SImode, op1);
-	  maskmode = mode = V8SImode;
-	  w = 8;
-	  e = 4;
+	  target = gen_lowpart (mode, target);
+	  op0 = gen_lowpart (mode, op0);
+	  op1 = gen_lowpart (mode, op1);
 	}
 
       switch (mode)
@@ -19443,6 +19461,92 @@  ix86_expand_vec_perm (rtx operands[])
 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
 	  return;
 
+	case V32QImode:
+	  t1 = gen_reg_rtx (V32QImode);
+	  t2 = gen_reg_rtx (V32QImode);
+	  t3 = gen_reg_rtx (V32QImode);
+	  vt2 = GEN_INT (128);
+	  for (i = 0; i < 32; i++)
+	    vec[i] = vt2;
+	  vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+	  vt = force_reg (V32QImode, vt);
+	  for (i = 0; i < 32; i++)
+	    vec[i] = i < 16 ? vt2 : const0_rtx;
+	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+	  vt2 = force_reg (V32QImode, vt2);
+	  /* From mask create two adjusted masks, which contain the same
+	     bits as mask in the low 7 bits of each vector element.
+	     The first mask will have the most significant bit clear
+	     if it requests element from the same 128-bit lane
+	     and MSB set if it requests element from the other 128-bit lane.
+	     The second mask will have the opposite values of the MSB,
+	     and additionally will have its 128-bit lanes swapped.
+	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
+	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+	     stands for other 12 bytes.  */
+	  /* The bit whether element is from the same lane or the other
+	     lane is bit 4, so shift it up by 3 to the MSB position.  */
+	  emit_insn (gen_avx2_lshlv4di3 (gen_lowpart (V4DImode, t1),
+					 gen_lowpart (V4DImode, mask),
+					 GEN_INT (3)));
+	  /* Clear MSB bits from the mask just in case it had them set.  */
+	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+	  /* After this t1 will have MSB set for elements from other lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, t1, vt2));
+	  /* Clear bits other than MSB.  */
+	  emit_insn (gen_andv32qi3 (t1, t1, vt));
+	  /* Or in the lower bits from mask into t3.  */
+	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
+	  /* And invert MSB bits in t1, so MSB is set for elements from the same
+	     lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
+	  /* Swap 128-bit lanes in t3.  */
+	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+					  gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  /* And or in the lower bits from mask into t1.  */
+	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
+	  if (one_operand_shuffle)
+	    {
+	      /* Each of these shuffles will put 0s in places where
+		 element from the other 128-bit lane is needed, otherwise
+		 will shuffle in the requested value.  */
+	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
+	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+	      /* For t3 the 128-bit lanes are swapped again.  */
+	      emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+					      gen_lowpart (V4DImode, t3),
+					      const2_rtx, GEN_INT (3),
+					      const0_rtx, const1_rtx));
+	      /* And oring both together leads to the result.  */
+	      emit_insn (gen_iorv32qi3 (target, t1, t3));
+	      return;
+	    }
+
+	  t4 = gen_reg_rtx (V32QImode);
+	  /* Similarly to the above one_operand_shuffle code,
+	     just for repeated twice for each operand.  merge_two:
+	     code will merge the two results together.  */
+	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
+					  gen_lowpart (V4DImode, t4),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+					  gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  emit_insn (gen_iorv32qi3 (t4, t2, t4));
+	  emit_insn (gen_iorv32qi3 (t3, t1, t3));
+	  t1 = t4;
+	  t2 = t3;
+	  goto merge_two;
+
 	default:
 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
 	  break;
@@ -31775,9 +31879,9 @@  x86_emit_floatuns (rtx operands[2])
   emit_label (donelab);
 }
 
-/* AVX does not support 32-byte integer vector operations,
-   thus the longest vector we are faced with is V16QImode.  */
-#define MAX_VECT_LEN	16
+/* AVX2 does support 32-byte integer vector operations,
+   thus the longest vector we are faced with is V32QImode.  */
+#define MAX_VECT_LEN	32
 
 struct expand_vec_perm_d
 {
@@ -34584,7 +34688,7 @@  expand_vselect_vconcat (rtx target, rtx 
 }
 
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of blendp[sd] / pblendw / pblendvb.  */
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
 static bool
 expand_vec_perm_blend (struct expand_vec_perm_d *d)
@@ -34592,10 +34696,17 @@  expand_vec_perm_blend (struct expand_vec
   enum machine_mode vmode = d->vmode;
   unsigned i, mask, nelt = d->nelt;
   rtx target, op0, op1, x;
+  rtx rperm[32], vperm;
 
-  if (!TARGET_SSE4_1 || d->op0 == d->op1)
+  if (d->op0 == d->op1)
     return false;
-  if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
     return false;
 
   /* This is a blend, not a permute.  Elements must stay in their
@@ -34613,30 +34724,6 @@  expand_vec_perm_blend (struct expand_vec
   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
      decision should be extracted elsewhere, so that we only try that
      sequence once all budget==3 options have been tried.  */
-
-  /* For bytes, see if bytes move in pairs so we can use pblendw with
-     an immediate argument, rather than pblendvb with a vector argument.  */
-  if (vmode == V16QImode)
-    {
-      bool pblendw_ok = true;
-      for (i = 0; i < 16 && pblendw_ok; i += 2)
-	pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
-
-      if (!pblendw_ok)
-	{
-	  rtx rperm[16], vperm;
-
-	  for (i = 0; i < nelt; ++i)
-	    rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
-
-	  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
-	  vperm = force_reg (V16QImode, vperm);
-
-	  emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
-	  return true;
-	}
-    }
-
   target = d->target;
   op0 = d->op0;
   op1 = d->op1;
@@ -34649,6 +34736,7 @@  expand_vec_perm_blend (struct expand_vec
     case V2DFmode:
     case V4SFmode:
     case V8HImode:
+    case V8SImode:
       for (i = 0; i < nelt; ++i)
 	mask |= (d->perm[i] >= nelt) << i;
       break;
@@ -34656,24 +34744,122 @@  expand_vec_perm_blend (struct expand_vec
     case V2DImode:
       for (i = 0; i < 2; ++i)
 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+      vmode = V8HImode;
       goto do_subreg;
 
     case V4SImode:
       for (i = 0; i < 4; ++i)
 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8HImode;
       goto do_subreg;
 
     case V16QImode:
+      /* See if bytes move in pairs so we can use pblendw with
+	 an immediate argument, rather than pblendvb with a vector
+	 argument.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  {
+	  use_pblendvb:
+	    for (i = 0; i < nelt; ++i)
+	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+	  finish_pblendvb:
+	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+	    vperm = force_reg (vmode, vperm);
+
+	    if (GET_MODE_SIZE (vmode) == 16)
+	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+	    else
+	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+	    return true;
+	  }
+
       for (i = 0; i < 8; ++i)
 	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8HImode;
+      /* FALLTHRU */
 
     do_subreg:
-      vmode = V8HImode;
       target = gen_lowpart (vmode, target);
       op0 = gen_lowpart (vmode, op0);
       op1 = gen_lowpart (vmode, op1);
       break;
 
+    case V32QImode:
+      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
+      for (i = 0; i < 32; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+      /* See if bytes move in quadruplets.  If yes, vpblendd
+	 with immediate can be used.  */
+      for (i = 0; i < 32; i += 4)
+	if (d->perm[i] + 2 != d->perm[i + 2])
+	  break;
+      if (i < 32)
+	{
+	  /* See if bytes move the same in both lanes.  If yes,
+	     vpblendw with immediate can be used.  */
+	  for (i = 0; i < 16; i += 2)
+	    if (d->perm[i] + 16 != d->perm[i + 16])
+	      goto use_pblendvb;
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i * 2] >= 32) << i;
+	  vmode = V16HImode;
+	  goto do_subreg;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 4] >= 32) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V16HImode:
+      /* See if words move in pairs.  If yes, vpblendd can be used.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  break;
+      if (i < 16)
+	{
+	  /* See if words move the same in both lanes.  If not,
+	     vpblendvb must be used.  */
+	  for (i = 0; i < 8; i++)
+	    if (d->perm[i] + 8 != d->perm[i + 8])
+	      {
+		/* Use vpblendvb.  */
+		for (i = 0; i < 32; ++i)
+		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+		vmode = V32QImode;
+		nelt = 32;
+		target = gen_lowpart (vmode, target);
+		op0 = gen_lowpart (vmode, op0);
+		op1 = gen_lowpart (vmode, op1);
+		goto finish_pblendvb;
+	      }
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i] >= 16) << i;
+	  break;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V4DImode:
+      /* Use vpblendd.  */
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8SImode;
+      goto do_subreg;
+
     default:
       gcc_unreachable ();
     }
@@ -34734,43 +34920,164 @@  expand_vec_perm_vpermil (struct expand_v
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of pshufb or vpperm.  */
+/* Return true if permutation D can be performed as VMODE permutation
+   instead.  */
 
 static bool
-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
 {
-  unsigned i, nelt, eltsz;
-  rtx rperm[16], vperm, target, op0, op1;
+  unsigned int i, j, chunk;
 
-  if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
-    return false;
-  if (GET_MODE_SIZE (d->vmode) != 16)
+  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
     return false;
 
-  if (d->testing_p)
+  if (GET_MODE_NUNITS (vmode) >= d->nelt)
     return true;
 
+  chunk = d->nelt / GET_MODE_NUNITS (vmode);
+  for (i = 0; i < d->nelt; i += chunk)
+    if (d->perm[i] & (chunk - 1))
+      return false;
+    else
+      for (j = 1; j < chunk; ++j)
+	if ((d->perm[i] & (d->nelt - 1)) + j
+	    != (d->perm[i + j] & (d->nelt - 1)))
+	  return false;
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128.  */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt, eltsz, mask;
+  unsigned char perm[32];
+  enum machine_mode vmode = V16QImode;
+  rtx rperm[32], vperm, target, op0, op1;
+
   nelt = d->nelt;
-  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
 
-  for (i = 0; i < nelt; ++i)
+  if (d->op0 != d->op1)
     {
-      unsigned j, e = d->perm[i];
-      for (j = 0; j < eltsz; ++j)
-	rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+	{
+	  if (TARGET_AVX2
+	      && valid_perm_using_mode_p (V2TImode, d))
+	    {
+	      if (d->testing_p)
+		return true;
+
+	      /* Use vperm2i128 insn.  The pattern uses
+		 V4DImode instead of V2TImode.  */
+	      target = gen_lowpart (V4DImode, d->target);
+	      op0 = gen_lowpart (V4DImode, d->op0);
+	      op1 = gen_lowpart (V4DImode, d->op1);
+	      rperm[0]
+		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
+			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
+	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	      return true;
+	    }
+	  return false;
+	}
     }
+  else
+    {
+      if (GET_MODE_SIZE (d->vmode) == 16)
+	{
+	  if (!TARGET_SSSE3)
+	    return false;
+	}
+      else if (GET_MODE_SIZE (d->vmode) == 32)
+	{
+	  if (!TARGET_AVX2)
+	    return false;
 
-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
-  vperm = force_reg (V16QImode, vperm);
+	  /* V4DImode should be already handled through
+	     expand_vselect by vpermq instruction.  */
+	  gcc_assert (d->vmode != V4DImode);
+
+	  vmode = V32QImode;
+	  if (d->vmode == V8SImode
+	      || d->vmode == V16HImode
+	      || d->vmode == V32QImode)
+	    {
+	      /* First see if vpermq can be used for
+		 V8SImode/V16HImode/V32QImode.  */
+	      if (valid_perm_using_mode_p (V4DImode, d))
+		{
+		  for (i = 0; i < 4; i++)
+		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+		  if (d->testing_p)
+		    return true;
+		  return expand_vselect (gen_lowpart (V4DImode, d->target),
+					 gen_lowpart (V4DImode, d->op0),
+					 perm, 4);
+		}
+
+	      /* Next see if vpermd can be used.  */
+	      if (valid_perm_using_mode_p (V8SImode, d))
+		vmode = V8SImode;
+	    }
+
+	  if (vmode == V32QImode)
+	    {
+	      /* vpshufb only works intra lanes, it is not
+		 possible to shuffle bytes in between the lanes.  */
+	      for (i = 0; i < nelt; ++i)
+		if ((d->perm[i] ^ i) & (nelt / 2))
+		  return false;
+	    }
+	}
+      else
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
 
-  target = gen_lowpart (V16QImode, d->target);
-  op0 = gen_lowpart (V16QImode, d->op0);
+  if (vmode == V8SImode)
+    for (i = 0; i < 8; ++i)
+      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+  else
+    {
+      eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+      if (d->op0 != d->op1)
+	mask = 2 * nelt - 1;
+      else if (vmode == V16QImode)
+	mask = nelt - 1;
+      else
+	mask = nelt / 2 - 1;
+
+      for (i = 0; i < nelt; ++i)
+	{
+	  unsigned j, e = d->perm[i] & mask;
+	  for (j = 0; j < eltsz; ++j)
+	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (vmode,
+				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+  vperm = force_reg (vmode, vperm);
+
+  target = gen_lowpart (vmode, d->target);
+  op0 = gen_lowpart (vmode, d->op0);
   if (d->op0 == d->op1)
-    emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+    {
+      if (vmode == V16QImode)
+	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+      else if (vmode == V32QImode)
+	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+    }
   else
     {
-      op1 = gen_lowpart (V16QImode, d->op1);
+      op1 = gen_lowpart (vmode, d->op1);
       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
     }
 
@@ -34858,7 +35165,8 @@  expand_vec_perm_1 (struct expand_vec_per
   if (expand_vec_perm_vpermil (d))
     return true;
 
-  /* Try the SSSE3 pshufb or XOP vpperm variable permutation.  */
+  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+     vpshufb, vpermd or vpermq variable permutation.  */
   if (expand_vec_perm_pshufb (d))
     return true;
 
@@ -35158,6 +35466,150 @@  expand_vec_perm_pshufb2 (struct expand_v
   return true;
 }
 
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+   with two vpshufb insns, vpermq and vpor.  We should have already failed
+   all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, hp, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->op0 != d->op1
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = ((d->perm[i] ^ i) & (nelt / 2));
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+	  rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  /* Swap the 128-byte lanes of h into hp.  */
+  hp = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V4DImode, h);
+  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op,
+				  const2_rtx, GEN_INT (3), const0_rtx,
+				  const1_rtx));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  op = gen_lowpart (V32QImode, d->target);
+  emit_insn (gen_iorv32qi3 (op, l, hp));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V32QImode and V16QImode operand
+   with two vpshufb insns, vpor and vpermq.  We should have already
+   failed all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, ior, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->op0 == d->op1
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  for (i = 0; i < d->nelt; ++i)
+    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  In the first permutation mask
+     the first quarter will contain indexes for the first half
+     of the op0, the second quarter will contain bit 7 set, third quarter
+     will contain indexes for the second half of the op0 and the
+     last quarter bit 7 set.  In the second permutation mask
+     the first quarter will contain bit 7 set, the second quarter
+     indexes for the first half of the op1, the third quarter bit 7 set
+     and last quarter indexes for the second half of the op1.
+     I.e. the first mask e.g. for V32QImode extract even will be:
+     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+     (all values masked with 0xf except for -128) and second mask
+     for extract even will be
+     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = d->perm[i] >= nelt;
+      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  ior = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (ior, l, h));
+
+  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
+  op = gen_lowpart (V4DImode, d->target);
+  ior = gen_lowpart (V4DImode, ior);
+  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */
 
@@ -35267,6 +35719,61 @@  expand_vec_perm_even_odd_1 (struct expan
 	}
       break;
 
+    case V16HImode:
+    case V32QImode:
+      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+
+    case V4DImode:
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an vpunpck[lh]qdq will produce the result required.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+      else
+	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case V8SImode:
+      t1 = gen_reg_rtx (V8SImode);
+      t2 = gen_reg_rtx (V8SImode);
+
+      /* Shuffle the lanes around into
+	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
+      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
+				    gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
+				    gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x31)));
+
+      /* Swap the 2nd and 3rd position in each lane into
+	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+      emit_insn (gen_avx2_pshufdv3 (t1, t1,
+				    GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
+      emit_insn (gen_avx2_pshufdv3 (t2, t2,
+				    GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
+
+      /* Now an vpunpck[lh]qdq will produce
+	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
+					   gen_lowpart (V4DImode, t1),
+					   gen_lowpart (V4DImode, t2));
+      else
+	t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
+					  gen_lowpart (V4DImode, t1),
+					  gen_lowpart (V4DImode, t2));
+      emit_insn (t3);
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -35401,6 +35908,14 @@  ix86_expand_vec_perm_builtin_1 (struct e
   if (expand_vec_perm_pshufb2 (d))
     return true;
 
+  /* Try sequences of four instructions.  */
+
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
+  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+    return true;
+
   /* ??? Look for narrow permutations whose element orderings would
      allow the promotion to a wider mode.  */
 
--- gcc/config/i386/sse.md.jj	2011-10-12 20:28:19.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-12 20:30:00.000000000 +0200
@@ -4330,10 +4330,10 @@  (define_expand "sse2_shufpd"
 
 ;; Modes handled by vec_extract_even/odd pattern.
 (define_mode_iterator VEC_EXTRACT_EVENODD_MODE
-  [(V16QI "TARGET_SSE2")
-   (V8HI "TARGET_SSE2")
-   (V4SI "TARGET_SSE2")
-   (V2DI "TARGET_SSE2")
+  [(V32QI "TARGET_AVX2") (V16QI "TARGET_SSE2")
+   (V16HI "TARGET_AVX2") (V8HI "TARGET_SSE2")
+   (V8SI "TARGET_AVX2") (V4SI "TARGET_SSE2")
+   (V4DI "TARGET_AVX2") (V2DI "TARGET_SSE2")
    (V8SF "TARGET_AVX") V4SF
    (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -6196,11 +6196,9 @@  (define_expand "vcondu<VI8F_128:mode>v2d
   DONE;
 })
 
-;; ??? Irritatingly, the 256-bit VPSHUFB only shuffles within the 128-bit
-;; lanes.  For now, we don't try to support V32QI or V16HImode.  So we
-;; don't want to use VI_AVX2.
 (define_mode_iterator VEC_PERM_AVX2
   [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")])
 
@@ -6431,8 +6429,8 @@  (define_insn "*<code>tf3"
 
 (define_expand "vec_pack_trunc_<mode>"
   [(match_operand:<ssepackmode> 0 "register_operand" "")
-   (match_operand:VI248_128 1 "register_operand" "")
-   (match_operand:VI248_128 2 "register_operand" "")]
+   (match_operand:VI248_AVX2 1 "register_operand" "")
+   (match_operand:VI248_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]);
@@ -6513,8 +6511,7 @@  (define_insn "avx2_interleave_highv32qi"
 		     (const_int 28) (const_int 60)
 		     (const_int 29) (const_int 61)
 		     (const_int 30) (const_int 62)
-		     (const_int 31) (const_int 63)
-		     (const_int 32) (const_int 64)])))]
+		     (const_int 31) (const_int 63)])))]
   "TARGET_AVX2"
   "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
@@ -6559,7 +6556,6 @@  (define_insn "avx2_interleave_lowv32qi"
 		     (const_int 5) (const_int 37)
 		     (const_int 6) (const_int 38)
 		     (const_int 7) (const_int 39)
-		     (const_int 15) (const_int 47)
 		     (const_int 16) (const_int 48)
 		     (const_int 17) (const_int 49)
 		     (const_int 18) (const_int 50)
@@ -6919,7 +6915,11 @@  (define_expand "avx2_pshufdv3"
 				GEN_INT ((mask >> 0) & 3),
 				GEN_INT ((mask >> 2) & 3),
 				GEN_INT ((mask >> 4) & 3),
-				GEN_INT ((mask >> 6) & 3)));
+				GEN_INT ((mask >> 6) & 3),
+				GEN_INT (((mask >> 0) & 3) + 4),
+				GEN_INT (((mask >> 2) & 3) + 4),
+				GEN_INT (((mask >> 4) & 3) + 4),
+				GEN_INT (((mask >> 6) & 3) + 4)));
   DONE;
 })
 
@@ -6931,11 +6931,15 @@  (define_insn "avx2_pshufd_1"
 		     (match_operand 3 "const_0_to_3_operand" "")
 		     (match_operand 4 "const_0_to_3_operand" "")
 		     (match_operand 5 "const_0_to_3_operand" "")
-		     (match_dup 2)
-		     (match_dup 3)
-		     (match_dup 4)
-		     (match_dup 5)])))]
-  "TARGET_AVX2"
+		     (match_operand 6 "const_4_to_7_operand" "")
+		     (match_operand 7 "const_4_to_7_operand" "")
+		     (match_operand 8 "const_4_to_7_operand" "")
+		     (match_operand 9 "const_4_to_7_operand" "")])))]
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 4 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 4 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 4 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 4 == INTVAL (operands[9])"
 {
   int mask = 0;
   mask |= INTVAL (operands[2]) << 0;
@@ -7002,7 +7006,11 @@  (define_expand "avx2_pshuflwv3"
 				 GEN_INT ((mask >> 0) & 3),
 				 GEN_INT ((mask >> 2) & 3),
 				 GEN_INT ((mask >> 4) & 3),
-				 GEN_INT ((mask >> 6) & 3)));
+				 GEN_INT ((mask >> 6) & 3),
+				 GEN_INT (((mask >> 0) & 3) + 8),
+				 GEN_INT (((mask >> 2) & 3) + 8),
+				 GEN_INT (((mask >> 4) & 3) + 8),
+				 GEN_INT (((mask >> 6) & 3) + 8)));
   DONE;
 })
 
@@ -7018,15 +7026,19 @@  (define_insn "avx2_pshuflw_1"
 		     (const_int 5)
 		     (const_int 6)
 		     (const_int 7)
-		     (match_dup 2)
-		     (match_dup 3)
-		     (match_dup 4)
-		     (match_dup 5)
+		     (match_operand 6 "const_8_to_11_operand" "")
+		     (match_operand 7 "const_8_to_11_operand" "")
+		     (match_operand 8 "const_8_to_11_operand" "")
+		     (match_operand 9 "const_8_to_11_operand" "")
 		     (const_int 12)
 		     (const_int 13)
 		     (const_int 14)
 		     (const_int 15)])))]
-  "TARGET_AVX2"
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 8 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 8 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 8 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
 {
   int mask = 0;
   mask |= INTVAL (operands[2]) << 0;
@@ -7098,7 +7110,11 @@  (define_expand "avx2_pshufhwv3"
 				 GEN_INT (((mask >> 0) & 3) + 4),
 				 GEN_INT (((mask >> 2) & 3) + 4),
 				 GEN_INT (((mask >> 4) & 3) + 4),
-				 GEN_INT (((mask >> 6) & 3) + 4)));
+				 GEN_INT (((mask >> 6) & 3) + 4),
+				 GEN_INT (((mask >> 0) & 3) + 12),
+				 GEN_INT (((mask >> 2) & 3) + 12),
+				 GEN_INT (((mask >> 4) & 3) + 12),
+				 GEN_INT (((mask >> 6) & 3) + 12)));
   DONE;
 })
 
@@ -7118,11 +7134,15 @@  (define_insn "avx2_pshufhw_1"
 		     (const_int 9)
 		     (const_int 10)
 		     (const_int 11)
-		     (match_dup 2)
-		     (match_dup 3)
-		     (match_dup 4)
-		     (match_dup 5)])))]
-  "TARGET_AVX2"
+		     (match_operand 6 "const_12_to_15_operand" "")
+		     (match_operand 7 "const_12_to_15_operand" "")
+		     (match_operand 8 "const_12_to_15_operand" "")
+		     (match_operand 9 "const_12_to_15_operand" "")])))]
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 8 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 8 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 8 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
 {
   int mask = 0;
   mask |= (INTVAL (operands[2]) - 4) << 0;
@@ -11526,14 +11546,39 @@  (define_insn "avx2_permvarv8sf"
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
-(define_insn "avx2_permv4di"
+(define_expand "avx2_permv4di"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (match_operand:V4DI 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_255_operand" "")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_permv4di_1 (operands[0], operands[1],
+				  GEN_INT ((mask >> 0) & 3),
+				  GEN_INT ((mask >> 2) & 3),
+				  GEN_INT ((mask >> 4) & 3),
+				  GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "avx2_permv4di_1"
   [(set (match_operand:V4DI 0 "register_operand" "=x")
-	(unspec:V4DI
-	  [(match_operand:V4DI 1 "register_operand" "xm")
-	   (match_operand:SI 2 "const_0_to_255_operand" "n")]
-	  UNSPEC_VPERMDI))]
+	(vec_select:V4DI
+	  (match_operand:V4DI 1 "nonimmediate_operand" "xm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand" "")
+		     (match_operand 3 "const_0_to_3_operand" "")
+		     (match_operand 4 "const_0_to_3_operand" "")
+		     (match_operand 5 "const_0_to_3_operand" "")])))]
   "TARGET_AVX2"
-  "vpermq\t{%2, %1, %0|%0, %1, %2}"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+  return "vpermq\t{%2, %1, %0|%0, %1, %2}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
@@ -11542,7 +11587,7 @@  (define_insn "avx2_permv2ti"
   [(set (match_operand:V4DI 0 "register_operand" "=x")
 	(unspec:V4DI
 	  [(match_operand:V4DI 1 "register_operand" "x")
-	   (match_operand:V4DI 2 "register_operand" "xm")
+	   (match_operand:V4DI 2 "nonimmediate_operand" "xm")
 	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
 	  UNSPEC_VPERMTI))]
   "TARGET_AVX2"