diff mbox series

i386: Implement 4-byte vector (V4QI/V2HI) constant permutations [PR100637]

Message ID CAFULd4YWR5JrU3h2QAKY1uJCth8S5gF3W3CRX-nKTDtbG8_J7A@mail.gmail.com
State New
Headers show
Series i386: Implement 4-byte vector (V4QI/V2HI) constant permutations [PR100637] | expand

Commit Message

Uros Bizjak July 5, 2021, 7:10 p.m. UTC
2021-07-05  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
    PR target/100637
    * config/i386/i386-expand.c (ix86_split_mmx_punpck):
    Handle V4QI and V2HI modes.
    (expand_vec_perm_blend): Allow 4-byte vector modes with TARGET_SSE4_1.
    Handle V4QI mode. Emit mmx_pblendvb32 for 4-byte modes.
    (expand_vec_perm_pshufb): Rewrite to use switch statemets.
    Handle 4-byte dual operands with TARGET_XOP and single operands
    with TARGET_SSSE3.  Emit mmx_ppermv32 for TARGET_XOP and
    mmx_pshufbv4qi3 for TARGET_SSSE3.
    (expand_vec_perm_pblendv): Allow 4-byte vector modes with TARGET_SSE4_1.
    (expand_vec_perm_interleave2): Allow 4-byte vector modes.
    (expand_vec_perm_pshufb2): Allow 4-byte vector modes with TARGET_SSSE3.
    (expand_vec_perm_even_odd_1): Handle V4QI mode.
    (expand_vec_perm_broadcast_1): Handle V4QI mode.
    (ix86_vectorize_vec_perm_const): Handle V4QI mode.
    * config/i386/mmx.md (mmx_ppermv32): New insn pattern.
    (mmx_pshufbv4qi3): Ditto.
    (*mmx_pblendw32): Ditto.
    (*mmx_pblendw64): Rename from *mmx_pblendw.
    (mmx_punpckhbw_low): New insn_and_split pattern.
    (mmx_punpcklbw_low): Ditto.

All permutations are already checked in gcc.target/i386/vperm-v4qi.c.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index b37642e35ee..7f74653722c 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -933,6 +933,7 @@  ix86_split_mmx_punpck (rtx operands[], bool high_p)
 
   switch (mode)
     {
+    case E_V4QImode:
     case E_V8QImode:
       sse_mode = V16QImode;
       double_sse_mode = V32QImode;
@@ -949,6 +950,7 @@  ix86_split_mmx_punpck (rtx operands[], bool high_p)
       break;
 
     case E_V4HImode:
+    case E_V2HImode:
       sse_mode = V8HImode;
       double_sse_mode = V16HImode;
       mask = gen_rtx_PARALLEL (VOIDmode,
@@ -991,7 +993,7 @@  ix86_split_mmx_punpck (rtx operands[], bool high_p)
   rtx insn = gen_rtx_SET (dest, op2);
   emit_insn (insn);
 
-  /* Move bits 64:127 to bits 0:63.  */
+  /* Move high bits to low bits.  */
   if (high_p)
     {
       if (sse_mode == V4SFmode)
@@ -1004,9 +1006,19 @@  ix86_split_mmx_punpck (rtx operands[], bool high_p)
 	}
       else
 	{
-	  mask = gen_rtx_PARALLEL (VOIDmode,
-				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
-					      GEN_INT (0), GEN_INT (1)));
+	  int sz = GET_MODE_SIZE (mode);
+
+	  if (sz == 4)
+	    mask = gen_rtx_PARALLEL (VOIDmode,
+				     gen_rtvec (4, GEN_INT (1), GEN_INT (0),
+						GEN_INT (0), GEN_INT (1)));
+	  else if (sz == 8)
+	    mask = gen_rtx_PARALLEL (VOIDmode,
+				     gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+						GEN_INT (0), GEN_INT (1)));
+	  else
+	    gcc_unreachable ();
+
 	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
 	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
 	}
@@ -17331,7 +17343,8 @@  expand_vec_perm_blend (struct expand_vec_perm_d *d)
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
-			     || GET_MODE_SIZE (vmode) == 8))
+			     || GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 4))
     ;
   else
     return false;
@@ -17408,7 +17421,9 @@  expand_vec_perm_blend (struct expand_vec_perm_d *d)
 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
 	    vperm = force_reg (vmode, vperm);
 
-	    if (GET_MODE_SIZE (vmode) == 8)
+	    if (GET_MODE_SIZE (vmode) == 4)
+	      emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
+	    else if (GET_MODE_SIZE (vmode) == 8)
 	      emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
 	    else if (GET_MODE_SIZE (vmode) == 16)
 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
@@ -17440,6 +17455,16 @@  expand_vec_perm_blend (struct expand_vec_perm_d *d)
       vmode = V4HImode;
       goto do_subreg;
 
+    case E_V4QImode:
+      for (i = 0; i < 4; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i * 2] >= 4) << i;
+      vmode = V2HImode;
+      goto do_subreg;
+
     case E_V32QImode:
       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
       for (i = 0; i < 32; i += 2)
@@ -17697,163 +17722,176 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   nelt = d->nelt;
 
   if (!d->one_operand_p)
-    {
-      if (GET_MODE_SIZE (d->vmode) == 8)
-	{
-	  if (!TARGET_XOP)
-	    return false;
-	  vmode = V8QImode;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 16)
-	{
-	  if (!TARGET_XOP)
-	    return false;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 32)
-	{
-	  if (!TARGET_AVX2)
-	    return false;
+    switch (GET_MODE_SIZE (d->vmode))
+      {
+      case 4:
+	if (!TARGET_XOP)
+	  return false;
+	vmode = V4QImode;
+	break;
 
-	  if (valid_perm_using_mode_p (V2TImode, d))
-	    {
-	      if (d->testing_p)
-		return true;
+      case 8:
+	if (!TARGET_XOP)
+	  return false;
+	vmode = V8QImode;
+	break;
 
-	      /* Use vperm2i128 insn.  The pattern uses
-		 V4DImode instead of V2TImode.  */
-	      target = d->target;
-	      if (d->vmode != V4DImode)
-		target = gen_reg_rtx (V4DImode);
-	      op0 = gen_lowpart (V4DImode, d->op0);
-	      op1 = gen_lowpart (V4DImode, d->op1);
-	      rperm[0]
-		= GEN_INT ((d->perm[0] / (nelt / 2))
-			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
-	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
-	      if (target != d->target)
-		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-	      return true;
-	    }
+      case 16:
+	if (!TARGET_XOP)
 	  return false;
-	}
-      else
+	break;
+
+      case 32:
+	if (!TARGET_AVX2)
+	  return false;
+
+	if (valid_perm_using_mode_p (V2TImode, d))
+	  {
+	    if (d->testing_p)
+	      return true;
+
+	    /* Use vperm2i128 insn.  The pattern uses
+	       V4DImode instead of V2TImode.  */
+	    target = d->target;
+	    if (d->vmode != V4DImode)
+	      target = gen_reg_rtx (V4DImode);
+	    op0 = gen_lowpart (V4DImode, d->op0);
+	    op1 = gen_lowpart (V4DImode, d->op1);
+	    rperm[0]
+	      = GEN_INT ((d->perm[0] / (nelt / 2))
+			 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+	    emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	    return true;
+	  }
+	/* FALLTHRU */
+
+      default:
 	return false;
-    }
+      }
   else
-    {
-      if (GET_MODE_SIZE (d->vmode) == 8)
-	{
-	  if (!TARGET_SSSE3)
-	    return false;
-	  vmode = V8QImode;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 16)
-	{
-	  if (!TARGET_SSSE3)
-	    return false;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 32)
-	{
-	  if (!TARGET_AVX2)
-	    return false;
+    switch (GET_MODE_SIZE (d->vmode))
+      {
+      case 4:
+	if (!TARGET_SSSE3)
+	  return false;
+	vmode = V4QImode;
+	break;
 
-	  /* V4DImode should be already handled through
-	     expand_vselect by vpermq instruction.  */
-	  gcc_assert (d->vmode != V4DImode);
+      case 8:
+	if (!TARGET_SSSE3)
+	  return false;
+	vmode = V8QImode;
+	break;
 
-	  vmode = V32QImode;
-	  if (d->vmode == V8SImode
-	      || d->vmode == V16HImode
-	      || d->vmode == V32QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V8SImode/V16HImode/V32QImode.  */
-	      if (valid_perm_using_mode_p (V4DImode, d))
-		{
-		  for (i = 0; i < 4; i++)
-		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
-		  if (d->testing_p)
+      case 16:
+	if (!TARGET_SSSE3)
+	  return false;
+	break;
+
+      case 32:
+	if (!TARGET_AVX2)
+	  return false;
+
+	/* V4DImode should be already handled through
+	   expand_vselect by vpermq instruction.  */
+	gcc_assert (d->vmode != V4DImode);
+
+	vmode = V32QImode;
+	if (d->vmode == V8SImode
+	    || d->vmode == V16HImode
+	    || d->vmode == V32QImode)
+	  {
+	    /* First see if vpermq can be used for
+	       V8SImode/V16HImode/V32QImode.  */
+	    if (valid_perm_using_mode_p (V4DImode, d))
+	      {
+		for (i = 0; i < 4; i++)
+		  perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+		if (d->testing_p)
+		  return true;
+		target = gen_reg_rtx (V4DImode);
+		if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				    perm, 4, false))
+		  {
+		    emit_move_insn (d->target,
+				    gen_lowpart (d->vmode, target));
 		    return true;
-		  target = gen_reg_rtx (V4DImode);
-		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
-				      perm, 4, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
+		  }
+		return false;
+	      }
 
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V8SImode, d))
-		vmode = V8SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V8SFmode)
-	    vmode = V8SImode;
+	    /* Next see if vpermd can be used.  */
+	    if (valid_perm_using_mode_p (V8SImode, d))
+	      vmode = V8SImode;
+	  }
+	/* Or if vpermps can be used.  */
+	else if (d->vmode == V8SFmode)
+	  vmode = V8SImode;
 
-	  if (vmode == V32QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (nelt / 2))
-		  return false;
-	    }
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 64)
-	{
-	  if (!TARGET_AVX512BW)
-	    return false;
+	if (vmode == V32QImode)
+	  {
+	    /* vpshufb only works intra lanes, it is not
+	       possible to shuffle bytes in between the lanes.  */
+	    for (i = 0; i < nelt; ++i)
+	      if ((d->perm[i] ^ i) & (nelt / 2))
+		return false;
+	  }
+	break;
 
-	  /* If vpermq didn't work, vpshufb won't work either.  */
-	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
-	    return false;
+      case 64:
+	if (!TARGET_AVX512BW)
+	  return false;
 
-	  vmode = V64QImode;
-	  if (d->vmode == V16SImode
-	      || d->vmode == V32HImode
-	      || d->vmode == V64QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V16SImode/V32HImode/V64QImode.  */
-	      if (valid_perm_using_mode_p (V8DImode, d))
-		{
-		  for (i = 0; i < 8; i++)
-		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
-		  if (d->testing_p)
+	/* If vpermq didn't work, vpshufb won't work either.  */
+	if (d->vmode == V8DFmode || d->vmode == V8DImode)
+	  return false;
+
+	vmode = V64QImode;
+	if (d->vmode == V16SImode
+	    || d->vmode == V32HImode
+	    || d->vmode == V64QImode)
+	  {
+	    /* First see if vpermq can be used for
+	       V16SImode/V32HImode/V64QImode.  */
+	    if (valid_perm_using_mode_p (V8DImode, d))
+	      {
+		for (i = 0; i < 8; i++)
+		  perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+		if (d->testing_p)
+		  return true;
+		target = gen_reg_rtx (V8DImode);
+		if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+				    perm, 8, false))
+		  {
+		    emit_move_insn (d->target,
+				    gen_lowpart (d->vmode, target));
 		    return true;
-		  target = gen_reg_rtx (V8DImode);
-		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
-				      perm, 8, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
+		  }
+		return false;
+	      }
 
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V16SImode, d))
-		vmode = V16SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V16SFmode)
-	    vmode = V16SImode;
-	  if (vmode == V64QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (3 * nelt / 4))
-		  return false;
-	    }
-	}
-      else
+	    /* Next see if vpermd can be used.  */
+	    if (valid_perm_using_mode_p (V16SImode, d))
+	      vmode = V16SImode;
+	  }
+	/* Or if vpermps can be used.  */
+	else if (d->vmode == V16SFmode)
+	  vmode = V16SImode;
+	if (vmode == V64QImode)
+	  {
+	    /* vpshufb only works intra lanes, it is not
+	       possible to shuffle bytes in between the lanes.  */
+	    for (i = 0; i < nelt; ++i)
+	      if ((d->perm[i] ^ i) & (3 * nelt / 4))
+		return false;
+	  }
+	break;
+
+      default:
 	return false;
-    }
+      }
 
   if (d->testing_p)
     return true;
@@ -17893,22 +17931,27 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 
   machine_mode vpmode = vmode;
 
-  if (vmode == V8QImode)
+  if (vmode == V4QImode
+      || vmode == V8QImode)
     {
       rtx m128 = GEN_INT (-128);
 
       /* Remap elements from the second operand, as we have to
-	 account for inactive top 8 elements from the first operand.  */
+	 account for inactive top elements from the first operand.  */
       if (!d->one_operand_p)
-	for (i = 0; i < nelt; ++i)
-	  {
-	    int ival = INTVAL (rperm[i]);
-	    if (ival >= 8)
-	      ival += 8;
-	    rperm[i] = GEN_INT (ival);
-	  }
+	{
+	  int sz = GET_MODE_SIZE (vmode);
 
-      /* V8QI is emulated with V16QI instruction, fill inactive
+	  for (i = 0; i < nelt; ++i)
+	    {
+	      int ival = INTVAL (rperm[i]);
+	      if (ival >= sz)
+		ival += 16-sz;
+	      rperm[i] = GEN_INT (ival);
+	    }
+	}
+
+      /* V4QI/V8QI is emulated with V16QI instruction, fill inactive
 	 elements in the top 8 positions with zeros.  */
       for (i = nelt; i < 16; ++i)
 	rperm[i] = m128;
@@ -17931,7 +17974,9 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
     {
       rtx (*gen) (rtx, rtx, rtx);
 
-      if (vmode == V8QImode)
+      if (vmode == V4QImode)
+	gen = gen_mmx_pshufbv4qi3;
+      else if (vmode == V8QImode)
 	gen = gen_mmx_pshufbv8qi3;
       else if (vmode == V16QImode)
 	gen = gen_ssse3_pshufbv16qi3;
@@ -17958,7 +18003,9 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 
       op1 = gen_lowpart (vmode, d->op1);
 
-      if (vmode == V8QImode)
+      if (vmode == V4QImode)
+	gen = gen_mmx_ppermv32;
+      else if (vmode == V8QImode)
 	gen = gen_mmx_ppermv64;
       else if (vmode == V16QImode)
 	gen = gen_xop_pperm;
@@ -18405,7 +18452,8 @@  expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
+			     || GET_MODE_SIZE (vmode) == 8
 			     || GET_MODE_SIZE (vmode) == 16))
     ;
   else
@@ -18485,7 +18533,8 @@  expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   rtx_insn *seq;
   bool ok, same_halves = false;
 
-  if (GET_MODE_SIZE (d->vmode) == 8
+  if (GET_MODE_SIZE (d->vmode) == 4
+      || GET_MODE_SIZE (d->vmode) == 8
       || GET_MODE_SIZE (d->vmode) == 16)
     {
       if (d->one_operand_p)
@@ -18521,7 +18570,8 @@  expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   memset (remap, 0xff, sizeof (remap));
   dremap = *d;
 
-  if (GET_MODE_SIZE (d->vmode) == 8)
+  if (GET_MODE_SIZE (d->vmode) == 4
+      || GET_MODE_SIZE (d->vmode) == 8)
     {
       unsigned HOST_WIDE_INT h1, h2, h3, h4;
 
@@ -19269,7 +19319,8 @@  expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
-			     || GET_MODE_SIZE (vmode) == 8))
+			     || GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 4))
     ;
   else
     return false;
@@ -19530,7 +19581,8 @@  expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   rtx (*gen) (rtx, rtx, rtx);
 
   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
-			&& GET_MODE_SIZE (d->vmode) != 8))
+			&& GET_MODE_SIZE (d->vmode) != 8
+			&& GET_MODE_SIZE (d->vmode) != 4))
     return false;
   gcc_assert (!d->one_operand_p);
 
@@ -19539,6 +19591,10 @@  expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 
   switch (GET_MODE_SIZE (d->vmode))
     {
+    case 4:
+      mode = V4QImode;
+      gen = gen_mmx_pshufbv4qi3;
+      break;
     case 8:
       mode = V8QImode;
       gen = gen_mmx_pshufbv8qi3;
@@ -20025,6 +20081,26 @@  expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
 	return false;
       break;
 
+    case E_V4QImode:
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+	return expand_vec_perm_pshufb2 (d);
+      else
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V4QImode);
+	  emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
+	  emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
+	  if (odd)
+	    t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
+	  else
+	    t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
+	  emit_insn (t2);
+	}
+      break;
+
     case E_V4HImode:
       if (TARGET_SSE4_1)
 	return expand_vec_perm_even_odd_pack (d);
@@ -20214,6 +20290,7 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
 {
   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   machine_mode vmode = d->vmode;
+  rtx (*gen) (rtx, rtx, rtx);
   unsigned char perm2[4];
   rtx op0 = d->op0, dest;
   bool ok;
@@ -20238,24 +20315,48 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       /* These are always implementable using standard shuffle patterns.  */
       gcc_unreachable ();
 
+    case E_V4QImode:
+      /* This can be implemented via interleave and pshuflw.  */
+      if (d->testing_p)
+	return true;
+
+      if (elt >= nelt2)
+	{
+	  gen = gen_mmx_punpckhbw_low;
+	  elt -= nelt2;
+	}
+      else
+	gen = gen_mmx_punpcklbw_low;
+
+      dest = gen_reg_rtx (vmode);
+      emit_insn (gen (dest, op0, op0));
+      vmode = get_mode_wider_vector (vmode);
+      op0 = gen_lowpart (vmode, dest);
+
+      memset (perm2, elt, 2);
+      dest = gen_reg_rtx (vmode);
+      ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+      gcc_assert (ok);
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
     case E_V8QImode:
-      /* These can be implemented via interleave.  We save one insn by
+      /* This can be implemented via interleave.  We save one insn by
 	 stopping once we have promoted to V2SImode and then use pshufd.  */
       if (d->testing_p)
 	return true;
       do
 	{
-	  rtx dest;
-	  rtx (*gen) (rtx, rtx, rtx)
-	    = vmode == V8QImode ? gen_mmx_punpcklbw
-				: gen_mmx_punpcklwd;
-
 	  if (elt >= nelt2)
 	    {
 	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
 				      : gen_mmx_punpckhwd;
 	      elt -= nelt2;
 	    }
+	  else
+	    gen = vmode == V8QImode ? gen_mmx_punpcklbw
+				    : gen_mmx_punpcklwd;
 	  nelt2 /= 2;
 
 	  dest = gen_reg_rtx (vmode);
@@ -20266,11 +20367,11 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       while (vmode != V2SImode);
 
       memset (perm2, elt, 2);
-      dest = gen_reg_rtx (V2SImode);
+      dest = gen_reg_rtx (vmode);
       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
       gcc_assert (ok);
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
     case E_V8HImode:
@@ -20281,17 +20382,15 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
 	return true;
       do
 	{
-	  rtx dest;
-	  rtx (*gen) (rtx, rtx, rtx)
-	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
-				 : gen_vec_interleave_lowv8hi;
-
 	  if (elt >= nelt2)
 	    {
 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
 				       : gen_vec_interleave_highv8hi;
 	      elt -= nelt2;
 	    }
+	  else
+	    gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+				     : gen_vec_interleave_lowv8hi;
 	  nelt2 /= 2;
 
 	  dest = gen_reg_rtx (vmode);
@@ -20302,11 +20401,11 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       while (vmode != V4SImode);
 
       memset (perm2, elt, 4);
-      dest = gen_reg_rtx (V4SImode);
+      dest = gen_reg_rtx (vmode);
       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
       gcc_assert (ok);
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
     case E_V64QImode:
@@ -20787,6 +20886,10 @@  ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
       if (d.testing_p)
 	return true;
       break;
+    case E_V4QImode:
+      if (!TARGET_SSE2)
+	return false;
+      break;
     case E_V2DImode:
     case E_V2DFmode:
       if (!TARGET_SSE)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5f10572718d..4ead8beff50 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2362,6 +2362,18 @@ 
   [(set_attr "type" "sse4arg")
    (set_attr "mode" "TI")])
 
+(define_insn "mmx_ppermv32"
+  [(set (match_operand:V4QI 0 "register_operand" "=x")
+	(unspec:V4QI
+	  [(match_operand:V4QI 1 "register_operand" "x")
+	   (match_operand:V4QI 2 "register_operand" "x")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "xm")]
+	  UNSPEC_XOP_PERMUTE))]
+  "TARGET_XOP"
+  "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral logical operations
@@ -2550,6 +2562,23 @@ 
    (set_attr "type" "mmxcvt,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "mmx_punpckhbw_low"
+  [(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
+	(vec_select:V4QI
+	  (vec_concat:V8QI
+	    (match_operand:V4QI 1 "register_operand" "0,Yw")
+	    (match_operand:V4QI 2 "register_operand" "x,Yw"))
+          (parallel [(const_int 2) (const_int 6)
+                     (const_int 3) (const_int 7)])))]
+  "TARGET_SSE2"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, true); DONE;"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "mode" "TI")])
+
 (define_insn_and_split "mmx_punpcklbw"
   [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
 	(vec_select:V8QI
@@ -2573,6 +2602,23 @@ 
    (set_attr "type" "mmxcvt,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "mmx_punpcklbw_low"
+  [(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
+	(vec_select:V4QI
+	  (vec_concat:V8QI
+	    (match_operand:V4QI 1 "register_operand" "0,Yw")
+	    (match_operand:V4QI 2 "register_operand" "x,Yw"))
+          (parallel [(const_int 0) (const_int 4)
+                     (const_int 1) (const_int 5)])))]
+  "TARGET_SSE2"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, false); DONE;"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "mode" "TI")])
+
 (define_insn_and_split "mmx_punpckhwd"
   [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yw")
 	(vec_select:V4HI
@@ -2930,6 +2976,24 @@ 
    (set_attr "btver2_decode" "vector")
    (set_attr "mode" "TI")])
 
+(define_insn "mmx_pshufbv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
+	(unspec:V4QI
+	  [(match_operand:V4QI 1 "register_operand" "0,Yw")
+	   (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")]
+	  UNSPEC_PSHUFB))]
+  "TARGET_SSSE3"
+  "@
+   pshufb\t{%2, %0|%0, %2}
+   vpshufb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "TI")])
+
 (define_expand "mmx_pshufw"
   [(match_operand:V4HI 0 "register_operand")
    (match_operand:V4HI 1 "register_mmxmem_operand")
@@ -3002,12 +3066,12 @@ 
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
-(define_insn "*mmx_pblendw"
+(define_insn "*mmx_pblendw64"
   [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x")
 	(vec_merge:V4HI
 	  (match_operand:V4HI 2 "register_operand" "Yr,*x,x")
 	  (match_operand:V4HI 1 "register_operand" "0,0,x")
-	  (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))]
+	  (match_operand:SI 3 "const_0_to_15_operand" "n,n,n")))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
    pblendw\t{%3, %2, %0|%0, %2, %3}
@@ -3020,6 +3084,24 @@ 
    (set_attr "prefix" "orig,orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "*mmx_pblendw32"
+  [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V2HI
+	  (match_operand:V2HI 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V2HI 1 "register_operand" "0,0,x")
+	  (match_operand:SI 3 "const_0_to_7_operand" "n,n,n")))]
+  "TARGET_SSE4_1"
+  "@
+   pblendw\t{%3, %2, %0|%0, %2, %3}
+   pblendw\t{%3, %2, %0|%0, %2, %3}
+   vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "mode" "TI")])
+
 ;; Optimize V2SImode load from memory, swapping the elements and
 ;; storing back into the memory into DImode rotate of the memory by 32.
 (define_split