@@ -942,7 +942,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blen
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps_v4sf, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
@@ -18985,6 +18985,78 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ using insertps. */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ unsigned i, cnt_s, nelt = d->nelt;
+ int cnt_d = -1;
+ rtx src, dst;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (!(TARGET_SSE4_1
+ && (vmode == V4SFmode || vmode == V4SImode
+ || (TARGET_MMX_WITH_SSE
+ && (vmode == V2SFmode || vmode == V2SImode)))))
+ return false;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i)
+ continue;
+ if (cnt_d != -1)
+ {
+ cnt_d = -1;
+ break;
+ }
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ {
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i + nelt)
+ continue;
+ if (cnt_d != -1)
+ return false;
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ gcc_assert (cnt_d != -1);
+
+ cnt_s = d->perm[cnt_d];
+ if (cnt_s < nelt)
+ {
+ src = d->op0;
+ dst = d->op1;
+ }
+ else
+ {
+ cnt_s -= nelt;
+ src = d->op1;
+ dst = d->op0;
+ }
+ gcc_assert (cnt_s < nelt);
+
+ rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+ GEN_INT (cnt_s << 6 | cnt_d << 4));
+ emit_insn (x);
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
@@ -19918,6 +19990,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_movs (d))
return true;
+ /* Try the SSE4.1 insertps instruction. */
+ if (expand_vec_perm_insertps (d))
+ return true;
+
/* Try the fully general two operand permute. */
if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
d->testing_p))
@@ -120,6 +120,7 @@ (define_c_enum "unspec" [
UNSPEC_MASKMOV
UNSPEC_MOVCC_MASK
UNSPEC_MOVMSK
+ UNSPEC_INSERTPS
UNSPEC_BLENDV
UNSPEC_PSHUFB
UNSPEC_XOP_PERMUTE
@@ -106,6 +106,10 @@ (define_mode_attr mmxintvecmode
(define_mode_attr mmxintvecmodelower
[(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")])
+;; Mapping of vector modes back to the scalar modes
+(define_mode_attr mmxscalarmode
+ [(V2SI "SI") (V2SF "SF")])
+
(define_mode_attr Yv_Yw
[(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
@@ -1154,6 +1158,42 @@ (define_expand "vcond<mode>v2sf"
DONE;
})
+(define_insn "@sse4_1_insertps_<mode>"
+ [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
+ (unspec:V2FI
+ [(match_operand:V2FI 2 "nonimmediate_operand" "Yrm,*xm,vm")
+ (match_operand:V2FI 1 "register_operand" "0,0,v")
+ (match_operand:SI 3 "const_0_to_255_operand")]
+ UNSPEC_INSERTPS))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+ if (MEM_P (operands[2]))
+ {
+ unsigned count_s = INTVAL (operands[3]) >> 6;
+ if (count_s)
+ operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
+ operands[2] = adjust_address_nv (operands[2],
+ <mmxscalarmode>mode, count_s * 4);
+ }
+ switch (which_alternative)
+ {
+ case 0:
+ case 1:
+ return "insertps\t{%3, %2, %0|%0, %2, %3}";
+ case 2:
+ return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set_attr "isa" "noavx,noavx,avx")
+ (set_attr "type" "sselog")
+ (set_attr "prefix_data16" "1,1,*")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "orig,orig,maybe_evex")
+ (set_attr "mode" "V4SF")])
+
(define_insn "*mmx_blendps"
[(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
(vec_merge:V2SF
@@ -38,7 +38,6 @@ (define_c_enum "unspec" [
UNSPEC_INSERTQ
;; For SSE4.1 support
- UNSPEC_INSERTPS
UNSPEC_DP
UNSPEC_MOVNTDQA
UNSPEC_MPSADBW
@@ -10959,12 +10958,13 @@ (define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
DONE;
})
-(define_insn "sse4_1_insertps"
- [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
- (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
- (match_operand:V4SF 1 "register_operand" "0,0,v")
- (match_operand:SI 3 "const_0_to_255_operand")]
- UNSPEC_INSERTPS))]
+(define_insn "@sse4_1_insertps_<mode>"
+ [(set (match_operand:VI4F_128 0 "register_operand" "=Yr,*x,v")
+ (unspec:VI4F_128
+ [(match_operand:VI4F_128 2 "nonimmediate_operand" "Yrm,*xm,vm")
+ (match_operand:VI4F_128 1 "register_operand" "0,0,v")
+ (match_operand:SI 3 "const_0_to_255_operand")]
+ UNSPEC_INSERTPS))]
"TARGET_SSE4_1"
{
if (MEM_P (operands[2]))
@@ -10972,7 +10972,8 @@ (define_insn "sse4_1_insertps"
unsigned count_s = INTVAL (operands[3]) >> 6;
if (count_s)
operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
- operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4);
+ operands[2] = adjust_address_nv (operands[2],
+ <ssescalarmode>mode, count_s * 4);
}
switch (which_alternative)
{
new file mode 100644
@@ -0,0 +1,14 @@
+/* PR target/94908 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4sf g();
+
+v4sf f(v4sf a, v4sf b)
+{
+ return (v4sf){g()[1], a[1], a[2], a[3]};
+}
+
+/* { dg-final { scan-assembler "\[ \t\]v?insertps" } } */
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo_1(v4si x, v4si y) { return (v4si){x[0],y[3],x[2],x[3]}; }
+v4si foo_2(v4si x, v4si y) { return (v4si){y[0],x[2],y[2],y[3]}; }
+v4si foo_3(v4si x, v4si y) { return (v4si){x[3],y[1],y[2],y[3]}; }
+
+v4sf bar_1(v4sf x, v4sf y) { return (v4sf){y[0],x[3],y[2],y[3]}; }
+v4sf bar_2(v4sf x, v4sf y) { return (v4sf){x[0],y[2],x[2],x[3]}; }
+v4sf bar_3(v4sf x, v4sf y) { return (v4sf){y[3],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?insertps\t" 6 } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "pblendw" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "blendps" } } */
new file mode 100644
@@ -0,0 +1,4 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O -msse4" } */
+#include "vperm-v4sf-2.c"