diff mbox series

i386: Improve permutations with INSERTPS instruction [PR94908]

Message ID CAFULd4ahvuDwL58Vqj2zVwzyio_yZMA8hKqMZoomfTDNuT0q8Q@mail.gmail.com
State New
Headers show
Series i386: Improve permutations with INSERTPS instruction [PR94908] | expand

Commit Message

Uros Bizjak April 18, 2023, 5:01 p.m. UTC
INSERTPS can select any element from src and insert into any place
of the dest.  For SSE4.1 targets, compiler can generate e.g.

    insertps $64, %xmm0, %xmm1

to insert element 1 from %xmm1 to element 0 of %xmm0.

gcc/ChangeLog:

    PR target/94908
    * config/i386/i386-builtin.def (__builtin_ia32_insertps128):
    Use CODE_FOR_sse4_1_insertps_v4sf.
    * config/i386/i386-expand.cc (expand_vec_perm_insertps): New.
    (expand_vec_perm_1): Call expand_vec_per_insertps.
    * config/i386/i386.md ("unspec"): Declare UNSPEC_INSERTPS here.
    * config/i386/mmx.md (mmxscalarmode): New mode attribute.
    (@sse4_1_insertps_<mode>): New insn pattern.
    * config/i386/sse.md (@sse4_1_insertps_<mode>): Macroize insn
    pattern from sse4_1_insertps using VI4F_128 mode iterator.

gcc/testsuite/ChangeLog:

    PR target/94908
    * gcc.target/i386/pr94908.c: New test.
    * gcc.target/i386/sse4_1-insertps-5.c: New test.
    * gcc.target/i386/vperm-v4sf-2-sse4.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 6dae6972d81..f7cf105ae69 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -942,7 +942,7 @@  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blen
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps_v4sf, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0d817fc3f3b..9fa549c4c3b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -18985,6 +18985,78 @@  expand_vec_perm_movs (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
+   using insertps.  */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, cnt_s, nelt = d->nelt;
+  int cnt_d = -1;
+  rtx src, dst;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (!(TARGET_SSE4_1
+	&& (vmode == V4SFmode || vmode == V4SImode
+	    || (TARGET_MMX_WITH_SSE
+		&& (vmode == V2SFmode || vmode == V2SImode)))))
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    {
+      if (d->perm[i] == i)
+	continue;
+      if (cnt_d != -1)
+	{
+	  cnt_d = -1;
+	  break;
+	}
+      cnt_d = i;
+    }
+
+  if (cnt_d == -1)
+    {
+      for (i = 0; i < nelt; ++i)
+	{
+	  if (d->perm[i] == i + nelt)
+	    continue;
+	  if (cnt_d != -1)
+	    return false;
+	  cnt_d = i;
+	}
+
+      if (cnt_d == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (cnt_d != -1);
+
+  cnt_s = d->perm[cnt_d];
+  if (cnt_s < nelt)
+    {
+      src = d->op0;
+      dst = d->op1;
+    }
+  else
+    {
+      cnt_s -= nelt;
+      src = d->op1;
+      dst = d->op0;
+     }
+  gcc_assert (cnt_s < nelt);
+
+  rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+			       GEN_INT (cnt_s << 6 | cnt_d << 4));
+  emit_insn (x);
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -19918,6 +19990,10 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_movs (d))
     return true;
 
+  /* Try the SSE4.1 insertps instruction.  */
+  if (expand_vec_perm_insertps (d))
+    return true;
+
   /* Try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ed689b044c3..1419ea4cff3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -120,6 +120,7 @@  (define_c_enum "unspec" [
   UNSPEC_MASKMOV
   UNSPEC_MOVCC_MASK
   UNSPEC_MOVMSK
+  UNSPEC_INSERTPS
   UNSPEC_BLENDV
   UNSPEC_PSHUFB
   UNSPEC_XOP_PERMUTE
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 18dae03ad0a..872ddbc55f2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -106,6 +106,10 @@  (define_mode_attr mmxintvecmode
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")])
 
+;; Mapping of vector modes back to the scalar modes
+(define_mode_attr mmxscalarmode
+  [(V2SI "SI") (V2SF "SF")])
+
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
 
@@ -1154,6 +1158,42 @@  (define_expand "vcond<mode>v2sf"
   DONE;
 })
 
+(define_insn "@sse4_1_insertps_<mode>"
+  [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
+	(unspec:V2FI
+	  [(match_operand:V2FI 2 "nonimmediate_operand" "Yrm,*xm,vm")
+	   (match_operand:V2FI 1 "register_operand" "0,0,v")
+	   (match_operand:SI 3 "const_0_to_255_operand")]
+	  UNSPEC_INSERTPS))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  if (MEM_P (operands[2]))
+    {
+      unsigned count_s = INTVAL (operands[3]) >> 6;
+      if (count_s)
+	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
+      operands[2] = adjust_address_nv (operands[2],
+				       <mmxscalarmode>mode, count_s * 4);
+    }
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return "insertps\t{%3, %2, %0|%0, %2, %3}";
+    case 2:
+      return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "*mmx_blendps"
   [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
 	(vec_merge:V2SF
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 513960e8f33..5dca8dd1e27 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -38,7 +38,6 @@  (define_c_enum "unspec" [
   UNSPEC_INSERTQ
 
   ;; For SSE4.1 support
-  UNSPEC_INSERTPS
   UNSPEC_DP
   UNSPEC_MOVNTDQA
   UNSPEC_MPSADBW
@@ -10959,12 +10958,13 @@  (define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
   DONE;
 })
 
-(define_insn "sse4_1_insertps"
-  [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
-	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
-		      (match_operand:V4SF 1 "register_operand" "0,0,v")
-		      (match_operand:SI 3 "const_0_to_255_operand")]
-		     UNSPEC_INSERTPS))]
+(define_insn "@sse4_1_insertps_<mode>"
+  [(set (match_operand:VI4F_128 0 "register_operand" "=Yr,*x,v")
+	(unspec:VI4F_128
+	  [(match_operand:VI4F_128 2 "nonimmediate_operand" "Yrm,*xm,vm")
+	   (match_operand:VI4F_128 1 "register_operand" "0,0,v")
+	   (match_operand:SI 3 "const_0_to_255_operand")]
+	  UNSPEC_INSERTPS))]
   "TARGET_SSE4_1"
 {
   if (MEM_P (operands[2]))
@@ -10972,7 +10972,8 @@  (define_insn "sse4_1_insertps"
       unsigned count_s = INTVAL (operands[3]) >> 6;
       if (count_s)
 	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
-      operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4);
+      operands[2] = adjust_address_nv (operands[2],
+				       <ssescalarmode>mode, count_s * 4);
     }
   switch (which_alternative)
     {
diff --git a/gcc/testsuite/gcc.target/i386/pr94908.c b/gcc/testsuite/gcc.target/i386/pr94908.c
new file mode 100644
index 00000000000..11a5f90e5dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94908.c
@@ -0,0 +1,14 @@ 
+/* PR target/94908 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4sf g();
+
+v4sf f(v4sf a, v4sf b)
+{
+    return (v4sf){g()[1], a[1], a[2], a[3]};
+}
+
+/* { dg-final { scan-assembler "\[ \t\]v?insertps" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-insertps-5.c b/gcc/testsuite/gcc.target/i386/sse4_1-insertps-5.c
new file mode 100644
index 00000000000..d9c4cfc81d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-insertps-5.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo_1(v4si x, v4si y) { return (v4si){x[0],y[3],x[2],x[3]}; }
+v4si foo_2(v4si x, v4si y) { return (v4si){y[0],x[2],y[2],y[3]}; }
+v4si foo_3(v4si x, v4si y) { return (v4si){x[3],y[1],y[2],y[3]}; }
+
+v4sf bar_1(v4sf x, v4sf y) { return (v4sf){y[0],x[3],y[2],y[3]}; }
+v4sf bar_2(v4sf x, v4sf y) { return (v4sf){x[0],y[2],x[2],x[3]}; }
+v4sf bar_3(v4sf x, v4sf y) { return (v4sf){y[3],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?insertps\t" 6 } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "pblendw" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "blendps" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vperm-v4sf-2-sse4.c b/gcc/testsuite/gcc.target/i386/vperm-v4sf-2-sse4.c
new file mode 100644
index 00000000000..ed5963efc5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vperm-v4sf-2-sse4.c
@@ -0,0 +1,4 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O -msse4" } */
+#include "vperm-v4sf-2.c"