diff mbox series

[COMMITTED] i386: Add V8QI and V4QImode partial vector shift operations

Message ID CAFULd4YGYGTgt7WqfNLK4dOX16=c+axq2n5_EyGxB9D8+AyWbA@mail.gmail.com
State New
Headers show
Series [COMMITTED] i386: Add V8QI and V4QImode partial vector shift operations | expand

Commit Message

Uros Bizjak May 23, 2023, 4:02 p.m. UTC
Add V8QImode and V4QImode vector shift patterns that call into
ix86_expand_vecop_qihi_partial.  Generate special sequences
for constant count operands.

The patch regresses g++.dg/pr91838.C - as explained in PR91838, the
test returns different results, depending on whether V8QImode shift
pattern is present in target *.md files. The tree optimizers produce:

V f (V x)
{
  V _2;

  <bb 2> [local count: 1073741824]:
  _2 = x_1(D) >> 8;
  return _2;

}

and without the named expander:

V f (V x)
{
  <bb 2> [local count: 1073741824]:
  return { 0, 0, 0, 0, 0, 0, 0, 0 };

}

RTL part just expands from there.

gcc/ChangeLog:

    * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
    Call ix86_expand_vec_shift_qihi_constant for shifts
    with constant count operand.
    * config/i386/i386.cc (ix86_shift_rotate_cost):
    Handle V4QImode and V8QImode.
    * config/i386/mmx.md (<insn>v8qi3): New insn pattern.
    (<insn>v4qi3): Ditto.

gcc/testsuite/ChangeLog:

    * gcc.target/i386/vect-shiftv4qi.c: New test.
    * gcc.target/i386/vect-shiftv8qi.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 50d9d34ebcb..ff3d382f1b4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23294,6 +23294,16 @@  ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   else
     qop2 = op2;
 
+  qdest = gen_reg_rtx (V16QImode);
+
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+    {
+      emit_move_insn (dest, gen_lowpart (qimode, qdest));
+      return;
+    }
+
   switch (code)
     {
     case MULT:
@@ -23358,8 +23368,6 @@  ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       bool ok;
       int i;
 
-      qdest = gen_reg_rtx (V16QImode);
-
       /* Merge the data back into the right place.  */
       d.target = qdest;
       d.op0 = qres;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38125ce284a..2710c6dfc56 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20580,6 +20580,37 @@  ix86_shift_rotate_cost (const struct processor_costs *cost,
 
       switch (mode)
 	{
+	case V4QImode:
+	case V8QImode:
+	  if (TARGET_AVX2)
+	    /* Use vpbroadcast.  */
+	    extra = cost->sse_op;
+	  else
+	    extra = cost->sse_load[2];
+
+	  if (constant_op1)
+	    {
+	      if (code == ASHIFTRT)
+		{
+		  count = 4;
+		  extra *= 2;
+		}
+	      else
+		count = 2;
+	    }
+	  else if (TARGET_AVX512BW && TARGET_AVX512VL)
+	    {
+	      count = 3;
+	      return ix86_vec_cost (mode, cost->sse_op * count);
+	    }
+	  else if (TARGET_SSE4_1)
+	    count = 4;
+	  else if (code == ASHIFTRT)
+	    count = 5;
+	  else
+	    count = 4;
+	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
+
 	case V16QImode:
 	  if (TARGET_XOP)
 	    {
@@ -20600,7 +20631,12 @@  ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    }
 	  /* FALLTHRU */
 	case V32QImode:
-	  extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+	  if (TARGET_AVX2)
+	    /* Use vpbroadcast.  */
+	    extra = cost->sse_op;
+	  else
+	    extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+
 	  if (constant_op1)
 	    {
 	      if (code == ASHIFTRT)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 45773673049..a37bbbb811f 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2680,6 +2680,28 @@ 
        (const_string "0")))
    (set_attr "mode" "TI")])
 
+(define_expand "<insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(any_shift:V8QI (match_operand:V8QI 1 "register_operand")
+			(match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "<insn>v4qi3"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(any_shift:V4QI (match_operand:V4QI 1 "register_operand")
+			(match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_vecop_qihi_partial (<CODE>, operands[0],
+				  operands[1], operands[2]);
+  DONE;
+})
+
 (define_insn_and_split "<insn>v2qi3"
   [(set (match_operand:V2QI 0 "register_operand" "=Q")
         (any_shift:V2QI
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
new file mode 100644
index 00000000000..c06dfb87bd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
@@ -0,0 +1,43 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+#define N 4
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+
+__vu sll (__vu a, int n)
+{
+  return a << n;
+}
+
+__vu sll_c (__vu a)
+{
+  return a << 5;
+}
+
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+
+__vu srl (__vu a, int n)
+{
+  return a >> n;
+}
+
+__vu srl_c (__vu a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "psrlw" 2 } } */
+
+__vi sra (__vi a, int n)
+{
+  return a >> n;
+}
+
+__vi sra_c (__vi a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "psraw" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c
new file mode 100644
index 00000000000..f5e8925aa25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c
@@ -0,0 +1,43 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+#define N 8
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+
+__vu sll (__vu a, int n)
+{
+  return a << n;
+}
+
+__vu sll_c (__vu a)
+{
+  return a << 5;
+}
+
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+
+__vu srl (__vu a, int n)
+{
+  return a >> n;
+}
+
+__vu srl_c (__vu a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "psrlw" 2 } } */
+
+__vi sra (__vi a, int n)
+{
+  return a >> n;
+}
+
+__vi sra_c (__vi a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "psraw" 2 } } */