diff mbox series

[RFC] i386: Add pack/unpack patterns for 64bit vectors [PR89021]

Message ID CAFULd4ZBQAHCLJG8bvk8+OjhGW5=wJYUTSUOfk4mLYHzMrYBAw@mail.gmail.com
State New
Headers show
Series [RFC] i386: Add pack/unpack patterns for 64bit vectors [PR89021] | expand

Commit Message

Uros Bizjak June 24, 2021, 10:43 a.m. UTC
2021-06-24  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
    PR target/89021
    * config/i386/i386-expand.c (ix86_expand_sse_unpack):
    Handle V8QI and V4HI modes.
    * config/i386/mmx.md (sse4_1_<any_extend:code>v4qiv4hi2):
    New insn pattern.
    (sse4_1_<any_extend:code>v4qiv4hi2): Ditto.
    (mmxpackmode): New mode attribute.
    (vec_pack_trunc_<mmxpackmode:mode>): New expander.
    (mmxunpackmode): New mode attribute.
    (vec_unpacks_lo_<mmxunpackmode:mode>): New expander.
    (vec_unpacks_hi_<mmxunpackmode:mode>): Ditto.
    (vec_unpacku_lo_<mmxunpackmode:mode>): Ditto.
    (vec_unpacku_hi_<mmxunpackmode:mode>): Ditto.
    * config/i386/i386.md (extsuffix): Move from ...
    * config/i386/sse.md: ... here.

gcc/testsuite/

    PR target/89021
    * gcc.target/i386/pr97249-1.c (foo): Add #pragma
    to avoid loop vectorization.
    (foo1): Ditto.
    (foo2): Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

There is still one scan-tree-not failure in generic vectorization testsuite:

FAIL: gcc.dg/vect/vect-nb-iter-ub-3.c scan-tree-dump-not cunroll "loop
turned into non-loop; it never loops"

This probably happens due to the additional epilogue vectorization,
but I don't know how to "fix" this failure. Richi, can you perhaps
help me here?

Uros.

Comments

Richard Biener June 24, 2021, 10:48 a.m. UTC | #1
On Thu, 24 Jun 2021, Uros Bizjak wrote:

> 2021-06-24  Uroš Bizjak  <ubizjak@gmail.com>
> 
> gcc/
>     PR target/89021
>     * config/i386/i386-expand.c (ix86_expand_sse_unpack):
>     Handle V8QI and V4HI modes.
>     * config/i386/mmx.md (sse4_1_<any_extend:code>v4qiv4hi2):
>     New insn pattern.
>     (sse4_1_<any_extend:code>v4qiv4hi2): Ditto.
>     (mmxpackmode): New mode attribute.
>     (vec_pack_trunc_<mmxpackmode:mode>): New expander.
>     (mmxunpackmode): New mode attribute.
>     (vec_unpacks_lo_<mmxunpackmode:mode>): New expander.
>     (vec_unpacks_hi_<mmxunpackmode:mode>): Ditto.
>     (vec_unpacku_lo_<mmxunpackmode:mode>): Ditto.
>     (vec_unpacku_hi_<mmxunpackmode:mode>): Ditto.
>     * config/i386/i386.md (extsuffix): Move from ...
>     * config/i386/sse.md: ... here.
> 
> gcc/testsuite/
> 
>     PR target/89021
>     * gcc.target/i386/pr97249-1.c (foo): Add #pragma
>     to avoid loop vectorization.
>     (foo1): Ditto.
>     (foo2): Ditto.
> 
> Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
> 
> There is still one scan-tree-not failure in generic vectorization testsuite:
> 
> FAIL: gcc.dg/vect/vect-nb-iter-ub-3.c scan-tree-dump-not cunroll "loop
> turned into non-loop; it never loops"
> 
> This probably happens due to the additional epilogue vectorization,
> but I don't know how to "fix" this failure. Richi, can you perhaps
> help me here?

I would suggest to add --param vect-epilogues-nomask=0 to
dg-additional-options to preserve what the testcase tested.

Richard.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 2cb939e51c3..e9763eb5b3e 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -5161,6 +5161,18 @@  ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  else
 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
 	  break;
+	case E_V8QImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv4qiv4hi2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv4qiv4hi2;
+	  break;
+	case E_V4HImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv2hiv2si2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv2hiv2si2;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -5172,10 +5184,24 @@  ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	}
       else if (high_p)
 	{
-	  /* Shift higher 8 bytes to lower 8 bytes.  */
-	  tmp = gen_reg_rtx (V1TImode);
-	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
-					 GEN_INT (64)));
+	  switch (GET_MODE_SIZE (imode))
+	    {
+	    case 16:
+	      /* Shift higher 8 bytes to lower 8 bytes.  */
+	      tmp = gen_reg_rtx (V1TImode);
+	      emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+					     GEN_INT (64)));
+	      break;
+	    case 8:
+	      /* Shift higher 4 bytes to lower 4 bytes.  */
+	      tmp = gen_reg_rtx (V1DImode);
+	      emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
+					    GEN_INT (32)));
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
 	  tmp = gen_lowpart (imode, tmp);
 	}
       else
@@ -5207,6 +5233,18 @@  ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  else
 	    unpack = gen_vec_interleave_lowv4si;
 	  break;
+	case E_V8QImode:
+	  if (high_p)
+	    unpack = gen_mmx_punpckhbw;
+	  else
+	    unpack = gen_mmx_punpcklbw;
+	  break;
+	case E_V4HImode:
+	  if (high_p)
+	    unpack = gen_mmx_punpckhwd;
+	  else
+	    unpack = gen_mmx_punpcklwd;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9043be3105d..9b619e2f78f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1000,6 +1000,9 @@  (define_code_iterator any_truncate [ss_truncate truncate us_truncate])
 (define_code_attr trunsuffix
   [(ss_truncate "s") (truncate "") (us_truncate "us")])
 
+;; Instruction suffix for SSE sign and zero extensions.
+(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
+
 ;; Used in signed and unsigned fix.
 (define_code_iterator any_fix [fix unsigned_fix])
 (define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 7a827dceb01..e887f03474d 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2639,6 +2639,78 @@  (define_insn_and_split "mmx_punpckldq"
    (set_attr "type" "mmxcvt,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_insn "sse4_1_<code>v4qiv4hi2"
+  [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
+	(any_extend:V4HI
+	  (vec_select:V4QI
+	    (match_operand:V8QI 1 "register_operand" "Yr,*x,Yw")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vpmov<extsuffix>bw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_<code>v2hiv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
+	(any_extend:V2SI
+	  (vec_select:V2HI
+	    (match_operand:V4HI 1 "register_operand" "Yr,*x,v")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vpmov<extsuffix>wd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+;; Pack/unpack vector modes
+(define_mode_attr mmxpackmode
+  [(V4HI "V8QI") (V2SI "V4HI")])
+
+(define_expand "vec_pack_trunc_<mode>"
+  [(match_operand:<mmxpackmode> 0 "register_operand")
+   (match_operand:MMXMODE24 1 "register_operand")
+   (match_operand:MMXMODE24 2 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = gen_lowpart (<mmxpackmode>mode, operands[1]);
+  rtx op2 = gen_lowpart (<mmxpackmode>mode, operands[2]);
+  ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0);
+  DONE;
+})
+
+(define_mode_attr mmxunpackmode
+  [(V8QI "V4HI") (V4HI "V2SI")])
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
+
+(define_expand "vec_unpacku_lo_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
+
+(define_expand "vec_unpacku_hi_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
+
 (define_insn "*mmx_pinsrd"
   [(set (match_operand:V2SI 0 "register_operand" "=x,Yv")
         (vec_merge:V2SI
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5bd65dd9312..d718a82cb58 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -976,9 +976,6 @@  (define_mode_attr castmode
  [(V8SI "si") (V8SF "ps") (V4DF "pd")
   (V16SI "si") (V16SF "ps") (V8DF "pd")])
 
-;; Instruction suffix for sign and zero extensions.
-(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
-
 ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
 ;; i64x4 or f64x4 for 512bit modes.
 (define_mode_attr i128
diff --git a/gcc/testsuite/gcc.target/i386/pr97249-1.c b/gcc/testsuite/gcc.target/i386/pr97249-1.c
index 4478a34a9f8..e7d1d74a208 100644
--- a/gcc/testsuite/gcc.target/i386/pr97249-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr97249-1.c
@@ -8,23 +8,26 @@ 
 void
 foo (unsigned char* p1, unsigned char* p2, short* __restrict p3)
 {
-    for (int i = 0 ; i != 8; i++)
-     p3[i] = p1[i] + p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 8
+  for (int i = 0 ; i != 8; i++)
+    p3[i] = p1[i] + p2[i];
 }
 
 void
 foo1 (unsigned short* p1, unsigned short* p2, int* __restrict p3)
 {
-    for (int i = 0 ; i != 4; i++)
-     p3[i] = p1[i] + p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 4
+  for (int i = 0 ; i != 4; i++)
+    p3[i] = p1[i] + p2[i];
 }
 
 void
 foo2 (unsigned int* p1, unsigned int* p2, long long* __restrict p3)
 {
-    for (int i = 0 ; i != 2; i++)
-      p3[i] = (long long)p1[i] + (long long)p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 2
+  for (int i = 0 ; i != 2; i++)
+    p3[i] = (long long)p1[i] + (long long)p2[i];
 }