diff mbox series

i386: Improve avx* vector concatenation [PR93594]

Message ID 20200206083427.GH17695@tucnak
State New
Headers show
Series i386: Improve avx* vector concatenation [PR93594] | expand

Commit Message

Jakub Jelinek Feb. 6, 2020, 8:34 a.m. UTC
Hi!

The following testcase shows that for _mm256_set*_m128i and similar
intrinsics, we sometimes generate bad code.  All 4 routines are expressing
the same thing, a 128-bit vector zero padded to 256-bit vector, but only the
3rd one actually emits the desired vmovdqa	%xmm0, %xmm0 insn, the
others vpxor	%xmm1, %xmm1, %xmm1; vinserti128	$0x1, %xmm1, %ymm0, %ymm0
The problem is that the cast builtins use UNSPEC_CAST which is after reload
simplified using a splitter, but during combine it prevents optimizations.
We do have avx_vec_concat* patterns that generate efficient code, both for
this low part + zero concatenation special case and for other cases too, so
the following define_insn_and_split just recognizes avx_vec_concat made of a
low half of a cast and some other reg.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-02-06  Jakub Jelinek  <jakub@redhat.com>

	PR target/93594
	* config/i386/predicates.md (avx_identity_operand): New predicate.
	* config/i386/sse.md (*avx_vec_concat<mode>_1): New
	define_insn_and_split.

	* gcc.target/i386/avx2-pr93594.c: New test.


	Jakub

Comments

Uros Bizjak Feb. 6, 2020, 8:44 a.m. UTC | #1
On Thu, Feb 6, 2020 at 9:34 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following testcase shows that for _mm256_set*_m128i and similar
> intrinsics, we sometimes generate bad code.  All 4 routines are expressing
> the same thing, a 128-bit vector zero padded to 256-bit vector, but only the
> 3rd one actually emits the desired vmovdqa      %xmm0, %xmm0 insn, the
> others vpxor    %xmm1, %xmm1, %xmm1; vinserti128        $0x1, %xmm1, %ymm0, %ymm0
> The problem is that the cast builtins use UNSPEC_CAST which is after reload
> simplified using a splitter, but during combine it prevents optimizations.
> We do have avx_vec_concat* patterns that generate efficient code, both for
> this low part + zero concatenation special case and for other cases too, so
> the following define_insn_and_split just recognizes avx_vec_concat made of a
> low half of a cast and some other reg.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-02-06  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/93594
>         * config/i386/predicates.md (avx_identity_operand): New predicate.
>         * config/i386/sse.md (*avx_vec_concat<mode>_1): New
>         define_insn_and_split.
>
>         * gcc.target/i386/avx2-pr93594.c: New test.

LGTM.

Thanks,
Uros.

> --- gcc/config/i386/predicates.md.jj    2020-01-12 11:54:36.331414646 +0100
> +++ gcc/config/i386/predicates.md       2020-02-05 17:44:44.663517106 +0100
> @@ -1584,6 +1584,19 @@ (define_predicate "palignr_operand"
>    return true;
>  })
>
> +;; Return true if OP is a parallel for identity permute.
> +(define_predicate "avx_identity_operand"
> +  (and (match_code "parallel")
> +       (match_code "const_int" "a"))
> +{
> +  int i, nelt = XVECLEN (op, 0);
> +
> +  for (i = 0; i < nelt; ++i)
> +    if (INTVAL (XVECEXP (op, 0, i)) != i)
> +      return false;
> +  return true;
> +})
> +
>  ;; Return true if OP is a proper third operand to vpblendw256.
>  (define_predicate "avx2_pblendw_operand"
>    (match_code "const_int")
> --- gcc/config/i386/sse.md.jj   2020-02-05 15:38:06.636292475 +0100
> +++ gcc/config/i386/sse.md      2020-02-05 17:55:06.696352286 +0100
> @@ -21358,6 +21358,24 @@ (define_insn "avx_vec_concat<mode>"
>     (set_attr "prefix" "maybe_evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*avx_vec_concat<mode>_1"
> +  [(set (match_operand:V_256_512 0 "register_operand")
> +       (vec_concat:V_256_512
> +         (vec_select:<ssehalfvecmode>
> +           (unspec:V_256_512
> +             [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand")]
> +             UNSPEC_CAST)
> +           (match_parallel 3 "avx_identity_operand"
> +             [(match_operand 4 "const_int_operand")]))
> +         (match_operand:<ssehalfvecmode> 2 "nonimm_or_0_operand")))]
> +  "TARGET_AVX
> +   && (operands[2] == CONST0_RTX (<ssehalfvecmode>mode)
> +       || !MEM_P (operands[1]))
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0) (vec_concat:V_256_512 (match_dup 1) (match_dup 2)))])
> +
>  (define_insn "vcvtph2ps<mask_name>"
>    [(set (match_operand:V4SF 0 "register_operand" "=v")
>         (vec_select:V4SF
> --- gcc/testsuite/gcc.target/i386/avx2-pr93594.c.jj     2020-02-05 17:59:33.470416968 +0100
> +++ gcc/testsuite/gcc.target/i386/avx2-pr93594.c        2020-02-05 18:06:20.703403613 +0100
> @@ -0,0 +1,32 @@
> +/* PR target/93594 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "vmovdqa\t%xmm0, %xmm0" 4 } } */
> +/* { dg-final { scan-assembler-not "vpxor\t%" } } */
> +/* { dg-final { scan-assembler-not "vinserti128\t\\\$" } } */
> +
> +#include <x86intrin.h>
> +
> +__m256i
> +foo (__m128i x)
> +{
> +  return _mm256_setr_m128i (x, _mm_setzero_si128 ());
> +}
> +
> +__m256i
> +bar (__m128i x)
> +{
> +  return _mm256_set_m128i (_mm_setzero_si128 (), x);
> +}
> +
> +__m256i
> +baz (__m128i x)
> +{
> +  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), x, 0);
> +}
> +
> +__m256i
> +qux (__m128i x)
> +{
> +  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (x), _mm_setzero_si128 (), 1);
> +}
>
>         Jakub
>
diff mbox series

Patch

--- gcc/config/i386/predicates.md.jj	2020-01-12 11:54:36.331414646 +0100
+++ gcc/config/i386/predicates.md	2020-02-05 17:44:44.663517106 +0100
@@ -1584,6 +1584,19 @@  (define_predicate "palignr_operand"
   return true;
 })
 
+;; Return true if OP is a parallel for identity permute.
+(define_predicate "avx_identity_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int i, nelt = XVECLEN (op, 0);
+
+  for (i = 0; i < nelt; ++i)
+    if (INTVAL (XVECEXP (op, 0, i)) != i)
+      return false;
+  return true;
+})
+
 ;; Return true if OP is a proper third operand to vpblendw256.
 (define_predicate "avx2_pblendw_operand"
   (match_code "const_int")
--- gcc/config/i386/sse.md.jj	2020-02-05 15:38:06.636292475 +0100
+++ gcc/config/i386/sse.md	2020-02-05 17:55:06.696352286 +0100
@@ -21358,6 +21358,24 @@  (define_insn "avx_vec_concat<mode>"
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*avx_vec_concat<mode>_1"
+  [(set (match_operand:V_256_512 0 "register_operand")
+	(vec_concat:V_256_512
+	  (vec_select:<ssehalfvecmode>
+	    (unspec:V_256_512
+	      [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand")]
+	      UNSPEC_CAST)
+	    (match_parallel 3 "avx_identity_operand"
+	      [(match_operand 4 "const_int_operand")]))
+	  (match_operand:<ssehalfvecmode> 2 "nonimm_or_0_operand")))]
+  "TARGET_AVX
+   && (operands[2] == CONST0_RTX (<ssehalfvecmode>mode)
+       || !MEM_P (operands[1]))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (vec_concat:V_256_512 (match_dup 1) (match_dup 2)))])
+
 (define_insn "vcvtph2ps<mask_name>"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
 	(vec_select:V4SF
--- gcc/testsuite/gcc.target/i386/avx2-pr93594.c.jj	2020-02-05 17:59:33.470416968 +0100
+++ gcc/testsuite/gcc.target/i386/avx2-pr93594.c	2020-02-05 18:06:20.703403613 +0100
@@ -0,0 +1,32 @@ 
+/* PR target/93594 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovdqa\t%xmm0, %xmm0" 4 } } */
+/* { dg-final { scan-assembler-not "vpxor\t%" } } */
+/* { dg-final { scan-assembler-not "vinserti128\t\\\$" } } */
+
+#include <x86intrin.h>
+
+__m256i
+foo (__m128i x)
+{
+  return _mm256_setr_m128i (x, _mm_setzero_si128 ());
+}
+
+__m256i
+bar (__m128i x)
+{
+  return _mm256_set_m128i (_mm_setzero_si128 (), x);
+}
+
+__m256i
+baz (__m128i x)
+{
+  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), x, 0);
+}
+
+__m256i
+qux (__m128i x)
+{
+  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (x), _mm_setzero_si128 (), 1);
+}