diff mbox series

[4/5] x86: further PR target/100711-like splitting

Message ID b2607ae7-045a-d1bc-2cc8-d2f114677cb6@suse.com
State New
Headers show
Series x86: make better use of VPTERNLOG{D,Q} | expand

Commit Message

Jan Beulich June 21, 2023, 6:27 a.m. UTC
With respective two-operand bitwise operations now expressable by a
single VPTERNLOG, add splitters to also deal with ior and xor
counterparts of the original and-only case. Note that the splitters need
to be separate, as the placement of "not" differs in the final insns
(*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of
the result.

gcc/

	* config/i386/sse.md: New splitters to simplify
	not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}.

gcc/testsuite/

	* gcc.target/i386/pr100711-4.c: New test.
	* gcc.target/i386/pr100711-5.c: New test.

Comments

Hongtao Liu June 25, 2023, 5:06 a.m. UTC | #1
On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> With respective two-operand bitwise operations now expressable by a
> single VPTERNLOG, add splitters to also deal with ior and xor
> counterparts of the original and-only case. Note that the splitters need
> to be separate, as the placement of "not" differs in the final insns
> (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of
> the result.
>
> gcc/
>
>         * config/i386/sse.md: New splitters to simplify
>         not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}.
>
> gcc/testsuite/
>
>         * gcc.target/i386/pr100711-4.c: New test.
>         * gcc.target/i386/pr100711-5.c: New test.
>
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17366,6 +17366,36 @@
>                         (match_dup 2)))]
>    "operands[3] = gen_reg_rtx (<MODE>mode);")
>
> +(define_split
> +  [(set (match_operand:VI 0 "register_operand")
> +       (ior:VI
> +         (vec_duplicate:VI
> +           (not:<ssescalarmode>
> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
> +         (match_operand:VI 2 "vector_operand")))]
> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
> +  [(set (match_dup 3)
> +       (vec_duplicate:VI (match_dup 1)))
> +   (set (match_dup 0)
> +       (ior:VI (not:VI (match_dup 3)) (match_dup 2)))]
> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
> +
> +(define_split
> +  [(set (match_operand:VI 0 "register_operand")
> +       (xor:VI
> +         (vec_duplicate:VI
> +           (not:<ssescalarmode>
> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
> +         (match_operand:VI 2 "vector_operand")))]
> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
> +  [(set (match_dup 3)
> +       (vec_duplicate:VI (match_dup 1)))
> +   (set (match_dup 0)
> +       (not:VI (xor:VI (match_dup 3) (match_dup 2))))]
> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
> +
Can we merge this splitter(xor:not) into ior:not one with a code
iterator for xor,ior, They look the same except for the xor/ior.
No need to merge it into and:not case which have different guard conditions.
Others LGTM.
>  (define_insn "*andnot<mode>3_mask"
>    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
>         (vec_merge:VI48_AVX512VL
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c
> @@ -0,0 +1,42 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +
> +typedef char v64qi __attribute__ ((vector_size (64)));
> +typedef short v32hi __attribute__ ((vector_size (64)));
> +typedef int v16si __attribute__ ((vector_size (64)));
> +typedef long long v8di __attribute__((vector_size (64)));
> +
> +v64qi foo_v64qi (char a, v64qi b)
> +{
> +    return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
> +}
> +
> +v32hi foo_v32hi (short a, v32hi b)
> +{
> +    return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
> +}
> +
> +v16si foo_v16si (int a, v16si b)
> +{
> +    return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
> +}
> +
> +v8di foo_v8di (long long a, v8di b)
> +{
> +    return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
> +}
> +
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +
> +typedef char v64qi __attribute__ ((vector_size (64)));
> +typedef short v32hi __attribute__ ((vector_size (64)));
> +typedef int v16si __attribute__ ((vector_size (64)));
> +typedef long long v8di __attribute__((vector_size (64)));
> +
> +v64qi foo_v64qi (char a, v64qi b)
> +{
> +    return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
> +}
> +
> +v32hi foo_v32hi (short a, v32hi b)
> +{
> +    return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
> +}
> +
> +v16si foo_v16si (int a, v16si b)
> +{
> +    return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
> +                                  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
> +}
> +
> +v8di foo_v8di (long long a, v8di b)
> +{
> +    return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
> +}
> +
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x99" 4 } } */
>
Jan Beulich June 25, 2023, 6:16 a.m. UTC | #2
On 25.06.2023 07:06, Hongtao Liu wrote:
> On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> With respective two-operand bitwise operations now expressable by a
>> single VPTERNLOG, add splitters to also deal with ior and xor
>> counterparts of the original and-only case. Note that the splitters need
>> to be separate, as the placement of "not" differs in the final insns
>> (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of
>> the result.
>>
>> gcc/
>>
>>         * config/i386/sse.md: New splitters to simplify
>>         not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}.
>>
>> gcc/testsuite/
>>
>>         * gcc.target/i386/pr100711-4.c: New test.
>>         * gcc.target/i386/pr100711-5.c: New test.
>>
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -17366,6 +17366,36 @@
>>                         (match_dup 2)))]
>>    "operands[3] = gen_reg_rtx (<MODE>mode);")
>>
>> +(define_split
>> +  [(set (match_operand:VI 0 "register_operand")
>> +       (ior:VI
>> +         (vec_duplicate:VI
>> +           (not:<ssescalarmode>
>> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
>> +         (match_operand:VI 2 "vector_operand")))]
>> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
>> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
>> +  [(set (match_dup 3)
>> +       (vec_duplicate:VI (match_dup 1)))
>> +   (set (match_dup 0)
>> +       (ior:VI (not:VI (match_dup 3)) (match_dup 2)))]
>> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
>> +
>> +(define_split
>> +  [(set (match_operand:VI 0 "register_operand")
>> +       (xor:VI
>> +         (vec_duplicate:VI
>> +           (not:<ssescalarmode>
>> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
>> +         (match_operand:VI 2 "vector_operand")))]
>> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
>> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
>> +  [(set (match_dup 3)
>> +       (vec_duplicate:VI (match_dup 1)))
>> +   (set (match_dup 0)
>> +       (not:VI (xor:VI (match_dup 3) (match_dup 2))))]
>> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
>> +
> Can we merge this splitter(xor:not) into ior:not one with a code
> iterator for xor,ior, They look the same except for the xor/ior.

They're only almost the same: Note (ior (not )) vs (not (xor )) as
the result of the splitting. The difference is necessary to fit
with what patch 1 introduces (which in turn is the way it is to
fit with what generic code transforms things to up front). (I had
it the way you suggest initially, until I figured why one of the
two would end up never being used.)

Jan
Hongtao Liu June 25, 2023, 6:27 a.m. UTC | #3
On Sun, Jun 25, 2023 at 2:16 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 07:06, Hongtao Liu wrote:
> > On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> With respective two-operand bitwise operations now expressable by a
> >> single VPTERNLOG, add splitters to also deal with ior and xor
> >> counterparts of the original and-only case. Note that the splitters need
> >> to be separate, as the placement of "not" differs in the final insns
> >> (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of
> >> the result.
> >>
> >> gcc/
> >>
> >>         * config/i386/sse.md: New splitters to simplify
> >>         not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}.
> >>
> >> gcc/testsuite/
> >>
> >>         * gcc.target/i386/pr100711-4.c: New test.
> >>         * gcc.target/i386/pr100711-5.c: New test.
> >>
> >> --- a/gcc/config/i386/sse.md
> >> +++ b/gcc/config/i386/sse.md
> >> @@ -17366,6 +17366,36 @@
> >>                         (match_dup 2)))]
> >>    "operands[3] = gen_reg_rtx (<MODE>mode);")
> >>
> >> +(define_split
> >> +  [(set (match_operand:VI 0 "register_operand")
> >> +       (ior:VI
> >> +         (vec_duplicate:VI
> >> +           (not:<ssescalarmode>
> >> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
> >> +         (match_operand:VI 2 "vector_operand")))]
> >> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
> >> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
> >> +  [(set (match_dup 3)
> >> +       (vec_duplicate:VI (match_dup 1)))
> >> +   (set (match_dup 0)
> >> +       (ior:VI (not:VI (match_dup 3)) (match_dup 2)))]
> >> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
> >> +
> >> +(define_split
> >> +  [(set (match_operand:VI 0 "register_operand")
> >> +       (xor:VI
> >> +         (vec_duplicate:VI
> >> +           (not:<ssescalarmode>
> >> +             (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
> >> +         (match_operand:VI 2 "vector_operand")))]
> >> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
> >> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
> >> +  [(set (match_dup 3)
> >> +       (vec_duplicate:VI (match_dup 1)))
> >> +   (set (match_dup 0)
> >> +       (not:VI (xor:VI (match_dup 3) (match_dup 2))))]
> >> +  "operands[3] = gen_reg_rtx (<MODE>mode);")
> >> +
> > Can we merge this splitter(xor:not) into ior:not one with a code
> > iterator for xor,ior, They look the same except for the xor/ior.
>
> They're only almost the same: Note (ior (not )) vs (not (xor )) as
> the result of the splitting. The difference is necessary to fit
> with what patch 1 introduces (which in turn is the way it is to
> fit with what generic code transforms things to up front). (I had
> it the way you suggest initially, until I figured why one of the
> two would end up never being used.)
>
3597      /* Convert (XOR (NOT x) (NOT y)) to (XOR x y).
3598         Also convert (XOR (NOT x) y) to (NOT (XOR x y)), similarly for
3599         (NOT y).  */
3600      {
3601        int num_negated = 0;
3602
3603        if (GET_CODE (op0) == NOT)
3604          num_negated++, op0 = XEXP (op0, 0);
3605        if (GET_CODE (op1) == NOT)
3606          num_negated++, op1 = XEXP (op1, 0);

It looks simplify_rtx plays the trick.

And it's documented.
8602@cindex @code{xor}, canonicalization of
 8603@item
 8604The only possible RTL expressions involving both bitwise exclusive-or
 8605and bitwise negation are @code{(xor:@var{m} @var{x} @var{y})}
 8606and @code{(not:@var{m} (xor:@var{m} @var{x} @var{y}))}.

Then the original patch LGTM.

> Jan
diff mbox series

Patch

--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17366,6 +17366,36 @@ 
 			(match_dup 2)))]
   "operands[3] = gen_reg_rtx (<MODE>mode);")
 
+(define_split
+  [(set (match_operand:VI 0 "register_operand")
+	(ior:VI
+	  (vec_duplicate:VI
+	    (not:<ssescalarmode>
+	      (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
+	  (match_operand:VI 2 "vector_operand")))]
+  "<MODE_SIZE> == 64 || TARGET_AVX512VL
+   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
+  [(set (match_dup 3)
+	(vec_duplicate:VI (match_dup 1)))
+   (set (match_dup 0)
+	(ior:VI (not:VI (match_dup 3)) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+(define_split
+  [(set (match_operand:VI 0 "register_operand")
+	(xor:VI
+	  (vec_duplicate:VI
+	    (not:<ssescalarmode>
+	      (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))
+	  (match_operand:VI 2 "vector_operand")))]
+  "<MODE_SIZE> == 64 || TARGET_AVX512VL
+   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
+  [(set (match_dup 3)
+	(vec_duplicate:VI (match_dup 1)))
+   (set (match_dup 0)
+	(not:VI (xor:VI (match_dup 3) (match_dup 2))))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
 (define_insn "*andnot<mode>3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c
@@ -0,0 +1,42 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size (64)));
+
+v64qi foo_v64qi (char a, v64qi b)
+{
+    return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
+}
+
+v32hi foo_v32hi (short a, v32hi b)
+{
+    return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
+}
+
+v16si foo_v16si (int a, v16si b)
+{
+    return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
+}
+
+v8di foo_v8di (long long a, v8di b)
+{
+    return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b;
+}
+
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size (64)));
+
+v64qi foo_v64qi (char a, v64qi b)
+{
+    return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
+}
+
+v32hi foo_v32hi (short a, v32hi b)
+{
+    return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
+}
+
+v16si foo_v16si (int a, v16si b)
+{
+    return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+				   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
+}
+
+v8di foo_v8di (long long a, v8di b)
+{
+    return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b;
+}
+
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x99" 4 } } */