diff mbox series

x86_64: PR target/100711: Splitters for pandn

Message ID 018501d7e45b$65690fe0$303b2fa0$@nextmovesoftware.com
State New
Headers show
Series x86_64: PR target/100711: Splitters for pandn | expand

Commit Message

Roger Sayle Nov. 28, 2021, 1:25 p.m. UTC
This patch addresses PR target/100711 by introducing define_split
patterns so that not/broadcast/pand may be simplified (by combine)
to broadcast/pandn.  This introduces two splitters one for optimizing
pandn on TARGET_SSE for V4SI and V2DI, and another for vpandn on
TARGET_AVX2 for V16QI, V8HI, V32QI, V16HI and V8SI.  Each splitter
has its own new testcase.

I've also confirmed that not/broadcast/pandn is already getting
simplified to broadcast/pand by the middle-end optimizers.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures.  Ok for mainline?


2021-11-28  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR target/100711
	* config/i386/sse.md (define_split): New splitters to simplify
	not;vec_duplicate;and as vec_duplicate;andn.

gcc/testsuite/ChangeLog
	PR target/100711
	* gcc.target/i386/pr100711-1.c: New test case.
	* gcc.target/i386/pr100711-2.c: New test case.


Thanks in advance,
Roger
--

Comments

Uros Bizjak Nov. 28, 2021, 7:44 p.m. UTC | #1
On Sun, Nov 28, 2021 at 2:25 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch addresses PR target/100711 by introducing define_split
> patterns so that not/broadcast/pand may be simplified (by combine)
> to broadcast/pandn.  This introduces two splitters one for optimizing
> pandn on TARGET_SSE for V4SI and V2DI, and another for vpandn on
> TARGET_AVX2 for V16QI, V8HI, V32QI, V16HI and V8SI.  Each splitter
> has its own new testcase.
>
> I've also confirmed that not/broadcast/pandn is already getting
> simplified to broadcast/pand by the middle-end optimizers.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  Ok for mainline?
>
>
> 2021-11-28  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR target/100711
>         * config/i386/sse.md (define_split): New splitters to simplify
>         not;vec_duplicate;and as vec_duplicate;andn.
>
> gcc/testsuite/ChangeLog
>         PR target/100711
>         * gcc.target/i386/pr100711-1.c: New test case.
>         * gcc.target/i386/pr100711-2.c: New test case.


+;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
+(define_split
+  [(set (match_operand:VI48_128 0 "register_operand")
+ (and:VI48_128
+  (vec_duplicate:VI48_128
+    (not:<ssescalarmode>
+      (match_operand:<ssescalarmode> 1 "register_operand")))
+  (match_operand:VI48_128 2 "register_operand")))]

You can use "vector_operand" here, the resulting PANDN can handle these.

+  "TARGET_SSE && can_create_pseudo_p ()"

This is a combine splitter, so can_create_pseudo_p () is not needed,
because it runs only during the combine phase.

FYI, the combine splitter is somehow different than normal splitter,
the important part from the documentation is, that the insn is *not*
matched by some define_insn pattern, and the split results in exactly
two patterns:

The insn combiner phase also splits putative insns.  If three insns are
merged into one insn with a complex expression that cannot be matched by
some 'define_insn' pattern, the combiner phase attempts to split the
complex pattern into two insns that are recognized.  Usually it can
break the complex pattern into two patterns by splitting out some
subexpression.  However, in some other cases, such as performing an
addition of a large constant in two insns on a RISC machine, the way to
split the addition into two insns is machine-dependent.

+  [(set (match_dup 3)
+ (vec_duplicate:VI48_128 (match_dup 1)))
+   (set (match_dup 0)
+ (and:VI48_128 (not:VI48_128 (match_dup 3))
+      (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
+(define_split
+  [(set (match_operand:VI124_AVX2 0 "register_operand")
+ (and:VI124_AVX2
+  (vec_duplicate:VI124_AVX2
+    (not:<ssescalarmode>
+      (match_operand:<ssescalarmode> 1 "register_operand")))
+  (match_operand:VI124_AVX2 2 "register_operand")))]
+  "TARGET_AVX2 && can_create_pseudo_p ()"
+  [(set (match_dup 3)
+ (vec_duplicate:VI124_AVX2 (match_dup 1)))
+   (set (match_dup 0)
+ (and:VI124_AVX2 (not:VI124_AVX2 (match_dup 3))
+ (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")

Same here as above.

+/* { dg-do compile } */
+/* { dg-options "-O2" } */

Please add -msse2 here, 32bit targets do not enable SSE by default,
and please check if they handle DImode long long at all.

Also, please run tests for x86_64 and i386 targets. The testsuite
should be ran with:

make -k check RUNTESTFLAGS="--target_board=unix\{,-m32\}"

(Eventually, you can use check-gcc instead of check and/or add
i386.exp after --target-board.)

Uros.

+typedef int v4si __attribute__((vector_size (16)));
+typedef long long v2di __attribute__((vector_size (16)));
+
+v4si foo (int a, v4si b)
+{
+    return (__extension__ (v4si) {~a, ~a, ~a, ~a}) & b;
+}
+
+v2di bar (long long a, v2di b)
+{
+    return (__extension__ (v2di) {~a, ~a}) & b;
+}

>
> Thanks in advance,
> Roger
> --
>
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b109c2a..7147bc1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16323,6 +16323,38 @@ 
 	      ]
 	      (const_string "<sseinsnmode>")))])
 
+;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
+(define_split
+  [(set (match_operand:VI48_128 0 "register_operand")
+	(and:VI48_128
+	  (vec_duplicate:VI48_128
+	    (not:<ssescalarmode>
+	      (match_operand:<ssescalarmode> 1 "register_operand")))
+	  (match_operand:VI48_128 2 "register_operand")))]
+  "TARGET_SSE && can_create_pseudo_p ()"
+  [(set (match_dup 3)
+	(vec_duplicate:VI48_128 (match_dup 1)))
+   (set (match_dup 0)
+	(and:VI48_128 (not:VI48_128 (match_dup 3))
+		      (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
+(define_split
+  [(set (match_operand:VI124_AVX2 0 "register_operand")
+	(and:VI124_AVX2
+	  (vec_duplicate:VI124_AVX2
+	    (not:<ssescalarmode>
+	      (match_operand:<ssescalarmode> 1 "register_operand")))
+	  (match_operand:VI124_AVX2 2 "register_operand")))]
+  "TARGET_AVX2 && can_create_pseudo_p ()"
+  [(set (match_dup 3)
+	(vec_duplicate:VI124_AVX2 (match_dup 1)))
+   (set (match_dup 0)
+	(and:VI124_AVX2 (not:VI124_AVX2 (match_dup 3))
+			(match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
 (define_insn "*andnot<mode>3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-1.c b/gcc/testsuite/gcc.target/i386/pr100711-1.c
new file mode 100644
index 0000000..81112f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-1.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef int v4si __attribute__((vector_size (16)));
+typedef long long v2di __attribute__((vector_size (16)));
+
+v4si foo (int a, v4si b)
+{
+    return (__extension__ (v4si) {~a, ~a, ~a, ~a}) & b;
+}
+
+v2di bar (long long a, v2di b)
+{
+    return (__extension__ (v2di) {~a, ~a}) & b;
+}
+
+/* { dg-final { scan-assembler-times "pandn" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-2.c b/gcc/testsuite/gcc.target/i386/pr100711-2.c
new file mode 100644
index 0000000..ccaf168
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-2.c
@@ -0,0 +1,47 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef int v8si __attribute__ ((vector_size (32)));
+
+v16qi foo_v16qi (char a, v16qi b)
+{
+    return (__extension__ (v16qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v8hi foo_v8hi (short a, v8hi b)
+{
+    return (__extension__ (v8hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,}) & b;
+}
+
+v4si foo_v4si (int a, v4si b)
+{
+    return (__extension__ (v4si) {~a, ~a, ~a, ~a}) & b;
+}
+
+v32qi foo_v32qi (char a, v32qi b)
+{
+    return (__extension__ (v32qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v16hi foo_v16hi (short a, v16hi b)
+{
+    return (__extension__ (v16hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+                                   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v8si foo_v8si (int a, v8si b)
+{
+    return (__extension__ (v8si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,}) & b;
+}
+
+/* { dg-final { scan-assembler-times "vpandn" 6 } } */