[AArch64,SVE2] Fix for r277110 (BSL variants)
diff mbox series

Message ID AM0PR08MB37169CF2EB3860CD59CA564A9B6D0@AM0PR08MB3716.eurprd08.prod.outlook.com
State New
Headers show
Series
  • [AArch64,SVE2] Fix for r277110 (BSL variants)
Related show

Commit Message

Yuliang Wang Oct. 17, 2019, 4:17 p.m. UTC
Hi,

SVE2 vectorization for BSL and NBSL fails when the element type is unsigned 8/16-bit.

The operands are being converted implicitly to corresponding signed types, which the mid-end fold pattern does not take into account; this patch augments the pattern with type conversion checks in order to rectify the above problem.

#define TYPE uint{8,16}_t

void
foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n)
{
  for (int i = 0; i < n; i++)
    a[i] = OP (b[i], c[i], d[i]);
}

BSL:

 // #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z)))

  before	and	z1.d, z2.d, z1.d
  		bic	z0.d, z0.d, z2.d
  		orr	z0.d, z0.d, z1.d
  ...
  after		bsl	z0.d, z0.d, z1.d, z2.d

NBSL:

  // #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z)))

  before	and	z1.d, z2.d, z1.d
  		bic	z0.d, z0.d, z2.d
  		orr	z0.d, z0.d, z1.d
  		not	z0.{b,h}, p1/m, z0.{b,h}
  ...
  after		nbsl	z0.d, z0.d, z1.d, z2.d

The GIMPLE output for BSL shows where conversions could be inserted:

_1 = b[i];
_2 = d[i];
_3 = _1 & _2;
_4 = (signed short) _3;
_5 = c[i];
_6 = (signed short) _5;
_7 = d[i];
_8 = (signed short) _7;
_9 = ~_8;
_10 = _6 & _9;
_11 = _4 | _10;
_12 = (short unsigned int) _11;
a[i] = _12;

In contrast, for 32/64-bit types (regardless of signedness):

_1 = b[i];
_2 = d[i];
_3 = _1 & _2;
_4 = c[i];
_5 = d[i];
_6 = ~_5;
_7 = _4 & _6;
_8 = _3 | _7;
_9 = ~_8;
a[i] = _9;

Built and tested on aarch64-none-elf.

Regards,
Yuliang Wang


gcc/ChangeLog:

2019-10-17  Yuliang Wang  <yuliang.wang@arm.com>

	* match.pd (/* (x & ~m) | (y & m) -> ... */): Modified fold pattern.
	* genmatch.c (convert3): New convert operation to support the above.

gcc/testsuite/ChangeLog:

2019-10-17  Yuliang Wang  <yuliang.wang@arm.com>

	* gcc.target/aarch64/sve2/bitsel_1.c: Add testing for unsigned types.
	* gcc.target/aarch64/sve2/bitsel_2.c: As above.
	* gcc.target/aarch64/sve2/bitsel_3.c: As above.
	* gcc.target/aarch64/sve2/bitsel_4.c: As above.
	* gcc.target/aarch64/sve2/eor3_1.c: As above.

Patch
diff mbox series

diff --git a/gcc/genmatch.c b/gcc/genmatch.c
index 7db1f135840e09e794e2921859fa8e9b76666fa8..ce87ae33e0b3c06f4d1fde8d8e74bf2210ee7a5a 100644
--- a/gcc/genmatch.c
+++ b/gcc/genmatch.c
@@ -227,6 +227,7 @@  enum tree_code {
 CONVERT0,
 CONVERT1,
 CONVERT2,
+CONVERT3,
 VIEW_CONVERT0,
 VIEW_CONVERT1,
 VIEW_CONVERT2,
@@ -1176,6 +1177,7 @@  lower_opt_convert (operand *o)
     = { CONVERT0, CONVERT_EXPR,
 	CONVERT1, CONVERT_EXPR,
 	CONVERT2, CONVERT_EXPR,
+	CONVERT3, CONVERT_EXPR,
 	VIEW_CONVERT0, VIEW_CONVERT_EXPR,
 	VIEW_CONVERT1, VIEW_CONVERT_EXPR,
 	VIEW_CONVERT2, VIEW_CONVERT_EXPR };
@@ -4145,8 +4147,8 @@  parser::record_operlist (location_t loc, user_id *p)
     }
 }
 
-/* Parse the operator ID, special-casing convert?, convert1? and
-   convert2?  */
+/* Parse the operator ID, special-casing convert?, convert1?, convert2? and
+   convert3?  */
 
 id_base *
 parser::parse_operation ()
@@ -4167,6 +4169,8 @@  parser::parse_operation ()
 	;
       else if (strcmp (id, "convert2") == 0)
 	;
+      else if (strcmp (id, "convert3") == 0)
+	;
       else if (strcmp (id, "view_convert") == 0)
 	id = "view_convert0";
       else if (strcmp (id, "view_convert1") == 0)
@@ -4183,6 +4187,7 @@  parser::parse_operation ()
     }
   else if (strcmp (id, "convert1") == 0
 	   || strcmp (id, "convert2") == 0
+	   || strcmp (id, "convert3") == 0
 	   || strcmp (id, "view_convert1") == 0
 	   || strcmp (id, "view_convert2") == 0)
     fatal_at (id_tok, "expected '?' after conditional operator");
@@ -4723,9 +4728,9 @@  parser::parse_for (location_t)
 	  id_base *idb = get_operator (oper, true);
 	  if (idb == NULL)
 	    fatal_at (token, "no such operator '%s'", oper);
-	  if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2
-	      || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1
-	      || *idb == VIEW_CONVERT2)
+	  if (*idb == CONVERT0 || *idb == VIEW_CONVERT0
+	      || *idb == CONVERT1 || *idb == CONVERT2|| *idb == CONVERT3
+	      || *idb == VIEW_CONVERT1 || *idb == VIEW_CONVERT2)
 	    fatal_at (token, "conditional operators cannot be used inside for");
 
 	  if (arity == -1)
@@ -5136,6 +5141,7 @@  main (int argc, char **argv)
 add_operator (CONVERT0, "convert0", "tcc_unary", 1);
 add_operator (CONVERT1, "convert1", "tcc_unary", 1);
 add_operator (CONVERT2, "convert2", "tcc_unary", 1);
+add_operator (CONVERT3, "convert3", "tcc_unary", 1);
 add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1);
 add_operator (VIEW_CONVERT1, "view_convert1", "tcc_unary", 1);
 add_operator (VIEW_CONVERT2, "view_convert2", "tcc_unary", 1);
diff --git a/gcc/match.pd b/gcc/match.pd
index e3ac06c8ef5b893bd344734095b11047a43f98b8..0aa065c2941dd79477434fd3b6691c9a9b68d20c 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1461,8 +1461,13 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 
 /* (x & ~m) | (y & m) -> ((x ^ y) & m) ^ x */
 (simplify
- (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
- (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
+ (bit_ior:c
+  (convert? (bit_and:cs @0 (bit_not (convert2? @2))))
+  (convert1? (bit_and:cs @1 (convert3? @2))))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+      && tree_nop_conversion_p (type, TREE_TYPE (@1)))
+  (bit_xor (bit_and
+   (bit_xor (convert @0) (convert @1)) (convert @2)) (convert @0))))
 
 /* Fold A - (A & B) into ~B & A.  */
 (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
index 5c58ff54231d88a4ebf0a91fe4fac97079c8d992..05431e591887c589a1bc1516f99db39c66c353c4 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
@@ -7,27 +7,31 @@ 
 #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z)))
 #endif
 
-#define TYPE(N) int##N##_t
-
-#define TEMPLATE(SIZE)						\
-void __attribute__ ((noinline, noclone))			\
-f_##SIZE##_##OP							\
-  (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b,		\
-   TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n)	\
-{								\
-  for (int i = 0; i < n; i++)					\
-    a[i] = OP (b[i], c[i], d[i]);				\
+#define TYPE(S,N) S##int##N##_t
+
+#define TEMPLATE(SIGN,SIZE)						\
+void __attribute__ ((noinline, noclone))				\
+f_##SIGN##_##SIZE##_##OP						\
+  (TYPE(SIGN,SIZE) *restrict a, TYPE(SIGN,SIZE) *restrict b,		\
+   TYPE(SIGN,SIZE) *restrict c, TYPE(SIGN,SIZE) *restrict d, int n)	\
+{									\
+  for (int i = 0; i < n; i++)						\
+    a[i] = OP (b[i], c[i], d[i]);					\
 }
 
-TEMPLATE (8);
-TEMPLATE (16);
-TEMPLATE (32);
-TEMPLATE (64);
+TEMPLATE (,8);
+TEMPLATE (,16);
+TEMPLATE (,32);
+TEMPLATE (,64);
+TEMPLATE (u,8);
+TEMPLATE (u,16);
+TEMPLATE (u,32);
+TEMPLATE (u,64);
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
 
 /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
 
-/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
index ac0d27213e84bb5c7f3d236f3cac59c71ac674ed..da6ac527e8c93e25e69a8db368fba79190b65202 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
@@ -5,11 +5,11 @@ 
 
 #include "bitsel_1.c"
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
 
 /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */
 
-/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
index 93995bb8bade89cd821ed85153d13e96bd4422a5..1036046a8119ef6aa19f7e975c90b2401cc43c0b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
@@ -5,10 +5,10 @@ 
 
 #include "bitsel_1.c"
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
 
 /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tbic\tz[0-9]+\.[bhsd]} } } */
 
-/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
index 7ccec619b4d1e8de366c0b0c53879a89a00c2c49..527dcf1a42009f484b2cf3d01e7aeb7448a4d1cc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
@@ -5,11 +5,11 @@ 
 
 #include "bitsel_1.c"
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
 
 /* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
 /* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */
 
-/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
index 551802a0c9f007273ddc68cc4ce77defe700d76e..29a023f9be705dcc67f96e0d2b97f8aef3e3ab4d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
@@ -5,9 +5,9 @@ 
 
 #include "bitsel_1.c"
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
 
 /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
 
-/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */