diff mbox series

middle-end: Simplify popcount/parity of bswap/rotate.

Message ID 000801d677db$76682c10$63388430$@nextmovesoftware.com
State New
Headers show
Series middle-end: Simplify popcount/parity of bswap/rotate. | expand

Commit Message

Roger Sayle Aug. 21, 2020, 4:52 p.m. UTC
This simple patch to match.pd optimizes away bit permutation
operations, specifically bswap and rotate, in calls to popcount and
parity.  Although this patch has been developed and tested on LP64,
it relies on there being no truncations or extensions to "marry up"
the appropriate PARITY, PARITYL and PARITYLL forms with either BSWAP32
or BSWAP64, assuming this transformation won't fire if the integral
types have different sizes.

The following patch has been tested on x86_64-pc-linux-gnu with
"make bootstrap" and "make -k check" with no new failures.
Ok for mainline?

2020-08-21  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* gcc/match.pd (popcount(bswapN(x)) -> popcount(x),
	popcount(rotate(x)) -> popcount(x), parity(bswapN(x)) -> parity(x),
	parity(rotate(x)) -> parity(x)): New simplifications.

gcc/testsuite/ChangeLog
	* gcc.dg/fold-popcount-6.c: New test.
	* gcc.dg/fold-popcount-7.c: New test.
	* gcc.dg/fold-parity-6.c: New test.
	* gcc.dg/fold-parity-7.c: New test.

Thanks in advance,
Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/testsuite/gcc.dg/fold-popcount-6.c b/gcc/testsuite/gcc.dg/fold-popcount-6.c
new file mode 100644
index 0000000..37b55a1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-6.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int foo(unsigned int x)
+{
+  if (sizeof(unsigned int) == 4)
+    return __builtin_popcount (__builtin_bswap32(x));
+  return x;
+}
+
+unsigned int bar(unsigned long x)
+{
+  if (sizeof(unsigned long) == 8)
+    return __builtin_popcountl (__builtin_bswap64(x));
+  if (sizeof(unsigned long) == 4)
+    return __builtin_popcountl (__builtin_bswap32(x));
+  return x;
+}
+
+/* { dg-final { scan-tree-dump-times "bswap" 0 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/fold-popcount-7.c b/gcc/testsuite/gcc.dg/fold-popcount-7.c
new file mode 100644
index 0000000..fdcefe1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-7.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int foo(unsigned int x)
+{
+  if (sizeof(unsigned int) == 4) {
+    unsigned int y = (x>>4) | (x<<28);
+    return __builtin_popcount(y);
+  } else return x;
+}
+
+unsigned int bar(unsigned long x)
+{
+  if (sizeof(unsigned long) == 8) {
+    unsigned long y = (x>>4) | (x<<60);
+    return __builtin_popcountl (y);
+  } else if(sizeof(unsigned long) == 4) {
+    unsigned long y = (x>>4) | (x<<28);
+    return __builtin_popcountl (y);
+  } else return (unsigned int)x;
+}
+
+/* { dg-final { scan-tree-dump-times " r>> " 0 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-6.c b/gcc/testsuite/gcc.dg/fold-parity-6.c
new file mode 100644
index 0000000..ece0048
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-6.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int foo(unsigned int x)
+{
+  if (sizeof(unsigned int) == 4)
+    return __builtin_parity (__builtin_bswap32(x));
+  return x;
+}
+
+unsigned int bar(unsigned long x)
+{
+  if (sizeof(unsigned long) == 8)
+    return __builtin_parityl (__builtin_bswap64(x));
+  if (sizeof(unsigned long) == 4)
+    return __builtin_parityl (__builtin_bswap32(x));
+  return (unsigned int)x;
+}
+
+/* { dg-final { scan-tree-dump-times "bswap" 0 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/fold-parity-7.c b/gcc/testsuite/gcc.dg/fold-parity-7.c
new file mode 100644
index 0000000..9b5085b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-7.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int foo(unsigned int x)
+{
+  if (sizeof(unsigned int) == 4) {
+    unsigned int y = (x>>4) | (x<<28);
+    return __builtin_parity (y);
+  } else return x;
+}
+
+unsigned int bar(unsigned long x)
+{
+  if (sizeof(unsigned long) == 8) {
+    unsigned long y = (x>>4) | (x<<60);
+    return __builtin_parityl (y);
+  } else if (sizeof(unsigned long) == 4) {
+    unsigned long y = (x>>4) | (x<<28);
+    return __builtin_parityl (y);
+  } else return (unsigned int)x;
+}
+
+/* { dg-final { scan-tree-dump-times " r>> " 0 "optimized" } } */
+

Comments

Marc Glisse Aug. 22, 2020, 1:03 p.m. UTC | #1
On Fri, 21 Aug 2020, Roger Sayle wrote:

> This simple patch to match.pd optimizes away bit permutation
> operations, specifically bswap and rotate, in calls to popcount and
> parity.

Good idea.

> Although this patch has been developed and tested on LP64,
> it relies on there being no truncations or extensions to "marry up"
> the appropriate PARITY, PARITYL and PARITYLL forms with either BSWAP32
> or BSWAP64, assuming this transformation won't fire if the integral
> types have different sizes.

There would be a convert_expr or similar if the sizes were different, so 
you are safe.

I wish we had only generic builtins/ifn instead of
__builtin_parity{,l,ll,imax}, and inconsistently __builtin_bswap{16,32,64,128},
that's very inconvenient to deal with... And genmatch generates a lot of
useless code because of that.

I didn't try but couldn't the transformations be merged to reduce repetition?

(for bitcnt (POPCOUNT PARITY)
  (for swap (BUILT_IN_BSWAP32 BUILT_IN_BSWAP64)
   (simplify
    (bitcnt (swap @0))
    (bitcnt @0)))
  (for rot (lrotate rrotate)
   (simplify
    (bitcnt (rot @0 @1))
    (bitcnt @0))))

I assume you skipped BUILT_IN_BSWAP16 because on 32+ bit targets, there
is an intermediate extension, and 16 bit targets are "rare"? And
BUILT_IN_BSWAP128 because on most platforms intmax_t is only 64 bits and
we don't have a 128-bit version of parity/popcount? (we have an IFN, but
it seldom appears by magic)
Bill Schmidt via Gcc-patches Aug. 24, 2020, 11:16 p.m. UTC | #2
On Fri, 2020-08-21 at 17:52 +0100, Roger Sayle wrote:
> This simple patch to match.pd optimizes away bit permutation
> operations, specifically bswap and rotate, in calls to popcount and
> parity.  Although this patch has been developed and tested on LP64,
> it relies on there being no truncations or extensions to "marry up"
> the appropriate PARITY, PARITYL and PARITYLL forms with either BSWAP32
> or BSWAP64, assuming this transformation won't fire if the integral
> types have different sizes.
> 
> The following patch has been tested on x86_64-pc-linux-gnu with
> "make bootstrap" and "make -k check" with no new failures.
> Ok for mainline?
> 
> 2020-08-21  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
> 	* gcc/match.pd (popcount(bswapN(x)) -> popcount(x),
> 	popcount(rotate(x)) -> popcount(x), parity(bswapN(x)) -> parity(x),
> 	parity(rotate(x)) -> parity(x)): New simplifications.
> 
> gcc/testsuite/ChangeLog
> 	* gcc.dg/fold-popcount-6.c: New test.
> 	* gcc.dg/fold-popcount-7.c: New test.
> 	* gcc.dg/fold-parity-6.c: New test.
> 	* gcc.dg/fold-parity-7.c: New test.
My worry here would be GCC's habit of not type checking arguments to builtins all
that well.  It's come up several times recently and while I think some of those
holes have been closed, I don't have much confidence we're doing a good job
there.

So with that in mind, this is fine if you verify that the type of the argument is
the same as the resultant type.  I think you've got access to both within the
match.pd framework.

jeff
>
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index c3b8816..7e8a893 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6060,12 +6060,40 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (bit_and (POPCOUNT @0) integer_onep)
   (PARITY @0))
 
+/* Rely on no conversion to marry POPCOUNT, POPCOUNTL and POPCOUNTLL.  */
+(simplify
+  (POPCOUNT (BUILT_IN_BSWAP32 @0))
+  (POPCOUNT @0))
+(simplify
+  (POPCOUNT (BUILT_IN_BSWAP64 @0))
+  (POPCOUNT @0))
+
+(for popcount (POPCOUNT)
+  (for rot (lrotate rrotate)
+    (simplify
+      (popcount (rot @0 @1))
+      (popcount @0))))
+
 /* PARITY simplifications.  */
 /* parity(~X) is parity(X).  */
 (simplify
   (PARITY (bit_not @0))
   (PARITY @0))
 
+/* Rely on no conversion to marry PARITY, PARITYL and PARITYLL.  */
+(simplify
+  (PARITY (BUILT_IN_BSWAP32 @0))
+  (PARITY @0))
+(simplify
+  (PARITY (BUILT_IN_BSWAP64 @0))
+  (PARITY @0))
+
+(for parity (PARITY)
+  (for rot (lrotate rrotate)
+    (simplify
+      (parity (rot @0 @1))
+      (parity @0))))
+
 /* parity(X)^parity(Y) is parity(X^Y).  */
 (simplify
   (bit_xor (PARITY:s @0) (PARITY:s @1))