Add missing popcount simplifications (PR90693)
diff mbox series

Message ID VI1PR0801MB21278052E2DDC128FD0D01EF83D20@VI1PR0801MB2127.eurprd08.prod.outlook.com
State New
Headers show
Series
  • Add missing popcount simplifications (PR90693)
Related show

Commit Message

Wilco Dijkstra Aug. 13, 2019, 3:49 p.m. UTC
Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
single-use cases and support an optional convert.  A microbenchmark
shows a speedup of 2-2.5x on both x64 and AArch64.

Bootstrap OK, OK for commit?

ChangeLog:
2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>

gcc/
	PR middle-end/90693
	* match.pd: Add popcount simplifications.

testsuite/
	PR middle-end/90693
	* gcc.dg/fold-popcount-5.c: Add new test.

---

Comments

Marc Glisse Aug. 13, 2019, 4:28 p.m. UTC | #1
On Tue, 13 Aug 2019, Wilco Dijkstra wrote:

> Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> single-use cases and support an optional convert.  A microbenchmark
> shows a speedup of 2-2.5x on both x64 and AArch64.

Is that true even on targets that have a popcount instruction? (-mpopcnt 
for x64)

> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        rep (eq eq ne ne)
>     (simplify
>       (cmp (popcount @0) integer_zerop)
> -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> +  (for cmp (eq ne)
> +       rep (lt ge)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (with {
> +	      tree utype = unsigned_type_for (TREE_TYPE (@0));
> +	      tree a0 = fold_convert (utype, @0); }

That doesn't seem right for a gimple transformation. I assume you didn't 
write (convert:utype @0) in the output because you want to avoid doing it 
3 times? IIRC you are allowed to write (convert:utype@1 @0) in the output 
and reuse @1 several times.

> +	(rep (plus { a0; } { build_minus_one_cst (utype); })
> +	     (bit_and (negate { a0; }) { a0; })))))
> +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> +  (for cmp (gt le)
> +       rep (ne eq)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> +	   { build_zero_cst (TREE_TYPE (@0)); }))))

Are there any types where this could be a problem? Say if you cast to a 
1-bit type. Actually, even converting popcnt(__uint128_t(-1)) to signed 
char may be problematic.
Andrew Pinski Aug. 13, 2019, 4:46 p.m. UTC | #2
On Tue, Aug 13, 2019 at 8:50 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> single-use cases and support an optional convert.  A microbenchmark
> shows a speedup of 2-2.5x on both x64 and AArch64.
>
> Bootstrap OK, OK for commit?

I think this should be in expand stage where there could be comparison
of the cost of the RTLs.
The only reason why it is faster for AARCH64 is the requirement of
moving between the GPRs and the SIMD registers.

Thanks,
Andrew Pinski

>
> ChangeLog:
> 2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>
>
> gcc/
>         PR middle-end/90693
>         * match.pd: Add popcount simplifications.
>
> testsuite/
>         PR middle-end/90693
>         * gcc.dg/fold-popcount-5.c: Add new test.
>
> ---
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         rep (eq eq ne ne)
>      (simplify
>        (cmp (popcount @0) integer_zerop)
> -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> +  (for cmp (eq ne)
> +       rep (lt ge)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (with {
> +             tree utype = unsigned_type_for (TREE_TYPE (@0));
> +             tree a0 = fold_convert (utype, @0); }
> +       (rep (plus { a0; } { build_minus_one_cst (utype); })
> +            (bit_and (negate { a0; }) { a0; })))))
> +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> +  (for cmp (gt le)
> +       rep (ne eq)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> +          { build_zero_cst (TREE_TYPE (@0)); }))))
>
>  /* Simplify:
>
> diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> @@ -0,0 +1,69 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
> +
> +int test_1 (long x)
> +{
> +  return __builtin_popcountl (x) >= 2;
> +}
> +
> +int test_2 (int x)
> +{
> +  return (unsigned) __builtin_popcount (x) <= 1u;
> +}
> +
> +int test_3 (unsigned x)
> +{
> +  return __builtin_popcount (x) > 1u;
> +}
> +
> +int test_4 (unsigned long x)
> +{
> +  return (unsigned char) __builtin_popcountl (x) > 1;
> +}
> +
> +int test_5 (unsigned long x)
> +{
> +  return (signed char) __builtin_popcountl (x) <= (signed char)1;
> +}
> +
> +int test_6 (unsigned long long x)
> +{
> +  return 2u <= __builtin_popcountll (x);
> +}
> +
> +/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
> +
> +int test_7 (unsigned long x)
> +{
> +  return __builtin_popcountl (x) != 1;
> +}
> +
> +int test_8 (long long x)
> +{
> +  return (unsigned) __builtin_popcountll (x) == 1u;
> +}
> +
> +int test_9 (int x)
> +{
> +  return (unsigned char) __builtin_popcount (x) != 1u;
> +}
> +
> +int test_10 (unsigned x)
> +{
> +  return (unsigned char) __builtin_popcount (x) == 1;
> +}
> +
> +int test_11 (long x)
> +{
> +  return (signed char) __builtin_popcountl (x) == 1;
> +}
> +
> +int test_12 (long x)
> +{
> +  return 1u == __builtin_popcountl (x);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
> +
Richard Biener Aug. 14, 2019, 8:27 a.m. UTC | #3
On Tue, Aug 13, 2019 at 6:47 PM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Tue, Aug 13, 2019 at 8:50 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> > popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> > single-use cases and support an optional convert.  A microbenchmark
> > shows a speedup of 2-2.5x on both x64 and AArch64.
> >
> > Bootstrap OK, OK for commit?
>
> I think this should be in expand stage where there could be comparison
> of the cost of the RTLs.

I tend to agree here, if not then for the reason the "simplified" variants
have more GIMPLE stmts which means they are not "simpler".  In
fact I'd argue for canonicalization we'd want to have the reverse
"simplifications" on GIMPLE and expansion based on target cost.

Richard.

> The only reason why it is faster for AARCH64 is the requirement of
> moving between the GPRs and the SIMD registers.
>
> Thanks,
> Andrew Pinski
>
> >
> > ChangeLog:
> > 2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>
> >
> > gcc/
> >         PR middle-end/90693
> >         * match.pd: Add popcount simplifications.
> >
> > testsuite/
> >         PR middle-end/90693
> >         * gcc.dg/fold-popcount-5.c: Add new test.
> >
> > ---
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >         rep (eq eq ne ne)
> >      (simplify
> >        (cmp (popcount @0) integer_zerop)
> > -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> > +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> > +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> > +  (for cmp (eq ne)
> > +       rep (lt ge)
> > +    (simplify
> > +      (cmp (convert? (popcount:s @0)) integer_onep)
> > +      (with {
> > +             tree utype = unsigned_type_for (TREE_TYPE (@0));
> > +             tree a0 = fold_convert (utype, @0); }
> > +       (rep (plus { a0; } { build_minus_one_cst (utype); })
> > +            (bit_and (negate { a0; }) { a0; })))))
> > +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> > +  (for cmp (gt le)
> > +       rep (ne eq)
> > +    (simplify
> > +      (cmp (convert? (popcount:s @0)) integer_onep)
> > +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> > +          { build_zero_cst (TREE_TYPE (@0)); }))))
> >
> >  /* Simplify:
> >
> > diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-optimized" } */
> > +
> > +/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
> > +
> > +int test_1 (long x)
> > +{
> > +  return __builtin_popcountl (x) >= 2;
> > +}
> > +
> > +int test_2 (int x)
> > +{
> > +  return (unsigned) __builtin_popcount (x) <= 1u;
> > +}
> > +
> > +int test_3 (unsigned x)
> > +{
> > +  return __builtin_popcount (x) > 1u;
> > +}
> > +
> > +int test_4 (unsigned long x)
> > +{
> > +  return (unsigned char) __builtin_popcountl (x) > 1;
> > +}
> > +
> > +int test_5 (unsigned long x)
> > +{
> > +  return (signed char) __builtin_popcountl (x) <= (signed char)1;
> > +}
> > +
> > +int test_6 (unsigned long long x)
> > +{
> > +  return 2u <= __builtin_popcountll (x);
> > +}
> > +
> > +/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
> > +
> > +int test_7 (unsigned long x)
> > +{
> > +  return __builtin_popcountl (x) != 1;
> > +}
> > +
> > +int test_8 (long long x)
> > +{
> > +  return (unsigned) __builtin_popcountll (x) == 1u;
> > +}
> > +
> > +int test_9 (int x)
> > +{
> > +  return (unsigned char) __builtin_popcount (x) != 1u;
> > +}
> > +
> > +int test_10 (unsigned x)
> > +{
> > +  return (unsigned char) __builtin_popcount (x) == 1;
> > +}
> > +
> > +int test_11 (long x)
> > +{
> > +  return (signed char) __builtin_popcountl (x) == 1;
> > +}
> > +
> > +int test_12 (long x)
> > +{
> > +  return 1u == __builtin_popcountl (x);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
> > +
Wilco Dijkstra Aug. 21, 2019, 1:57 p.m. UTC | #4
Hi Richard,

> >
> > I think this should be in expand stage where there could be comparison
> > of the cost of the RTLs.
> 
> I tend to agree here, if not then for the reason the "simplified" variants
> have more GIMPLE stmts which means they are not "simpler".  In
> fact I'd argue for canonicalization we'd want to have the reverse
> "simplifications" on GIMPLE and expansion based on target cost.
 
So how would this work? Expand works on one statement at a time, but
we are dealing with more complex expressions here. When we get a
popcount (x) > 1 in expand_gimple_cond, the popcount has already been
expanded. And the code in builtins.c that emits popcount doesn't see or
consider the comparison, so it would be difficult to change it at that point.
None of the infrastructure in expand seems to be set up to do complex
pattern matches and replacements at expand time...

Costing would be difficult too since rtx_cost doesn't support builtins or
calls, so each backend would need to be modified to add costs for these.

So what is the best place to do pattern matches? I thought it was all
moving to match.pd.

Wilco
Jeff Law Aug. 21, 2019, 3:53 p.m. UTC | #5
On 8/21/19 7:57 AM, Wilco Dijkstra wrote:
> Hi Richard,
> 
>>>
>>> I think this should be in expand stage where there could be comparison
>>> of the cost of the RTLs.
>>
>> I tend to agree here, if not then for the reason the "simplified" variants
>> have more GIMPLE stmts which means they are not "simpler".  In
>> fact I'd argue for canonicalization we'd want to have the reverse
>> "simplifications" on GIMPLE and expansion based on target cost.
>  
> So how would this work? Expand works on one statement at a time, but
> we are dealing with more complex expressions here. When we get a
> popcount (x) > 1 in expand_gimple_cond, the popcount has already been
> expanded. And the code in builtins.c that emits popcount doesn't see or
> consider the comparison, so it would be difficult to change it at that point.
> None of the infrastructure in expand seems to be set up to do complex
> pattern matches and replacements at expand time...
> 
> Costing would be difficult too since rtx_cost doesn't support builtins or
> calls, so each backend would need to be modified to add costs for these.
> 
> So what is the best place to do pattern matches? I thought it was all
> moving to match.pd.
I believe the expanders have access to more than one statement via the
use-def chains and TER's transformations.

jeff

Patch
diff mbox series

diff --git a/gcc/match.pd b/gcc/match.pd
index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5356,7 +5356,24 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        rep (eq eq ne ne)
     (simplify
       (cmp (popcount @0) integer_zerop)
-      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
+      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
+  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
+  (for cmp (eq ne)
+       rep (lt ge)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (with {
+	      tree utype = unsigned_type_for (TREE_TYPE (@0));
+	      tree a0 = fold_convert (utype, @0); }
+	(rep (plus { a0; } { build_minus_one_cst (utype); })
+	     (bit_and (negate { a0; }) { a0; })))))
+  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
+  (for cmp (gt le)
+       rep (ne eq)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
+	   { build_zero_cst (TREE_TYPE (@0)); }))))
 
 /* Simplify:
 
diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
@@ -0,0 +1,69 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
+
+int test_1 (long x)
+{
+  return __builtin_popcountl (x) >= 2;
+}
+
+int test_2 (int x)
+{
+  return (unsigned) __builtin_popcount (x) <= 1u;
+}
+
+int test_3 (unsigned x)
+{
+  return __builtin_popcount (x) > 1u;
+}
+
+int test_4 (unsigned long x)
+{
+  return (unsigned char) __builtin_popcountl (x) > 1;
+}
+
+int test_5 (unsigned long x)
+{
+  return (signed char) __builtin_popcountl (x) <= (signed char)1;
+}
+
+int test_6 (unsigned long long x)
+{
+  return 2u <= __builtin_popcountll (x);
+}
+
+/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
+
+int test_7 (unsigned long x)
+{
+  return __builtin_popcountl (x) != 1;
+}
+
+int test_8 (long long x)
+{
+  return (unsigned) __builtin_popcountll (x) == 1u;
+}
+
+int test_9 (int x)
+{
+  return (unsigned char) __builtin_popcount (x) != 1u;
+}
+
+int test_10 (unsigned x)
+{
+  return (unsigned char) __builtin_popcount (x) == 1;
+}
+
+int test_11 (long x)
+{
+  return (signed char) __builtin_popcountl (x) == 1;
+}
+
+int test_12 (long x)
+{
+  return 1u == __builtin_popcountl (x);
+}
+
+/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
+