diff mbox series

Add missing popcount simplifications (PR90693)

Message ID VI1PR0801MB21278052E2DDC128FD0D01EF83D20@VI1PR0801MB2127.eurprd08.prod.outlook.com
State New
Headers show
Series Add missing popcount simplifications (PR90693) | expand

Commit Message

Wilco Dijkstra Aug. 13, 2019, 3:49 p.m. UTC
Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
single-use cases and support an optional convert.  A microbenchmark
shows a speedup of 2-2.5x on both x64 and AArch64.

Bootstrap OK, OK for commit?

ChangeLog:
2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>

gcc/
	PR middle-end/90693
	* match.pd: Add popcount simplifications.

testsuite/
	PR middle-end/90693
	* gcc.dg/fold-popcount-5.c: Add new test.

---

Comments

Marc Glisse Aug. 13, 2019, 4:28 p.m. UTC | #1
On Tue, 13 Aug 2019, Wilco Dijkstra wrote:

> Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> single-use cases and support an optional convert.  A microbenchmark
> shows a speedup of 2-2.5x on both x64 and AArch64.

Is that true even on targets that have a popcount instruction? (-mpopcnt 
for x64)

> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        rep (eq eq ne ne)
>     (simplify
>       (cmp (popcount @0) integer_zerop)
> -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> +  (for cmp (eq ne)
> +       rep (lt ge)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (with {
> +	      tree utype = unsigned_type_for (TREE_TYPE (@0));
> +	      tree a0 = fold_convert (utype, @0); }

That doesn't seem right for a gimple transformation. I assume you didn't 
write (convert:utype @0) in the output because you want to avoid doing it 
3 times? IIRC you are allowed to write (convert:utype@1 @0) in the output 
and reuse @1 several times.

> +	(rep (plus { a0; } { build_minus_one_cst (utype); })
> +	     (bit_and (negate { a0; }) { a0; })))))
> +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> +  (for cmp (gt le)
> +       rep (ne eq)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> +	   { build_zero_cst (TREE_TYPE (@0)); }))))

Are there any types where this could be a problem? Say if you cast to a 
1-bit type. Actually, even converting popcnt(__uint128_t(-1)) to signed 
char may be problematic.
Andrew Pinski Aug. 13, 2019, 4:46 p.m. UTC | #2
On Tue, Aug 13, 2019 at 8:50 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> single-use cases and support an optional convert.  A microbenchmark
> shows a speedup of 2-2.5x on both x64 and AArch64.
>
> Bootstrap OK, OK for commit?

I think this should be in expand stage where there could be comparison
of the cost of the RTLs.
The only reason why it is faster for AARCH64 is the requirement of
moving between the GPRs and the SIMD registers.

Thanks,
Andrew Pinski

>
> ChangeLog:
> 2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>
>
> gcc/
>         PR middle-end/90693
>         * match.pd: Add popcount simplifications.
>
> testsuite/
>         PR middle-end/90693
>         * gcc.dg/fold-popcount-5.c: Add new test.
>
> ---
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         rep (eq eq ne ne)
>      (simplify
>        (cmp (popcount @0) integer_zerop)
> -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> +  (for cmp (eq ne)
> +       rep (lt ge)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (with {
> +             tree utype = unsigned_type_for (TREE_TYPE (@0));
> +             tree a0 = fold_convert (utype, @0); }
> +       (rep (plus { a0; } { build_minus_one_cst (utype); })
> +            (bit_and (negate { a0; }) { a0; })))))
> +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> +  (for cmp (gt le)
> +       rep (ne eq)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> +          { build_zero_cst (TREE_TYPE (@0)); }))))
>
>  /* Simplify:
>
> diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> @@ -0,0 +1,69 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
> +
> +int test_1 (long x)
> +{
> +  return __builtin_popcountl (x) >= 2;
> +}
> +
> +int test_2 (int x)
> +{
> +  return (unsigned) __builtin_popcount (x) <= 1u;
> +}
> +
> +int test_3 (unsigned x)
> +{
> +  return __builtin_popcount (x) > 1u;
> +}
> +
> +int test_4 (unsigned long x)
> +{
> +  return (unsigned char) __builtin_popcountl (x) > 1;
> +}
> +
> +int test_5 (unsigned long x)
> +{
> +  return (signed char) __builtin_popcountl (x) <= (signed char)1;
> +}
> +
> +int test_6 (unsigned long long x)
> +{
> +  return 2u <= __builtin_popcountll (x);
> +}
> +
> +/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
> +
> +int test_7 (unsigned long x)
> +{
> +  return __builtin_popcountl (x) != 1;
> +}
> +
> +int test_8 (long long x)
> +{
> +  return (unsigned) __builtin_popcountll (x) == 1u;
> +}
> +
> +int test_9 (int x)
> +{
> +  return (unsigned char) __builtin_popcount (x) != 1u;
> +}
> +
> +int test_10 (unsigned x)
> +{
> +  return (unsigned char) __builtin_popcount (x) == 1;
> +}
> +
> +int test_11 (long x)
> +{
> +  return (signed char) __builtin_popcountl (x) == 1;
> +}
> +
> +int test_12 (long x)
> +{
> +  return 1u == __builtin_popcountl (x);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
> +
Richard Biener Aug. 14, 2019, 8:27 a.m. UTC | #3
On Tue, Aug 13, 2019 at 6:47 PM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Tue, Aug 13, 2019 at 8:50 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> > popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> > single-use cases and support an optional convert.  A microbenchmark
> > shows a speedup of 2-2.5x on both x64 and AArch64.
> >
> > Bootstrap OK, OK for commit?
>
> I think this should be in expand stage where there could be comparison
> of the cost of the RTLs.

I tend to agree here, if not then for the reason the "simplified" variants
have more GIMPLE stmts which means they are not "simpler".  In
fact I'd argue for canonicalization we'd want to have the reverse
"simplifications" on GIMPLE and expansion based on target cost.

Richard.

> The only reason why it is faster for AARCH64 is the requirement of
> moving between the GPRs and the SIMD registers.
>
> Thanks,
> Andrew Pinski
>
> >
> > ChangeLog:
> > 2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>
> >
> > gcc/
> >         PR middle-end/90693
> >         * match.pd: Add popcount simplifications.
> >
> > testsuite/
> >         PR middle-end/90693
> >         * gcc.dg/fold-popcount-5.c: Add new test.
> >
> > ---
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >         rep (eq eq ne ne)
> >      (simplify
> >        (cmp (popcount @0) integer_zerop)
> > -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> > +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> > +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> > +  (for cmp (eq ne)
> > +       rep (lt ge)
> > +    (simplify
> > +      (cmp (convert? (popcount:s @0)) integer_onep)
> > +      (with {
> > +             tree utype = unsigned_type_for (TREE_TYPE (@0));
> > +             tree a0 = fold_convert (utype, @0); }
> > +       (rep (plus { a0; } { build_minus_one_cst (utype); })
> > +            (bit_and (negate { a0; }) { a0; })))))
> > +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> > +  (for cmp (gt le)
> > +       rep (ne eq)
> > +    (simplify
> > +      (cmp (convert? (popcount:s @0)) integer_onep)
> > +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> > +          { build_zero_cst (TREE_TYPE (@0)); }))))
> >
> >  /* Simplify:
> >
> > diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-optimized" } */
> > +
> > +/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
> > +
> > +int test_1 (long x)
> > +{
> > +  return __builtin_popcountl (x) >= 2;
> > +}
> > +
> > +int test_2 (int x)
> > +{
> > +  return (unsigned) __builtin_popcount (x) <= 1u;
> > +}
> > +
> > +int test_3 (unsigned x)
> > +{
> > +  return __builtin_popcount (x) > 1u;
> > +}
> > +
> > +int test_4 (unsigned long x)
> > +{
> > +  return (unsigned char) __builtin_popcountl (x) > 1;
> > +}
> > +
> > +int test_5 (unsigned long x)
> > +{
> > +  return (signed char) __builtin_popcountl (x) <= (signed char)1;
> > +}
> > +
> > +int test_6 (unsigned long long x)
> > +{
> > +  return 2u <= __builtin_popcountll (x);
> > +}
> > +
> > +/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
> > +
> > +int test_7 (unsigned long x)
> > +{
> > +  return __builtin_popcountl (x) != 1;
> > +}
> > +
> > +int test_8 (long long x)
> > +{
> > +  return (unsigned) __builtin_popcountll (x) == 1u;
> > +}
> > +
> > +int test_9 (int x)
> > +{
> > +  return (unsigned char) __builtin_popcount (x) != 1u;
> > +}
> > +
> > +int test_10 (unsigned x)
> > +{
> > +  return (unsigned char) __builtin_popcount (x) == 1;
> > +}
> > +
> > +int test_11 (long x)
> > +{
> > +  return (signed char) __builtin_popcountl (x) == 1;
> > +}
> > +
> > +int test_12 (long x)
> > +{
> > +  return 1u == __builtin_popcountl (x);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
> > +
Wilco Dijkstra Aug. 21, 2019, 1:57 p.m. UTC | #4
Hi Richard,

> >
> > I think this should be in expand stage where there could be comparison
> > of the cost of the RTLs.
> 
> I tend to agree here, if not then for the reason the "simplified" variants
> have more GIMPLE stmts which means they are not "simpler".  In
> fact I'd argue for canonicalization we'd want to have the reverse
> "simplifications" on GIMPLE and expansion based on target cost.
 
So how would this work? Expand works on one statement at a time, but
we are dealing with more complex expressions here. When we get a
popcount (x) > 1 in expand_gimple_cond, the popcount has already been
expanded. And the code in builtins.c that emits popcount doesn't see or
consider the comparison, so it would be difficult to change it at that point.
None of the infrastructure in expand seems to be set up to do complex
pattern matches and replacements at expand time...

Costing would be difficult too since rtx_cost doesn't support builtins or
calls, so each backend would need to be modified to add costs for these.

So what is the best place to do pattern matches? I thought it was all
moving to match.pd.

Wilco
Jeff Law Aug. 21, 2019, 3:53 p.m. UTC | #5
On 8/21/19 7:57 AM, Wilco Dijkstra wrote:
> Hi Richard,
> 
>>>
>>> I think this should be in expand stage where there could be comparison
>>> of the cost of the RTLs.
>>
>> I tend to agree here, if not then for the reason the "simplified" variants
>> have more GIMPLE stmts which means they are not "simpler".  In
>> fact I'd argue for canonicalization we'd want to have the reverse
>> "simplifications" on GIMPLE and expansion based on target cost.
>  
> So how would this work? Expand works on one statement at a time, but
> we are dealing with more complex expressions here. When we get a
> popcount (x) > 1 in expand_gimple_cond, the popcount has already been
> expanded. And the code in builtins.c that emits popcount doesn't see or
> consider the comparison, so it would be difficult to change it at that point.
> None of the infrastructure in expand seems to be set up to do complex
> pattern matches and replacements at expand time...
> 
> Costing would be difficult too since rtx_cost doesn't support builtins or
> calls, so each backend would need to be modified to add costs for these.
> 
> So what is the best place to do pattern matches? I thought it was all
> moving to match.pd.
I believe the expanders have access to more than one statement via the
use-def chains and TER's transformations.

jeff
Richard Biener Aug. 22, 2019, 10:59 a.m. UTC | #6
On Wed, Aug 21, 2019 at 5:53 PM Jeff Law <law@redhat.com> wrote:
>
> On 8/21/19 7:57 AM, Wilco Dijkstra wrote:
> > Hi Richard,
> >
> >>>
> >>> I think this should be in expand stage where there could be comparison
> >>> of the cost of the RTLs.
> >>
> >> I tend to agree here, if not then for the reason the "simplified" variants
> >> have more GIMPLE stmts which means they are not "simpler".  In
> >> fact I'd argue for canonicalization we'd want to have the reverse
> >> "simplifications" on GIMPLE and expansion based on target cost.
> >
> > So how would this work? Expand works on one statement at a time, but
> > we are dealing with more complex expressions here. When we get a
> > popcount (x) > 1 in expand_gimple_cond, the popcount has already been
> > expanded. And the code in builtins.c that emits popcount doesn't see or
> > consider the comparison, so it would be difficult to change it at that point.
> > None of the infrastructure in expand seems to be set up to do complex
> > pattern matches and replacements at expand time...
> >
> > Costing would be difficult too since rtx_cost doesn't support builtins or
> > calls, so each backend would need to be modified to add costs for these.
> >
> > So what is the best place to do pattern matches? I thought it was all
> > moving to match.pd.
> I believe the expanders have access to more than one statement via the
> use-def chains and TER's transformations.

Either that or as repeatedly suggested elsewhere more complex
"expand time instruction selection" can happen on GIMPLE right
before RTL expansion (pass_optimize_widening_mul is a pass
doing something like that).  We probably want to have a more
formalized thing at some point as part of RTL expansion itself
to also get rid of TER.

The issue with using TER for this is that TER doesn't "handle"
internal FN calls I think (which is simply an oversight):

      /* Increment counter if this is a non BUILT_IN call. We allow
         replacement over BUILT_IN calls since many will expand to inline
         insns instead of a true call.  */
      if (is_gimple_call (stmt)
          && !((fndecl = gimple_call_fndecl (stmt))
               && fndecl_built_in_p (fndecl)))
        cur_call_cnt++;

Richard.

> jeff
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5356,7 +5356,24 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        rep (eq eq ne ne)
     (simplify
       (cmp (popcount @0) integer_zerop)
-      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
+      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
+  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
+  (for cmp (eq ne)
+       rep (lt ge)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (with {
+	      tree utype = unsigned_type_for (TREE_TYPE (@0));
+	      tree a0 = fold_convert (utype, @0); }
+	(rep (plus { a0; } { build_minus_one_cst (utype); })
+	     (bit_and (negate { a0; }) { a0; })))))
+  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
+  (for cmp (gt le)
+       rep (ne eq)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
+	   { build_zero_cst (TREE_TYPE (@0)); }))))
 
 /* Simplify:
 
diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
@@ -0,0 +1,69 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
+
+int test_1 (long x)
+{
+  return __builtin_popcountl (x) >= 2;
+}
+
+int test_2 (int x)
+{
+  return (unsigned) __builtin_popcount (x) <= 1u;
+}
+
+int test_3 (unsigned x)
+{
+  return __builtin_popcount (x) > 1u;
+}
+
+int test_4 (unsigned long x)
+{
+  return (unsigned char) __builtin_popcountl (x) > 1;
+}
+
+int test_5 (unsigned long x)
+{
+  return (signed char) __builtin_popcountl (x) <= (signed char)1;
+}
+
+int test_6 (unsigned long long x)
+{
+  return 2u <= __builtin_popcountll (x);
+}
+
+/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
+
+int test_7 (unsigned long x)
+{
+  return __builtin_popcountl (x) != 1;
+}
+
+int test_8 (long long x)
+{
+  return (unsigned) __builtin_popcountll (x) == 1u;
+}
+
+int test_9 (int x)
+{
+  return (unsigned char) __builtin_popcount (x) != 1u;
+}
+
+int test_10 (unsigned x)
+{
+  return (unsigned char) __builtin_popcount (x) == 1;
+}
+
+int test_11 (long x)
+{
+  return (signed char) __builtin_popcountl (x) == 1;
+}
+
+int test_12 (long x)
+{
+  return 1u == __builtin_popcountl (x);
+}
+
+/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
+