diff mbox series

POPCOUNT folding optimizations

Message ID 003f01d3a1a3$66b893b0$3429bb10$@nextmovesoftware.com
State New
Headers show
Series POPCOUNT folding optimizations | expand

Commit Message

Roger Sayle Feb. 9, 2018, 12:42 p.m. UTC
The following patch implements a number of __builtin_popcount related
optimizations.
(i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
x!=0.
(ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
popcount(x>>31) to x>>31.
(iii) popcount (x&6) + popcount(y&16) can be simplified to
popcount((x&6)|(y&16))

These may seem obscure transformations, but performing these types of
POPCOUNT
operations are often the performance critical steps in some cheminformatics
applications.

To implement the above transformations I've introduced the tree_nonzero_bits
function,
which is a tree-level version of rtlanal's nonzero_bits used by the RTL
optimizers.

The following patch has been tested on x86_64-pc-linux-gnu with a "make
bootstrap"
and "make check" with no regressions, and passes for the four new gcc.dg
test cases.

Many thanks In advance.  Best regards,

Roger
--
Roger Sayle, PhD.
NextMove Software Limited
Innovation Centre (Unit 23), Cambridge Science Park, Cambridge, CB4 0EY

2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>

        * fold-const.c (tree_nonzero_bits): New function.
        * fold-const.h (tree_nonzero_bits): Likewise.
        * match.pd (POPCOUNT): New patterns to fold BUILTIN_POPCOUNT and
        friends.  POPCOUNT(x&1) => x&1, POPCOUNT(x)==0 => x==0, etc.

2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>

        * gcc.dg/fold-popcount-1.c: New testcase.
        * gcc.dg/fold-popcount-2.c: New testcase.
        * gcc.dg/fold-popcount-3.c: New testcase.
        * gcc.dg/fold-popcount-4.c: New testcase.
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-original" } */

int test_eqzero(unsigned int a)
{
  return __builtin_popcount(a) == 0;
}

int test_eqzerol(unsigned long b)
{
  return __builtin_popcountl(b) == 0;
}

int test_eqzeroll(unsigned long long c)
{
  return __builtin_popcountll(c) == 0;
}

int test_nezero(unsigned int d)
{
  return __builtin_popcount(d) != 0;
}

int test_nezerol(unsigned long e)
{
  return __builtin_popcountl(e) != 0;
}

int test_nezeroll(unsigned long long f)
{
  return __builtin_popcountll(f) != 0;
}

/* { dg-final { scan-tree-dump-times "popcount" 0 "original" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-cddce1" } */

int test_andone(unsigned int a)
{
  return __builtin_popcount(a&1);
}

int test_andonel(unsigned long b)
{
  return __builtin_popcountl(b&1);
}

int test_andonell(unsigned long long c)
{
  return __builtin_popcountll(c&1);
}

int test_oneand(unsigned int d)
{
  return __builtin_popcount(1&d);
}

int test_oneandl(unsigned long e)
{
  return __builtin_popcountl(1&e);
}

int test_oneandll(unsigned long long f)
{
  return __builtin_popcountll(1&f);
}

/* { dg-final { scan-tree-dump-times "popcount" 0 "cddce1" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-cddce1" } */

int test_combine(unsigned int a, unsigned int b)
{
  return __builtin_popcount(a&8) + __builtin_popcount(b&2);
}

/* { dg-final { scan-tree-dump-times "popcount" 1 "cddce1" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-cddce1" } */

int test_shiftmax(unsigned int a)
{
  return __builtin_popcount(a>>(8*sizeof(a)-1));
}

int test_shiftmaxl(unsigned long b)
{
  return __builtin_popcountl(b>>(8*sizeof(b)-1));
}

int test_shiftmaxll(unsigned long long c)
{
  return __builtin_popcountll(c>>(8*sizeof(c)-1));
}

int test_shift7(unsigned char d)
{
  return __builtin_popcount(d>>7);
}

int test_shift7l(unsigned char e)
{
  return __builtin_popcountl(e>>7);
}

int test_shift7ll(unsigned char f)
{
  return __builtin_popcountll(f>>7);
}

int test_shift15(unsigned short g)
{
  return __builtin_popcount(g>>15);
}

int test_shift15l(unsigned short h)
{
  return __builtin_popcountl(h>>15);
}

int test_shift15ll(unsigned short i)
{
  return __builtin_popcountll(i>>15);
}

/* { dg-final { scan-tree-dump-times "popcount" 0 "cddce1" } } */

Comments

Jeff Law Feb. 10, 2018, 6:56 p.m. UTC | #1
On 02/09/2018 05:42 AM, Roger Sayle wrote:
> 
> The following patch implements a number of __builtin_popcount related
> optimizations.
> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
> x!=0.
> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
> popcount(x>>31) to x>>31.
> (iii) popcount (x&6) + popcount(y&16) can be simplified to
> popcount((x&6)|(y&16))
> 
> These may seem obscure transformations, but performing these types of
> POPCOUNT
> operations are often the performance critical steps in some cheminformatics
> applications.
> 
> To implement the above transformations I've introduced the tree_nonzero_bits
> function,
> which is a tree-level version of rtlanal's nonzero_bits used by the RTL
> optimizers.
> 
> The following patch has been tested on x86_64-pc-linux-gnu with a "make
> bootstrap"
> and "make check" with no regressions, and passes for the four new gcc.dg
> test cases.
> 
> Many thanks In advance.  Best regards,
> 
> Roger
> --
> Roger Sayle, PhD.
> NextMove Software Limited
> Innovation Centre (Unit 23), Cambridge Science Park, Cambridge, CB4 0EY
> 
> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
> 
>         * fold-const.c (tree_nonzero_bits): New function.
>         * fold-const.h (tree_nonzero_bits): Likewise.
>         * match.pd (POPCOUNT): New patterns to fold BUILTIN_POPCOUNT and
>         friends.  POPCOUNT(x&1) => x&1, POPCOUNT(x)==0 => x==0, etc.
> 
> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
> 
>         * gcc.dg/fold-popcount-1.c: New testcase.
>         * gcc.dg/fold-popcount-2.c: New testcase.
>         * gcc.dg/fold-popcount-3.c: New testcase.
>         * gcc.dg/fold-popcount-4.c: New testcase.
Queued for stage1.  THanks Roger.  I hope things are going well.

jeff
Jeff Law April 30, 2018, 10:57 p.m. UTC | #2
On 02/09/2018 05:42 AM, Roger Sayle wrote:
> The following patch implements a number of __builtin_popcount related
> optimizations.
> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
> x!=0.
> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
> popcount(x>>31) to x>>31.
> (iii) popcount (x&6) + popcount(y&16) can be simplified to
> popcount((x&6)|(y&16))
> 
> These may seem obscure transformations, but performing these types of
> POPCOUNT
> operations are often the performance critical steps in some cheminformatics
> applications.
> 
> To implement the above transformations I've introduced the tree_nonzero_bits
> function,
> which is a tree-level version of rtlanal's nonzero_bits used by the RTL
> optimizers.
> 
> The following patch has been tested on x86_64-pc-linux-gnu with a "make
> bootstrap"
> and "make check" with no regressions, and passes for the four new gcc.dg
> test cases.
> 
> Many thanks In advance.  Best regards,
> 
> Roger
> --
> Roger Sayle, PhD.
> NextMove Software Limited
> Innovation Centre (Unit 23), Cambridge Science Park, Cambridge, CB4 0EY
> 
> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
> 
>         * fold-const.c (tree_nonzero_bits): New function.
>         * fold-const.h (tree_nonzero_bits): Likewise.
>         * match.pd (POPCOUNT): New patterns to fold BUILTIN_POPCOUNT and
>         friends.  POPCOUNT(x&1) => x&1, POPCOUNT(x)==0 => x==0, etc.
> 
> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
> 
>         * gcc.dg/fold-popcount-1.c: New testcase.
>         * gcc.dg/fold-popcount-2.c: New testcase.
>         * gcc.dg/fold-popcount-3.c: New testcase.
>         * gcc.dg/fold-popcount-4.c: New testcase.
> 
> 
> 
> 
> Index: gcc/fold-const.c
> ===================================================================
> --- gcc/fold-const.c	(revision 257227)
> +++ gcc/fold-const.c	(working copy)
> @@ -14580,6 +14580,75 @@
>    return string + offset;
>  }
>  
> +/* Given a tree T, compute which bits in T may be nonzero.  */
> +
> +wide_int
> +tree_nonzero_bits (const_tree t)
> +{
> +  switch (TREE_CODE (t))
> +    {
> +    case BIT_IOR_EXPR:
> +    case BIT_XOR_EXPR:
> +      return wi::bit_or (tree_nonzero_bits (TREE_OPERAND (t, 0)),
> +			 tree_nonzero_bits (TREE_OPERAND (t, 1)));
Hmm.   I think this will potentially have too many bits set in the
BIT_XOR case.  Is there some reason you didn't use wi::bit_xor for that
case?


We can probably go ahead and ACK this once that question is resolved.

THanks,
jeff
Marc Glisse May 1, 2018, 8:42 a.m. UTC | #3
(I am not a reviewer, just commenting)

On Fri, 9 Feb 2018, Roger Sayle wrote:

> The following patch implements a number of __builtin_popcount related
> optimizations.
> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
> x!=0.
> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
> popcount(x>>31) to x>>31.
> (iii) popcount (x&6) + popcount(y&16) can be simplified to
> popcount((x&6)|(y&16))
>
> These may seem obscure transformations, but performing these types of 
> POPCOUNT operations are often the performance critical steps in some 
> cheminformatics applications.
>
> To implement the above transformations I've introduced the 
> tree_nonzero_bits function, which is a tree-level version of rtlanal's 
> nonzero_bits used by the RTL optimizers.

I am wondering if this brings much, compared to just using 
get_nonzero_bits (works on INTEGER_CST / SSA_NAME, matched by 
with_possible_nonzero_bits). If we do decide to introduce this function, 
we probably want to use it in other places that currently use 
get_nonzero_bits as well...

> +    case NOP_EXPR:
> +    case CONVERT_EXPR:

We have CASE_CONVERT: for this pair.

> +    case LSHIFT_EXPR:
> +      if (TREE_CODE (TREE_OPERAND (t, 1)) == INTEGER_CST)

Maybe check INTEGRAL_TYPE_P as well, like you did for PLUS_EXPR? Or was 
that also unnecessary for PLUS_EXPR?

> +	  return wi::neg_p (arg1)
> +		 ? wi::rshift (nzbits, -arg1, TYPE_SIGN (type))
> +		 : wi::lshift (nzbits, arg1);

I can see that fold-const.c already does something like that. I am 
surprised the sanitizer guys haven't asked that we just punt on 
negative values instead.

> --- gcc/match.pd	(revision 257227)
> +++ gcc/match.pd	(working copy)
> @@ -4648,3 +4648,24 @@
>  	|| wi::geu_p (wi::to_wide (@rpos),
>  		      wi::to_wide (@ipos) + isize))
>      (BIT_FIELD_REF @0 @rsize @rpos)))))
> +
> +/* POPCOUNT simplifications.  */
> +(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL
> +               BUILT_IN_POPCOUNTIMAX)
> +  /* popcount(X&1) is nop_expr(X&1).  */
> +  (simplify
> +    (popcount @0)
> +    (if (tree_nonzero_bits (@0) == 1)
> +      (convert @0)))

Good thing we can't call popcount on signed 1-bit types ;-)

> +  /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
> +  (simplify
> +    (plus (popcount @0) (popcount @1))

We probably want :s on both popcount: if they are used in other places 
than just this addition, it is likely cheaper not to introduce a new call 
to popcount.

> +    (if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
> +      (popcount (bit_ior @0 @1))))

We'll have to be careful if we ever turn popcount into something generic, 
but the current situation indeed should safely guarantee that @0 and @1 
have the same type.

> +  /* pocount(X) == 0 is X == 0, and related (in)equalities.  */

po+p+count

> +  (for cmp (le eq ne gt)
> +       rep (eq eq ne ne)
> +    (simplify
> +      (cmp (popcount @0) zerop)

Might as well use integer_zerop when we know we are dealing with integers.

> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))


Nice patch :-)
Marc Glisse May 1, 2018, 8:48 a.m. UTC | #4
On Mon, 30 Apr 2018, Jeff Law wrote:

> On 02/09/2018 05:42 AM, Roger Sayle wrote:
>> The following patch implements a number of __builtin_popcount related
>> optimizations.
>> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
>> x!=0.
>> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
>> popcount(x>>31) to x>>31.
>> (iii) popcount (x&6) + popcount(y&16) can be simplified to
>> popcount((x&6)|(y&16))
>>
>> These may seem obscure transformations, but performing these types of
>> POPCOUNT
>> operations are often the performance critical steps in some cheminformatics
>> applications.
>>
>> To implement the above transformations I've introduced the tree_nonzero_bits
>> function,
>> which is a tree-level version of rtlanal's nonzero_bits used by the RTL
>> optimizers.
>>
>> The following patch has been tested on x86_64-pc-linux-gnu with a "make
>> bootstrap"
>> and "make check" with no regressions, and passes for the four new gcc.dg
>> test cases.
>>
>> Many thanks In advance.  Best regards,
>>
>> Roger
>> --
>> Roger Sayle, PhD.
>> NextMove Software Limited
>> Innovation Centre (Unit 23), Cambridge Science Park, Cambridge, CB4 0EY
>>
>> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
>>
>>         * fold-const.c (tree_nonzero_bits): New function.
>>         * fold-const.h (tree_nonzero_bits): Likewise.
>>         * match.pd (POPCOUNT): New patterns to fold BUILTIN_POPCOUNT and
>>         friends.  POPCOUNT(x&1) => x&1, POPCOUNT(x)==0 => x==0, etc.
>>
>> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
>>
>>         * gcc.dg/fold-popcount-1.c: New testcase.
>>         * gcc.dg/fold-popcount-2.c: New testcase.
>>         * gcc.dg/fold-popcount-3.c: New testcase.
>>         * gcc.dg/fold-popcount-4.c: New testcase.
>>
>>
>>
>>
>> Index: gcc/fold-const.c
>> ===================================================================
>> --- gcc/fold-const.c	(revision 257227)
>> +++ gcc/fold-const.c	(working copy)
>> @@ -14580,6 +14580,75 @@
>>    return string + offset;
>>  }
>>
>> +/* Given a tree T, compute which bits in T may be nonzero.  */
>> +
>> +wide_int
>> +tree_nonzero_bits (const_tree t)
>> +{
>> +  switch (TREE_CODE (t))
>> +    {
>> +    case BIT_IOR_EXPR:
>> +    case BIT_XOR_EXPR:
>> +      return wi::bit_or (tree_nonzero_bits (TREE_OPERAND (t, 0)),
>> +			 tree_nonzero_bits (TREE_OPERAND (t, 1)));
> Hmm.   I think this will potentially have too many bits set in the
> BIT_XOR case.  Is there some reason you didn't use wi::bit_xor for that
> case?

You cannot use bit_xor because the bits are only *possibly* set (same as 
you can't say anything about BIT_NOT_EXPR). We would need to also track 
the bits *certainly* set to start doing clever stuff, and for BIT_XOR_EXPR 
that would still be inconvenient.
Jeff Law May 1, 2018, 3:01 p.m. UTC | #5
On 05/01/2018 02:48 AM, Marc Glisse wrote:
> On Mon, 30 Apr 2018, Jeff Law wrote:
> 
>> On 02/09/2018 05:42 AM, Roger Sayle wrote:
>>> The following patch implements a number of __builtin_popcount related
>>> optimizations.
>>> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
>>> x!=0.
>>> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
>>> popcount(x>>31) to x>>31.
>>> (iii) popcount (x&6) + popcount(y&16) can be simplified to
>>> popcount((x&6)|(y&16))
>>>
>>> These may seem obscure transformations, but performing these types of
>>> POPCOUNT
>>> operations are often the performance critical steps in some
>>> cheminformatics
>>> applications.
>>>
>>> To implement the above transformations I've introduced the
>>> tree_nonzero_bits
>>> function,
>>> which is a tree-level version of rtlanal's nonzero_bits used by the RTL
>>> optimizers.
>>>
>>> The following patch has been tested on x86_64-pc-linux-gnu with a "make
>>> bootstrap"
>>> and "make check" with no regressions, and passes for the four new gcc.dg
>>> test cases.
>>>
>>> Many thanks In advance.  Best regards,
>>>
>>> Roger
>>> -- 
>>> Roger Sayle, PhD.
>>> NextMove Software Limited
>>> Innovation Centre (Unit 23), Cambridge Science Park, Cambridge, CB4 0EY
>>>
>>> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
>>>
>>>         * fold-const.c (tree_nonzero_bits): New function.
>>>         * fold-const.h (tree_nonzero_bits): Likewise.
>>>         * match.pd (POPCOUNT): New patterns to fold BUILTIN_POPCOUNT and
>>>         friends.  POPCOUNT(x&1) => x&1, POPCOUNT(x)==0 => x==0, etc.
>>>
>>> 2018-02-09  Roger Sayle  <roger@nextmovesoftware.com>
>>>
>>>         * gcc.dg/fold-popcount-1.c: New testcase.
>>>         * gcc.dg/fold-popcount-2.c: New testcase.
>>>         * gcc.dg/fold-popcount-3.c: New testcase.
>>>         * gcc.dg/fold-popcount-4.c: New testcase.
>>>
>>>
>>>
>>>
>>> Index: gcc/fold-const.c
>>> ===================================================================
>>> --- gcc/fold-const.c    (revision 257227)
>>> +++ gcc/fold-const.c    (working copy)
>>> @@ -14580,6 +14580,75 @@
>>>    return string + offset;
>>>  }
>>>
>>> +/* Given a tree T, compute which bits in T may be nonzero.  */
>>> +
>>> +wide_int
>>> +tree_nonzero_bits (const_tree t)
>>> +{
>>> +  switch (TREE_CODE (t))
>>> +    {
>>> +    case BIT_IOR_EXPR:
>>> +    case BIT_XOR_EXPR:
>>> +      return wi::bit_or (tree_nonzero_bits (TREE_OPERAND (t, 0)),
>>> +             tree_nonzero_bits (TREE_OPERAND (t, 1)));
>> Hmm.   I think this will potentially have too many bits set in the
>> BIT_XOR case.  Is there some reason you didn't use wi::bit_xor for that
>> case?
> 
> You cannot use bit_xor because the bits are only *possibly* set (same as
> you can't say anything about BIT_NOT_EXPR). We would need to also track
> the bits *certainly* set to start doing clever stuff, and for
> BIT_XOR_EXPR that would still be inconvenient.
Yea, I realized it was a may vs must issue after I signed off for the night.

jeff
Jeff Law May 24, 2018, 8:41 p.m. UTC | #6
On 05/01/2018 02:42 AM, Marc Glisse wrote:
> (I am not a reviewer, just commenting)
But your comments are definitely appreciated!

> 
> On Fri, 9 Feb 2018, Roger Sayle wrote:
> 
>> The following patch implements a number of __builtin_popcount related
>> optimizations.
>> (i) popcount(x) == 0 can be simplified to x==0, and popcount(x) != 0 to
>> x!=0.
>> (ii) popcount(x&1) can be simplified to x&1, and for unsigned x,
>> popcount(x>>31) to x>>31.
>> (iii) popcount (x&6) + popcount(y&16) can be simplified to
>> popcount((x&6)|(y&16))
>>
>> These may seem obscure transformations, but performing these types of
>> POPCOUNT operations are often the performance critical steps in some
>> cheminformatics applications.
>>
>> To implement the above transformations I've introduced the
>> tree_nonzero_bits function, which is a tree-level version of rtlanal's
>> nonzero_bits used by the RTL optimizers.
> 
> I am wondering if this brings much, compared to just using
> get_nonzero_bits (works on INTEGER_CST / SSA_NAME, matched by
> with_possible_nonzero_bits). If we do decide to introduce this function,
> we probably want to use it in other places that currently use
> get_nonzero_bits as well...
> 
>> +    case NOP_EXPR:
>> +    case CONVERT_EXPR:
> 
> We have CASE_CONVERT: for this pair.
I've fixed this in Roger's patch.

> 
>> +    case LSHIFT_EXPR:
>> +      if (TREE_CODE (TREE_OPERAND (t, 1)) == INTEGER_CST)
> 
> Maybe check INTEGRAL_TYPE_P as well, like you did for PLUS_EXPR? Or was
> that also unnecessary for PLUS_EXPR?
While there may be cases where allowing an INTEGRAL_TYPE_P SSA_NAME for
the shift count would be helpful, I think it'd be exceedingly rare.

For operands of a PLUS_EXPR I think we're a lot more likely to be able
to do something useful with an SSA_NAME argument.


> 
>> +      return wi::neg_p (arg1)
>> +         ? wi::rshift (nzbits, -arg1, TYPE_SIGN (type))
>> +         : wi::lshift (nzbits, arg1);
> 
> I can see that fold-const.c already does something like that. I am
> surprised the sanitizer guys haven't asked that we just punt on negative
> values instead.
I'm leaving this as-is -- while I think negative shift counts are a bad
idea, handling them in any other way is likely going to result in a real
surprise in the result of the computation.


> 
>> +  /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be
>> zero.  */
>> +  (simplify
>> +    (plus (popcount @0) (popcount @1))
> 
> We probably want :s on both popcount: if they are used in other places
> than just this addition, it is likely cheaper not to introduce a new
> call to popcount.
Yea.  I also verified the Roger's tests still work with the :s added.

> 
>> +  /* pocount(X) == 0 is X == 0, and related (in)equalities.  */
> 
> po+p+count
Fixed.

> 
>> +  (for cmp (le eq ne gt)
>> +       rep (eq eq ne ne)
>> +    (simplify
>> +      (cmp (popcount @0) zerop)
> 
> Might as well use integer_zerop when we know we are dealing with integers.
And fixed.


I've also re-bootstrapped Roger's patch and will be committing it shortly.

jeff
Marc Glisse May 24, 2018, 9:10 p.m. UTC | #7
On Thu, 24 May 2018, Jeff Law wrote:

>>> +    case LSHIFT_EXPR:
>>> +      if (TREE_CODE (TREE_OPERAND (t, 1)) == INTEGER_CST)
>>
>> Maybe check INTEGRAL_TYPE_P as well, like you did for PLUS_EXPR? Or was
>> that also unnecessary for PLUS_EXPR?
> While there may be cases where allowing an INTEGRAL_TYPE_P SSA_NAME for
> the shift count would be helpful, I think it'd be exceedingly rare.
>
> For operands of a PLUS_EXPR I think we're a lot more likely to be able
> to do something useful with an SSA_NAME argument.

This was a while ago, but it seems likely that I meant: check that the 
result (or lhs) has scalar integer type (in particular not a vector type). 
Or more precisely asking why such a check is done for PLUS_EXPR and not 
for LSHIFT_EXPR.
diff mbox series

Patch

Index: gcc/fold-const.c
===================================================================
--- gcc/fold-const.c	(revision 257227)
+++ gcc/fold-const.c	(working copy)
@@ -14580,6 +14580,75 @@ 
   return string + offset;
 }
 
+/* Given a tree T, compute which bits in T may be nonzero.  */
+
+wide_int
+tree_nonzero_bits (const_tree t)
+{
+  switch (TREE_CODE (t))
+    {
+    case INTEGER_CST:
+      return wi::to_wide (t);
+    case SSA_NAME:
+      return get_nonzero_bits (t);
+    case NON_LVALUE_EXPR:
+    case SAVE_EXPR:
+      return tree_nonzero_bits (TREE_OPERAND (t, 0));
+    case BIT_AND_EXPR:
+      return wi::bit_and (tree_nonzero_bits (TREE_OPERAND (t, 0)),
+			  tree_nonzero_bits (TREE_OPERAND (t, 1)));
+    case BIT_IOR_EXPR:
+    case BIT_XOR_EXPR:
+      return wi::bit_or (tree_nonzero_bits (TREE_OPERAND (t, 0)),
+			 tree_nonzero_bits (TREE_OPERAND (t, 1)));
+    case COND_EXPR:
+      return wi::bit_or (tree_nonzero_bits (TREE_OPERAND (t, 1)),
+			 tree_nonzero_bits (TREE_OPERAND (t, 2)));
+    case NOP_EXPR:
+    case CONVERT_EXPR:
+      return wide_int::from (tree_nonzero_bits (TREE_OPERAND (t, 0)),
+			     TYPE_PRECISION (TREE_TYPE (t)),
+			     TYPE_SIGN (TREE_TYPE (TREE_OPERAND (t, 0))));
+    case PLUS_EXPR:
+      if (INTEGRAL_TYPE_P (TREE_TYPE (t)))
+	{
+	  wide_int nzbits1 = tree_nonzero_bits (TREE_OPERAND (t, 0));
+	  wide_int nzbits2 = tree_nonzero_bits (TREE_OPERAND (t, 1));
+	  if (wi::bit_and (nzbits1, nzbits2) == 0)
+	    return wi::bit_or (nzbits1, nzbits2);
+	}
+      break;
+    case LSHIFT_EXPR:
+      if (TREE_CODE (TREE_OPERAND (t, 1)) == INTEGER_CST)
+	{
+	  tree type = TREE_TYPE (t);
+	  wide_int nzbits = tree_nonzero_bits (TREE_OPERAND (t, 0));
+	  wide_int arg1 = wi::to_wide (TREE_OPERAND (t, 1),
+				       TYPE_PRECISION (type));
+	  return wi::neg_p (arg1)
+		 ? wi::rshift (nzbits, -arg1, TYPE_SIGN (type))
+		 : wi::lshift (nzbits, arg1);
+	}
+      break;
+    case RSHIFT_EXPR:
+      if (TREE_CODE (TREE_OPERAND (t, 1)) == INTEGER_CST)
+        {
+	  tree type = TREE_TYPE (t);
+	  wide_int nzbits = tree_nonzero_bits (TREE_OPERAND (t, 0));
+	  wide_int arg1 = wi::to_wide (TREE_OPERAND (t, 1),
+				       TYPE_PRECISION (type));
+	  return wi::neg_p (arg1)
+		 ? wi::lshift (nzbits, -arg1)
+		 : wi::rshift (nzbits, arg1, TYPE_SIGN (type));
+        }
+      break;
+    default:
+      break;
+    }
+
+  return wi::shwi (-1, TYPE_PRECISION (TREE_TYPE (t)));
+}
+
 #if CHECKING_P
 
 namespace selftest {
Index: gcc/fold-const.h
===================================================================
--- gcc/fold-const.h	(revision 257227)
+++ gcc/fold-const.h	(working copy)
@@ -181,6 +181,7 @@ 
 extern tree const_binop (enum tree_code, tree, tree, tree);
 extern bool negate_mathfn_p (combined_fn);
 extern const char *c_getstr (tree, unsigned HOST_WIDE_INT *strlen = NULL);
+extern wide_int tree_nonzero_bits (const_tree);
 
 /* Return OFF converted to a pointer offset type suitable as offset for
    POINTER_PLUS_EXPR.  Use location LOC for this conversion.  */
Index: gcc/match.pd
===================================================================
--- gcc/match.pd	(revision 257227)
+++ gcc/match.pd	(working copy)
@@ -4648,3 +4648,24 @@ 
 	|| wi::geu_p (wi::to_wide (@rpos),
 		      wi::to_wide (@ipos) + isize))
     (BIT_FIELD_REF @0 @rsize @rpos)))))
+
+/* POPCOUNT simplifications.  */
+(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL
+               BUILT_IN_POPCOUNTIMAX)
+  /* popcount(X&1) is nop_expr(X&1).  */
+  (simplify
+    (popcount @0)
+    (if (tree_nonzero_bits (@0) == 1)
+      (convert @0)))
+  /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
+  (simplify
+    (plus (popcount @0) (popcount @1))
+    (if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
+      (popcount (bit_ior @0 @1))))
+  /* pocount(X) == 0 is X == 0, and related (in)equalities.  */
+  (for cmp (le eq ne gt)
+       rep (eq eq ne ne)
+    (simplify
+      (cmp (popcount @0) zerop)
+      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
+