diff mbox

Improve folding of bitwise ops on booleans

Message ID 51A90588.5080807@redhat.com
State New
Headers show

Commit Message

Jeff Law May 31, 2013, 8:18 p.m. UTC
This is an implementation to fix a missed optimization pointed out to me 
by Kai.

In all these examples, assume a & b are single bit types.

~a && b --> a < b
a && ~b --> b < a
~a || b --> a <= b
a && ~b --> b <= a

This happens with some regularity in GCC itself, though it's not as 
pervasive as some of the other missed optimizations I've run into.

This could have gone into fold-const.c or tree-forwprop.   fold-const.c 
isn't as useful as would need to see the entire expression as a single 
tree node.  tree-forwprop.c can follow the use-def links and discover 
more opportunities even when the expressions span two source statements 
or are exposed by other optimizations.

Bootstrapped and regression tested on x86_64-unknown-linux-gnu.

OK for the trunk?
commit 2b61de6f70576105fe6ada31618db23857f9c902
Author: Jeff Law <law@redhat.com>
Date:   Fri May 31 14:16:27 2013 -0600

            * tree-ssa-forwprop.c (simplify_bitwise_binary_boolean): New
            * function.
            (simplify_bitwise_binary): Use it to simpify certain binary ops on
            booleans.
    
            * gcc.dg/tree-ssa/forwprop-27.c: New test.

Comments

Richard Biener June 3, 2013, 8:24 a.m. UTC | #1
On Fri, May 31, 2013 at 10:18 PM, Jeff Law <law@redhat.com> wrote:
>
> This is an implementation to fix a missed optimization pointed out to me by
> Kai.
>
> In all these examples, assume a & b are single bit types.
>
> ~a && b --> a < b

For a signed 1-bit type you'll have values -1, 0 and clearly

  0 < -1

is false while ~0 & -1 is non-zero.

So I believe you have to restrict these transforms to signed 1-bit values
or adjust the folding appropriately.  Besides that, ...

> a && ~b --> b < a
> ~a || b --> a <= b
> a && ~b --> b <= a

I wonder if these are really a simplification if the result is not used
exclusively in a conditional jump.  Because for setting a register
to a < b you'll likely get worse code than using ~a & b (given that
many ISAs have a and-not instruction).  Of course you may argue
that's a bug in the RTL / target piece (getting different code for
a < b vs. ~a & b) and a < b is shorter on the tree level.

More comments in-line.

> This happens with some regularity in GCC itself, though it's not as
> pervasive as some of the other missed optimizations I've run into.
>
> This could have gone into fold-const.c or tree-forwprop.   fold-const.c
> isn't as useful as would need to see the entire expression as a single tree
> node.  tree-forwprop.c can follow the use-def links and discover more
> opportunities even when the expressions span two source statements or are
> exposed by other optimizations.
>
> Bootstrapped and regression tested on x86_64-unknown-linux-gnu.
>
> OK for the trunk?
>
>
> commit 2b61de6f70576105fe6ada31618db23857f9c902
> Author: Jeff Law <law@redhat.com>
> Date:   Fri May 31 14:16:27 2013 -0600
>
>             * tree-ssa-forwprop.c (simplify_bitwise_binary_boolean): New
>             * function.
>             (simplify_bitwise_binary): Use it to simpify certain binary ops
> on
>             booleans.
>
>             * gcc.dg/tree-ssa/forwprop-27.c: New test.
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 396111e..7f027b0 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2013-05-31  Jeff Law  <law@redhat.com>
> +
> +       * tree-ssa-forwprop.c (simplify_bitwise_binary_boolean): New
> function.
> +       (simplify_bitwise_binary): Use it to simpify certain binary ops on
> +       booleans.
> +
>  2013-05-28  Steve Ellcey  <sellcey@mips.com>
>
>         * config/mips/mips-cpus.def (mips32r2): Change processor type.
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 869371a..6f80afb 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,7 @@
> +2013-05-31  Jeff Law  <law@redhat.com>
> +
> +       * gcc.dg/tree-ssa/forwprop-27.c: New test.
> +
>  2013-05-28  Balaji V. Iyer  <balaji.v.iyer@intel.com>
>
>         * c-c++-common/cilk-plus/AN/array_test1.c: New test.
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
> new file mode 100644
> index 0000000..75e935d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
> @@ -0,0 +1,78 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
> +
> +extern char * frob (void);
> +extern _Bool testit(void);
> +
> +test (int code)
> +{
> +  char * temp = frob();;
> +  int rotate = (code == 22);
> +  if (temp == 0 && !rotate)
> +      oof();
> +}
> +
> +test_2 (int code)
> +{
> +  char * temp = frob();
> +  int rotate = (code == 22);
> +  if (!rotate && temp == 0)
> +      oof();
> +}
> +
> +
> +test_3 (int code)
> +{
> +  char * temp = frob();
> +  int rotate = (code == 22);
> +  if (!rotate || temp == 0)
> +      oof();
> +}
> +
> +
> +test_4 (int code)
> +{
> +  char * temp = frob();
> +  int rotate = (code == 22);
> +  if (temp == 0 || !rotate)
> +      oof();
> +}
> +
> +
> +test_5 (int code)
> +{
> +  _Bool temp = testit();;
> +  _Bool rotate = (code == 22);
> +  if (temp == 0 && !rotate)
> +      oof();
> +}
> +
> +test_6 (int code)
> +{
> +  _Bool temp = testit();
> +  _Bool rotate = (code == 22);
> +  if (!rotate && temp == 0)
> +      oof();
> +}
> +
> +
> +test_7 (int code)
> +{
> +  _Bool temp = testit();
> +  _Bool rotate = (code == 22);
> +  if (!rotate || temp == 0)
> +      oof();
> +}
> +
> +
> +test_8 (int code)
> +{
> +  _Bool temp = testit();
> +  _Bool rotate = (code == 22);
> +  if (temp == 0 || !rotate)
> +      oof();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Replaced" 8 "forwprop1"} } */
> +
> +
> diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
> index 6043d31..8c3f08b 100644
> --- a/gcc/tree-ssa-forwprop.c
> +++ b/gcc/tree-ssa-forwprop.c
> @@ -1870,6 +1870,45 @@ hoist_conversion_for_bitop_p (tree to, tree from)
>    return false;
>  }
>
> +/* GSI points to a statement of the form
> +
> +   result = OP0 CODE OP1
> +
> +   Where OP0 and OP1 are single bit SSA_NAMEs and CODE is either
> +   BIT_AND_EXPR or BIT_IOR_EXPR.
> +
> +   If OP0 is fed by a bitwise negation of another single bit SSA_NAME,
> +   then we can simplify the two statements into a single LT_EXPR or LE_EXPR
> +   when code is BIT_AND_EXPR and BIT_IOR_EXPR respectively.
> +
> +   If a simplification is mode, return TRUE, else return FALSE.  */
> +static bool
> +simplify_bitwise_binary_boolean (gimple_stmt_iterator *gsi,
> +                                enum tree_code code,
> +                                tree op0, tree op1)
> +{
> +  gimple op0_def_stmt = SSA_NAME_DEF_STMT (op0);
> +
> +  if (!is_gimple_assign (op0_def_stmt)
> +      || (gimple_assign_rhs_code (op0_def_stmt) != BIT_NOT_EXPR))
> +    return false;
> +
> +  tree x = gimple_assign_rhs1 (op0_def_stmt);
> +  if (TREE_CODE (x) == SSA_NAME
> +      && INTEGRAL_TYPE_P (TREE_TYPE (x))
> +      && TYPE_PRECISION (TREE_TYPE (x)) == 1)

Do these transforms not work for BOOLEAN_TYPE as well?

> +    {
> +      gimple stmt = gsi_stmt (*gsi);
> +      gimple_assign_set_rhs1 (stmt, x);
> +      gimple_assign_set_rhs2 (stmt, op1);
> +      gimple_assign_set_rhs_code (stmt, code == BIT_AND_EXPR ? LT_EXPR :
> LE_EXPR);
> +      update_stmt (gsi_stmt (*gsi));

No need to query gsi_stmt again, it cannot change.

> +      return true;
> +    }
> +  return false;
> +
> +}
> +
>  /* Simplify bitwise binary operations.
>     Return true if a transformation applied, otherwise return false.  */
>
> @@ -2118,8 +2157,26 @@ simplify_bitwise_binary (gimple_stmt_iterator *gsi)
>               return true;
>             }
>         }
> -    }
>
> +      /* If arg1 and arg2 are booleans (or any single bit type)
> +         then try to simplify:
> +
> +        (~X & Y) -> X < Y
> +        (X & ~Y) -> Y < X
> +         (~X | Y) -> X <= Y
> +         (X | ~Y) -> Y <= X  */
> +      if (TREE_CODE (arg1) == SSA_NAME
> +         && TREE_CODE (arg2) == SSA_NAME
> +         && INTEGRAL_TYPE_P (TREE_TYPE (arg1))
> +         && TYPE_PRECISION (TREE_TYPE (arg1)) == 1
> +         && TYPE_PRECISION (TREE_TYPE (arg2)) == 1)
> +       {
> +         if (simplify_bitwise_binary_boolean (gsi, code, arg1, arg2))
> +           return true;
> +         if (simplify_bitwise_binary_boolean (gsi, code, arg2, arg1))
> +           return true;
> +       }
> +    }
>    return false;
>  }
>
>
Kai Tietz June 3, 2013, 2:23 p.m. UTC | #2
Hmm, it isn't necessary to restrict this optimization AFAICS.  We have
just two cases.

(~X & Y) -> X < Y
(X & ~Y) -> Y < X
(~X | Y) -> X <= Y
(X | ~Y) -> Y <= X

is true for unsigned 1-bit types.

For signed 1-bit types we need to invert logic here as following:

(~X & Y) -> X > Y
(X & ~Y) -> Y > X
(~X | Y) -> X >= Y
(X | ~Y) -> Y >= X

Btw there is one optimization in this context which might be something
worth here too.
-X -> X for 1-bit typed X (signed doesn't matter here).

Kai
Jeff Law June 3, 2013, 4:05 p.m. UTC | #3
On 06/03/2013 02:24 AM, Richard Biener wrote:
> On Fri, May 31, 2013 at 10:18 PM, Jeff Law <law@redhat.com> wrote:
>>
>> This is an implementation to fix a missed optimization pointed out to me by
>> Kai.
>>
>> In all these examples, assume a & b are single bit types.
>>
>> ~a && b --> a < b
>
> For a signed 1-bit type you'll have values -1, 0 and clearly
>
>    0 < -1
>
> is false while ~0 & -1 is non-zero.
Sigh.  Yes.

>
> So I believe you have to restrict these transforms to signed 1-bit values
> or adjust the folding appropriately.  Besides that, ...
>
>> a && ~b --> b < a
>> ~a || b --> a <= b
>> a && ~b --> b <= a
>
> I wonder if these are really a simplification if the result is not used
> exclusively in a conditional jump.  Because for setting a register
> to a < b you'll likely get worse code than using ~a & b (given that
> many ISAs have a and-not instruction).  Of course you may argue
> that's a bug in the RTL / target piece (getting different code for
> a < b vs. ~a & b) and a < b is shorter on the tree level.
The gimple optimizers should be looking to simplify things to the 
fullest extent possible at the gimple level and let the backend make the 
determination to use and-not if such an insn is available.

The counter to that argument of leaving it to the backend to recover the 
and-not for is that the backend doesn't typically see these as single 
bit operations which makes turning the relational back into an and-not 
sequence considerably more difficult.

Do we consider this enough of an issue to want to avoid going down this 
path? (in which case we'll file the example code as a missed-opt PR and 
attach the code and pointer to this thread for future reference)

>> +static bool
>> +simplify_bitwise_binary_boolean (gimple_stmt_iterator *gsi,
>> +                                enum tree_code code,
>> +                                tree op0, tree op1)
>> +{
>> +  gimple op0_def_stmt = SSA_NAME_DEF_STMT (op0);
>> +
>> +  if (!is_gimple_assign (op0_def_stmt)
>> +      || (gimple_assign_rhs_code (op0_def_stmt) != BIT_NOT_EXPR))
>> +    return false;
>> +
>> +  tree x = gimple_assign_rhs1 (op0_def_stmt);
>> +  if (TREE_CODE (x) == SSA_NAME
>> +      && INTEGRAL_TYPE_P (TREE_TYPE (x))
>> +      && TYPE_PRECISION (TREE_TYPE (x)) == 1)
>
> Do these transforms not work for BOOLEAN_TYPE as well?
Yes.  Booleans are integral types with a single bit of precision, right? 
  So this check should allow boolean types.  What am I missing?

>
>> +    {
>> +      gimple stmt = gsi_stmt (*gsi);
>> +      gimple_assign_set_rhs1 (stmt, x);
>> +      gimple_assign_set_rhs2 (stmt, op1);
>> +      gimple_assign_set_rhs_code (stmt, code == BIT_AND_EXPR ? LT_EXPR :
>> LE_EXPR);
>> +      update_stmt (gsi_stmt (*gsi));
>
> No need to query gsi_stmt again, it cannot change.
I'd have to check on that; I think this was cribbed from another 
transformation in tree-ssa-reassoc.
Kai Tietz June 3, 2013, 4:37 p.m. UTC | #4
2013/6/3 Jeff Law <law@redhat.com>:
> The counter to that argument of leaving it to the backend to recover the
> and-not for is that the backend doesn't typically see these as single bit
> operations which makes turning the relational back into an and-not sequence
> considerably more difficult.
>
> Do we consider this enough of an issue to want to avoid going down this
> path? (in which case we'll file the example code as a missed-opt PR and
> attach the code and pointer to this thread for future reference)
>

Hmm, my tests are showing (for targets with conditional set
instructions) and improvement even for none-conditional-branch code.

Sample code:

_Bool foo (_Bool a, _Bool b)
{
  _Bool r = a < b;
  return r;
}

_Bool boo (_Bool a, _Bool b)
{
  _Bool r = ~a & b;
  return r;
}

produces:

        .text
        .p2align 4,,15
        .globl  foo
        .def    foo;    .scl    2;      .type   32;     .endef
        .seh_proc       foo
foo:
        .seh_endprologue
        cmpb    %cl, %dl
        seta    %al
        ret
        .seh_endproc
        .p2align 4,,15
        .globl  boo
        .def    boo;    .scl    2;      .type   32;     .endef
        .seh_proc       boo
boo:
        .seh_endprologue
        movl    %ecx, %eax
        notl    %eax
        andl    %edx, %eax
        andl    $1, %eax
        ret

So it seems to be obvious that code gets much less costy as without
this optimization.

Kai
Richard Henderson June 3, 2013, 5 p.m. UTC | #5
On 06/03/2013 09:37 AM, Kai Tietz wrote:
> foo:
>         .seh_endprologue
>         cmpb    %cl, %dl
>         seta    %al
>         ret
>         .seh_endproc
>         .p2align 4,,15
>         .globl  boo
>         .def    boo;    .scl    2;      .type   32;     .endef
>         .seh_proc       boo
> boo:
>         .seh_endprologue
>         movl    %ecx, %eax
>         notl    %eax
>         andl    %edx, %eax
>         andl    $1, %eax
>         ret

Try arm or s390 or ppc for significantly different results.


r~
Jeff Law June 3, 2013, 5:09 p.m. UTC | #6
On 06/03/2013 11:00 AM, Richard Henderson wrote:
> On 06/03/2013 09:37 AM, Kai Tietz wrote:
>> foo:
>>          .seh_endprologue
>>          cmpb    %cl, %dl
>>          seta    %al
>>          ret
>>          .seh_endproc
>>          .p2align 4,,15
>>          .globl  boo
>>          .def    boo;    .scl    2;      .type   32;     .endef
>>          .seh_proc       boo
>> boo:
>>          .seh_endprologue
>>          movl    %ecx, %eax
>>          notl    %eax
>>          andl    %edx, %eax
>>          andl    $1, %eax
>>          ret
>
> Try arm or s390 or ppc for significantly different results.
I'm starting to wonder if we could delay make a choice about using a 
relational or  bit ops until gimple->rtl expansion.  I haven't seen any 
secondary benefits, so deferring to a later point where we might be able 
to query the backend's capabilities might make sense.
jeff
Kai Tietz June 3, 2013, 6:32 p.m. UTC | #7
2013/6/3 Jeff Law <law@redhat.com>:
> On 06/03/2013 11:00 AM, Richard Henderson wrote:
>>
>> On 06/03/2013 09:37 AM, Kai Tietz wrote:
>>>
>>> foo:
>>>          .seh_endprologue
>>>          cmpb    %cl, %dl
>>>          seta    %al
>>>          ret
>>>          .seh_endproc
>>>          .p2align 4,,15
>>>          .globl  boo
>>>          .def    boo;    .scl    2;      .type   32;     .endef
>>>          .seh_proc       boo
>>> boo:
>>>          .seh_endprologue
>>>          movl    %ecx, %eax
>>>          notl    %eax
>>>          andl    %edx, %eax
>>>          andl    $1, %eax
>>>          ret
>>
>>
>> Try arm or s390 or ppc for significantly different results.
>
> I'm starting to wonder if we could delay make a choice about using a
> relational or  bit ops until gimple->rtl expansion.  I haven't seen any
> secondary benefits, so deferring to a later point where we might be able to
> query the backend's capabilities might make sense.
> jeff

Well, a secondary benefit I see in area of BC-optimization.  But I
agree that this operation should go along gimple->rtl transformation.
And at same place BC-optimization should happen.

Kai
Jeff Law June 3, 2013, 6:34 p.m. UTC | #8
On 06/03/2013 12:32 PM, Kai Tietz wrote:
> 2013/6/3 Jeff Law <law@redhat.com>:
>> On 06/03/2013 11:00 AM, Richard Henderson wrote:
>>>
>>> On 06/03/2013 09:37 AM, Kai Tietz wrote:
>>>>
>>>> foo:
>>>>           .seh_endprologue
>>>>           cmpb    %cl, %dl
>>>>           seta    %al
>>>>           ret
>>>>           .seh_endproc
>>>>           .p2align 4,,15
>>>>           .globl  boo
>>>>           .def    boo;    .scl    2;      .type   32;     .endef
>>>>           .seh_proc       boo
>>>> boo:
>>>>           .seh_endprologue
>>>>           movl    %ecx, %eax
>>>>           notl    %eax
>>>>           andl    %edx, %eax
>>>>           andl    $1, %eax
>>>>           ret
>>>
>>>
>>> Try arm or s390 or ppc for significantly different results.
>>
>> I'm starting to wonder if we could delay make a choice about using a
>> relational or  bit ops until gimple->rtl expansion.  I haven't seen any
>> secondary benefits, so deferring to a later point where we might be able to
>> query the backend's capabilities might make sense.
>> jeff
>
> Well, a secondary benefit I see in area of BC-optimization.  But I
> agree that this operation should go along gimple->rtl transformation.
> And at same place BC-optimization should happen.
Let's table it for now then.  Ironically I was just looking at moving 
one of the branch-cost transformations out of fold-const.c.  We're going 
to have to build out some infrastructure to make that happen.

I'll withdraw the patch and just submit a missed optimization PR and 
xfailed testsuite for this issue.

jeff
Michael Meissner June 3, 2013, 7:22 p.m. UTC | #9
On Mon, Jun 03, 2013 at 10:00:19AM -0700, Richard Henderson wrote:
> On 06/03/2013 09:37 AM, Kai Tietz wrote:
> > foo:
> >         .seh_endprologue
> >         cmpb    %cl, %dl
> >         seta    %al
> >         ret
> >         .seh_endproc
> >         .p2align 4,,15
> >         .globl  boo
> >         .def    boo;    .scl    2;      .type   32;     .endef
> >         .seh_proc       boo
> > boo:
> >         .seh_endprologue
> >         movl    %ecx, %eax
> >         notl    %eax
> >         andl    %edx, %eax
> >         andl    $1, %eax
> >         ret
> 
> Try arm or s390 or ppc for significantly different results.

Lets see for powerpc:

.L.foo:
	cmpw 7,3,4
	mfcr 3,1
	rlwinm 3,3,29,1
	blr

.L.boo:
	andc 3,4,3
	rldicl 3,3,0,63
	blr
 
So for powerpc, the second is the preferred method.

However, in looking at it, if we rewrite the code to have come from
comparisons:

_Bool foo_cmp (long w, long x, long y, long z)
{
  _Bool a = (w < x);
  _Bool b = (y < z);
  _Bool r = a < b;
  return r;
}

_Bool bfoo_cmp (long w, long x, long y, long z)
{
  _Bool a = (w < x);
  _Bool b = (y < z);
  _Bool r = ~a & b;
  return r;
}

We get:

.L.foo_cmp:
	cmpd 7,5,6
	cmpd 6,3,4
	mfcr 6
	rlwinm 3,6,25,1
	rlwinm 6,6,29,1
	cmpw 7,3,6
	mfcr 3,1
	rlwinm 3,3,29,1
	blr

.L.bfoo_cmp:
	cmpd 6,3,4
	cmpd 7,5,6
	mfcr 3,2
	rlwinm 3,3,25,1
	mfcr 6,1
	rlwinm 6,6,29,1
	andc 3,6,3
	blr

And it would have been nice to use the logcal comparison operations in the CR
register unit (i.e. doing a crandc in the CR unit rather than moving the value
from a CR to a GPR -- cmpd/mfcr/rlwinm instructions).
Richard Biener June 4, 2013, 10:45 a.m. UTC | #10
On Mon, Jun 3, 2013 at 6:05 PM, Jeff Law <law@redhat.com> wrote:
> On 06/03/2013 02:24 AM, Richard Biener wrote:
>>
>> On Fri, May 31, 2013 at 10:18 PM, Jeff Law <law@redhat.com> wrote:
>>>
>>>
>>> This is an implementation to fix a missed optimization pointed out to me
>>> by
>>> Kai.
>>>
>>> In all these examples, assume a & b are single bit types.
>>>
>>> ~a && b --> a < b
>>
>>
>> For a signed 1-bit type you'll have values -1, 0 and clearly
>>
>>    0 < -1
>>
>> is false while ~0 & -1 is non-zero.
>
> Sigh.  Yes.
>
>
>>
>> So I believe you have to restrict these transforms to signed 1-bit values
>> or adjust the folding appropriately.  Besides that, ...
>>
>>> a && ~b --> b < a
>>> ~a || b --> a <= b
>>> a && ~b --> b <= a
>>
>>
>> I wonder if these are really a simplification if the result is not used
>> exclusively in a conditional jump.  Because for setting a register
>> to a < b you'll likely get worse code than using ~a & b (given that
>> many ISAs have a and-not instruction).  Of course you may argue
>> that's a bug in the RTL / target piece (getting different code for
>> a < b vs. ~a & b) and a < b is shorter on the tree level.
>
> The gimple optimizers should be looking to simplify things to the fullest
> extent possible at the gimple level and let the backend make the
> determination to use and-not if such an insn is available.
>
> The counter to that argument of leaving it to the backend to recover the
> and-not for is that the backend doesn't typically see these as single bit
> operations which makes turning the relational back into an and-not sequence
> considerably more difficult.
>
> Do we consider this enough of an issue to want to avoid going down this
> path? (in which case we'll file the example code as a missed-opt PR and
> attach the code and pointer to this thread for future reference)

I agree that gimple optimizers should simplify things as much as possible.
Still we try to not generate regressions on the way.  So as an intermediate
guard I'd make sure the result of the bitwise op is only used in a GIMPLE_COND
(which means we can forward the resulting compare directly into the
GIMPLE_COND).
A comment should explain why we restrict the transform.

>
>>> +static bool
>>> +simplify_bitwise_binary_boolean (gimple_stmt_iterator *gsi,
>>> +                                enum tree_code code,
>>> +                                tree op0, tree op1)
>>> +{
>>> +  gimple op0_def_stmt = SSA_NAME_DEF_STMT (op0);
>>> +
>>> +  if (!is_gimple_assign (op0_def_stmt)
>>> +      || (gimple_assign_rhs_code (op0_def_stmt) != BIT_NOT_EXPR))
>>> +    return false;
>>> +
>>> +  tree x = gimple_assign_rhs1 (op0_def_stmt);
>>> +  if (TREE_CODE (x) == SSA_NAME
>>> +      && INTEGRAL_TYPE_P (TREE_TYPE (x))
>>> +      && TYPE_PRECISION (TREE_TYPE (x)) == 1)
>>
>>
>> Do these transforms not work for BOOLEAN_TYPE as well?
>
> Yes.  Booleans are integral types with a single bit of precision, right?  So
> this check should allow boolean types.  What am I missing?

We have BOOLEAN_TYPEs that do not have a TYPE_PRECISION of one
(but still are two-valued, and we assume those values are 0 and != 0 (eh)).
So there is code that treats BOOLEAN_TYPEs the same as TYPE_PRECISION
one types and there is code that does not (for example bitwise not is not
equal to truth not on such types).

Richard.

>>
>>> +    {
>>> +      gimple stmt = gsi_stmt (*gsi);
>>> +      gimple_assign_set_rhs1 (stmt, x);
>>> +      gimple_assign_set_rhs2 (stmt, op1);
>>> +      gimple_assign_set_rhs_code (stmt, code == BIT_AND_EXPR ? LT_EXPR :
>>> LE_EXPR);
>>> +      update_stmt (gsi_stmt (*gsi));
>>
>>
>> No need to query gsi_stmt again, it cannot change.
>
> I'd have to check on that; I think this was cribbed from another
> transformation in tree-ssa-reassoc.
>
>
Richard Biener June 4, 2013, 10:47 a.m. UTC | #11
On Mon, Jun 3, 2013 at 8:34 PM, Jeff Law <law@redhat.com> wrote:
> On 06/03/2013 12:32 PM, Kai Tietz wrote:
>>
>> 2013/6/3 Jeff Law <law@redhat.com>:
>>>
>>> On 06/03/2013 11:00 AM, Richard Henderson wrote:
>>>>
>>>>
>>>> On 06/03/2013 09:37 AM, Kai Tietz wrote:
>>>>>
>>>>>
>>>>> foo:
>>>>>           .seh_endprologue
>>>>>           cmpb    %cl, %dl
>>>>>           seta    %al
>>>>>           ret
>>>>>           .seh_endproc
>>>>>           .p2align 4,,15
>>>>>           .globl  boo
>>>>>           .def    boo;    .scl    2;      .type   32;     .endef
>>>>>           .seh_proc       boo
>>>>> boo:
>>>>>           .seh_endprologue
>>>>>           movl    %ecx, %eax
>>>>>           notl    %eax
>>>>>           andl    %edx, %eax
>>>>>           andl    $1, %eax
>>>>>           ret
>>>>
>>>>
>>>>
>>>> Try arm or s390 or ppc for significantly different results.
>>>
>>>
>>> I'm starting to wonder if we could delay make a choice about using a
>>> relational or  bit ops until gimple->rtl expansion.  I haven't seen any
>>> secondary benefits, so deferring to a later point where we might be able
>>> to
>>> query the backend's capabilities might make sense.
>>> jeff
>>
>>
>> Well, a secondary benefit I see in area of BC-optimization.  But I
>> agree that this operation should go along gimple->rtl transformation.
>> And at same place BC-optimization should happen.
>
> Let's table it for now then.  Ironically I was just looking at moving one of
> the branch-cost transformations out of fold-const.c.  We're going to have to
> build out some infrastructure to make that happen.
>
> I'll withdraw the patch and just submit a missed optimization PR and xfailed
> testsuite for this issue.

The transform is worthwhile if there is secondary benefit of being able to
forward the result into a test instruction (GIMPLE_COND or condition
in a COND_EXPR).

Richard.

> jeff
Jeff Law June 4, 2013, 6:41 p.m. UTC | #12
On 06/04/2013 04:45 AM, Richard Biener wrote:
>>
>> Yes.  Booleans are integral types with a single bit of precision, right?  So
>> this check should allow boolean types.  What am I missing?
>
> We have BOOLEAN_TYPEs that do not have a TYPE_PRECISION of one
> (but still are two-valued, and we assume those values are 0 and != 0 (eh)).
> So there is code that treats BOOLEAN_TYPEs the same as TYPE_PRECISION
> one types and there is code that does not (for example bitwise not is not
> equal to truth not on such types).
Good grief.  For a boolean with a TYPE_PRECISION != 1, I think we can 
apply the transformations if the type is unsigned.  Once the type is 
signed I think we'd lose.

Do you have any sample code which would create a boolean type with a 
precision other than 1?

jeff
Kai Tietz June 4, 2013, 6:51 p.m. UTC | #13
2013/6/4 Jeff Law <law@redhat.com>:
> On 06/04/2013 04:45 AM, Richard Biener wrote:
>>>
>>>
>>> Yes.  Booleans are integral types with a single bit of precision, right?
>>> So
>>> this check should allow boolean types.  What am I missing?
>>
>>
>> We have BOOLEAN_TYPEs that do not have a TYPE_PRECISION of one
>> (but still are two-valued, and we assume those values are 0 and != 0
>> (eh)).
>> So there is code that treats BOOLEAN_TYPEs the same as TYPE_PRECISION
>> one types and there is code that does not (for example bitwise not is not
>> equal to truth not on such types).
>
> Good grief.  For a boolean with a TYPE_PRECISION != 1, I think we can apply
> the transformations if the type is unsigned.  Once the type is signed I
> think we'd lose.
>
> Do you have any sample code which would create a boolean type with a
> precision other than 1?
>
> jeff
>

AFAI recall, the boolean-type in Ada has 8-bit precision.  I think we
have to omit this transformations for any boolean-type with
type-precision not equal to 1.  Ada uses the other values for
sanity-check AFAIR.

Kai
Jeff Law June 4, 2013, 7:20 p.m. UTC | #14
On 06/03/2013 08:23 AM, Kai Tietz wrote:
>
> Btw there is one optimization in this context which might be something
> worth here too.
> -X -> X for 1-bit typed X (signed doesn't matter here).
I've had a hell of a time trying to trigger a case where this isn't 
already handled.   Samples welcome.
Segher Boessenkool June 5, 2013, 1:42 a.m. UTC | #15
> However, in looking at it, if we rewrite the code to have come from
> comparisons:

<snip>

> _Bool bfoo_cmp (long w, long x, long y, long z)
> {
>   _Bool a = (w < x);
>   _Bool b = (y < z);
>   _Bool r = ~a & b;
>   return r;
> }
>
> We get:

<snip>

> .L.bfoo_cmp:
> 	cmpd 6,3,4
> 	cmpd 7,5,6
> 	mfcr 3,2
> 	rlwinm 3,3,25,1
> 	mfcr 6,1
> 	rlwinm 6,6,29,1
> 	andc 3,6,3
> 	blr

It is a bit better on 32-bit:

bfoo_cmp:
         cmpw 7,3,4
         cmpw 6,5,6
         mfcr 3
         rlwinm 6,3,25,1
         rlwinm 3,3,29,1
         andc 3,6,3
         blr

> And it would have been nice to use the logcal comparison operations  
> in the CR
> register unit (i.e. doing a crandc in the CR unit rather than  
> moving the value
> from a CR to a GPR -- cmpd/mfcr/rlwinm instructions).

We cannot avoid an mfcr then, either.  It would be one machine
instruction shorter though (but can be more expensive to execute,
on some CPUs).

That you get two MFCRs on 64-bit is a target bug.


Segher
Andrew Pinski June 5, 2013, 1:46 a.m. UTC | #16
On Tue, Jun 4, 2013 at 6:42 PM, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>> However, in looking at it, if we rewrite the code to have come from
>> comparisons:
>
>
> <snip>
>
>
>> _Bool bfoo_cmp (long w, long x, long y, long z)
>> {
>>   _Bool a = (w < x);
>>   _Bool b = (y < z);
>>   _Bool r = ~a & b;
>>   return r;
>> }
>>
>> We get:
>
>
> <snip>
>
>
>> .L.bfoo_cmp:
>>         cmpd 6,3,4
>>         cmpd 7,5,6
>>         mfcr 3,2
>>         rlwinm 3,3,25,1
>>         mfcr 6,1
>>         rlwinm 6,6,29,1
>>         andc 3,6,3
>>         blr
>
>
> It is a bit better on 32-bit:
>
> bfoo_cmp:
>         cmpw 7,3,4
>         cmpw 6,5,6
>         mfcr 3
>         rlwinm 6,3,25,1
>         rlwinm 3,3,29,1
>
>         andc 3,6,3
>         blr
>
>> And it would have been nice to use the logcal comparison operations in the
>> CR
>> register unit (i.e. doing a crandc in the CR unit rather than moving the
>> value
>> from a CR to a GPR -- cmpd/mfcr/rlwinm instructions).
>
>
> We cannot avoid an mfcr then, either.  It would be one machine
> instruction shorter though (but can be more expensive to execute,
> on some CPUs).
>
> That you get two MFCRs on 64-bit is a target bug.

Not true there.  If you look at the 64bit output you will see you are
using the one cr mfcr version in 64bit while you are grabbing the full
cr in the 32bit.  It depends on the processor but the one cr might be
better.

Thanks,
Andrew

>
>
> Segher
>
Segher Boessenkool June 5, 2013, 3:34 a.m. UTC | #17
>> We cannot avoid an mfcr then, either.  It would be one machine
>> instruction shorter though (but can be more expensive to execute,
>> on some CPUs).
>>
>> That you get two MFCRs on 64-bit is a target bug.
>
> Not true there.  If you look at the 64bit output you will see you are
> using the one cr mfcr version in 64bit while you are grabbing the full
> cr in the 32bit.  It depends on the processor but the one cr might be
> better.

Yes, and it's still a bug.  Or three:

1) GCC uses the all-fields instruction instead of the one-field
form unless you use -mmfcrf (or -mcpu=power4, etc.), although
the one-field mfcr works fine on all CPUs and is never slower
(I'm not talking about mfocrf; just the plain mfcr instruction);

2) There is a define_peephole to combine two mfcr's into one
(search rs6000.md for "3 cycle delay"); for the example code,
one of the results ends up as SI and the other as DI, and
the peephole will not trigger on that;

3) That peephole should not be enabled with -mmfcrf, and of
course it should not be a peephole at all!

Without bug 2), you'd see the same behaviour on 64-bit as on
32-bit.


Segher
Segher Boessenkool June 5, 2013, 3:37 a.m. UTC | #18
> 1) GCC uses the all-fields instruction instead of the one-field
> form unless you use -mmfcrf (or -mcpu=power4, etc.), although
> the one-field mfcr works fine on all CPUs and is never slower
> (I'm not talking about mfocrf; just the plain mfcr instruction);

Ugh, need more coffee, forget about this one.  Sorry.


Segher
Richard Biener June 5, 2013, 8:36 a.m. UTC | #19
On Tue, Jun 4, 2013 at 9:20 PM, Jeff Law <law@redhat.com> wrote:
> On 06/03/2013 08:23 AM, Kai Tietz wrote:
>>
>>
>> Btw there is one optimization in this context which might be something
>> worth here too.
>> -X -> X for 1-bit typed X (signed doesn't matter here).
>
> I've had a hell of a time trying to trigger a case where this isn't already
> handled.   Samples welcome.

Samples should pop up when including BOOLEAN_TYPEs in the handling
of tree-ssa-forwprop.c:truth_valued_ssa_name ().  It's probably Ada indeed
(gfortran also has various BOOLEAN_TYPEs but they all have precision 1
but different sizes IIRC).

Eventually we should force all BOOLEAN_TYPEs to have precision 1 but
as Kai remembers Ada uses alternate values for verification (true, false, NaT
or so).  Eric may be able to explain.

Thanks,
Richard.
Eric Botcazou June 5, 2013, 11:52 a.m. UTC | #20
> Eventually we should force all BOOLEAN_TYPEs to have precision 1 but
> as Kai remembers Ada uses alternate values for verification (true, false,
> NaT or so).  Eric may be able to explain.

Correct, see: http://gcc.gnu.org/ml/gcc-patches/2011-05/msg01214.html
A boolean type isn't a modular type of precision 1 in Ada, it's an enumeration 
type with two values so a standard scalar type, i.e. precision == mode size.

IIRC I already mentioned that if boolean_type_node was to be unified across 
languages, we'll very likely need to reimplement the boolean types in Ada and 
use boolean_type_node only for the result of comparisons.
Kai Tietz June 5, 2013, 4:57 p.m. UTC | #21
2013/6/4 Jeff Law <law@redhat.com>:
> On 06/03/2013 08:23 AM, Kai Tietz wrote:
>>
>>
>> Btw there is one optimization in this context which might be something
>> worth here too.
>> -X -> X for 1-bit typed X (signed doesn't matter here).
>
> I've had a hell of a time trying to trigger a case where this isn't already
> handled.   Samples welcome.

A sample for this is

typedef struct onebit {
  unsigned int i : 1;
} onebit;

_Bool boo (onebit a, onebit b, onebit c)
{
 onebit i1, i2;
 i1.i = a.i + b.i;
 i2.i = a.i + c.i;
 i1.i -= i2.i;
 i1.i -= b.i;
 return i1.i;
}

Kai
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 396111e..7f027b0 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@ 
+2013-05-31  Jeff Law  <law@redhat.com>
+
+	* tree-ssa-forwprop.c (simplify_bitwise_binary_boolean): New function.
+	(simplify_bitwise_binary): Use it to simpify certain binary ops on
+	booleans.
+
 2013-05-28  Steve Ellcey  <sellcey@mips.com>
 
 	* config/mips/mips-cpus.def (mips32r2): Change processor type.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 869371a..6f80afb 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@ 
+2013-05-31  Jeff Law  <law@redhat.com>
+
+	* gcc.dg/tree-ssa/forwprop-27.c: New test.
+
 2013-05-28  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* c-c++-common/cilk-plus/AN/array_test1.c: New test.
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
new file mode 100644
index 0000000..75e935d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
@@ -0,0 +1,78 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-forwprop1" } */
+
+extern char * frob (void);
+extern _Bool testit(void);
+
+test (int code)
+{
+  char * temp = frob();;
+  int rotate = (code == 22);
+  if (temp == 0 && !rotate)
+      oof();
+}
+
+test_2 (int code)
+{
+  char * temp = frob();
+  int rotate = (code == 22);
+  if (!rotate && temp == 0)
+      oof();
+}
+
+
+test_3 (int code)
+{
+  char * temp = frob();
+  int rotate = (code == 22);
+  if (!rotate || temp == 0)
+      oof();
+}
+
+
+test_4 (int code)
+{
+  char * temp = frob();
+  int rotate = (code == 22);
+  if (temp == 0 || !rotate)
+      oof();
+}
+
+
+test_5 (int code)
+{
+  _Bool temp = testit();;
+  _Bool rotate = (code == 22);
+  if (temp == 0 && !rotate)
+      oof();
+}
+
+test_6 (int code)
+{
+  _Bool temp = testit();
+  _Bool rotate = (code == 22);
+  if (!rotate && temp == 0)
+      oof();
+}
+
+
+test_7 (int code)
+{
+  _Bool temp = testit();
+  _Bool rotate = (code == 22);
+  if (!rotate || temp == 0)
+      oof();
+}
+
+
+test_8 (int code)
+{
+  _Bool temp = testit();
+  _Bool rotate = (code == 22);
+  if (temp == 0 || !rotate)
+      oof();
+}
+
+/* { dg-final { scan-tree-dump-times "Replaced" 8 "forwprop1"} } */
+
+
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index 6043d31..8c3f08b 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -1870,6 +1870,45 @@  hoist_conversion_for_bitop_p (tree to, tree from)
   return false;
 }
 
+/* GSI points to a statement of the form
+
+   result = OP0 CODE OP1
+
+   Where OP0 and OP1 are single bit SSA_NAMEs and CODE is either
+   BIT_AND_EXPR or BIT_IOR_EXPR.
+
+   If OP0 is fed by a bitwise negation of another single bit SSA_NAME,
+   then we can simplify the two statements into a single LT_EXPR or LE_EXPR
+   when code is BIT_AND_EXPR and BIT_IOR_EXPR respectively.
+
+   If a simplification is mode, return TRUE, else return FALSE.  */
+static bool
+simplify_bitwise_binary_boolean (gimple_stmt_iterator *gsi,
+				 enum tree_code code,
+				 tree op0, tree op1)
+{
+  gimple op0_def_stmt = SSA_NAME_DEF_STMT (op0);
+
+  if (!is_gimple_assign (op0_def_stmt)
+      || (gimple_assign_rhs_code (op0_def_stmt) != BIT_NOT_EXPR))
+    return false;
+
+  tree x = gimple_assign_rhs1 (op0_def_stmt);
+  if (TREE_CODE (x) == SSA_NAME
+      && INTEGRAL_TYPE_P (TREE_TYPE (x))
+      && TYPE_PRECISION (TREE_TYPE (x)) == 1)
+    {
+      gimple stmt = gsi_stmt (*gsi);
+      gimple_assign_set_rhs1 (stmt, x);
+      gimple_assign_set_rhs2 (stmt, op1);
+      gimple_assign_set_rhs_code (stmt, code == BIT_AND_EXPR ? LT_EXPR : LE_EXPR);
+      update_stmt (gsi_stmt (*gsi));
+      return true;
+    }
+  return false;
+
+}
+
 /* Simplify bitwise binary operations.
    Return true if a transformation applied, otherwise return false.  */
 
@@ -2118,8 +2157,26 @@  simplify_bitwise_binary (gimple_stmt_iterator *gsi)
 	      return true;
 	    }
 	}
-    }
 
+      /* If arg1 and arg2 are booleans (or any single bit type)
+         then try to simplify:
+
+	 (~X & Y) -> X < Y
+	 (X & ~Y) -> Y < X
+         (~X | Y) -> X <= Y
+         (X | ~Y) -> Y <= X  */
+      if (TREE_CODE (arg1) == SSA_NAME
+	  && TREE_CODE (arg2) == SSA_NAME
+	  && INTEGRAL_TYPE_P (TREE_TYPE (arg1))
+	  && TYPE_PRECISION (TREE_TYPE (arg1)) == 1
+	  && TYPE_PRECISION (TREE_TYPE (arg2)) == 1)
+	{
+	  if (simplify_bitwise_binary_boolean (gsi, code, arg1, arg2))
+	    return true;
+	  if (simplify_bitwise_binary_boolean (gsi, code, arg2, arg1))
+	    return true;
+	}
+    }
   return false;
 }