diff mbox

Simple bitop reassoc in match.pd (was: Canonicalize X u< X to UNORDERED_EXPR)

Message ID alpine.DEB.2.02.1605100810480.7805@laptop-mg.saclay.inria.fr
State New
Headers show

Commit Message

Marc Glisse May 10, 2016, 6:11 a.m. UTC
On Fri, 6 May 2016, Marc Glisse wrote:

> Here they are. I did (X&Y)&X and (X&Y)&(X&Z). The next one would be 
> ((X&Y)&Z)&X, but at some point we have to defer to reassoc.
>
> I didn't add the convert?+tree_nop_conversion_p to the existing transform I 
> modified. I guess at some point we should make a pass and add them to all the 
> transformations on bit operations...
>
> For (X & Y) & Y, I believe that any conversion is fine. For the others, 
> tree_nop_conversion_p is probably too strict (narrowing should be fine for 
> all), but I was too lazy to look for tighter conditions.
>
> (X ^ Y) ^ Y -> X should probably have (non_lvalue ...) on its output, but in 
> a simple test it didn't seem to matter. Is non_lvalue still needed?
>
>
> Bootstrap+regtest on powerpc64le-unknown-linux-gnu.
>
> 2016-05-06  Marc Glisse  <marc.glisse@inria.fr>
>
> gcc/
> 	* fold-const.c (fold_binary_loc) [(X ^ Y) & Y]: Remove and merge 
> with...
> 	* match.pd ((X & Y) ^ Y): ... this.
> 	((X & Y) & Y, (X | Y) | Y, (X ^ Y) ^ Y, (X & Y) & (X & Z), (X | Y)
> 	| (X | Z), (X ^ Y) ^ (X ^ Z)): New transformations.
>
> gcc/testsuite/
> 	* gcc.dg/tree-ssa/bit-assoc.c: New testcase.
> 	* gcc.dg/tree-ssa/pr69270.c: Adjust.
> 	* gcc.dg/tree-ssa/vrp59.c: Disable forwprop.

Here it is again, I just replaced convert with convert[12] in the last 2 
transforms. This should matter for (unsigned)(si & 42) & (ui & 42u). I 
didn't change it in the other transform, because it would only matter in 
the case (T)(X & CST) & CST, which I think would be better served by 
extending the transform that currently handles (X & CST1) & CST2 (not done 
in this patch).

Comments

Richard Biener May 10, 2016, 9:27 a.m. UTC | #1
On Tue, May 10, 2016 at 8:11 AM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Fri, 6 May 2016, Marc Glisse wrote:
>
>> Here they are. I did (X&Y)&X and (X&Y)&(X&Z). The next one would be
>> ((X&Y)&Z)&X, but at some point we have to defer to reassoc.
>>
>> I didn't add the convert?+tree_nop_conversion_p to the existing transform
>> I modified. I guess at some point we should make a pass and add them to all
>> the transformations on bit operations...
>>
>> For (X & Y) & Y, I believe that any conversion is fine. For the others,
>> tree_nop_conversion_p is probably too strict (narrowing should be fine for
>> all), but I was too lazy to look for tighter conditions.
>>
>> (X ^ Y) ^ Y -> X should probably have (non_lvalue ...) on its output, but
>> in a simple test it didn't seem to matter. Is non_lvalue still needed?
>>
>>
>> Bootstrap+regtest on powerpc64le-unknown-linux-gnu.
>>
>> 2016-05-06  Marc Glisse  <marc.glisse@inria.fr>
>>
>> gcc/
>>         * fold-const.c (fold_binary_loc) [(X ^ Y) & Y]: Remove and merge
>> with...
>>         * match.pd ((X & Y) ^ Y): ... this.
>>         ((X & Y) & Y, (X | Y) | Y, (X ^ Y) ^ Y, (X & Y) & (X & Z), (X | Y)
>>         | (X | Z), (X ^ Y) ^ (X ^ Z)): New transformations.
>>
>> gcc/testsuite/
>>         * gcc.dg/tree-ssa/bit-assoc.c: New testcase.
>>         * gcc.dg/tree-ssa/pr69270.c: Adjust.
>>         * gcc.dg/tree-ssa/vrp59.c: Disable forwprop.
>
>
> Here it is again, I just replaced convert with convert[12] in the last 2
> transforms. This should matter for (unsigned)(si & 42) & (ui & 42u). I
> didn't change it in the other transform, because it would only matter in the
> case (T)(X & CST) & CST, which I think would be better served by extending
> the transform that currently handles (X & CST1) & CST2 (not done in this
> patch).

Ok.

Thanks!
Richard.

> --
> Marc Glisse
> Index: gcc/fold-const.c
> ===================================================================
> --- gcc/fold-const.c    (revision 236047)
> +++ gcc/fold-const.c    (working copy)
> @@ -10064,59 +10064,20 @@ fold_binary_loc (location_t loc,
>         }
>        /* Fold !X & 1 as X == 0.  */
>        if (TREE_CODE (arg0) == TRUTH_NOT_EXPR
>           && integer_onep (arg1))
>         {
>           tem = TREE_OPERAND (arg0, 0);
>           return fold_build2_loc (loc, EQ_EXPR, type, tem,
>                                   build_zero_cst (TREE_TYPE (tem)));
>         }
>
> -      /* Fold (X ^ Y) & Y as ~X & Y.  */
> -      if (TREE_CODE (arg0) == BIT_XOR_EXPR
> -         && operand_equal_p (TREE_OPERAND (arg0, 1), arg1, 0))
> -       {
> -         tem = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 0));
> -         return fold_build2_loc (loc, BIT_AND_EXPR, type,
> -                             fold_build1_loc (loc, BIT_NOT_EXPR, type,
> tem),
> -                             fold_convert_loc (loc, type, arg1));
> -       }
> -      /* Fold (X ^ Y) & X as ~Y & X.  */
> -      if (TREE_CODE (arg0) == BIT_XOR_EXPR
> -         && operand_equal_p (TREE_OPERAND (arg0, 0), arg1, 0)
> -         && reorder_operands_p (TREE_OPERAND (arg0, 1), arg1))
> -       {
> -         tem = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 1));
> -         return fold_build2_loc (loc, BIT_AND_EXPR, type,
> -                             fold_build1_loc (loc, BIT_NOT_EXPR, type,
> tem),
> -                             fold_convert_loc (loc, type, arg1));
> -       }
> -      /* Fold X & (X ^ Y) as X & ~Y.  */
> -      if (TREE_CODE (arg1) == BIT_XOR_EXPR
> -         && operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
> -       {
> -         tem = fold_convert_loc (loc, type, TREE_OPERAND (arg1, 1));
> -         return fold_build2_loc (loc, BIT_AND_EXPR, type,
> -                             fold_convert_loc (loc, type, arg0),
> -                             fold_build1_loc (loc, BIT_NOT_EXPR, type,
> tem));
> -       }
> -      /* Fold X & (Y ^ X) as ~Y & X.  */
> -      if (TREE_CODE (arg1) == BIT_XOR_EXPR
> -         && operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0)
> -         && reorder_operands_p (arg0, TREE_OPERAND (arg1, 0)))
> -       {
> -         tem = fold_convert_loc (loc, type, TREE_OPERAND (arg1, 0));
> -         return fold_build2_loc (loc, BIT_AND_EXPR, type,
> -                             fold_build1_loc (loc, BIT_NOT_EXPR, type,
> tem),
> -                             fold_convert_loc (loc, type, arg0));
> -       }
> -
>        /* Fold (X * Y) & -(1 << CST) to X * Y if Y is a constant
>           multiple of 1 << CST.  */
>        if (TREE_CODE (arg1) == INTEGER_CST)
>         {
>           wide_int cst1 = arg1;
>           wide_int ncst1 = -cst1;
>           if ((cst1 & ncst1) == ncst1
>               && multiple_of_p (type, arg0,
>                                 wide_int_to_tree (TREE_TYPE (arg1), ncst1)))
>             return fold_convert_loc (loc, type, arg0);
> Index: gcc/match.pd
> ===================================================================
> --- gcc/match.pd        (revision 236047)
> +++ gcc/match.pd        (working copy)
> @@ -667,39 +667,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
>        && tree_nop_conversion_p (type, TREE_TYPE (@1)))
>    (bit_xor (convert @0) (convert @1))))
>
>  /* Convert ~X ^ C to X ^ ~C.  */
>  (simplify
>   (bit_xor (convert? (bit_not @0)) INTEGER_CST@1)
>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>    (bit_xor (convert @0) (bit_not @1))))
>
> -/* Fold (X & Y) ^ Y as ~X & Y.  */
> -(simplify
> - (bit_xor:c (bit_and:c @0 @1) @1)
> - (bit_and (bit_not @0) @1))
> +/* Fold (X & Y) ^ Y and (X ^ Y) & Y as ~X & Y.  */
> +(for opo (bit_and bit_xor)
> +     opi (bit_xor bit_and)
> + (simplify
> +  (opo:c (opi:c @0 @1) @1)
> +  (bit_and (bit_not @0) @1)))
>
>  /* Given a bit-wise operation CODE applied to ARG0 and ARG1, see if both
>     operands are another bit-wise operation with a common input.  If so,
>     distribute the bit operations to save an operation and possibly two if
>     constants are involved.  For example, convert
>       (A | B) & (A | C) into A | (B & C)
>     Further simplification will occur if B and C are constants.  */
>  (for op (bit_and bit_ior bit_xor)
>       rop (bit_ior bit_and bit_and)
>   (simplify
>    (op (convert? (rop:c @0 @1)) (convert? (rop:c @0 @2)))
>    (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
>         && tree_nop_conversion_p (type, TREE_TYPE (@2)))
>     (rop (convert @0) (op (convert @1) (convert @2))))))
>
> +/* Some simple reassociation for bit operations, also handled in reassoc.
> */
> +/* (X & Y) & Y -> X & Y
> +   (X | Y) | Y -> X | Y  */
> +(for op (bit_and bit_ior)
> + (simplify
> +  (op:c (convert?@2 (op:c @0 @1)) (convert? @1))
> +  @2))
> +/* (X ^ Y) ^ Y -> X  */
> +(simplify
> + (bit_xor:c (convert? (bit_xor:c @0 @1)) (convert? @1))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (convert @0)))
> +/* (X & Y) & (X & Z) -> (X & Y) & Z
> +   (X | Y) | (X | Z) -> (X | Y) | Z  */
> +(for op (bit_and bit_ior)
> + (simplify
> +  (op:c (convert1?@3 (op:c@4 @0 @1)) (convert2?@5 (op:c@6 @0 @2)))
> +  (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
> +       && tree_nop_conversion_p (type, TREE_TYPE (@2)))
> +   (if (single_use (@5) && single_use (@6))
> +    (op @3 (convert @2))
> +    (if (single_use (@3) && single_use (@4))
> +     (op (convert @1) @5))))))
> +/* (X ^ Y) ^ (X ^ Z) -> Y ^ Z  */
> +(simplify
> + (bit_xor (convert1? (bit_xor:c @0 @1)) (convert2? (bit_xor:c @0 @2)))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
> +      && tree_nop_conversion_p (type, TREE_TYPE (@2)))
> +  (convert (bit_xor @1 @2))))
>
>  (simplify
>   (abs (abs@1 @0))
>   @1)
>  (simplify
>   (abs (negate @0))
>   (abs @0))
>  (simplify
>   (abs tree_expr_nonnegative_p@0)
>   @0)
> Index: gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c   (working copy)
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-forwprop1-details -fdump-tree-ccp1-details"
> } */
> +
> +int f1(int a, int b){
> +  int c = a & b;
> +  return c & b;
> +}
> +int f2(int a, int b){
> +  int c = a | b;
> +  return b | c;
> +}
> +int g1(int a, int b, int c){
> +  int d = a & b;
> +  int e = b & c;
> +  return d & e;
> +}
> +int g2(int a, int b, int c){
> +  int d = a | b;
> +  int e = c | b;
> +  return d | e;
> +}
> +int g3(int a, int b, int c){
> +  int d = a ^ b;
> +  int e = b ^ c;
> +  return e ^ d;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Match-and-simplified" 2 "ccp1" } } */
> +/* { dg-final { scan-tree-dump-times "gimple_simplified" 3 "forwprop1" } }
> */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr69270.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr69270.c     (revision 236047)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr69270.c     (working copy)
> @@ -1,21 +1,23 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -fsplit-paths -fdump-tree-dom3-details" } */
>
>  /* There should be two references to bufferstep that turn into
>     constants.  */
>  /* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with
> constant .0." 1 "dom3"} } */
>  /* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with
> constant .1." 1 "dom3"} } */
>
>  /* And some assignments ought to fold down to constants.  */
> -/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 2 "dom3"}
> } */
> -/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 2 "dom3"}
> } */
> +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = -1;" 1 "dom3"}
> } */
> +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = -2;" 1 "dom3"}
> } */
> +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom3"}
> } */
> +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom3"}
> } */
>
>  /* The XOR operations should have been optimized to constants.  */
>  /* { dg-final { scan-tree-dump-not "bit_xor" "dom3"} } */
>
>
>  extern int *stepsizeTable;
>
>  void
>  adpcm_coder (signed char *outdata, int len)
>  {
> Index: gcc/testsuite/gcc.dg/tree-ssa/vrp59.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/vrp59.c       (revision 236047)
> +++ gcc/testsuite/gcc.dg/tree-ssa/vrp59.c       (working copy)
> @@ -1,12 +1,12 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fno-tree-ccp -fdump-tree-vrp1" } */
> +/* { dg-options "-O2 -fno-tree-ccp -fno-tree-forwprop -fdump-tree-vrp1" }
> */
>
>  int f(int x)
>  {
>    if (x >= 0 && x <= 3)
>      {
>        x = x ^ 3;
>        x = x & 3;
>      }
>    return x;
>  }
>
H.J. Lu May 11, 2016, 1:52 p.m. UTC | #2
On Mon, May 9, 2016 at 11:11 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Fri, 6 May 2016, Marc Glisse wrote:
>
>> Here they are. I did (X&Y)&X and (X&Y)&(X&Z). The next one would be
>> ((X&Y)&Z)&X, but at some point we have to defer to reassoc.
>>
>> I didn't add the convert?+tree_nop_conversion_p to the existing transform
>> I modified. I guess at some point we should make a pass and add them to all
>> the transformations on bit operations...
>>
>> For (X & Y) & Y, I believe that any conversion is fine. For the others,
>> tree_nop_conversion_p is probably too strict (narrowing should be fine for
>> all), but I was too lazy to look for tighter conditions.
>>
>> (X ^ Y) ^ Y -> X should probably have (non_lvalue ...) on its output, but
>> in a simple test it didn't seem to matter. Is non_lvalue still needed?
>>
>>
>> Bootstrap+regtest on powerpc64le-unknown-linux-gnu.
>>
>> 2016-05-06  Marc Glisse  <marc.glisse@inria.fr>
>>
>> gcc/
>>         * fold-const.c (fold_binary_loc) [(X ^ Y) & Y]: Remove and merge
>> with...
>>         * match.pd ((X & Y) ^ Y): ... this.
>>         ((X & Y) & Y, (X | Y) | Y, (X ^ Y) ^ Y, (X & Y) & (X & Z), (X | Y)
>>         | (X | Z), (X ^ Y) ^ (X ^ Z)): New transformations.
>>
>> gcc/testsuite/
>>         * gcc.dg/tree-ssa/bit-assoc.c: New testcase.
>>         * gcc.dg/tree-ssa/pr69270.c: Adjust.
>>         * gcc.dg/tree-ssa/vrp59.c: Disable forwprop.
>
>
> Here it is again, I just replaced convert with convert[12] in the last 2
> transforms. This should matter for (unsigned)(si & 42) & (ui & 42u). I
> didn't change it in the other transform, because it would only matter in the
> case (T)(X & CST) & CST, which I think would be better served by extending
> the transform that currently handles (X & CST1) & CST2 (not done in this
> patch).

It caused:

FAIL: gcc.dg/tree-ssa/vrp47.c scan-tree-dump-times vrp2 " & 1;" 0

on x86.

H.J.
Marc Glisse May 11, 2016, 4:17 p.m. UTC | #3
On Wed, 11 May 2016, H.J. Lu wrote:

>>>         * fold-const.c (fold_binary_loc) [(X ^ Y) & Y]: Remove and merge with...
>>>         * match.pd ((X & Y) ^ Y): ... this.

> It caused:
>
> FAIL: gcc.dg/tree-ssa/vrp47.c scan-tree-dump-times vrp2 " & 1;" 0
>
> on x86.

Ah, yes, logical_op_short_circuit so I didn't test it. Hmm, doesn't seem 
so easy. We want to compute (int)(x==0) in a branch where x is known to be 
in [0, 1]. VRP1 gives (int)(_Bool)(x^1). forwprop3 handles the double 
conversion, which gives (x^1)&1. Here the new transform applies and gives 
(~x)&1. VRP2 comes, and with (x^1)&1 it used to notice that this is just 
x^1 (remember that x is in [0, 1] in this branch), while it doesn't know 
what to do with (~x)&1. In the end, we get
 	notl	%eax
 	andl	$1, %eax
instead of
 	xorl	$1, %eax

(andn doesn't seem to have a version with an immediate)

The transformation seems right to me, but we are then missing another 
transformation like ~X & Y -> X ^ Y when we know that X & ~Y is 0 (the 1 
bits of X are included in those of Y). That may not be easy with the 
limited bit tracking we have. A version limited to constant Y of the form 
2^n-1 (i.e. 0...01...1) where X has a range included in [0, Y] may be 
simpler.

We could also simplify (int)(_Bool)x to x using VRP information that x is 
in [0, 1], but apparently when VRP replaces x==0 with y=x^1,(_Bool)y, it 
does not compute a range for the new variable y, and by the time the next 
VRP pass comes, it is too late.

In the mean time, I propose xfailing this test...
Jeff Law May 11, 2016, 4:26 p.m. UTC | #4
On 05/11/2016 10:17 AM, Marc Glisse wrote:
> The transformation seems right to me, but we are then missing another
> transformation like ~X & Y -> X ^ Y when we know that X & ~Y is 0 (the 1
> bits of X are included in those of Y). That may not be easy with the
> limited bit tracking we have. A version limited to constant Y of the
> form 2^n-1 (i.e. 0...01...1) where X has a range included in [0, Y] may
> be simpler.
While we don't have strong bit tracking, we do track [0..1] ranges 
reasonably well, so it may be worth doing.

> We could also simplify (int)(_Bool)x to x using VRP information that x
> is in [0, 1], but apparently when VRP replaces x==0 with y=x^1,(_Bool)y,
> it does not compute a range for the new variable y, and by the time the
> next VRP pass comes, it is too late.
Seems like a clear oversight.

jeff
Marc Glisse May 11, 2016, 5:56 p.m. UTC | #5
On Wed, 11 May 2016, Jeff Law wrote:

>> We could also simplify (int)(_Bool)x to x using VRP information that x
>> is in [0, 1], but apparently when VRP replaces x==0 with y=x^1,(_Bool)y,
>> it does not compute a range for the new variable y, and by the time the
>> next VRP pass comes, it is too late.
> Seems like a clear oversight.

In get_value_range, there is:
   /* If we query the range for a new SSA name return an unmodifiable VARYING.
      We should get here at most from the substitute-and-fold stage which
      will never try to change values.  */
so this is a known limitation.

We could try to change that (XRESIZEVEC, memset(0) on the new elements, 
update num_vr_values to the new num_ssa_names, at this point vr_value 
should be replaced with a vector).

We could also use set_range_info and make simplify_conversion_using_ranges 
use get_range_info instead of get_value_range. Might even move the whole 
function to match.pd then ;-)
Marc Glisse May 12, 2016, 5:21 a.m. UTC | #6
On Wed, 11 May 2016, Jeff Law wrote:

> On 05/11/2016 10:17 AM, Marc Glisse wrote:
>> The transformation seems right to me, but we are then missing another
>> transformation like ~X & Y -> X ^ Y when we know that X & ~Y is 0 (the 1
>> bits of X are included in those of Y). That may not be easy with the
>> limited bit tracking we have. A version limited to constant Y of the
>> form 2^n-1 (i.e. 0...01...1) where X has a range included in [0, Y] may
>> be simpler.
> While we don't have strong bit tracking, we do track [0..1] ranges reasonably 
> well, so it may be worth doing.

I had started writing

+/* Simplify (~X & Y) to X ^ Y if we know that (X & ~Y) is 0.  */
+#if GIMPLE
+(simplify
+ (bit_and (bit_not SSA_NAME@0) INTEGER_CST@1)
+ (if ((get_nonzero_bits (@0) & wi::bit_not (@1)) == 0)
+  (bit_xor @0 @1)))
+#endif

but then I realized that VRP does not call the simplify machinery, so this 
would have to be rewritten in simplify_bit_ops_using_ranges. The good 
point is that we could then compute:
may_be_nonzero_X & ~must_be_nonzero_Y
instead of assuming that Y is a constant (not that it would change 
anything in practice).
Richard Biener May 12, 2016, 8:41 a.m. UTC | #7
On Wed, May 11, 2016 at 7:56 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Wed, 11 May 2016, Jeff Law wrote:
>
>>> We could also simplify (int)(_Bool)x to x using VRP information that x
>>> is in [0, 1], but apparently when VRP replaces x==0 with y=x^1,(_Bool)y,
>>> it does not compute a range for the new variable y, and by the time the
>>> next VRP pass comes, it is too late.
>>
>> Seems like a clear oversight.
>
>
> In get_value_range, there is:
>   /* If we query the range for a new SSA name return an unmodifiable
> VARYING.
>      We should get here at most from the substitute-and-fold stage which
>      will never try to change values.  */
> so this is a known limitation.
>
> We could try to change that (XRESIZEVEC, memset(0) on the new elements,
> update num_vr_values to the new num_ssa_names, at this point vr_value should
> be replaced with a vector).
>
> We could also use set_range_info and make simplify_conversion_using_ranges
> use get_range_info instead of get_value_range. Might even move the whole
> function to match.pd then ;-)

Yeah - note that VRP already calls set_range_info before simplifying
stmts.  It's
just that substitute_and_fold doesn't apply fold_stmt (and thus match.pd) to
all stmts but it only applies the pass specific "fold" (vrp_fold_stmt)
to all stmts.

Richard.

> --
> Marc Glisse
Marc Glisse May 12, 2016, 4:02 p.m. UTC | #8
On Thu, 12 May 2016, Richard Biener wrote:

> Yeah - note that VRP already calls set_range_info before simplifying 
> stmts.  It's just that substitute_and_fold doesn't apply fold_stmt (and 
> thus match.pd) to all stmts but it only applies the pass specific "fold" 
> (vrp_fold_stmt) to all stmts.

Just to be sure: is the fact that VRP doesn't apply fold_stmt on purpose? 
The restriction makes sense, it is just that it may yield a bit of 
duplication. We already indirectly use get_range_info in match.pd and may 
miss out on opportunities that only occur in branches during the VRP pass.
Richard Biener May 12, 2016, 4:51 p.m. UTC | #9
On May 12, 2016 6:02:47 PM GMT+02:00, Marc Glisse <marc.glisse@inria.fr> wrote:
>On Thu, 12 May 2016, Richard Biener wrote:
>
>> Yeah - note that VRP already calls set_range_info before simplifying 
>> stmts.  It's just that substitute_and_fold doesn't apply fold_stmt
>(and 
>> thus match.pd) to all stmts but it only applies the pass specific
>"fold" 
>> (vrp_fold_stmt) to all stmts.
>
>Just to be sure: is the fact that VRP doesn't apply fold_stmt on
>purpose? 

The propagator only folds stmts that had operands replaced (that doesn't enable all simplifications as match.PD patterns cover more than one statement).

>The restriction makes sense, it is just that it may yield a bit of 
>duplication. We already indirectly use get_range_info in match.pd and
>may 
>miss out on opportunities that only occur in branches during the VRP
>pass.

Yes.  The pass specific fold is called on each stmt.  Maybe we can omit the propagators folding if the pass specific folding applied.

Richard.
diff mbox

Patch

Index: gcc/fold-const.c
===================================================================
--- gcc/fold-const.c	(revision 236047)
+++ gcc/fold-const.c	(working copy)
@@ -10064,59 +10064,20 @@  fold_binary_loc (location_t loc,
 	}
       /* Fold !X & 1 as X == 0.  */
       if (TREE_CODE (arg0) == TRUTH_NOT_EXPR
 	  && integer_onep (arg1))
 	{
 	  tem = TREE_OPERAND (arg0, 0);
 	  return fold_build2_loc (loc, EQ_EXPR, type, tem,
 				  build_zero_cst (TREE_TYPE (tem)));
 	}
 
-      /* Fold (X ^ Y) & Y as ~X & Y.  */
-      if (TREE_CODE (arg0) == BIT_XOR_EXPR
-	  && operand_equal_p (TREE_OPERAND (arg0, 1), arg1, 0))
-	{
-	  tem = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 0));
-	  return fold_build2_loc (loc, BIT_AND_EXPR, type,
-			      fold_build1_loc (loc, BIT_NOT_EXPR, type, tem),
-			      fold_convert_loc (loc, type, arg1));
-	}
-      /* Fold (X ^ Y) & X as ~Y & X.  */
-      if (TREE_CODE (arg0) == BIT_XOR_EXPR
-	  && operand_equal_p (TREE_OPERAND (arg0, 0), arg1, 0)
-	  && reorder_operands_p (TREE_OPERAND (arg0, 1), arg1))
-	{
-	  tem = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 1));
-	  return fold_build2_loc (loc, BIT_AND_EXPR, type,
-			      fold_build1_loc (loc, BIT_NOT_EXPR, type, tem),
-			      fold_convert_loc (loc, type, arg1));
-	}
-      /* Fold X & (X ^ Y) as X & ~Y.  */
-      if (TREE_CODE (arg1) == BIT_XOR_EXPR
-	  && operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
-	{
-	  tem = fold_convert_loc (loc, type, TREE_OPERAND (arg1, 1));
-	  return fold_build2_loc (loc, BIT_AND_EXPR, type,
-			      fold_convert_loc (loc, type, arg0),
-			      fold_build1_loc (loc, BIT_NOT_EXPR, type, tem));
-	}
-      /* Fold X & (Y ^ X) as ~Y & X.  */
-      if (TREE_CODE (arg1) == BIT_XOR_EXPR
-	  && operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0)
-	  && reorder_operands_p (arg0, TREE_OPERAND (arg1, 0)))
-	{
-	  tem = fold_convert_loc (loc, type, TREE_OPERAND (arg1, 0));
-	  return fold_build2_loc (loc, BIT_AND_EXPR, type,
-			      fold_build1_loc (loc, BIT_NOT_EXPR, type, tem),
-			      fold_convert_loc (loc, type, arg0));
-	}
-
       /* Fold (X * Y) & -(1 << CST) to X * Y if Y is a constant
          multiple of 1 << CST.  */
       if (TREE_CODE (arg1) == INTEGER_CST)
 	{
 	  wide_int cst1 = arg1;
 	  wide_int ncst1 = -cst1;
 	  if ((cst1 & ncst1) == ncst1
 	      && multiple_of_p (type, arg0,
 				wide_int_to_tree (TREE_TYPE (arg1), ncst1)))
 	    return fold_convert_loc (loc, type, arg0);
Index: gcc/match.pd
===================================================================
--- gcc/match.pd	(revision 236047)
+++ gcc/match.pd	(working copy)
@@ -667,39 +667,70 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
       && tree_nop_conversion_p (type, TREE_TYPE (@1)))
   (bit_xor (convert @0) (convert @1))))
 
 /* Convert ~X ^ C to X ^ ~C.  */
 (simplify
  (bit_xor (convert? (bit_not @0)) INTEGER_CST@1)
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (bit_xor (convert @0) (bit_not @1))))
 
-/* Fold (X & Y) ^ Y as ~X & Y.  */
-(simplify
- (bit_xor:c (bit_and:c @0 @1) @1)
- (bit_and (bit_not @0) @1))
+/* Fold (X & Y) ^ Y and (X ^ Y) & Y as ~X & Y.  */
+(for opo (bit_and bit_xor)
+     opi (bit_xor bit_and)
+ (simplify
+  (opo:c (opi:c @0 @1) @1) 
+  (bit_and (bit_not @0) @1)))
 
 /* Given a bit-wise operation CODE applied to ARG0 and ARG1, see if both
    operands are another bit-wise operation with a common input.  If so,
    distribute the bit operations to save an operation and possibly two if
    constants are involved.  For example, convert
      (A | B) & (A | C) into A | (B & C)
    Further simplification will occur if B and C are constants.  */
 (for op (bit_and bit_ior bit_xor)
      rop (bit_ior bit_and bit_and)
  (simplify
   (op (convert? (rop:c @0 @1)) (convert? (rop:c @0 @2)))
   (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
        && tree_nop_conversion_p (type, TREE_TYPE (@2)))
    (rop (convert @0) (op (convert @1) (convert @2))))))
 
+/* Some simple reassociation for bit operations, also handled in reassoc.  */
+/* (X & Y) & Y -> X & Y
+   (X | Y) | Y -> X | Y  */
+(for op (bit_and bit_ior)
+ (simplify
+  (op:c (convert?@2 (op:c @0 @1)) (convert? @1))
+  @2))
+/* (X ^ Y) ^ Y -> X  */
+(simplify
+ (bit_xor:c (convert? (bit_xor:c @0 @1)) (convert? @1))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (convert @0)))
+/* (X & Y) & (X & Z) -> (X & Y) & Z
+   (X | Y) | (X | Z) -> (X | Y) | Z  */
+(for op (bit_and bit_ior)
+ (simplify
+  (op:c (convert1?@3 (op:c@4 @0 @1)) (convert2?@5 (op:c@6 @0 @2)))
+  (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
+       && tree_nop_conversion_p (type, TREE_TYPE (@2)))
+   (if (single_use (@5) && single_use (@6))
+    (op @3 (convert @2))
+    (if (single_use (@3) && single_use (@4))
+     (op (convert @1) @5))))))
+/* (X ^ Y) ^ (X ^ Z) -> Y ^ Z  */
+(simplify
+ (bit_xor (convert1? (bit_xor:c @0 @1)) (convert2? (bit_xor:c @0 @2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@1))
+      && tree_nop_conversion_p (type, TREE_TYPE (@2)))
+  (convert (bit_xor @1 @2))))
 
 (simplify
  (abs (abs@1 @0))
  @1)
 (simplify
  (abs (negate @0))
  (abs @0))
 (simplify
  (abs tree_expr_nonnegative_p@0)
  @0)
Index: gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/bit-assoc.c	(working copy)
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-forwprop1-details -fdump-tree-ccp1-details" } */
+
+int f1(int a, int b){
+  int c = a & b;
+  return c & b;
+}
+int f2(int a, int b){
+  int c = a | b;
+  return b | c;
+}
+int g1(int a, int b, int c){
+  int d = a & b;
+  int e = b & c;
+  return d & e;
+}
+int g2(int a, int b, int c){
+  int d = a | b;
+  int e = c | b;
+  return d | e;
+}
+int g3(int a, int b, int c){
+  int d = a ^ b;
+  int e = b ^ c;
+  return e ^ d;
+}
+
+/* { dg-final { scan-tree-dump-times "Match-and-simplified" 2 "ccp1" } } */
+/* { dg-final { scan-tree-dump-times "gimple_simplified" 3 "forwprop1" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr69270.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr69270.c	(revision 236047)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr69270.c	(working copy)
@@ -1,21 +1,23 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -fsplit-paths -fdump-tree-dom3-details" } */
 
 /* There should be two references to bufferstep that turn into
    constants.  */
 /* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .0." 1 "dom3"} } */
 /* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .1." 1 "dom3"} } */
 
 /* And some assignments ought to fold down to constants.  */
-/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 2 "dom3"} } */
-/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 2 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = -1;" 1 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = -2;" 1 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom3"} } */
 
 /* The XOR operations should have been optimized to constants.  */
 /* { dg-final { scan-tree-dump-not "bit_xor" "dom3"} } */
 
 
 extern int *stepsizeTable;
 
 void
 adpcm_coder (signed char *outdata, int len)
 {
Index: gcc/testsuite/gcc.dg/tree-ssa/vrp59.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/vrp59.c	(revision 236047)
+++ gcc/testsuite/gcc.dg/tree-ssa/vrp59.c	(working copy)
@@ -1,12 +1,12 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-ccp -fdump-tree-vrp1" } */
+/* { dg-options "-O2 -fno-tree-ccp -fno-tree-forwprop -fdump-tree-vrp1" } */
 
 int f(int x)
 {
   if (x >= 0 && x <= 3)
     {
       x = x ^ 3;
       x = x & 3;
     }
   return x;
 }