Message ID | 20191206164148.GF10088@tucnak |
---|---|
State | New |
Headers | show |
Series | Canonicalize fancy ways of expressing blend operation into COND_EXPR (PR tree-optimization/92834) | expand |
On Fri, 6 Dec 2019, Jakub Jelinek wrote: > Hi! > > The following patch canonicalizes fancy ways of writing cond ? A : B > into COND_EXPR, which is what we expect people writing and thus are able to > optimize it better. If in some case we wouldn't optimize it better, > the right way would be improve the COND_EXPR handling, as that is what > people are using in the wild most of the time. > E.g. on the testcase in the patch on x86_64-linux with -O2, the difference > is that test used to be 519 bytes long and now is 223, with -O2 > -march=skylake used to be the same 519 bytes long and now is 275 bytes (in > that case it uses the SSE4.1+ min/max). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK. Thanks, Richard. > 2019-12-06 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/92834 > * match.pd (A - ((A - B) & -(C cmp D)) -> (C cmp D) ? B : A, > A + ((B - A) & -(C cmp D)) -> (C cmp D) ? B : A): New simplifications. > > * gcc.dg/tree-ssa/pr92834.c: New test. > > --- gcc/match.pd.jj 2019-12-06 14:07:26.877749065 +0100 > +++ gcc/match.pd 2019-12-06 15:06:08.042953309 +0100 > @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY > (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2) > (comb (cmp @0 @2) (cmp @1 @2)))) > > +/* Undo fancy way of writing max/min or other ?: expressions, > + like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a. > + People normally use ?: and that is what we actually try to optimize. */ > +(for cmp (simple_comparison) > + (simplify > + (minus @0 (bit_and:c (minus @0 @1) > + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) > + (if (INTEGRAL_TYPE_P (type) > + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) > + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE > + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) > + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) > + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) > + (cond (cmp @2 @3) @1 @0))) > + (simplify > + (plus:c @0 (bit_and:c (minus @1 @0) > + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) > + (if (INTEGRAL_TYPE_P (type) > + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) > + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE > + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) > + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) > + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) > + (cond (cmp @2 @3) @1 @0)))) > + > /* Simplifications of shift and rotates. */ > > (for rotate (lrotate rrotate) > --- gcc/testsuite/gcc.dg/tree-ssa/pr92834.c.jj 2019-12-06 15:24:56.817353747 +0100 > +++ gcc/testsuite/gcc.dg/tree-ssa/pr92834.c 2019-12-06 15:24:08.921100518 +0100 > @@ -0,0 +1,122 @@ > +/* PR tree-optimization/92834 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-times "MIN_EXPR <" 8 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times "MAX_EXPR <" 8 "optimized" } } */ > + > +static inline unsigned > +umax1 (unsigned a, unsigned b) > +{ > + return a - ((a - b) & -(a < b)); > +} > + > +static inline unsigned > +umin1 (unsigned a, unsigned b) > +{ > + return a - ((a - b) & -(a > b)); > +} > + > +static inline int > +smax1 (int a, int b) > +{ > + return a - ((a - b) & -(a < b)); > +} > + > +static inline int > +smin1 (int a, int b) > +{ > + return a - ((a - b) & -(a > b)); > +} > + > +static inline unsigned long long > +umax2 (unsigned long long a, unsigned long long b) > +{ > + return a - ((a - b) & -(a <= b)); > +} > + > +static inline unsigned long long > +umin2 (unsigned long long a, unsigned long long b) > +{ > + return a - ((a - b) & -(a >= b)); > +} > + > +static inline long long > +smax2 (long long a, long long b) > +{ > + return a - ((a - b) & -(a <= b)); > +} > + > +static inline long long > +smin2 (long long a, long long b) > +{ > + return a - ((a - b) & -(a >= b)); > +} > + > +static inline unsigned > +umax3 (unsigned a, unsigned b) > +{ > + return a + ((b - a) & -(a < b)); > +} > + > +static inline unsigned > +umin3 (unsigned a, unsigned b) > +{ > + return a + ((b - a) & -(a > b)); > +} > + > +static inline int > +smax3 (int a, int b) > +{ > + return a + ((b - a) & -(a < b)); > +} > + > +static inline int > +smin3 (int a, int b) > +{ > + return a + ((b - a) & -(a > b)); > +} > + > +static inline unsigned long long > +umax4 (unsigned long long a, unsigned long long b) > +{ > + return a + ((b - a) & -(a <= b)); > +} > + > +static inline unsigned long long > +umin4 (unsigned long long a, unsigned long long b) > +{ > + return a + ((b - a) & -(a >= b)); > +} > + > +static inline long long > +smax4 (long long a, long long b) > +{ > + return a + ((b - a) & -(a <= b)); > +} > + > +static inline long long > +smin4 (long long a, long long b) > +{ > + return a + ((b - a) & -(a >= b)); > +} > + > +void > +test (unsigned *x, int *y, unsigned long long *z, long long *w) > +{ > + x[2] = umax1 (x[0], x[1]); > + x[5] = umin1 (x[2], x[3]); > + y[2] = smax1 (y[0], y[1]); > + y[5] = smin1 (y[2], y[3]); > + z[2] = umax2 (z[0], z[1]); > + z[5] = umin2 (z[2], z[3]); > + w[2] = smax2 (w[0], w[1]); > + w[5] = smin2 (w[2], w[3]); > + x[8] = umax3 (x[6], x[7]); > + x[11] = umin3 (x[9], x[10]); > + y[8] = smax3 (y[6], y[7]); > + y[11] = smin3 (y[9], y[10]); > + z[8] = umax4 (z[6], z[7]); > + z[11] = umin4 (z[9], z[10]); > + w[8] = smax4 (w[6], w[7]); > + w[11] = smin4 (w[9], w[10]); > +} > > Jakub > >
On Fri, 6 Dec 2019, Jakub Jelinek wrote: > --- gcc/match.pd.jj 2019-12-06 14:07:26.877749065 +0100 > +++ gcc/match.pd 2019-12-06 15:06:08.042953309 +0100 > @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY > (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2) > (comb (cmp @0 @2) (cmp @1 @2)))) > > +/* Undo fancy way of writing max/min or other ?: expressions, > + like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a. > + People normally use ?: and that is what we actually try to optimize. */ > +(for cmp (simple_comparison) > + (simplify > + (minus @0 (bit_and:c (minus @0 @1) > + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) > + (if (INTEGRAL_TYPE_P (type) > + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) > + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE > + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) > + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) > + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) > + (cond (cmp @2 @3) @1 @0))) I was going to suggest (cond @5 @1 @0) and possibly replacing (cmp@5 @2 @3) with truth_valued_p@5, before remembering that COND_EXPR embeds the comparison, and that not transforming when we don't see the comparison is likely on purpose. Plus, if @5 was in a signed 1-bit type, it may look more like -1 than 1 and break the transformation (is that forbidden as return type of a comparion?).
On Mon, Dec 09, 2019 at 10:54:34AM +0100, Marc Glisse wrote: > On Fri, 6 Dec 2019, Jakub Jelinek wrote: > > > --- gcc/match.pd.jj 2019-12-06 14:07:26.877749065 +0100 > > +++ gcc/match.pd 2019-12-06 15:06:08.042953309 +0100 > > @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY > > (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2) > > (comb (cmp @0 @2) (cmp @1 @2)))) > > > > +/* Undo fancy way of writing max/min or other ?: expressions, > > + like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a. > > + People normally use ?: and that is what we actually try to optimize. */ > > +(for cmp (simple_comparison) > > + (simplify > > + (minus @0 (bit_and:c (minus @0 @1) > > + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) > > + (if (INTEGRAL_TYPE_P (type) > > + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) > > + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE > > + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) > > + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) > > + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) > > + (cond (cmp @2 @3) @1 @0))) > > I was going to suggest > (cond @5 @1 @0) > > and possibly replacing (cmp@5 @2 @3) with truth_valued_p@5, before > remembering that COND_EXPR embeds the comparison, and that not transforming > when we don't see the comparison is likely on purpose. Plus, if @5 was in a > signed 1-bit type, it may look more like -1 than 1 and break the > transformation (is that forbidden as return type of a comparion?). FYI, I've already committed the patch, so any improvement or bugfix needs to be done incrementally. The comparison in there was mainly an attempt to have a truth value in there, so maybe truth_valued_p would work too, maybe even a get_range_info checked value of [0, 1] would, though perhaps just truth_valued_p is better because it involves some kind of setcc-like instruction in the end. All I'd like to see for comparisons is that they are in the COND_EXPR's first operand if they can't throw. I'm afraid I have no idea whether we can have signed 1-bit truth_valued_p operations and what will happen with them, if it is possible, then I guess an additional condition will be needed, check that it has prec > 1 or TYPE_UNSIGNED. Jakub
--- gcc/match.pd.jj 2019-12-06 14:07:26.877749065 +0100 +++ gcc/match.pd 2019-12-06 15:06:08.042953309 +0100 @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2) (comb (cmp @0 @2) (cmp @1 @2)))) +/* Undo fancy way of writing max/min or other ?: expressions, + like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a. + People normally use ?: and that is what we actually try to optimize. */ +(for cmp (simple_comparison) + (simplify + (minus @0 (bit_and:c (minus @0 @1) + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) + (if (INTEGRAL_TYPE_P (type) + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) + (cond (cmp @2 @3) @1 @0))) + (simplify + (plus:c @0 (bit_and:c (minus @1 @0) + (convert? (negate@4 (convert? (cmp@5 @2 @3)))))) + (if (INTEGRAL_TYPE_P (type) + && INTEGRAL_TYPE_P (TREE_TYPE (@4)) + && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE + && INTEGRAL_TYPE_P (TREE_TYPE (@5)) + && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type) + || !TYPE_UNSIGNED (TREE_TYPE (@4)))) + (cond (cmp @2 @3) @1 @0)))) + /* Simplifications of shift and rotates. */ (for rotate (lrotate rrotate) --- gcc/testsuite/gcc.dg/tree-ssa/pr92834.c.jj 2019-12-06 15:24:56.817353747 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr92834.c 2019-12-06 15:24:08.921100518 +0100 @@ -0,0 +1,122 @@ +/* PR tree-optimization/92834 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times "MIN_EXPR <" 8 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MAX_EXPR <" 8 "optimized" } } */ + +static inline unsigned +umax1 (unsigned a, unsigned b) +{ + return a - ((a - b) & -(a < b)); +} + +static inline unsigned +umin1 (unsigned a, unsigned b) +{ + return a - ((a - b) & -(a > b)); +} + +static inline int +smax1 (int a, int b) +{ + return a - ((a - b) & -(a < b)); +} + +static inline int +smin1 (int a, int b) +{ + return a - ((a - b) & -(a > b)); +} + +static inline unsigned long long +umax2 (unsigned long long a, unsigned long long b) +{ + return a - ((a - b) & -(a <= b)); +} + +static inline unsigned long long +umin2 (unsigned long long a, unsigned long long b) +{ + return a - ((a - b) & -(a >= b)); +} + +static inline long long +smax2 (long long a, long long b) +{ + return a - ((a - b) & -(a <= b)); +} + +static inline long long +smin2 (long long a, long long b) +{ + return a - ((a - b) & -(a >= b)); +} + +static inline unsigned +umax3 (unsigned a, unsigned b) +{ + return a + ((b - a) & -(a < b)); +} + +static inline unsigned +umin3 (unsigned a, unsigned b) +{ + return a + ((b - a) & -(a > b)); +} + +static inline int +smax3 (int a, int b) +{ + return a + ((b - a) & -(a < b)); +} + +static inline int +smin3 (int a, int b) +{ + return a + ((b - a) & -(a > b)); +} + +static inline unsigned long long +umax4 (unsigned long long a, unsigned long long b) +{ + return a + ((b - a) & -(a <= b)); +} + +static inline unsigned long long +umin4 (unsigned long long a, unsigned long long b) +{ + return a + ((b - a) & -(a >= b)); +} + +static inline long long +smax4 (long long a, long long b) +{ + return a + ((b - a) & -(a <= b)); +} + +static inline long long +smin4 (long long a, long long b) +{ + return a + ((b - a) & -(a >= b)); +} + +void +test (unsigned *x, int *y, unsigned long long *z, long long *w) +{ + x[2] = umax1 (x[0], x[1]); + x[5] = umin1 (x[2], x[3]); + y[2] = smax1 (y[0], y[1]); + y[5] = smin1 (y[2], y[3]); + z[2] = umax2 (z[0], z[1]); + z[5] = umin2 (z[2], z[3]); + w[2] = smax2 (w[0], w[1]); + w[5] = smin2 (w[2], w[3]); + x[8] = umax3 (x[6], x[7]); + x[11] = umin3 (x[9], x[10]); + y[8] = smax3 (y[6], y[7]); + y[11] = smin3 (y[9], y[10]); + z[8] = umax4 (z[6], z[7]); + z[11] = umin4 (z[9], z[10]); + w[8] = smax4 (w[6], w[7]); + w[11] = smin4 (w[9], w[10]); +}