Canonicalize fancy ways of expressing blend operation into COND_EXPR (PR tree-optimization/92834)
diff mbox series

Message ID 20191206164148.GF10088@tucnak
State New
Headers show
Series
  • Canonicalize fancy ways of expressing blend operation into COND_EXPR (PR tree-optimization/92834)
Related show

Commit Message

Jakub Jelinek Dec. 6, 2019, 4:41 p.m. UTC
Hi!

The following patch canonicalizes fancy ways of writing cond ? A : B
into COND_EXPR, which is what we expect people writing and thus are able to
optimize it better.  If in some case we wouldn't optimize it better,
the right way would be improve the COND_EXPR handling, as that is what
people are using in the wild most of the time.
E.g. on the testcase in the patch on x86_64-linux with -O2, the difference
is that test used to be 519 bytes long and now is 223, with -O2
-march=skylake used to be the same 519 bytes long and now is 275 bytes (in
that case it uses the SSE4.1+ min/max).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-12-06  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/92834
	* match.pd (A - ((A - B) & -(C cmp D)) -> (C cmp D) ? B : A,
	A + ((B - A) & -(C cmp D)) -> (C cmp D) ? B : A): New simplifications.

	* gcc.dg/tree-ssa/pr92834.c: New test.


	Jakub

Comments

Richard Biener Dec. 9, 2019, 9:08 a.m. UTC | #1
On Fri, 6 Dec 2019, Jakub Jelinek wrote:

> Hi!
> 
> The following patch canonicalizes fancy ways of writing cond ? A : B
> into COND_EXPR, which is what we expect people writing and thus are able to
> optimize it better.  If in some case we wouldn't optimize it better,
> the right way would be improve the COND_EXPR handling, as that is what
> people are using in the wild most of the time.
> E.g. on the testcase in the patch on x86_64-linux with -O2, the difference
> is that test used to be 519 bytes long and now is 223, with -O2
> -march=skylake used to be the same 519 bytes long and now is 275 bytes (in
> that case it uses the SSE4.1+ min/max).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Thanks,
Richard.

> 2019-12-06  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/92834
> 	* match.pd (A - ((A - B) & -(C cmp D)) -> (C cmp D) ? B : A,
> 	A + ((B - A) & -(C cmp D)) -> (C cmp D) ? B : A): New simplifications.
> 
> 	* gcc.dg/tree-ssa/pr92834.c: New test.
> 
> --- gcc/match.pd.jj	2019-12-06 14:07:26.877749065 +0100
> +++ gcc/match.pd	2019-12-06 15:06:08.042953309 +0100
> @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY
>    (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2)
>    (comb (cmp @0 @2) (cmp @1 @2))))
>  
> +/* Undo fancy way of writing max/min or other ?: expressions,
> +   like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a.
> +   People normally use ?: and that is what we actually try to optimize.  */
> +(for cmp (simple_comparison)
> + (simplify
> +  (minus @0 (bit_and:c (minus @0 @1)
> +		       (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
> +  (if (INTEGRAL_TYPE_P (type)
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
> +       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
> +       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
> +	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
> +   (cond (cmp @2 @3) @1 @0)))
> + (simplify
> +  (plus:c @0 (bit_and:c (minus @1 @0)
> +			(convert? (negate@4 (convert? (cmp@5 @2 @3))))))
> +  (if (INTEGRAL_TYPE_P (type)
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
> +       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
> +       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
> +	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
> +   (cond (cmp @2 @3) @1 @0))))
> +
>  /* Simplifications of shift and rotates.  */
>  
>  (for rotate (lrotate rrotate)
> --- gcc/testsuite/gcc.dg/tree-ssa/pr92834.c.jj	2019-12-06 15:24:56.817353747 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr92834.c	2019-12-06 15:24:08.921100518 +0100
> @@ -0,0 +1,122 @@
> +/* PR tree-optimization/92834 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times "MIN_EXPR <" 8 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "MAX_EXPR <" 8 "optimized" } } */
> +
> +static inline unsigned
> +umax1 (unsigned a, unsigned b)
> +{
> +  return a - ((a - b) & -(a < b));
> +}
> +
> +static inline unsigned
> +umin1 (unsigned a, unsigned b)
> +{
> +  return a - ((a - b) & -(a > b));
> +}
> +
> +static inline int
> +smax1 (int a, int b)
> +{
> +  return a - ((a - b) & -(a < b));
> +}
> +
> +static inline int
> +smin1 (int a, int b)
> +{
> +  return a - ((a - b) & -(a > b));
> +}
> +
> +static inline unsigned long long
> +umax2 (unsigned long long a, unsigned long long b)
> +{
> +  return a - ((a - b) & -(a <= b));
> +}
> +
> +static inline unsigned long long
> +umin2 (unsigned long long a, unsigned long long b)
> +{
> +  return a - ((a - b) & -(a >= b));
> +}
> +
> +static inline long long
> +smax2 (long long a, long long b)
> +{
> +  return a - ((a - b) & -(a <= b));
> +}
> +
> +static inline long long
> +smin2 (long long a, long long b)
> +{
> +  return a - ((a - b) & -(a >= b));
> +}
> +
> +static inline unsigned
> +umax3 (unsigned a, unsigned b)
> +{
> +  return a + ((b - a) & -(a < b));
> +}
> +
> +static inline unsigned
> +umin3 (unsigned a, unsigned b)
> +{
> +  return a + ((b - a) & -(a > b));
> +}
> +
> +static inline int
> +smax3 (int a, int b)
> +{
> +  return a + ((b - a) & -(a < b));
> +}
> +
> +static inline int
> +smin3 (int a, int b)
> +{
> +  return a + ((b - a) & -(a > b));
> +}
> +
> +static inline unsigned long long
> +umax4 (unsigned long long a, unsigned long long b)
> +{
> +  return a + ((b - a) & -(a <= b));
> +}
> +
> +static inline unsigned long long
> +umin4 (unsigned long long a, unsigned long long b)
> +{
> +  return a + ((b - a) & -(a >= b));
> +}
> +
> +static inline long long
> +smax4 (long long a, long long b)
> +{
> +  return a + ((b - a) & -(a <= b));
> +}
> +
> +static inline long long
> +smin4 (long long a, long long b)
> +{
> +  return a + ((b - a) & -(a >= b));
> +}
> +
> +void
> +test (unsigned *x, int *y, unsigned long long *z, long long *w)
> +{
> +  x[2] = umax1 (x[0], x[1]);
> +  x[5] = umin1 (x[2], x[3]);
> +  y[2] = smax1 (y[0], y[1]);
> +  y[5] = smin1 (y[2], y[3]);
> +  z[2] = umax2 (z[0], z[1]);
> +  z[5] = umin2 (z[2], z[3]);
> +  w[2] = smax2 (w[0], w[1]);
> +  w[5] = smin2 (w[2], w[3]);
> +  x[8] = umax3 (x[6], x[7]);
> +  x[11] = umin3 (x[9], x[10]);
> +  y[8] = smax3 (y[6], y[7]);
> +  y[11] = smin3 (y[9], y[10]);
> +  z[8] = umax4 (z[6], z[7]);
> +  z[11] = umin4 (z[9], z[10]);
> +  w[8] = smax4 (w[6], w[7]);
> +  w[11] = smin4 (w[9], w[10]);
> +}
> 
> 	Jakub
> 
>
Marc Glisse Dec. 9, 2019, 9:54 a.m. UTC | #2
On Fri, 6 Dec 2019, Jakub Jelinek wrote:

> --- gcc/match.pd.jj	2019-12-06 14:07:26.877749065 +0100
> +++ gcc/match.pd	2019-12-06 15:06:08.042953309 +0100
> @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY
>   (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2)
>   (comb (cmp @0 @2) (cmp @1 @2))))
>
> +/* Undo fancy way of writing max/min or other ?: expressions,
> +   like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a.
> +   People normally use ?: and that is what we actually try to optimize.  */
> +(for cmp (simple_comparison)
> + (simplify
> +  (minus @0 (bit_and:c (minus @0 @1)
> +		       (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
> +  (if (INTEGRAL_TYPE_P (type)
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
> +       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
> +       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
> +       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
> +	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
> +   (cond (cmp @2 @3) @1 @0)))

I was going to suggest
  (cond @5 @1 @0)

and possibly replacing (cmp@5 @2 @3) with truth_valued_p@5, before 
remembering that COND_EXPR embeds the comparison, and that not 
transforming when we don't see the comparison is likely on purpose. Plus, 
if @5 was in a signed 1-bit type, it may look more like -1 than 1 and 
break the transformation (is that forbidden as return type of a 
comparion?).
Jakub Jelinek Dec. 9, 2019, 10:30 a.m. UTC | #3
On Mon, Dec 09, 2019 at 10:54:34AM +0100, Marc Glisse wrote:
> On Fri, 6 Dec 2019, Jakub Jelinek wrote:
> 
> > --- gcc/match.pd.jj	2019-12-06 14:07:26.877749065 +0100
> > +++ gcc/match.pd	2019-12-06 15:06:08.042953309 +0100
> > @@ -2697,6 +2697,31 @@ (define_operator_list COND_TERNARY
> >   (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2)
> >   (comb (cmp @0 @2) (cmp @1 @2))))
> > 
> > +/* Undo fancy way of writing max/min or other ?: expressions,
> > +   like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a.
> > +   People normally use ?: and that is what we actually try to optimize.  */
> > +(for cmp (simple_comparison)
> > + (simplify
> > +  (minus @0 (bit_and:c (minus @0 @1)
> > +		       (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
> > +  (if (INTEGRAL_TYPE_P (type)
> > +       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
> > +       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
> > +       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
> > +       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
> > +	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
> > +   (cond (cmp @2 @3) @1 @0)))
> 
> I was going to suggest
>  (cond @5 @1 @0)
> 
> and possibly replacing (cmp@5 @2 @3) with truth_valued_p@5, before
> remembering that COND_EXPR embeds the comparison, and that not transforming
> when we don't see the comparison is likely on purpose. Plus, if @5 was in a
> signed 1-bit type, it may look more like -1 than 1 and break the
> transformation (is that forbidden as return type of a comparion?).

FYI, I've already committed the patch, so any improvement or bugfix needs to
be done incrementally.
The comparison in there was mainly an attempt to have a truth value
in there, so maybe truth_valued_p would work too, maybe even a
get_range_info checked value of [0, 1] would, though perhaps just
truth_valued_p is better because it involves some kind of setcc-like
instruction in the end.  All I'd like to see for comparisons is that they
are in the COND_EXPR's first operand if they can't throw.
I'm afraid I have no idea whether we can have signed 1-bit truth_valued_p
operations and what will happen with them, if it is possible, then I guess
an additional condition will be needed, check that it has prec > 1 or
TYPE_UNSIGNED.

	Jakub

Patch
diff mbox series

--- gcc/match.pd.jj	2019-12-06 14:07:26.877749065 +0100
+++ gcc/match.pd	2019-12-06 15:06:08.042953309 +0100
@@ -2697,6 +2697,31 @@  (define_operator_list COND_TERNARY
   (cmp (minmax @0 INTEGER_CST@1) INTEGER_CST@2)
   (comb (cmp @0 @2) (cmp @1 @2))))
 
+/* Undo fancy way of writing max/min or other ?: expressions,
+   like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a.
+   People normally use ?: and that is what we actually try to optimize.  */
+(for cmp (simple_comparison)
+ (simplify
+  (minus @0 (bit_and:c (minus @0 @1)
+		       (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
+  (if (INTEGRAL_TYPE_P (type)
+       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
+       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
+       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
+       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
+	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
+   (cond (cmp @2 @3) @1 @0)))
+ (simplify
+  (plus:c @0 (bit_and:c (minus @1 @0)
+			(convert? (negate@4 (convert? (cmp@5 @2 @3))))))
+  (if (INTEGRAL_TYPE_P (type)
+       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
+       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
+       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
+       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
+	   || !TYPE_UNSIGNED (TREE_TYPE (@4))))
+   (cond (cmp @2 @3) @1 @0))))
+
 /* Simplifications of shift and rotates.  */
 
 (for rotate (lrotate rrotate)
--- gcc/testsuite/gcc.dg/tree-ssa/pr92834.c.jj	2019-12-06 15:24:56.817353747 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr92834.c	2019-12-06 15:24:08.921100518 +0100
@@ -0,0 +1,122 @@ 
+/* PR tree-optimization/92834 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "MIN_EXPR <" 8 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MAX_EXPR <" 8 "optimized" } } */
+
+static inline unsigned
+umax1 (unsigned a, unsigned b)
+{
+  return a - ((a - b) & -(a < b));
+}
+
+static inline unsigned
+umin1 (unsigned a, unsigned b)
+{
+  return a - ((a - b) & -(a > b));
+}
+
+static inline int
+smax1 (int a, int b)
+{
+  return a - ((a - b) & -(a < b));
+}
+
+static inline int
+smin1 (int a, int b)
+{
+  return a - ((a - b) & -(a > b));
+}
+
+static inline unsigned long long
+umax2 (unsigned long long a, unsigned long long b)
+{
+  return a - ((a - b) & -(a <= b));
+}
+
+static inline unsigned long long
+umin2 (unsigned long long a, unsigned long long b)
+{
+  return a - ((a - b) & -(a >= b));
+}
+
+static inline long long
+smax2 (long long a, long long b)
+{
+  return a - ((a - b) & -(a <= b));
+}
+
+static inline long long
+smin2 (long long a, long long b)
+{
+  return a - ((a - b) & -(a >= b));
+}
+
+static inline unsigned
+umax3 (unsigned a, unsigned b)
+{
+  return a + ((b - a) & -(a < b));
+}
+
+static inline unsigned
+umin3 (unsigned a, unsigned b)
+{
+  return a + ((b - a) & -(a > b));
+}
+
+static inline int
+smax3 (int a, int b)
+{
+  return a + ((b - a) & -(a < b));
+}
+
+static inline int
+smin3 (int a, int b)
+{
+  return a + ((b - a) & -(a > b));
+}
+
+static inline unsigned long long
+umax4 (unsigned long long a, unsigned long long b)
+{
+  return a + ((b - a) & -(a <= b));
+}
+
+static inline unsigned long long
+umin4 (unsigned long long a, unsigned long long b)
+{
+  return a + ((b - a) & -(a >= b));
+}
+
+static inline long long
+smax4 (long long a, long long b)
+{
+  return a + ((b - a) & -(a <= b));
+}
+
+static inline long long
+smin4 (long long a, long long b)
+{
+  return a + ((b - a) & -(a >= b));
+}
+
+void
+test (unsigned *x, int *y, unsigned long long *z, long long *w)
+{
+  x[2] = umax1 (x[0], x[1]);
+  x[5] = umin1 (x[2], x[3]);
+  y[2] = smax1 (y[0], y[1]);
+  y[5] = smin1 (y[2], y[3]);
+  z[2] = umax2 (z[0], z[1]);
+  z[5] = umin2 (z[2], z[3]);
+  w[2] = smax2 (w[0], w[1]);
+  w[5] = smin2 (w[2], w[3]);
+  x[8] = umax3 (x[6], x[7]);
+  x[11] = umin3 (x[9], x[10]);
+  y[8] = smax3 (y[6], y[7]);
+  y[11] = smin3 (y[9], y[10]);
+  z[8] = umax4 (z[6], z[7]);
+  z[11] = umin4 (z[9], z[10]);
+  w[8] = smax4 (w[6], w[7]);
+  w[11] = smin4 (w[9], w[10]);
+}