Improve rotate fold-const pattern matching (PR target/82498)

Message ID 20171012194250.GC14653@tucnak
State New
Headers show
Series
  • Improve rotate fold-const pattern matching (PR target/82498)
Related show

Commit Message

Jakub Jelinek Oct. 12, 2017, 7:42 p.m.
Hi!

Marc in the PR mentioned that it is not really good that the recommended
rotate pattern is recognized only during forwprop1 and later, which is after
einline and that inlining or early opts could have changed stuff too much so
that we wouldn't recogize it anymore.

The following patch handles that pattern in fold_binary_loc too, and while
I've touched it, it cleans a lot of ugliness/duplication in that code.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-10-12  Jakub Jelinek  <jakub@redhat.com>

	PR target/82498
	* fold-const.c (fold_binary_loc) <bit_rotate>: Code cleanups,
	instead of handling MINUS_EXPR twice (once for each argument),
	canonicalize operand order and handle just once, use rtype where
	possible.  Handle (A << B) | (A >> (-B & (Z - 1))).

	* gcc.dg/tree-ssa/pr82498.c: New test.


	Jakub

Comments

Richard Biener Oct. 13, 2017, 7:10 a.m. | #1
On Thu, 12 Oct 2017, Jakub Jelinek wrote:

> Hi!
> 
> Marc in the PR mentioned that it is not really good that the recommended
> rotate pattern is recognized only during forwprop1 and later, which is after
> einline and that inlining or early opts could have changed stuff too much so
> that we wouldn't recogize it anymore.

Hmm, but the only thing functions see is inlining early optimized bodies
into them and then constant propagation performed, so I don't see how
we could confuse the pattern in a way to be indetectable.  Also
early inlining is performed on early optimized bodies so cost metrics
see rotates, not the unrecognized form.

> The following patch handles that pattern in fold_binary_loc too, and while
> I've touched it, it cleans a lot of ugliness/duplication in that code.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Still looks like an improvement, thus ok.

Thanks,
Richard.

> 2017-10-12  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR target/82498
> 	* fold-const.c (fold_binary_loc) <bit_rotate>: Code cleanups,
> 	instead of handling MINUS_EXPR twice (once for each argument),
> 	canonicalize operand order and handle just once, use rtype where
> 	possible.  Handle (A << B) | (A >> (-B & (Z - 1))).
> 
> 	* gcc.dg/tree-ssa/pr82498.c: New test.
> 
> --- gcc/fold-const.c.jj	2017-10-11 22:37:51.000000000 +0200
> +++ gcc/fold-const.c	2017-10-12 13:17:45.360589554 +0200
> @@ -9429,7 +9429,10 @@ fold_binary_loc (location_t loc,
>        /* (A << C1) + (A >> C2) if A is unsigned and C1+C2 is the size of A
>  	 is a rotate of A by C1 bits.  */
>        /* (A << B) + (A >> (Z - B)) if A is unsigned and Z is the size of A
> -	 is a rotate of A by B bits.  */
> +	 is a rotate of A by B bits.
> +	 Similarly for (A << B) | (A >> (-B & C3)) where C3 is Z-1,
> +	 though in this case CODE must be | and not + or ^, otherwise
> +	 it doesn't return A when B is 0.  */
>        {
>  	enum tree_code code0, code1;
>  	tree rtype;
> @@ -9447,25 +9450,32 @@ fold_binary_loc (location_t loc,
>  		== GET_MODE_UNIT_PRECISION (TYPE_MODE (rtype))))
>  	  {
>  	    tree tree01, tree11;
> +	    tree orig_tree01, orig_tree11;
>  	    enum tree_code code01, code11;
>  
> -	    tree01 = TREE_OPERAND (arg0, 1);
> -	    tree11 = TREE_OPERAND (arg1, 1);
> +	    tree01 = orig_tree01 = TREE_OPERAND (arg0, 1);
> +	    tree11 = orig_tree11 = TREE_OPERAND (arg1, 1);
>  	    STRIP_NOPS (tree01);
>  	    STRIP_NOPS (tree11);
>  	    code01 = TREE_CODE (tree01);
>  	    code11 = TREE_CODE (tree11);
> +	    if (code11 != MINUS_EXPR
> +		&& (code01 == MINUS_EXPR || code01 == BIT_AND_EXPR))
> +	      {
> +		std::swap (code0, code1);
> +		std::swap (code01, code11);
> +		std::swap (tree01, tree11);
> +		std::swap (orig_tree01, orig_tree11);
> +	      }
>  	    if (code01 == INTEGER_CST
>  		&& code11 == INTEGER_CST
>  		&& (wi::to_widest (tree01) + wi::to_widest (tree11)
> -		    == element_precision (TREE_TYPE (TREE_OPERAND (arg0, 0)))))
> +		    == element_precision (rtype)))
>  	      {
>  		tem = build2_loc (loc, LROTATE_EXPR,
> -				  TREE_TYPE (TREE_OPERAND (arg0, 0)),
> -				  TREE_OPERAND (arg0, 0),
> +				  rtype, TREE_OPERAND (arg0, 0),
>  				  code0 == LSHIFT_EXPR
> -				  ? TREE_OPERAND (arg0, 1)
> -				  : TREE_OPERAND (arg1, 1));
> +				  ? orig_tree01 : orig_tree11);
>  		return fold_convert_loc (loc, type, tem);
>  	      }
>  	    else if (code11 == MINUS_EXPR)
> @@ -9477,39 +9487,37 @@ fold_binary_loc (location_t loc,
>  		STRIP_NOPS (tree111);
>  		if (TREE_CODE (tree110) == INTEGER_CST
>  		    && 0 == compare_tree_int (tree110,
> -					      element_precision
> -					      (TREE_TYPE (TREE_OPERAND
> -							  (arg0, 0))))
> +					      element_precision (rtype))
>  		    && operand_equal_p (tree01, tree111, 0))
> -		  return
> -		    fold_convert_loc (loc, type,
> -				      build2 ((code0 == LSHIFT_EXPR
> -					       ? LROTATE_EXPR
> -					       : RROTATE_EXPR),
> -					      TREE_TYPE (TREE_OPERAND (arg0, 0)),
> -					      TREE_OPERAND (arg0, 0),
> -					      TREE_OPERAND (arg0, 1)));
> +		  {
> +		    tem = build2_loc (loc, (code0 == LSHIFT_EXPR
> +					    ? LROTATE_EXPR : RROTATE_EXPR),
> +				      rtype, TREE_OPERAND (arg0, 0),
> +				      orig_tree01);
> +		    return fold_convert_loc (loc, type, tem);
> +		  }
>  	      }
> -	    else if (code01 == MINUS_EXPR)
> +	    else if (code == BIT_IOR_EXPR
> +		     && code11 == BIT_AND_EXPR
> +		     && pow2p_hwi (element_precision (rtype)))
>  	      {
> -		tree tree010, tree011;
> -		tree010 = TREE_OPERAND (tree01, 0);
> -		tree011 = TREE_OPERAND (tree01, 1);
> -		STRIP_NOPS (tree010);
> -		STRIP_NOPS (tree011);
> -		if (TREE_CODE (tree010) == INTEGER_CST
> -		    && 0 == compare_tree_int (tree010,
> -					      element_precision
> -					      (TREE_TYPE (TREE_OPERAND
> -							  (arg0, 0))))
> -		    && operand_equal_p (tree11, tree011, 0))
> -		    return fold_convert_loc
> -		      (loc, type,
> -		       build2 ((code0 != LSHIFT_EXPR
> -				? LROTATE_EXPR
> -				: RROTATE_EXPR),
> -			       TREE_TYPE (TREE_OPERAND (arg0, 0)),
> -			       TREE_OPERAND (arg0, 0), TREE_OPERAND (arg1, 1)));
> +		tree tree110, tree111;
> +		tree110 = TREE_OPERAND (tree11, 0);
> +		tree111 = TREE_OPERAND (tree11, 1);
> +		STRIP_NOPS (tree110);
> +		STRIP_NOPS (tree111);
> +		if (TREE_CODE (tree110) == NEGATE_EXPR
> +		    && TREE_CODE (tree111) == INTEGER_CST
> +		    && 0 == compare_tree_int (tree111,
> +					      element_precision (rtype) - 1)
> +		    && operand_equal_p (tree01, TREE_OPERAND (tree110, 0), 0))
> +		  {
> +		    tem = build2_loc (loc, (code0 == LSHIFT_EXPR
> +					    ? LROTATE_EXPR : RROTATE_EXPR),
> +				      rtype, TREE_OPERAND (arg0, 0),
> +				      orig_tree01);
> +		    return fold_convert_loc (loc, type, tem);
> +		  }
>  	      }
>  	  }
>        }
> --- gcc/testsuite/gcc.dg/tree-ssa/pr82498.c.jj	2017-10-12 13:14:27.234991970 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr82498.c	2017-10-12 13:13:45.000000000 +0200
> @@ -0,0 +1,53 @@
> +/* PR target/82498 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-original" } */
> +/* { dg-final { scan-tree-dump-times "x r<< y" 4 "original" { target int32 } } } */
> +/* { dg-final { scan-tree-dump-times "x r>> y" 4 "original" { target int32 } } } */
> +
> +unsigned
> +f1 (unsigned x, int y)
> +{
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f2 (unsigned x, int y)
> +{
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f3 (unsigned x, int y)
> +{
> +  return (x >> y) | (x << (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f4 (unsigned x, int y)
> +{
> +  return (x >> y) | (x << (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f5 (unsigned x, int y)
> +{
> +  return (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)) | (x << y);
> +}
> +
> +unsigned
> +f6 (unsigned x, int y)
> +{
> +  return (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))) | (x << y);
> +}
> +
> +unsigned
> +f7 (unsigned x, int y)
> +{
> +  return (x << (__CHAR_BIT__ * __SIZEOF_INT__ - y)) | (x >> y);
> +}
> +
> +unsigned
> +f8 (unsigned x, int y)
> +{
> +  return (x << (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))) | (x >> y);
> +}
> 
> 	Jakub
> 
>
Marc Glisse Oct. 13, 2017, 9:10 a.m. | #2
On Fri, 13 Oct 2017, Richard Biener wrote:

> On Thu, 12 Oct 2017, Jakub Jelinek wrote:
>
>> Hi!
>>
>> Marc in the PR mentioned that it is not really good that the recommended
>> rotate pattern is recognized only during forwprop1 and later, which is after
>> einline and that inlining or early opts could have changed stuff too much so
>> that we wouldn't recogize it anymore.
>
> Hmm, but the only thing functions see is inlining early optimized bodies
> into them and then constant propagation performed, so I don't see how
> we could confuse the pattern in a way to be indetectable.  Also

For instance, in

unsigned f(unsigned x, int y) { return (x << y) | (x >> (-y & 31)); }
unsigned g(unsigned x) { return f(x << 2, 3); }

if we inline f into g before optimizing f, we might combine (x << 2) << 3 
into x << 5, which would make it harder to recognize the rotation. For 
some reason that's not happening, but similar scenarios seemed possible to 
me. But if the inliner calls early optimizations (including forwprop) on 
the callee before inlining it, then my comment is indeed nonsense.

> early inlining is performed on early optimized bodies so cost metrics
> see rotates, not the unrecognized form.

Ah, so there's a hidden forwprop pass in einline?

unsigned f(unsigned x, int y) {
   unsigned z = x << y;
   return z | (x >> (-y & 31));
}
unsigned g(unsigned x, int y) { return f(x, y); }

After einline, g has rotate but not f, that's rather confusing.
Richard Biener Oct. 13, 2017, 10:16 a.m. | #3
On Fri, 13 Oct 2017, Marc Glisse wrote:

> On Fri, 13 Oct 2017, Richard Biener wrote:
> 
> > On Thu, 12 Oct 2017, Jakub Jelinek wrote:
> > 
> > > Hi!
> > > 
> > > Marc in the PR mentioned that it is not really good that the recommended
> > > rotate pattern is recognized only during forwprop1 and later, which is
> > > after
> > > einline and that inlining or early opts could have changed stuff too much
> > > so
> > > that we wouldn't recogize it anymore.
> > 
> > Hmm, but the only thing functions see is inlining early optimized bodies
> > into them and then constant propagation performed, so I don't see how
> > we could confuse the pattern in a way to be indetectable.  Also
> 
> For instance, in
> 
> unsigned f(unsigned x, int y) { return (x << y) | (x >> (-y & 31)); }
> unsigned g(unsigned x) { return f(x << 2, 3); }
> 
> if we inline f into g before optimizing f, we might combine (x << 2) << 3 into
> x << 5, which would make it harder to recognize the rotation. For some reason
> that's not happening, but similar scenarios seemed possible to me. But if the
> inliner calls early optimizations (including forwprop) on the callee before
> inlining it, then my comment is indeed nonsense.

We are always inlining early optimized bodies (unless it is the recursion
edge in a cyclic callgraph).  So we're effectively re-optimizing any
code we inline as well.

> > early inlining is performed on early optimized bodies so cost metrics
> > see rotates, not the unrecognized form.
> 
> Ah, so there's a hidden forwprop pass in einline?

No.  The early optimization pipeline runs on each function separately
and the processing is done in an order that optimizes callees before
callers.  The first step is to perform inlining into the current
function, then we optimize the result.

> unsigned f(unsigned x, int y) {
>   unsigned z = x << y;
>   return z | (x >> (-y & 31));
> }
> unsigned g(unsigned x, int y) { return f(x, y); }
> 
> After einline, g has rotate but not f, that's rather confusing.

It indeed is ;)  Non-IPA pass dumps do not show function bodies
at a "single point in time".

Richard.

Patch

--- gcc/fold-const.c.jj	2017-10-11 22:37:51.000000000 +0200
+++ gcc/fold-const.c	2017-10-12 13:17:45.360589554 +0200
@@ -9429,7 +9429,10 @@  fold_binary_loc (location_t loc,
       /* (A << C1) + (A >> C2) if A is unsigned and C1+C2 is the size of A
 	 is a rotate of A by C1 bits.  */
       /* (A << B) + (A >> (Z - B)) if A is unsigned and Z is the size of A
-	 is a rotate of A by B bits.  */
+	 is a rotate of A by B bits.
+	 Similarly for (A << B) | (A >> (-B & C3)) where C3 is Z-1,
+	 though in this case CODE must be | and not + or ^, otherwise
+	 it doesn't return A when B is 0.  */
       {
 	enum tree_code code0, code1;
 	tree rtype;
@@ -9447,25 +9450,32 @@  fold_binary_loc (location_t loc,
 		== GET_MODE_UNIT_PRECISION (TYPE_MODE (rtype))))
 	  {
 	    tree tree01, tree11;
+	    tree orig_tree01, orig_tree11;
 	    enum tree_code code01, code11;
 
-	    tree01 = TREE_OPERAND (arg0, 1);
-	    tree11 = TREE_OPERAND (arg1, 1);
+	    tree01 = orig_tree01 = TREE_OPERAND (arg0, 1);
+	    tree11 = orig_tree11 = TREE_OPERAND (arg1, 1);
 	    STRIP_NOPS (tree01);
 	    STRIP_NOPS (tree11);
 	    code01 = TREE_CODE (tree01);
 	    code11 = TREE_CODE (tree11);
+	    if (code11 != MINUS_EXPR
+		&& (code01 == MINUS_EXPR || code01 == BIT_AND_EXPR))
+	      {
+		std::swap (code0, code1);
+		std::swap (code01, code11);
+		std::swap (tree01, tree11);
+		std::swap (orig_tree01, orig_tree11);
+	      }
 	    if (code01 == INTEGER_CST
 		&& code11 == INTEGER_CST
 		&& (wi::to_widest (tree01) + wi::to_widest (tree11)
-		    == element_precision (TREE_TYPE (TREE_OPERAND (arg0, 0)))))
+		    == element_precision (rtype)))
 	      {
 		tem = build2_loc (loc, LROTATE_EXPR,
-				  TREE_TYPE (TREE_OPERAND (arg0, 0)),
-				  TREE_OPERAND (arg0, 0),
+				  rtype, TREE_OPERAND (arg0, 0),
 				  code0 == LSHIFT_EXPR
-				  ? TREE_OPERAND (arg0, 1)
-				  : TREE_OPERAND (arg1, 1));
+				  ? orig_tree01 : orig_tree11);
 		return fold_convert_loc (loc, type, tem);
 	      }
 	    else if (code11 == MINUS_EXPR)
@@ -9477,39 +9487,37 @@  fold_binary_loc (location_t loc,
 		STRIP_NOPS (tree111);
 		if (TREE_CODE (tree110) == INTEGER_CST
 		    && 0 == compare_tree_int (tree110,
-					      element_precision
-					      (TREE_TYPE (TREE_OPERAND
-							  (arg0, 0))))
+					      element_precision (rtype))
 		    && operand_equal_p (tree01, tree111, 0))
-		  return
-		    fold_convert_loc (loc, type,
-				      build2 ((code0 == LSHIFT_EXPR
-					       ? LROTATE_EXPR
-					       : RROTATE_EXPR),
-					      TREE_TYPE (TREE_OPERAND (arg0, 0)),
-					      TREE_OPERAND (arg0, 0),
-					      TREE_OPERAND (arg0, 1)));
+		  {
+		    tem = build2_loc (loc, (code0 == LSHIFT_EXPR
+					    ? LROTATE_EXPR : RROTATE_EXPR),
+				      rtype, TREE_OPERAND (arg0, 0),
+				      orig_tree01);
+		    return fold_convert_loc (loc, type, tem);
+		  }
 	      }
-	    else if (code01 == MINUS_EXPR)
+	    else if (code == BIT_IOR_EXPR
+		     && code11 == BIT_AND_EXPR
+		     && pow2p_hwi (element_precision (rtype)))
 	      {
-		tree tree010, tree011;
-		tree010 = TREE_OPERAND (tree01, 0);
-		tree011 = TREE_OPERAND (tree01, 1);
-		STRIP_NOPS (tree010);
-		STRIP_NOPS (tree011);
-		if (TREE_CODE (tree010) == INTEGER_CST
-		    && 0 == compare_tree_int (tree010,
-					      element_precision
-					      (TREE_TYPE (TREE_OPERAND
-							  (arg0, 0))))
-		    && operand_equal_p (tree11, tree011, 0))
-		    return fold_convert_loc
-		      (loc, type,
-		       build2 ((code0 != LSHIFT_EXPR
-				? LROTATE_EXPR
-				: RROTATE_EXPR),
-			       TREE_TYPE (TREE_OPERAND (arg0, 0)),
-			       TREE_OPERAND (arg0, 0), TREE_OPERAND (arg1, 1)));
+		tree tree110, tree111;
+		tree110 = TREE_OPERAND (tree11, 0);
+		tree111 = TREE_OPERAND (tree11, 1);
+		STRIP_NOPS (tree110);
+		STRIP_NOPS (tree111);
+		if (TREE_CODE (tree110) == NEGATE_EXPR
+		    && TREE_CODE (tree111) == INTEGER_CST
+		    && 0 == compare_tree_int (tree111,
+					      element_precision (rtype) - 1)
+		    && operand_equal_p (tree01, TREE_OPERAND (tree110, 0), 0))
+		  {
+		    tem = build2_loc (loc, (code0 == LSHIFT_EXPR
+					    ? LROTATE_EXPR : RROTATE_EXPR),
+				      rtype, TREE_OPERAND (arg0, 0),
+				      orig_tree01);
+		    return fold_convert_loc (loc, type, tem);
+		  }
 	      }
 	  }
       }
--- gcc/testsuite/gcc.dg/tree-ssa/pr82498.c.jj	2017-10-12 13:14:27.234991970 +0200
+++ gcc/testsuite/gcc.dg/tree-ssa/pr82498.c	2017-10-12 13:13:45.000000000 +0200
@@ -0,0 +1,53 @@ 
+/* PR target/82498 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-original" } */
+/* { dg-final { scan-tree-dump-times "x r<< y" 4 "original" { target int32 } } } */
+/* { dg-final { scan-tree-dump-times "x r>> y" 4 "original" { target int32 } } } */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f2 (unsigned x, int y)
+{
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f3 (unsigned x, int y)
+{
+  return (x >> y) | (x << (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f4 (unsigned x, int y)
+{
+  return (x >> y) | (x << (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f5 (unsigned x, int y)
+{
+  return (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)) | (x << y);
+}
+
+unsigned
+f6 (unsigned x, int y)
+{
+  return (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))) | (x << y);
+}
+
+unsigned
+f7 (unsigned x, int y)
+{
+  return (x << (__CHAR_BIT__ * __SIZEOF_INT__ - y)) | (x >> y);
+}
+
+unsigned
+f8 (unsigned x, int y)
+{
+  return (x << (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))) | (x >> y);
+}