diff mbox series

match.pd: Use ranges to optimize some x * y / y to x [PR97997]

Message ID 20201126085205.GR3788@tucnak
State New
Headers show
Series match.pd: Use ranges to optimize some x * y / y to x [PR97997] | expand

Commit Message

Jakub Jelinek Nov. 26, 2020, 8:52 a.m. UTC
Hi!

For signed integers with undefined overflow we already optimize x * y / y
into x, but for signed integers with -fwrapv or unsigned integers we don't.
The following patch allows optimizing that into just x if value ranges
prove that x * y will never overflow.
It uses the global SSA_NAME_RANGE_INFO only, because like mentioned
in another PR we don't currently have a way to tell the ranger from match.pd
the use stmt (and we'd need in that case to tell ranger to only follow
SSA_NAME_DEF_STMTs + SSA_NAME_RANGE_INFO and never go in the other
direction, as following immediate uses seems forbidden in match.pd).
Another possibility would be to optimize this during vrp, but on the
other side the optimization itself is match.pd-ish.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-11-26  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/97997
	* match.pd ((t * 2) / 2) -> t): Optimize even for defined
	overflow if ranges prove there is no overflow.

	* gcc.dg/tree-ssa/pr97997-1.c: New test.
	* gcc.dg/tree-ssa/pr97997-2.c: New test.


	Jakub

Comments

Richard Biener Nov. 26, 2020, 9:24 a.m. UTC | #1
On Thu, 26 Nov 2020, Jakub Jelinek wrote:

> Hi!
> 
> For signed integers with undefined overflow we already optimize x * y / y
> into x, but for signed integers with -fwrapv or unsigned integers we don't.
> The following patch allows optimizing that into just x if value ranges
> prove that x * y will never overflow.
> It uses the global SSA_NAME_RANGE_INFO only, because like mentioned
> in another PR we don't currently have a way to tell the ranger from match.pd
> the use stmt (and we'd need in that case to tell ranger to only follow
> SSA_NAME_DEF_STMTs + SSA_NAME_RANGE_INFO and never go in the other
> direction, as following immediate uses seems forbidden in match.pd).
> Another possibility would be to optimize this during vrp, but on the
> other side the optimization itself is match.pd-ish.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Hmm, can't you match

>    (div (mult@3:c @0 @1) @1)

and then look at the range of @3 directly?


> 2020-11-26  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/97997
> 	* match.pd ((t * 2) / 2) -> t): Optimize even for defined
> 	overflow if ranges prove there is no overflow.
> 
> 	* gcc.dg/tree-ssa/pr97997-1.c: New test.
> 	* gcc.dg/tree-ssa/pr97997-2.c: New test.
> 
> --- gcc/match.pd.jj	2020-11-25 16:58:33.840370571 +0100
> +++ gcc/match.pd	2020-11-25 21:45:07.487050919 +0100
> @@ -650,9 +650,48 @@ (define_operator_list COND_TERNARY
>  (for div (trunc_div ceil_div floor_div round_div exact_div)
>   (simplify
>    (div (mult:c @0 @1) @1)
> -  (if (ANY_INTEGRAL_TYPE_P (type)
> -       && TYPE_OVERFLOW_UNDEFINED (type))
> -   @0)))
> +  (if (ANY_INTEGRAL_TYPE_P (type))
> +   (if (TYPE_OVERFLOW_UNDEFINED (type))
> +    @0
> +#if GIMPLE
> +    (if (TREE_CODE (@0) == SSA_NAME
> +	 && (TREE_CODE (@1) == SSA_NAME || TREE_CODE (@1) == INTEGER_CST))
> +     (with
> +      {
> +	bool overflowed = true;
> +	wide_int wmin0, wmax0;
> +	if (get_range_info (@0, &wmin0, &wmax0) == VR_RANGE)
> +	  {
> +	    /* If the multiplication can't overflow/wrap around, then
> +	       it can be optimized too.  */
> +	    wide_int wmin1, wmax1;
> +	    wi::overflow_type min_ovf, max_ovf;
> +	    if (TREE_CODE (@1) == INTEGER_CST)
> +	      {
> +		wmin1 = wi::to_wide (@1);
> +		wi::mul (wmin0, wmin1, TYPE_SIGN (type), &min_ovf);
> +		wi::mul (wmax0, wmin1, TYPE_SIGN (type), &max_ovf);
> +		if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
> +		  overflowed = false;
> +	      }
> +	    else if (get_range_info (@1, &wmin1, &wmax1) == VR_RANGE)
> +	      {
> +		wi::mul (wmin0, wmin1, TYPE_SIGN (type), &min_ovf);
> +		wi::mul (wmax0, wmax1, TYPE_SIGN (type), &max_ovf);
> +		if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
> +		  {
> +		    wi::mul (wmin0, wmax1, TYPE_SIGN (type), &min_ovf);
> +		    wi::mul (wmax0, wmin1, TYPE_SIGN (type), &max_ovf);
> +		    if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
> +		      overflowed = false;
> +		  }
> +	      }
> +	  }
> +      }
> +     (if (!overflowed)
> +      @0)))
> +#endif
> +   ))))
>  
>  (for op (negate abs)
>   /* Simplify cos(-x) and cos(|x|) -> cos(x).  Similarly for cosh.  */
> --- gcc/testsuite/gcc.dg/tree-ssa/pr97997-1.c.jj	2020-11-25 21:54:29.745748747 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr97997-1.c	2020-11-25 21:59:01.428703547 +0100
> @@ -0,0 +1,52 @@
> +/* PR tree-optimization/97997 */
> +/* { dg-do compile { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times "return x_\[0-9]*\\\(D\\\);" 6 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
> +
> +unsigned short
> +f1 (unsigned short x)
> +{
> +  return x * 10 / 10;
> +}
> +
> +unsigned short
> +f2 (unsigned short x)
> +{
> +  int a = x;
> +  int b = 10;
> +  int c = 10;
> +  return a * b / c;
> +}
> +
> +unsigned short
> +f3 (unsigned short x)
> +{
> +  return x * 10U / 10;
> +}
> +
> +unsigned short
> +f4 (unsigned short x)
> +{
> +  unsigned a = x;
> +  unsigned b = 10;
> +  unsigned c = 10;
> +  return a * b / c;
> +}
> +
> +unsigned short
> +f5 (unsigned short x, unsigned short y)
> +{
> +  return (unsigned) x * y / y;
> +}
> +
> +unsigned int
> +f6 (unsigned int x, unsigned int y)
> +{
> +  if (x >= 30000)
> +    __builtin_unreachable ();
> +  if (y >= ~0U / 30000)
> +    __builtin_unreachable ();
> +  return x * y / y;
> +}
> --- gcc/testsuite/gcc.dg/tree-ssa/pr97997-2.c.jj	2020-11-25 21:55:34.322024930 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr97997-2.c	2020-11-25 21:59:08.570623495 +0100
> @@ -0,0 +1,41 @@
> +/* PR tree-optimization/97997 */
> +/* { dg-do compile { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized -fwrapv" } */
> +/* { dg-final { scan-tree-dump-times "return x_\[0-9]*\\\(D\\\);" 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
> +
> +unsigned short
> +f1 (unsigned short x)
> +{
> +  return x * 10 / 10;
> +}
> +
> +unsigned short
> +f2 (unsigned short x)
> +{
> +  int a = x;
> +  int b = 10;
> +  int c = 10;
> +  return a * b / c;
> +}
> +
> +short
> +f3 (short x, short y)
> +{
> +  return x * y / y;
> +}
> +
> +int
> +f4 (int x, int y)
> +{
> +  if (x >= 30000)
> +    __builtin_unreachable ();
> +  if (x <= -30000)
> +    __builtin_unreachable ();
> +  if (y >= __INT_MAX__ / 30000)
> +    __builtin_unreachable ();
> +  if (y <= -__INT_MAX__ / 30000)
> +    __builtin_unreachable ();
> +  return x * y / y;
> +}
> 
> 	Jakub
> 
>
Jakub Jelinek Nov. 26, 2020, 9:36 a.m. UTC | #2
On Thu, Nov 26, 2020 at 09:24:30AM +0000, Richard Biener wrote:
> > For signed integers with undefined overflow we already optimize x * y / y
> > into x, but for signed integers with -fwrapv or unsigned integers we don't.
> > The following patch allows optimizing that into just x if value ranges
> > prove that x * y will never overflow.
> > It uses the global SSA_NAME_RANGE_INFO only, because like mentioned
> > in another PR we don't currently have a way to tell the ranger from match.pd
> > the use stmt (and we'd need in that case to tell ranger to only follow
> > SSA_NAME_DEF_STMTs + SSA_NAME_RANGE_INFO and never go in the other
> > direction, as following immediate uses seems forbidden in match.pd).
> > Another possibility would be to optimize this during vrp, but on the
> > other side the optimization itself is match.pd-ish.
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> Hmm, can't you match
> 
> >    (div (mult@3:c @0 @1) @1)
> 
> and then look at the range of @3 directly?

No.  I need to check whether the multiplication will never overflow,
i.e. I need @3 range in the infinite precision (for multiplication
twice as large precision as the multiplication (when the arguments and
result have the same precision, which is the case for IL MULT_EXPR)),
while the @3 precision is that after wrapping it into the @3's precision.
So, e.g. the multiplication could always overflow, yet the range
wouldn't be VARYING, consider
@0 in [0x4000000, 0x40000ff] and @1 64, then the infinite precision
range would be [0x100000000, 0x100003fc0], but @3 range is
[0, 0x3fc0], etc.  What I need to check is essentially that
__builtin_mul_overflow_p (@0, @1, (typeof (@0)) 0) folds to constant 0.

	Jakub
Richard Biener Nov. 26, 2020, 1:36 p.m. UTC | #3
On Thu, 26 Nov 2020, Jakub Jelinek wrote:

> On Thu, Nov 26, 2020 at 09:24:30AM +0000, Richard Biener wrote:
> > > For signed integers with undefined overflow we already optimize x * y / y
> > > into x, but for signed integers with -fwrapv or unsigned integers we don't.
> > > The following patch allows optimizing that into just x if value ranges
> > > prove that x * y will never overflow.
> > > It uses the global SSA_NAME_RANGE_INFO only, because like mentioned
> > > in another PR we don't currently have a way to tell the ranger from match.pd
> > > the use stmt (and we'd need in that case to tell ranger to only follow
> > > SSA_NAME_DEF_STMTs + SSA_NAME_RANGE_INFO and never go in the other
> > > direction, as following immediate uses seems forbidden in match.pd).
> > > Another possibility would be to optimize this during vrp, but on the
> > > other side the optimization itself is match.pd-ish.
> > > 
> > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> > 
> > Hmm, can't you match
> > 
> > >    (div (mult@3:c @0 @1) @1)
> > 
> > and then look at the range of @3 directly?
> 
> No.  I need to check whether the multiplication will never overflow,
> i.e. I need @3 range in the infinite precision (for multiplication
> twice as large precision as the multiplication (when the arguments and
> result have the same precision, which is the case for IL MULT_EXPR)),
> while the @3 precision is that after wrapping it into the @3's precision.
> So, e.g. the multiplication could always overflow, yet the range
> wouldn't be VARYING, consider
> @0 in [0x4000000, 0x40000ff] and @1 64, then the infinite precision
> range would be [0x100000000, 0x100003fc0], but @3 range is
> [0, 0x3fc0], etc.  What I need to check is essentially that
> __builtin_mul_overflow_p (@0, @1, (typeof (@0)) 0) folds to constant 0.

Oops, yes.  The patch is OK.

Thanks,
Richard.
Andrew MacLeod Nov. 30, 2020, 3:28 p.m. UTC | #4
On 11/26/20 3:52 AM, Jakub Jelinek wrote:
> Hi!
>
> For signed integers with undefined overflow we already optimize x * y / y
> into x, but for signed integers with -fwrapv or unsigned integers we don't.
> The following patch allows optimizing that into just x if value ranges
> prove that x * y will never overflow.
> It uses the global SSA_NAME_RANGE_INFO only, because like mentioned
> in another PR we don't currently have a way to tell the ranger from match.pd
> the use stmt (and we'd need in that case to tell ranger to only follow
> SSA_NAME_DEF_STMTs + SSA_NAME_RANGE_INFO and never go in the other
> direction, as following immediate uses seems forbidden in match.pd).

as an FYI, ranger only uses immediate-uses to try to track non-null 
pointer references. so
    a) if its not a pointer, it'll never follow immediate uses, and
    b) we can look at disabling that functionality as needed, or better 
yet, replace the non-null pointer processing facility to eventually not 
need immediate_uses.  I will add that to the worklist.

Andrew
Marc Glisse Dec. 6, 2020, 9:44 a.m. UTC | #5
On Thu, 26 Nov 2020, Jakub Jelinek via Gcc-patches wrote:

> For signed integers with undefined overflow we already optimize x * y / y
> into x, but for signed integers with -fwrapv or unsigned integers we don't.
> The following patch allows optimizing that into just x if value ranges
> prove that x * y will never overflow.

I've long wanted a helper that checks if VRP thinks an operation could 
overflow, I think at some point it would make sense to move this code to 
some function so that it can be easily reused. Maybe also define a matcher 
so we can write (mult_noovf @0 @1) which would succeed if either overflow 
is undefined or if VRP can prove that no overflow is happening.

Of course that's all ideas for later, refactoring belongs in the second or 
third patch using a feature, not the first one :-)
diff mbox series

Patch

--- gcc/match.pd.jj	2020-11-25 16:58:33.840370571 +0100
+++ gcc/match.pd	2020-11-25 21:45:07.487050919 +0100
@@ -650,9 +650,48 @@  (define_operator_list COND_TERNARY
 (for div (trunc_div ceil_div floor_div round_div exact_div)
  (simplify
   (div (mult:c @0 @1) @1)
-  (if (ANY_INTEGRAL_TYPE_P (type)
-       && TYPE_OVERFLOW_UNDEFINED (type))
-   @0)))
+  (if (ANY_INTEGRAL_TYPE_P (type))
+   (if (TYPE_OVERFLOW_UNDEFINED (type))
+    @0
+#if GIMPLE
+    (if (TREE_CODE (@0) == SSA_NAME
+	 && (TREE_CODE (@1) == SSA_NAME || TREE_CODE (@1) == INTEGER_CST))
+     (with
+      {
+	bool overflowed = true;
+	wide_int wmin0, wmax0;
+	if (get_range_info (@0, &wmin0, &wmax0) == VR_RANGE)
+	  {
+	    /* If the multiplication can't overflow/wrap around, then
+	       it can be optimized too.  */
+	    wide_int wmin1, wmax1;
+	    wi::overflow_type min_ovf, max_ovf;
+	    if (TREE_CODE (@1) == INTEGER_CST)
+	      {
+		wmin1 = wi::to_wide (@1);
+		wi::mul (wmin0, wmin1, TYPE_SIGN (type), &min_ovf);
+		wi::mul (wmax0, wmin1, TYPE_SIGN (type), &max_ovf);
+		if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
+		  overflowed = false;
+	      }
+	    else if (get_range_info (@1, &wmin1, &wmax1) == VR_RANGE)
+	      {
+		wi::mul (wmin0, wmin1, TYPE_SIGN (type), &min_ovf);
+		wi::mul (wmax0, wmax1, TYPE_SIGN (type), &max_ovf);
+		if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
+		  {
+		    wi::mul (wmin0, wmax1, TYPE_SIGN (type), &min_ovf);
+		    wi::mul (wmax0, wmin1, TYPE_SIGN (type), &max_ovf);
+		    if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
+		      overflowed = false;
+		  }
+	      }
+	  }
+      }
+     (if (!overflowed)
+      @0)))
+#endif
+   ))))
 
 (for op (negate abs)
  /* Simplify cos(-x) and cos(|x|) -> cos(x).  Similarly for cosh.  */
--- gcc/testsuite/gcc.dg/tree-ssa/pr97997-1.c.jj	2020-11-25 21:54:29.745748747 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr97997-1.c	2020-11-25 21:59:01.428703547 +0100
@@ -0,0 +1,52 @@ 
+/* PR tree-optimization/97997 */
+/* { dg-do compile { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "return x_\[0-9]*\\\(D\\\);" 6 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
+
+unsigned short
+f1 (unsigned short x)
+{
+  return x * 10 / 10;
+}
+
+unsigned short
+f2 (unsigned short x)
+{
+  int a = x;
+  int b = 10;
+  int c = 10;
+  return a * b / c;
+}
+
+unsigned short
+f3 (unsigned short x)
+{
+  return x * 10U / 10;
+}
+
+unsigned short
+f4 (unsigned short x)
+{
+  unsigned a = x;
+  unsigned b = 10;
+  unsigned c = 10;
+  return a * b / c;
+}
+
+unsigned short
+f5 (unsigned short x, unsigned short y)
+{
+  return (unsigned) x * y / y;
+}
+
+unsigned int
+f6 (unsigned int x, unsigned int y)
+{
+  if (x >= 30000)
+    __builtin_unreachable ();
+  if (y >= ~0U / 30000)
+    __builtin_unreachable ();
+  return x * y / y;
+}
--- gcc/testsuite/gcc.dg/tree-ssa/pr97997-2.c.jj	2020-11-25 21:55:34.322024930 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr97997-2.c	2020-11-25 21:59:08.570623495 +0100
@@ -0,0 +1,41 @@ 
+/* PR tree-optimization/97997 */
+/* { dg-do compile { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized -fwrapv" } */
+/* { dg-final { scan-tree-dump-times "return x_\[0-9]*\\\(D\\\);" 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
+
+unsigned short
+f1 (unsigned short x)
+{
+  return x * 10 / 10;
+}
+
+unsigned short
+f2 (unsigned short x)
+{
+  int a = x;
+  int b = 10;
+  int c = 10;
+  return a * b / c;
+}
+
+short
+f3 (short x, short y)
+{
+  return x * y / y;
+}
+
+int
+f4 (int x, int y)
+{
+  if (x >= 30000)
+    __builtin_unreachable ();
+  if (x <= -30000)
+    __builtin_unreachable ();
+  if (y >= __INT_MAX__ / 30000)
+    __builtin_unreachable ();
+  if (y <= -__INT_MAX__ / 30000)
+    __builtin_unreachable ();
+  return x * y / y;
+}