Optimize (A / (cast) (1 << B)) -> (A >> B) (PR middle-end/91680)
diff mbox series

Message ID 20190910074528.GZ2120@tucnak
State New
Headers show
Series
  • Optimize (A / (cast) (1 << B)) -> (A >> B) (PR middle-end/91680)
Related show

Commit Message

Jakub Jelinek Sept. 10, 2019, 7:45 a.m. UTC
Hi!

The following patch optimizes (A / (cast) (1 << B)) -> (A >> B)
in addition to already supported (A / (1 << B)) -> (A >> B).

The patch only supports same precision cast (i.e. a sign change),
or widening cast, in that case either zero extension (then the extended
value is known to be a power of two and all is fine), or sign extension
if the first operand has all upper bits starting with the sign bit of the
narrower type clear.  That is because (unsigned long long) (1 << 31)
is 0xffffffff80000000ULL and we need to make sure that dividing by that
huge value is equal to >> 31.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-09-10  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/91680
	* match.pd ((A / (1 << B)) -> (A >> B)): Allow widening cast from
	the shift type to type.

	* gcc.dg/tree-ssa/pr91680.c: New test.
	* g++.dg/torture/pr91680.C: New test.


	Jakub

Comments

Richard Biener Sept. 10, 2019, 8:06 a.m. UTC | #1
On September 10, 2019 9:45:28 AM GMT+02:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>The following patch optimizes (A / (cast) (1 << B)) -> (A >> B)
>in addition to already supported (A / (1 << B)) -> (A >> B).
>
>The patch only supports same precision cast (i.e. a sign change),
>or widening cast, in that case either zero extension (then the extended
>value is known to be a power of two and all is fine), or sign extension
>if the first operand has all upper bits starting with the sign bit of
>the
>narrower type clear.  That is because (unsigned long long) (1 << 31)
>is 0xffffffff80000000ULL and we need to make sure that dividing by that
>huge value is equal to >> 31.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok. 

Richard.

>2019-09-10  Jakub Jelinek  <jakub@redhat.com>
>
>	PR middle-end/91680
>	* match.pd ((A / (1 << B)) -> (A >> B)): Allow widening cast from
>	the shift type to type.
>
>	* gcc.dg/tree-ssa/pr91680.c: New test.
>	* g++.dg/torture/pr91680.C: New test.
>
>--- gcc/match.pd.jj	2019-09-09 17:33:08.551582878 +0200
>+++ gcc/match.pd	2019-09-09 20:05:25.966107441 +0200
>@@ -305,13 +305,29 @@ (define_operator_list COND_TERNARY
> /* (A / (1 << B)) -> (A >> B).
>   Only for unsigned A.  For signed A, this would not preserve rounding
>    toward zero.
>-   For example: (-1 / ( 1 << B)) !=  -1 >> B.  */
>+   For example: (-1 / ( 1 << B)) !=  -1 >> B.
>+   Also also widening conversions, like:
>+   (A / (unsigned long long) (1U << B)) -> (A >> B)
>+   or
>+   (A / (unsigned long long) (1 << B)) -> (A >> B).
>+   If the left shift is signed, it can be done only if the upper bits
>+   of A starting from shift's type sign bit are zero, as
>+   (unsigned long long) (1 << 31) is -2147483648ULL, not
>2147483648ULL,
>+   so it is valid only if A >> 31 is zero.  */
> (simplify
>- (trunc_div @0 (lshift integer_onep@1 @2))
>+ (trunc_div @0 (convert? (lshift integer_onep@1 @2)))
>  (if ((TYPE_UNSIGNED (type) || tree_expr_nonnegative_p (@0))
>       && (!VECTOR_TYPE_P (type)
> 	  || target_supports_op_p (type, RSHIFT_EXPR, optab_vector)
>-	  || target_supports_op_p (type, RSHIFT_EXPR, optab_scalar)))
>+	  || target_supports_op_p (type, RSHIFT_EXPR, optab_scalar))
>+      && (useless_type_conversion_p (type, TREE_TYPE (@1))
>+	  || (element_precision (type) >= element_precision (TREE_TYPE (@1))
>+	      && (TYPE_UNSIGNED (TREE_TYPE (@1))
>+		  || (element_precision (type)
>+		      == element_precision (TREE_TYPE (@1)))
>+		  || (get_nonzero_bits (@0)
>+		      & wi::mask (element_precision (TREE_TYPE (@1)) - 1, true,
>+				  element_precision (type))) == 0))))
>   (rshift @0 @2)))
> 
> /* Preserve explicit divisions by 0: the C++ front-end wants to detect
>--- gcc/testsuite/gcc.dg/tree-ssa/pr91680.c.jj	2019-09-09
>20:18:04.349619827 +0200
>+++ gcc/testsuite/gcc.dg/tree-ssa/pr91680.c	2019-09-09
>20:18:33.922171856 +0200
>@@ -0,0 +1,37 @@
>+/* PR middle-end/91680 */
>+/* { dg-do compile { target { ilp32 || lp64 } } } */
>+/* { dg-options "-O2 -fdump-tree-forwprop1" } */
>+/* { dg-final { scan-tree-dump-times " / " 1 "forwprop1" } } */
>+/* { dg-final { scan-tree-dump-times " >> " 3 "forwprop1" } } */
>+
>+__attribute__((noipa)) unsigned long long
>+foo (unsigned char x)
>+{
>+  unsigned long long q = 1 << x;
>+  return 256 / q;
>+}
>+
>+__attribute__((noipa)) unsigned long long
>+bar (unsigned char x)
>+{
>+  unsigned long long q = 1U << x;
>+  return 256 / q;
>+}
>+
>+__attribute__((noipa)) unsigned long long
>+baz (unsigned char x, unsigned long long y)
>+{
>+  /* This can't be optimized, at least not in C++ and maybe not
>+     in C89, because for x 31 q is -2147483648ULL, not
>+     2147483648ULL, and e.g. 2147483648ULL >> 31 is 1, while
>+     2147483648ULL / -2147483648ULL is 0.  */
>+  unsigned long long q = 1 << x;
>+  return y / q;
>+}
>+
>+__attribute__((noipa)) unsigned long long
>+qux (unsigned char x, unsigned long long y)
>+{
>+  unsigned long long q = 1U << x;
>+  return y / q;
>+}
>--- gcc/testsuite/g++.dg/torture/pr91680.C.jj	2019-09-09
>20:25:51.775539166 +0200
>+++ gcc/testsuite/g++.dg/torture/pr91680.C	2019-09-09
>20:26:18.610132667 +0200
>@@ -0,0 +1,35 @@
>+/* PR middle-end/91680 */
>+/* { dg-do run { target { ilp32 || lp64 } } } */
>+
>+extern "C" void abort ();
>+
>+#include "../../gcc.dg/tree-ssa/pr91680.c"
>+
>+int
>+main ()
>+{
>+  unsigned char i;
>+  for (i = 0; i < __SIZEOF_INT__ * __CHAR_BIT__; i++)
>+    {
>+      volatile unsigned long long q = 1 << i;
>+      if (foo (i) != 256 / q)
>+	abort ();
>+      q = 1U << i;
>+      if (bar (i) != 256 / q)
>+	abort ();
>+      q = 1 << i;
>+      if (baz (i, (1U << i) - 1) != ((1U << i) - 1) / q)
>+	abort ();
>+      if (baz (i, 1U << i) != (1U << i) / q)
>+	abort ();
>+      if (baz (i, -1) != -1 / q)
>+	abort ();
>+      q = 1U << i;
>+      if (qux (i, (1U << i) - 1) != ((1U << i) - 1) / q)
>+	abort ();
>+      if (qux (i, 1U << i) != (1U << i) / q)
>+	abort ();
>+      if (qux (i, -1) != -1 / q)
>+	abort ();
>+    }
>+}
>
>	Jakub

Patch
diff mbox series

--- gcc/match.pd.jj	2019-09-09 17:33:08.551582878 +0200
+++ gcc/match.pd	2019-09-09 20:05:25.966107441 +0200
@@ -305,13 +305,29 @@  (define_operator_list COND_TERNARY
 /* (A / (1 << B)) -> (A >> B).
    Only for unsigned A.  For signed A, this would not preserve rounding
    toward zero.
-   For example: (-1 / ( 1 << B)) !=  -1 >> B.  */
+   For example: (-1 / ( 1 << B)) !=  -1 >> B.
+   Also also widening conversions, like:
+   (A / (unsigned long long) (1U << B)) -> (A >> B)
+   or
+   (A / (unsigned long long) (1 << B)) -> (A >> B).
+   If the left shift is signed, it can be done only if the upper bits
+   of A starting from shift's type sign bit are zero, as
+   (unsigned long long) (1 << 31) is -2147483648ULL, not 2147483648ULL,
+   so it is valid only if A >> 31 is zero.  */
 (simplify
- (trunc_div @0 (lshift integer_onep@1 @2))
+ (trunc_div @0 (convert? (lshift integer_onep@1 @2)))
  (if ((TYPE_UNSIGNED (type) || tree_expr_nonnegative_p (@0))
       && (!VECTOR_TYPE_P (type)
 	  || target_supports_op_p (type, RSHIFT_EXPR, optab_vector)
-	  || target_supports_op_p (type, RSHIFT_EXPR, optab_scalar)))
+	  || target_supports_op_p (type, RSHIFT_EXPR, optab_scalar))
+      && (useless_type_conversion_p (type, TREE_TYPE (@1))
+	  || (element_precision (type) >= element_precision (TREE_TYPE (@1))
+	      && (TYPE_UNSIGNED (TREE_TYPE (@1))
+		  || (element_precision (type)
+		      == element_precision (TREE_TYPE (@1)))
+		  || (get_nonzero_bits (@0)
+		      & wi::mask (element_precision (TREE_TYPE (@1)) - 1, true,
+				  element_precision (type))) == 0))))
   (rshift @0 @2)))
 
 /* Preserve explicit divisions by 0: the C++ front-end wants to detect
--- gcc/testsuite/gcc.dg/tree-ssa/pr91680.c.jj	2019-09-09 20:18:04.349619827 +0200
+++ gcc/testsuite/gcc.dg/tree-ssa/pr91680.c	2019-09-09 20:18:33.922171856 +0200
@@ -0,0 +1,37 @@ 
+/* PR middle-end/91680 */
+/* { dg-do compile { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-forwprop1" } */
+/* { dg-final { scan-tree-dump-times " / " 1 "forwprop1" } } */
+/* { dg-final { scan-tree-dump-times " >> " 3 "forwprop1" } } */
+
+__attribute__((noipa)) unsigned long long
+foo (unsigned char x)
+{
+  unsigned long long q = 1 << x;
+  return 256 / q;
+}
+
+__attribute__((noipa)) unsigned long long
+bar (unsigned char x)
+{
+  unsigned long long q = 1U << x;
+  return 256 / q;
+}
+
+__attribute__((noipa)) unsigned long long
+baz (unsigned char x, unsigned long long y)
+{
+  /* This can't be optimized, at least not in C++ and maybe not
+     in C89, because for x 31 q is -2147483648ULL, not
+     2147483648ULL, and e.g. 2147483648ULL >> 31 is 1, while
+     2147483648ULL / -2147483648ULL is 0.  */
+  unsigned long long q = 1 << x;
+  return y / q;
+}
+
+__attribute__((noipa)) unsigned long long
+qux (unsigned char x, unsigned long long y)
+{
+  unsigned long long q = 1U << x;
+  return y / q;
+}
--- gcc/testsuite/g++.dg/torture/pr91680.C.jj	2019-09-09 20:25:51.775539166 +0200
+++ gcc/testsuite/g++.dg/torture/pr91680.C	2019-09-09 20:26:18.610132667 +0200
@@ -0,0 +1,35 @@ 
+/* PR middle-end/91680 */
+/* { dg-do run { target { ilp32 || lp64 } } } */
+
+extern "C" void abort ();
+
+#include "../../gcc.dg/tree-ssa/pr91680.c"
+
+int
+main ()
+{
+  unsigned char i;
+  for (i = 0; i < __SIZEOF_INT__ * __CHAR_BIT__; i++)
+    {
+      volatile unsigned long long q = 1 << i;
+      if (foo (i) != 256 / q)
+	abort ();
+      q = 1U << i;
+      if (bar (i) != 256 / q)
+	abort ();
+      q = 1 << i;
+      if (baz (i, (1U << i) - 1) != ((1U << i) - 1) / q)
+	abort ();
+      if (baz (i, 1U << i) != (1U << i) / q)
+	abort ();
+      if (baz (i, -1) != -1 / q)
+	abort ();
+      q = 1U << i;
+      if (qux (i, (1U << i) - 1) != ((1U << i) - 1) / q)
+	abort ();
+      if (qux (i, 1U << i) != (1U << i) / q)
+	abort ();
+      if (qux (i, -1) != -1 / q)
+	abort ();
+    }
+}