diff mbox

[Combine] missed one ASHIFTRT opt opportunity

Message ID 53D7D6E6.1080203@arm.com
State New
Headers show

Commit Message

Jiong Wang July 29, 2014, 5:16 p.m. UTC
Suppose the following shift:

T A = (T) B  << C

which met the following conditions:
* T is a signed type with 2 x word_size.
* B is with signed type with the size <= word_size.
* C + sizeof (B) < word_size.

|<           T          >|
|   high     |   low     |
      | B shifted to |
      |   E   |   D  |

gcc is generating sub-optimal instruction pattern for most targets, like arm/mips/sparc, during rtl expand.

gcc's expand logic is:

1. assign low with B.
2. assign high with sign bit of B spreaded.
3. logic left shift low by C bit to get the final low.
4. logic left shift high by E bit.
5. logic right shift B by D bit.
6. or the result of 4 and 5 to get final high.

step 2, 4, 5, 6 could actually be combined into one step: arithmetic right shift B by D bit.

aarch64 example
================
what gcc generate for

__int128_t
load3 (int data)
{
   return (__int128_t) data << 50;
}

==>
         sxtw    x2, w0
         asr     x1, x2, 63
         lsl     x0, x2, 50
         lsl     x1, x1, 50
         orr     x1, x1, x2, lsr 14

actually could be simplified to
         sxtw    x1, w0
         lsl     x0, x1, 50
         asr     x1, x1, 14

the following three instructions are actually doing "asr     x1, x1, 14"
         asr     x1, x2, 63
         lsl     x1, x1, 50
         orr     x1, x1, x2, lsr 14

arm example (.fpu softvfp)
===========

what gcc generated for

long long
shift (int data)
{
   return (long long) data << 20;
}

==>
shift:
         stmfd   sp!, {r4, r5}
         mov     r5, r0, asr #31
         mov     r3, r0
         mov     r0, r0, asl #20
         mov     r1, r5, asl #20
         orr     r1, r1, r3, lsr #12
         ldmfd   sp!, {r4, r5}
         bx      lr

actually could be simplified to
shift:
         mov     r1, r0
         mov     r0, r0, asl #20
         mov     r1, r1, asr #12
         bx      lr

this patch tring to combine step 2, 4, 5, 6 into one single arithmetic
right shift in gcc's combine pass.


test done
===
bootstrap OK on x86-64/aarch64
no regresson aarch64-none-elf/arm-none-eabi
  
please review, thanks.

-- Jiong

gcc/
   * simplify-rtx.c (simplify_binary_operation_1): New combine check for ASHIFTRT.

gcc/testsuite
   * gcc.dg/wide_shift_64_1.c: New testcase.
   * gcc.dg/wide_shift_128_1.c: Ditto.
   * gcc.target/aarch64/ashlti3_1.c: Ditto.
   * gcc.target/arm/ashldisi_1.c: Ditto.

Comments

Richard Sandiford July 30, 2014, 9:44 p.m. UTC | #1
Jiong Wang <jiong.wang@arm.com> writes:
> +      /* If we have
> +
> +	   (ior (lshiftrt A imm1) (and (ashiftrt (A imm2)) mask))
> +
> +	 where imm1 = imm2 + 1
> +	       mask = ((1 << imm1) - 1) << (mode_size (A) - imm1)
> +
> +	 then we may simplify this into
> +
> +	   (ashiftrt A imm1).
> +
> +	 For example:
> +
> +	 (ior:DI (lshiftrt:DI (reg:DI 79 [ data ])
> +				(const_int 14 [0xe]))
> +		(and:DI (ashiftrt:DI (reg:DI 79 [ data ])
> +					(const_int 13 [0xd]))
> +			(const_int -1125899906842624 [0xfffc000000000000]))).
> +
> +	 is actually:
> +
> +	   (ashiftrt:DI (reg:DI 79 [ data ]) (const_int 14 [0xe])).
> +
> +	 There is another form:
> +
> +	   (ior (lshiftrt A imm1)
> +		(ashift (subreg (sign_extend A) high_offset) imm2)
> +
> +	 where imm1 + imm2 == mode_size (A) could be simplified into
> +
> +	   (ashiftrt A imm1).  */
> +
> +      if (GET_CODE (op1) == LSHIFTRT)
> +       {
> +         opleft =  op1;
> +         opright = op0;
> +       }
> +      else
> +       {
> +         opright = op1;
> +         opleft = op0;
> +       }
> +
> +      if (GET_CODE (opleft) == LSHIFTRT
> +	  && GET_CODE (opright) == ASHIFT
> +	  && CONST_INT_P (XEXP (opleft, 1))
> +	  && CONST_INT_P (XEXP (opright, 1))
> +	  && (INTVAL (XEXP (opleft, 1)) + INTVAL (XEXP (opright, 1)))
> +	     == GET_MODE_SIZE (GET_MODE (XEXP (opleft, 0)))
> +	  && GET_CODE (XEXP (opright, 0)) == SUBREG
> +	  && GET_CODE (XEXP (XEXP (opright, 0), 0)) == SIGN_EXTEND
> +	  && (SUBREG_BYTE (XEXP (opright, 0))
> +	      == subreg_highpart_offset (GET_MODE (XEXP (opright, 0)),
> +					 GET_MODE (XEXP (XEXP (opright, 0),
> +							 0))))
> +	  && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (XEXP (opright, 0), 0),
> +						  0))
> +	  && !side_effects_p (XEXP (opleft, 0))
> +	  && have_insn_for (ASHIFTRT, mode))
> +	return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
> +				    XEXP (opleft, 1));
> +
> +      if (GET_CODE (opleft) == LSHIFTRT
> +	  && GET_CODE (opright) == AND
> +	  && CONST_INT_P (XEXP (opleft, 1))
> +	  && CONST_INT_P (XEXP (opright, 1))
> +	  && GET_CODE (XEXP (opright, 0)) == ASHIFTRT
> +	  && CONST_INT_P (XEXP (XEXP (opright, 0), 1))
> +	  && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (opright, 0), 0))
> +	  && !side_effects_p (XEXP (opleft, 0))
> +	  && have_insn_for (ASHIFTRT, mode))
> +       {
> +         int lshiftrt_imm = INTVAL (XEXP (opleft, 1));
> +         HOST_WIDE_INT mask_nonzero_part, and_mask = INTVAL (XEXP (opright, 1));
> +         int ashiftrt_imm, t_zero, l_one;
> +
> +	 if (width < HOST_BITS_PER_WIDE_INT)
> +	   and_mask &= ((unsigned HOST_WIDE_INT) 1 << width) - 1;
> +
> +	 if (!and_mask)
> +	   return opleft;
> +
> +         ashiftrt_imm = INTVAL (XEXP (XEXP (opright, 0), 1));
> +#if GCC_VERSION >= 3004
> +         t_zero = (width <= HOST_BITS_PER_INT
> +		   ? __builtin_ctz (and_mask)
> +		   : (width <= HOST_BITS_PER_LONG
> +		      ? __builtin_ctzl (and_mask)
> +		      : __builtin_ctzll (and_mask)));
> +#else
> +         t_zero = 0;
> +	 while (((~and_mask) >> t_zero) & 1)
> +	   ++t_zero;
> +#endif
> +         mask_nonzero_part = (((unsigned HOST_WIDE_INT) and_mask) >> t_zero);
> +#if GCC_VERSION >= 3400
> +         l_one = (width <= HOST_BITS_PER_INT
> +		  ? __builtin_popcount (mask_nonzero_part)
> +		  : (width <= HOST_BITS_PER_LONG
> +		     ? __builtin_popcountl (mask_nonzero_part)
> +		     : __builtin_popcountll (mask_nonzero_part)));
> +#else
> +	 /* calculate the number of leading 1-bits.
> +	    we use _popcount above, because the second condition check
> +	    of "if" below will make sure there is no gap between
> +	    those 1-bits.  */
> +	 l_one = sizeof (and_mask) * CHAR_BIT;
> +	 while ((and_mask >> (l_one - 1)) & 1)
> +	   --l_one;
> +	 l_one = sizeof (and_mask) * CHAR_BIT - l_one;
> +#endif
> +
> +         if ((ashiftrt_imm + 1) == lshiftrt_imm
> +             && (mask_nonzero_part
> +                 == ((HOST_WIDE_INT_C (1) << (ashiftrt_imm + 1)) - 1))
> +             && lshiftrt_imm == l_one)
> +           return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
> +				       XEXP (opleft, 1));
> +       }
> +
>        tem = simplify_byte_swapping_operation (code, mode, op0, op1);
>        if (tem)
>  	return tem;

Please do this using wi:: instead, which in particular should make
the popcount stuff easier.  E.g. the first large "if" would probably be:

      if (GET_CODE (opleft) == LSHIFTRT
	  && GET_CODE (opright) == ASHIFT
-->	  && CONST_SCALAR_INT_P (XEXP (opleft, 1))
-->	  && CONST_SCALAR_INT_P (XEXP (opright, 1))
-->	  && (wi::add (XEXP (opleft, 1), XEXP (opright, 1))
-->	      == GET_MODE_SIZE (GET_MODE (XEXP (opleft, 0)))
	  && GET_CODE (XEXP (opright, 0)) == SUBREG
	  && GET_CODE (XEXP (XEXP (opright, 0), 0)) == SIGN_EXTEND
	  && (SUBREG_BYTE (XEXP (opright, 0))
	      == subreg_highpart_offset (GET_MODE (XEXP (opright, 0)),
					 GET_MODE (XEXP (XEXP (opright, 0),
							 0))))
	  && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (XEXP (opright, 0), 0),
						  0))
	  && !side_effects_p (XEXP (opleft, 0))
	  && have_insn_for (ASHIFTRT, mode))
	return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
				    XEXP (opleft, 1));

The HOST_WIDE_INTs in the second big "if" would then be wide_ints.

Thanks,
Richard
diff mbox

Patch

commit 2cdd04f4de73db87b1f27d801922ff409ca7b3a8
Author: Jiong Wang <jiong.wang@arm.com>
Date:   Tue Jul 29 14:57:01 2014 +0100

    commit

diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 9f6dbe1..5fb663e 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -2599,6 +2599,126 @@  simplify_binary_operation_1 (enum rtx_code code, enum machine_mode mode,
 					XEXP (op0, 1));
         }
 
+      /* If we have
+
+	   (ior (lshiftrt A imm1) (and (ashiftrt (A imm2)) mask))
+
+	 where imm1 = imm2 + 1
+	       mask = ((1 << imm1) - 1) << (mode_size (A) - imm1)
+
+	 then we may simplify this into
+
+	   (ashiftrt A imm1).
+
+	 For example:
+
+	 (ior:DI (lshiftrt:DI (reg:DI 79 [ data ])
+				(const_int 14 [0xe]))
+		(and:DI (ashiftrt:DI (reg:DI 79 [ data ])
+					(const_int 13 [0xd]))
+			(const_int -1125899906842624 [0xfffc000000000000]))).
+
+	 is actually:
+
+	   (ashiftrt:DI (reg:DI 79 [ data ]) (const_int 14 [0xe])).
+
+	 There is another form:
+
+	   (ior (lshiftrt A imm1)
+		(ashift (subreg (sign_extend A) high_offset) imm2)
+
+	 where imm1 + imm2 == mode_size (A) could be simplified into
+
+	   (ashiftrt A imm1).  */
+
+      if (GET_CODE (op1) == LSHIFTRT)
+       {
+         opleft =  op1;
+         opright = op0;
+       }
+      else
+       {
+         opright = op1;
+         opleft = op0;
+       }
+
+      if (GET_CODE (opleft) == LSHIFTRT
+	  && GET_CODE (opright) == ASHIFT
+	  && CONST_INT_P (XEXP (opleft, 1))
+	  && CONST_INT_P (XEXP (opright, 1))
+	  && (INTVAL (XEXP (opleft, 1)) + INTVAL (XEXP (opright, 1)))
+	     == GET_MODE_SIZE (GET_MODE (XEXP (opleft, 0)))
+	  && GET_CODE (XEXP (opright, 0)) == SUBREG
+	  && GET_CODE (XEXP (XEXP (opright, 0), 0)) == SIGN_EXTEND
+	  && (SUBREG_BYTE (XEXP (opright, 0))
+	      == subreg_highpart_offset (GET_MODE (XEXP (opright, 0)),
+					 GET_MODE (XEXP (XEXP (opright, 0),
+							 0))))
+	  && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (XEXP (opright, 0), 0),
+						  0))
+	  && !side_effects_p (XEXP (opleft, 0))
+	  && have_insn_for (ASHIFTRT, mode))
+	return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
+				    XEXP (opleft, 1));
+
+      if (GET_CODE (opleft) == LSHIFTRT
+	  && GET_CODE (opright) == AND
+	  && CONST_INT_P (XEXP (opleft, 1))
+	  && CONST_INT_P (XEXP (opright, 1))
+	  && GET_CODE (XEXP (opright, 0)) == ASHIFTRT
+	  && CONST_INT_P (XEXP (XEXP (opright, 0), 1))
+	  && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (opright, 0), 0))
+	  && !side_effects_p (XEXP (opleft, 0))
+	  && have_insn_for (ASHIFTRT, mode))
+       {
+         int lshiftrt_imm = INTVAL (XEXP (opleft, 1));
+         HOST_WIDE_INT mask_nonzero_part, and_mask = INTVAL (XEXP (opright, 1));
+         int ashiftrt_imm, t_zero, l_one;
+
+	 if (width < HOST_BITS_PER_WIDE_INT)
+	   and_mask &= ((unsigned HOST_WIDE_INT) 1 << width) - 1;
+
+	 if (!and_mask)
+	   return opleft;
+
+         ashiftrt_imm = INTVAL (XEXP (XEXP (opright, 0), 1));
+#if GCC_VERSION >= 3004
+         t_zero = (width <= HOST_BITS_PER_INT
+		   ? __builtin_ctz (and_mask)
+		   : (width <= HOST_BITS_PER_LONG
+		      ? __builtin_ctzl (and_mask)
+		      : __builtin_ctzll (and_mask)));
+#else
+         t_zero = 0;
+	 while (((~and_mask) >> t_zero) & 1)
+	   ++t_zero;
+#endif
+         mask_nonzero_part = (((unsigned HOST_WIDE_INT) and_mask) >> t_zero);
+#if GCC_VERSION >= 3400
+         l_one = (width <= HOST_BITS_PER_INT
+		  ? __builtin_popcount (mask_nonzero_part)
+		  : (width <= HOST_BITS_PER_LONG
+		     ? __builtin_popcountl (mask_nonzero_part)
+		     : __builtin_popcountll (mask_nonzero_part)));
+#else
+	 /* calculate the number of leading 1-bits.
+	    we use _popcount above, because the second condition check
+	    of "if" below will make sure there is no gap between
+	    those 1-bits.  */
+	 l_one = sizeof (and_mask) * CHAR_BIT;
+	 while ((and_mask >> (l_one - 1)) & 1)
+	   --l_one;
+	 l_one = sizeof (and_mask) * CHAR_BIT - l_one;
+#endif
+
+         if ((ashiftrt_imm + 1) == lshiftrt_imm
+             && (mask_nonzero_part
+                 == ((HOST_WIDE_INT_C (1) << (ashiftrt_imm + 1)) - 1))
+             && lshiftrt_imm == l_one)
+           return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
+				       XEXP (opleft, 1));
+       }
+
       tem = simplify_byte_swapping_operation (code, mode, op0, op1);
       if (tem)
 	return tem;
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..5bc278f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..aeb2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..bb5297a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+  return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+  return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+	if (ushift_ ## y (x)\
+	    != ((unsigned long long) (unsigned) x << y)) \
+	  abort (); \
+	if (shift_ ## y (x)\
+	    != ((long long) (signed) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 3)
+  SHIFT_CHECK (0xcafecafe, 3)
+  SHIFT_CHECK (0x12345678, 23)
+  SHIFT_CHECK (0xcafecafe, 23)
+  SHIFT_CHECK (0x12345678, 30)
+  SHIFT_CHECK (0xcafecafe, 30)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */