diff mbox

[Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding

Message ID n99sic0uyf2.fsf@arm.com
State New
Headers show

Commit Message

Jiong Wang April 16, 2015, 11:04 a.m. UTC
This is a rework of

  https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html

After second thinking, I feel it's better to fix this in earlier stage
during RTL expand which is more generic, and we also avoid making the
already complex combine pass complexer.

Currently gcc expand wide mode left shift to some generic complex
instruction sequences, while if we have known the high part of wide mode
all comes from sign extension, the expand logic could be simplifed.

Given the following example,

T A = (T) B  << const_imm_shift

We know the high part of A are all comes from sign extension, if

* T is the next wider type of word_mode.

For example, for aarch64, if type T is 128int (TImode), and B is with
type SImode or DImode, then tree analyzer know that the high part of
TImode result all comes from sign extension, and kept them in range info.

 |<           T          >|
 |   high     |   low     |
              |<- sizel ->|

For above example, we could simplify the expand logic into
 1. low = low << const_imm_shift;
 2. high = low >> (sizel - const_imm_shift)  */

We can utilize the arithmetic right shift to do the sign
extension. Those reduntant instructions will be optimized out later.

For actual .s improvement,

AArch64
=======

  __int128_t
  foo (int data)
  {
    return (__int128_t) data << 50;
  }

  old:
    sxtw    x2, w0
    asr     x1, x2, 63
    lsl     x0, x2, 50
    lsl     x1, x1, 50
    orr     x1, x1, x2, lsr 14
 
  new:
    sxtw    x1, w0
    lsl     x0, x1, 50
    asr     x1, x1, 14


ARM (.fpu softvfp)
===========

  long long
  shift (int data)
  {
    return (long long) data << 20;
  }
 
  old:
    stmfd   sp!, {r4, r5}
    mov     r5, r0, asr #31
    mov     r3, r0
    mov     r0, r0, asl #20
    mov     r1, r5, asl #20
    orr     r1, r1, r3, lsr #12
    ldmfd   sp!, {r4, r5}
    bx      lr

  new:
    mov     r1, r0
    mov     r0, r0, asl #20
    mov     r1, r1, asr #12
    bx      lr

Test
====

  x86 bootstrap OK, regression test OK.
  AArch64 bootstrap OK, regression test on board OK.

Regards,
Jiong

2015-04-116  Jiong.Wang  <jiong.wang@arm.com>

gcc/
  * expr.c (expand_expr_real_2): Take tree range info into account when
  expanding LSHIFT_EXPR.

gcc/testsuite
  * gcc.dg/wide_shift_64_1.c: New testcase.
  * gcc.dg/wide_shift_128_1.c: Ditto.
  * gcc.target/aarch64/ashlti3_1.c: Ditto.
  * gcc.target/arm/ashldisi_1.c: Ditto.

Comments

Jeff Law April 24, 2015, 2:23 a.m. UTC | #1
On 04/16/2015 05:04 AM, Jiong Wang wrote:
>
> This is a rework of
>
>    https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html
>
> After second thinking, I feel it's better to fix this in earlier stage
> during RTL expand which is more generic, and we also avoid making the
> already complex combine pass complexer.
>
> Currently gcc expand wide mode left shift to some generic complex
> instruction sequences, while if we have known the high part of wide mode
> all comes from sign extension, the expand logic could be simplifed.
>
> Given the following example,
>
> T A = (T) B  << const_imm_shift
>
> We know the high part of A are all comes from sign extension, if
>
> * T is the next wider type of word_mode.
>
> For example, for aarch64, if type T is 128int (TImode), and B is with
> type SImode or DImode, then tree analyzer know that the high part of
> TImode result all comes from sign extension, and kept them in range info.
>
>   |<           T          >|
>   |   high     |   low     |
>                |<- sizel ->|
>
> For above example, we could simplify the expand logic into
>   1. low = low << const_imm_shift;
>   2. high = low >> (sizel - const_imm_shift)  */
>
> We can utilize the arithmetic right shift to do the sign
> extension. Those reduntant instructions will be optimized out later.
>
> For actual .s improvement,
>
> AArch64
> =======
>
>    __int128_t
>    foo (int data)
>    {
>      return (__int128_t) data << 50;
>    }
>
>    old:
>      sxtw    x2, w0
>      asr     x1, x2, 63
>      lsl     x0, x2, 50
>      lsl     x1, x1, 50
>      orr     x1, x1, x2, lsr 14
>
>    new:
>      sxtw    x1, w0
>      lsl     x0, x1, 50
>      asr     x1, x1, 14
>
>
> ARM (.fpu softvfp)
> ===========
>
>    long long
>    shift (int data)
>    {
>      return (long long) data << 20;
>    }
>
>    old:
>      stmfd   sp!, {r4, r5}
>      mov     r5, r0, asr #31
>      mov     r3, r0
>      mov     r0, r0, asl #20
>      mov     r1, r5, asl #20
>      orr     r1, r1, r3, lsr #12
>      ldmfd   sp!, {r4, r5}
>      bx      lr
>
>    new:
>      mov     r1, r0
>      mov     r0, r0, asl #20
>      mov     r1, r1, asr #12
>      bx      lr
>
> Test
> ====
>
>    x86 bootstrap OK, regression test OK.
>    AArch64 bootstrap OK, regression test on board OK.
>
> Regards,
> Jiong
>
> 2015-04-116  Jiong.Wang  <jiong.wang@arm.com>
>
> gcc/
>    * expr.c (expand_expr_real_2): Take tree range info into account when
>    expanding LSHIFT_EXPR.
>
> gcc/testsuite
>    * gcc.dg/wide_shift_64_1.c: New testcase.
>    * gcc.dg/wide_shift_128_1.c: Ditto.
>    * gcc.target/aarch64/ashlti3_1.c: Ditto.
>    * gcc.target/arm/ashldisi_1.c: Ditto.
Funny, I find myself wanting this transformation in both places :-) 
Expansion time so that we generate efficient code from the start and 
combine to catch those cases which are too complex to see at expansion, 
but due to other optimizations become visible in the combiner.

Sadly, it's been fairly common practice for targets to define 
double-word shift patterns which catch a variety of special cases. 
Ports will have to choose between using those patterns and exploiting 
your work.   I'd be tempted to generate a double-word shift by the given 
constant and check its cost vs the single word shifts.

What happens if there's an overlap between t_low and low?  Won't the 
lshift clobber low and thus we get the right value for the rshift in 
that case?

Note that expand_variable_shift may not honor your request for putting 
the result in the TARGET target parameter you pass in.

Thus:

   temp = expand_variable_shift (...)
   temp = expand_variable_shift (...)

Can't be right.    You probably need something like

   temp = expand_variable_shift (...)
   if (temp != t_low)
     emit_move_insn (t_low, temp);
   temp = expand_variable_shift (...)
   if (temp != t_high)
     emit_move_insn (t_high, temp);
   return target;


So I generally like where you're going with this, but I have concerns 
about its correctness, particularly in cases where there's an overlap or 
when expand_variable_shift returns its value in something other than the 
passed in target.

Jeff
Jiong Wang April 27, 2015, 8:21 p.m. UTC | #2
Jeff Law writes:

> On 04/16/2015 05:04 AM, Jiong Wang wrote:
>>
>> This is a rework of
>>
>>    https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html
>>
>> After second thinking, I feel it's better to fix this in earlier stage
>> during RTL expand which is more generic, and we also avoid making the
>> already complex combine pass complexer.
>>
>> Currently gcc expand wide mode left shift to some generic complex
>> instruction sequences, while if we have known the high part of wide mode
>> all comes from sign extension, the expand logic could be simplifed.
>>
>> Given the following example,
>>
>> T A = (T) B  << const_imm_shift
>>
>> We know the high part of A are all comes from sign extension, if
>>
>> * T is the next wider type of word_mode.
>>
>> For example, for aarch64, if type T is 128int (TImode), and B is with
>> type SImode or DImode, then tree analyzer know that the high part of
>> TImode result all comes from sign extension, and kept them in range info.
>>
>>   |<           T          >|
>>   |   high     |   low     |
>>                |<- sizel ->|
>>
>> For above example, we could simplify the expand logic into
>>   1. low = low << const_imm_shift;
>>   2. high = low >> (sizel - const_imm_shift)  */
>>
>> We can utilize the arithmetic right shift to do the sign
>> extension. Those reduntant instructions will be optimized out later.
>>
>> For actual .s improvement,
>>
>> AArch64
>> =======
>>
>>    __int128_t
>>    foo (int data)
>>    {
>>      return (__int128_t) data << 50;
>>    }
>>
>>    old:
>>      sxtw    x2, w0
>>      asr     x1, x2, 63
>>      lsl     x0, x2, 50
>>      lsl     x1, x1, 50
>>      orr     x1, x1, x2, lsr 14
>>
>>    new:
>>      sxtw    x1, w0
>>      lsl     x0, x1, 50
>>      asr     x1, x1, 14
>>
>>
>> ARM (.fpu softvfp)
>> ===========
>>
>>    long long
>>    shift (int data)
>>    {
>>      return (long long) data << 20;
>>    }
>>
>>    old:
>>      stmfd   sp!, {r4, r5}
>>      mov     r5, r0, asr #31
>>      mov     r3, r0
>>      mov     r0, r0, asl #20
>>      mov     r1, r5, asl #20
>>      orr     r1, r1, r3, lsr #12
>>      ldmfd   sp!, {r4, r5}
>>      bx      lr
>>
>>    new:
>>      mov     r1, r0
>>      mov     r0, r0, asl #20
>>      mov     r1, r1, asr #12
>>      bx      lr
>>
>> Test
>> ====
>>
>>    x86 bootstrap OK, regression test OK.
>>    AArch64 bootstrap OK, regression test on board OK.
>>
>> Regards,
>> Jiong
>>
>> 2015-04-116  Jiong.Wang  <jiong.wang@arm.com>
>>
>> gcc/
>>    * expr.c (expand_expr_real_2): Take tree range info into account when
>>    expanding LSHIFT_EXPR.
>>
>> gcc/testsuite
>>    * gcc.dg/wide_shift_64_1.c: New testcase.
>>    * gcc.dg/wide_shift_128_1.c: Ditto.
>>    * gcc.target/aarch64/ashlti3_1.c: Ditto.
>>    * gcc.target/arm/ashldisi_1.c: Ditto.
> Funny, I find myself wanting this transformation in both places :-) 
> Expansion time so that we generate efficient code from the start and 
> combine to catch those cases which are too complex to see at expansion, 
> but due to other optimizations become visible in the combiner.
>
> Sadly, it's been fairly common practice for targets to define 
> double-word shift patterns which catch a variety of special cases. 
> Ports will have to choose between using those patterns and exploiting 
> your work.   I'd be tempted to generate a double-word shift by the given 
> constant and check its cost vs the single word shifts.
>
> What happens if there's an overlap between t_low and low?  Won't the 
> lshift clobber low and thus we get the right value for the rshift in 
> that case?

Jeff,

  Sorry, I can't understand the meaning of "overlap between t_low and low",
  assume "right" in "right value" means the opposite of "left" not
  "correct".

  So what you mean is t_low and low share the same pseudo regiser?

  or you mean if we are shifting a value across the word boundary? like the following.
 
   |<      double word      >|
   |   t_high  |   t_low     |
          |<- low ->|
         
  for above situation, the simplified two instruction sequence do works.
  "t_low = low << const_imm_shift ; t_high = low >> (sizel - const_imm_shift)"

  I attached the expand result for a simple testcase below. I appreicate
  if you could comment on the RTL. 

  Thanks.
  
  __int128_t
  foo (int data)
  {
    return (__int128_t) data << 50;
  }

  foo.c.188t.optimized
  ===
  foo (int data)
  {
    __int128 _2;
    __int128 _3;

  <bb 2>:
    _2 = (__int128) data_1(D);
    _3 = _2 << 50;
    return _3;
  }

  foo.c.189r.expand
  ===
  
  (insn 2 4 3 2 (set (reg/v:SI 76 [ data ])
    (reg:SI 0 x0 [ data ])) foo.c:3 -1
      (nil))
  (insn 6 3 7 2 (set (reg:DI 79)
    (sign_extend:DI (reg/v:SI 76 [ data ]))) foo.c:4 -1
      (nil))
  (insn 7 6 8 2 (set (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
    (reg:DI 79)) foo.c:4 -1
      (nil))
  (insn 8 7 9 2 (set (reg:DI 80)
    (ashiftrt:DI (reg:DI 79) (const_int 63 [0x3f]))) foo.c:4 -1
      (nil))
  (insn 9 8 10 2 (set (subreg:DI (reg:TI 78 [ D.2677 ]) 8)
    (reg:DI 80)) foo.c:4 -1
      (nil))
      
  ^
   ~~~~~~~~~ sign extend SImode "data" into TImode "_2" (r78)
                    
  (insn 10 9 11 2 (set (subreg:DI (reg:TI 77 [D.2677 ]) 0)
    (ashift:DI (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
               (const_int 50 [0x32]))) foo.c:4 -1
      (nil))

  ^
   ~~~~~~~~~~ t_low = low << const_imm_shift, target be r77
  
  (insn 11 10 12 2 (set (subreg:DI (reg:TI 77 [ D.2677 ]) 8)
    (ashiftrt:DI (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
                 (const_int 14 [0xe]))) foo.c:4 -1
      (nil))

  ^
   ~~~~~~~~~~ t_high = low >> (sizel - const_imm_shift)
  
  (insn 12 11 16 2 (set (reg:TI 75 [ <retval> ])
    (reg:TI 77 [ D.2677 ])) foo.c:4 -1
      (nil))
  (insn 16 12 17 2 (set (reg/i:TI 0 x0)
    (reg:TI 75 [ <retval> ])) foo.c:5 -1
      (nil))

> Note that expand_variable_shift may not honor your request for putting 
> the result in the TARGET target parameter you pass in.

Thanks, agree, it's better to add those extra move.

I noticed the comments at the start of the function:

  "Store the result in the rtx TARGET, if that is convenient."

Although I still don't understand in which case it's inconveninent.
Jeff Law April 29, 2015, 3:50 a.m. UTC | #3
On 04/27/2015 02:21 PM, Jiong Wang wrote:

>> Funny, I find myself wanting this transformation in both places :-)
>> Expansion time so that we generate efficient code from the start and
>> combine to catch those cases which are too complex to see at expansion,
>> but due to other optimizations become visible in the combiner.
>>
>> Sadly, it's been fairly common practice for targets to define
>> double-word shift patterns which catch a variety of special cases.
>> Ports will have to choose between using those patterns and exploiting
>> your work.   I'd be tempted to generate a double-word shift by the given
>> constant and check its cost vs the single word shifts.
>>
>> What happens if there's an overlap between t_low and low?  Won't the
>> lshift clobber low and thus we get the right value for the rshift in
>> that case?
>
> Jeff,
>
>    Sorry, I can't understand the meaning of "overlap between t_low and low",
>    assume "right" in "right value" means the opposite of "left" not
>    "correct".
>
>    So what you mean is t_low and low share the same pseudo regiser?
My concern is sharing the same pseudo or memory location.  But thinking 
more about it, the shifted value has to have range information, so it 
must have been an SSA_NAME, right?  If so, then it can't overlap with 
the destination, so this is a non-issue.  Sorry for the confusion.

>
>> Note that expand_variable_shift may not honor your request for putting
>> the result in the TARGET target parameter you pass in.
>
> Thanks, agree, it's better to add those extra move.
>
> I noticed the comments at the start of the function:
>
>    "Store the result in the rtx TARGET, if that is convenient."
>
> Although I still don't understand in which case it's inconveninent.
I've never liked the model of storing into TARGET when it's convenient. 
  Because storing into TARGET is totally optional, it means the callers 
have to check if the value was stored into TARGET or not.

Sadly that model has been in the expanders as long as I can remember.

So I think this can go forward once we resolve the case where 
expand_variable_shift returns its value in something other than the 
passed in target.

Jeff
Jiong Wang April 29, 2015, 9:36 p.m. UTC | #4
Jeff Law writes:

> On 04/27/2015 02:21 PM, Jiong Wang wrote:
>
>> Jeff,
>>
>>    Sorry, I can't understand the meaning of "overlap between t_low and low",
>>    assume "right" in "right value" means the opposite of "left" not
>>    "correct".
>>
>>    So what you mean is t_low and low share the same pseudo regiser?
> My concern is sharing the same pseudo or memory location.  But thinking 
> more about it, the shifted value has to have range information, so it 
> must have been an SSA_NAME, right?  If so, then it can't overlap with 
> the destination, so this is a non-issue.  Sorry for the confusion.

Thanks for the light. By looking at related code, looks like even it's
SSA_NAME, it's still possible to share the same pseudo given the
destination is in the same SSA map parition after ssa name coleascing?   

> I've never liked the model of storing into TARGET when it's convenient. 
>   Because storing into TARGET is totally optional, it means the callers 
> have to check if the value was stored into TARGET or not.
>
> Sadly that model has been in the expanders as long as I can remember.
>
> So I think this can go forward once we resolve the case where 
> expand_variable_shift returns its value in something other than the 
> passed in target.

OK. I will rework the patch, and I found there is a function named
"expand_doubleword_shift" which looks like a more natural place to do
this optimization, although it's hard to get range info there. I will do
further explore on this.
Jeff Law April 29, 2015, 10:33 p.m. UTC | #5
On 04/29/2015 03:36 PM, Jiong Wang wrote:
>
> Jeff Law writes:
>
>> On 04/27/2015 02:21 PM, Jiong Wang wrote:
>>
>>> Jeff,
>>>
>>>     Sorry, I can't understand the meaning of "overlap between t_low and low",
>>>     assume "right" in "right value" means the opposite of "left" not
>>>     "correct".
>>>
>>>     So what you mean is t_low and low share the same pseudo regiser?
>> My concern is sharing the same pseudo or memory location.  But thinking
>> more about it, the shifted value has to have range information, so it
>> must have been an SSA_NAME, right?  If so, then it can't overlap with
>> the destination, so this is a non-issue.  Sorry for the confusion.
>
> Thanks for the light. By looking at related code, looks like even it's
> SSA_NAME, it's still possible to share the same pseudo given the
> destination is in the same SSA map parition after ssa name coleascing?
If they're the same size and have non-overlapping lifetimes, then yes, 
they could be the same pseudo.  That ought to be easy to check. 
Thankfully we don't have to worry about MEMs, which is a harder check.

> OK. I will rework the patch, and I found there is a function named
> "expand_doubleword_shift" which looks like a more natural place to do
> this optimization, although it's hard to get range info there. I will do
> further explore on this.
Sounds good.

jeff
diff mbox

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index 89ca129..96d64cc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8984,23 +8984,85 @@  expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case LSHIFT_EXPR:
     case RSHIFT_EXPR:
-      /* If this is a fixed-point operation, then we cannot use the code
-	 below because "expand_shift" doesn't support sat/no-sat fixed-point
-         shifts.   */
-      if (ALL_FIXED_POINT_MODE_P (mode))
-	goto binop;
-
-      if (! safe_from_p (subtarget, treeop1, 1))
-	subtarget = 0;
-      if (modifier == EXPAND_STACK_PARM)
-	target = 0;
-      op0 = expand_expr (treeop0, subtarget,
-			 VOIDmode, EXPAND_NORMAL);
-      temp = expand_variable_shift (code, mode, op0, treeop1, target,
-				    unsignedp);
-      if (code == LSHIFT_EXPR)
-	temp = REDUCE_BIT_FIELD (temp);
-      return temp;
+      {
+	/* If this is a fixed-point operation, then we cannot use the code
+	   below because "expand_shift" doesn't support sat/no-sat fixed-point
+	   shifts.  */
+	if (ALL_FIXED_POINT_MODE_P (mode))
+	  goto binop;
+
+	if (! safe_from_p (subtarget, treeop1, 1))
+	  subtarget = 0;
+	if (modifier == EXPAND_STACK_PARM)
+	  target = 0;
+
+	op0 = expand_expr (treeop0, subtarget,
+			   VOIDmode, EXPAND_NORMAL);
+
+	/* If mode == GET_MODE_WIDER_MODE (word_mode),
+	   then normally, there will no native instructions to support
+	   this wide mode left shift.
+
+	   given below example,
+
+	   T A = (T) B  << C
+
+	   |<		T	   >|
+	   |   high     |   low     |
+
+			|<- sizel ->|
+
+	   if from range info, we could deduce that the high part are all sign
+	   bit extension, then this left shift operation could be largely
+	   simplified into.
+
+	     1. low = low << C;
+	     2. high = low >> (sizel - C)  */
+
+	int o_bits = GET_MODE_SIZE (mode) * BITS_PER_UNIT;
+	wide_int min, max;
+
+	if (code == LSHIFT_EXPR
+	    && !unsignedp
+	    && mode == GET_MODE_WIDER_MODE (word_mode)
+	    && !have_insn_for (LSHIFT_EXPR, mode)
+	    && TREE_CONSTANT (treeop1)
+	    && get_range_info (treeop0, &min, &max) == VR_RANGE
+	    && (wi::cmp (min,
+			 wide_int::from (wi::min_value
+					 ((unsigned) (BITS_PER_WORD),
+					  SIGNED), o_bits, SIGNED),
+			 SIGNED) != -1)
+	    && (wi::cmp (max,
+			 wide_int::from (wi::max_value
+					 ((unsigned)(BITS_PER_WORD),
+					  SIGNED), o_bits, SIGNED),
+			 SIGNED) != 1))
+	  {
+	    rtx low = simplify_gen_subreg (word_mode, op0, mode, 0);
+	    rtx t_low = simplify_gen_subreg (word_mode, target, mode, 0);
+	    rtx t_high = simplify_gen_subreg (word_mode, target, mode,
+					      UNITS_PER_WORD);
+	    tree high_shift =
+	      build_int_cst (TREE_TYPE (treeop1),
+			     BITS_PER_WORD -TREE_INT_CST_LOW (treeop1));
+
+	    temp = expand_variable_shift (code, word_mode, low, treeop1,
+					  t_low, unsignedp);
+
+	    temp = expand_variable_shift (RSHIFT_EXPR, word_mode, low,
+					  high_shift, t_high, unsignedp);
+
+	    gcc_assert (GET_CODE (temp) == SUBREG);
+	    temp = XEXP (temp, 0);
+	  }
+	else
+	  temp = expand_variable_shift (code, mode, op0, treeop1, target,
+					unsignedp);
+	if (code == LSHIFT_EXPR)
+	  temp = REDUCE_BIT_FIELD (temp);
+	return temp;
+      }
 
       /* Could determine the answer when only additive constants differ.  Also,
 	 the addition of one can be handled by changing the condition.  */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..5bc278f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..aeb2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..00dc06e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+  return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+  return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+	if (ushift_ ## y (x)\
+	    != ((unsigned long long) (unsigned) x << y)) \
+	  abort (); \
+	if (shift_ ## y (x)\
+	    != ((long long) (signed) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 3)
+  SHIFT_CHECK (0xcafecafe, 3)
+  SHIFT_CHECK (0x12345678, 23)
+  SHIFT_CHECK (0xcafecafe, 23)
+  SHIFT_CHECK (0x12345678, 30)
+  SHIFT_CHECK (0xcafecafe, 30)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */