Patchwork [ARM] Improve 64-bit shifts (non-NEON)

login
register
mail settings
Submitter Andrew Stubbs
Date Jan. 27, 2012, 4:07 p.m.
Message ID <4F22CBB2.7010101@codesourcery.com>
Download mbox | patch
Permalink /patch/138247/
State New
Headers show

Comments

Andrew Stubbs - Jan. 27, 2012, 4:07 p.m.
Hi all,

This patch introduces a new, more efficient set of DImode shift 
sequences for values stored in core-registers (as opposed to VFP/NEON 
registers).

The new sequences take advantage of knowledge of what the ARM 
instructions do with out-of-range shift amounts.

The following are examples or a simple test case, like this one:

long long
f (long long *a, int b)
{
   return *a << b;
}


In ARM mode, old left-shift vs. the new one:

     stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
     rsb     r4, r1, #32          | mov     ip, r1
     ldr     r5, [r0, #4]         | stmfd   sp!, {r4, r5}
     subs    ip, r1, #32          | sub     r5, ip, #32
     ldr     r0, [r0, #0]         | rsb     r4, ip, #32
     mov     r3, r5, asl r1       | mov     r1, r3, asl ip
     orr     r3, r3, r0, lsr r4   | mov     r0, r2, asl ip
     mov     r2, r0, asl r1       | orr     r1, r1, r2, asl r5
     movpl   r3, r0, asl ip       | orr     r1, r1, r2, lsr r4
     mov     r0, r2               | ldmfd   sp!, {r4, r5}
     mov     r1, r3               | bx      lr
     ldmfd   sp!, {r4, r5}        |
     bx      lr                   |

In Thumb mode, old left-shift vs. new:

     ldr     r2, [r0, #0]         | ldrd    r2, [r0]
     ldr     r3, [r0, #4]         | push    {r4, r5, r6}
     push    {r4, r5, r6}         | sub     r6, r1, #32
     rsb     r6, r1, #32          | mov     r4, r1
     sub     r4, r1, #32          | rsb     r5, r1, #32
     lsls    r3, r3, r1           | lsls    r6, r2, r6
     lsrs    r6, r2, r6           | lsls    r1, r3, r1
     lsls    r5, r2, r4           | lsrs    r5, r2, r5
     orrs    r3, r3, r6           | lsls    r0, r2, r4
     lsls    r0, r2, r1           | orrs    r1, r1, r6
     bics    r1, r5, r4, asr #32  | orrs    r1, r1, r5
     it      cs                   | pop     {r4, r5, r6}
     movcs   r1, r3               | bx      lr
     pop     {r4, r5, r6}         |
     bx      lr                   |

Logical right shift is essentially the same sequence as the left shift 
above. However, arithmetic right shift requires something slightly 
different. Here it is in ARM mode, old vs. new:

     stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
     rsb     r4, r1, #32          | mov     ip, r1
     ldr     r5, [r0, #0]         | stmfd   sp!, {r4, r5}
     subs    ip, r1, #32          | rsb     r5, ip, #32
     ldr     r0, [r0, #4]         | subs    r4, ip, #32
     mov     r2, r5, lsr r1       | mov     r0, r2, lsr ip
     orr     r2, r2, r0, asl r4   | mov     r1, r3, asr ip
     mov     r3, r0, asr r1       | orr     r0, r0, r3, asl r5
     movpl   r2, r0, asr ip       | orrge   r0, r0, r3, asr r4
     mov     r1, r3               | ldmfd   sp!, {r4, r5}
     mov     r0, r2               | bx      lr
     ldmfd   sp!, {r4, r5}        |
     bx      lr                   |

I won't bore you with the Thumb mode comparison.

The shift-by-constant cases have also been reimplemented, although the 
resultant sequences are much the same as before. (Doing this isn't 
strictly necessary just yet, but when I post my next patch to do 64-bit 
shifts in NEON, this feature will be required by the fall-back 
alternatives.)

I've run a regression test on a cross-compiler, and I should have native 
test results next week sometime. Also some benchmark results.

Is this OK for stage 1?

Andrew
Richard Earnshaw - Jan. 30, 2012, 3:25 p.m.
On 27/01/12 16:07, Andrew Stubbs wrote:
> Hi all,
> 
> This patch introduces a new, more efficient set of DImode shift 
> sequences for values stored in core-registers (as opposed to VFP/NEON 
> registers).
> 
> The new sequences take advantage of knowledge of what the ARM 
> instructions do with out-of-range shift amounts.
> 
> The following are examples or a simple test case, like this one:
> 
> long long
> f (long long *a, int b)
> {
>    return *a << b;
> }
> 
> 
> In ARM mode, old left-shift vs. the new one:
> 
>      stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
>      rsb     r4, r1, #32          | mov     ip, r1
>      ldr     r5, [r0, #4]         | stmfd   sp!, {r4, r5}
>      subs    ip, r1, #32          | sub     r5, ip, #32
>      ldr     r0, [r0, #0]         | rsb     r4, ip, #32
>      mov     r3, r5, asl r1       | mov     r1, r3, asl ip
>      orr     r3, r3, r0, lsr r4   | mov     r0, r2, asl ip
>      mov     r2, r0, asl r1       | orr     r1, r1, r2, asl r5
>      movpl   r3, r0, asl ip       | orr     r1, r1, r2, lsr r4
>      mov     r0, r2               | ldmfd   sp!, {r4, r5}
>      mov     r1, r3               | bx      lr
>      ldmfd   sp!, {r4, r5}        |
>      bx      lr                   |
> 
> In Thumb mode, old left-shift vs. new:
> 
>      ldr     r2, [r0, #0]         | ldrd    r2, [r0]
>      ldr     r3, [r0, #4]         | push    {r4, r5, r6}
>      push    {r4, r5, r6}         | sub     r6, r1, #32
>      rsb     r6, r1, #32          | mov     r4, r1
>      sub     r4, r1, #32          | rsb     r5, r1, #32
>      lsls    r3, r3, r1           | lsls    r6, r2, r6
>      lsrs    r6, r2, r6           | lsls    r1, r3, r1
>      lsls    r5, r2, r4           | lsrs    r5, r2, r5
>      orrs    r3, r3, r6           | lsls    r0, r2, r4
>      lsls    r0, r2, r1           | orrs    r1, r1, r6
>      bics    r1, r5, r4, asr #32  | orrs    r1, r1, r5
>      it      cs                   | pop     {r4, r5, r6}
>      movcs   r1, r3               | bx      lr
>      pop     {r4, r5, r6}         |
>      bx      lr                   |
> 
> Logical right shift is essentially the same sequence as the left shift 
> above. However, arithmetic right shift requires something slightly 
> different. Here it is in ARM mode, old vs. new:
> 
>      stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
>      rsb     r4, r1, #32          | mov     ip, r1
>      ldr     r5, [r0, #0]         | stmfd   sp!, {r4, r5}
>      subs    ip, r1, #32          | rsb     r5, ip, #32
>      ldr     r0, [r0, #4]         | subs    r4, ip, #32
>      mov     r2, r5, lsr r1       | mov     r0, r2, lsr ip
>      orr     r2, r2, r0, asl r4   | mov     r1, r3, asr ip
>      mov     r3, r0, asr r1       | orr     r0, r0, r3, asl r5
>      movpl   r2, r0, asr ip       | orrge   r0, r0, r3, asr r4
>      mov     r1, r3               | ldmfd   sp!, {r4, r5}
>      mov     r0, r2               | bx      lr
>      ldmfd   sp!, {r4, r5}        |
>      bx      lr                   |
> 
> I won't bore you with the Thumb mode comparison.
> 
> The shift-by-constant cases have also been reimplemented, although the 
> resultant sequences are much the same as before. (Doing this isn't 
> strictly necessary just yet, but when I post my next patch to do 64-bit 
> shifts in NEON, this feature will be required by the fall-back 
> alternatives.)
> 
> I've run a regression test on a cross-compiler, and I should have native 
> test results next week sometime. Also some benchmark results.
> 
> Is this OK for stage 1?
> 

What's the impact of this on -Os?  At present we fall back to the
libcalls, but I can't immediately see how the new code would do that.

Gut feeling is that shift by a constant is always worth inlining at -Os,
but shift by a register isn't.

R.


>

Patch

2012-01-27  Andrew Stubbs  <ams@codesourcery.com>

	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.

---
 gcc/config/arm/arm-protos.h |    3 +
 gcc/config/arm/arm.c        |  198 +++++++++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm.md       |   90 ++++++++++++++------
 3 files changed, 264 insertions(+), 27 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@  struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@  vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+   
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define IF(COND,RTX) \
+	    gen_rtx_COND_EXEC (VOIDmode, \
+			       gen_rtx_ ## COND (CCmode, cc_reg, \
+						 const0_rtx), \
+			       (RTX))
+
+  if (CONST_INT_P (amount))
+    {
+      /* Shifts by a constant amount.  */
+      if (INTVAL (amount) <= 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* Shifts by a variable amount.  */
+      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* If this were only called during expand we could just use the else
+	     case and let combine deal with it, but this can also be called
+	     from post-reload splitters.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE,
+			     SET (out_down,
+				  ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+				       out_down))));
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE, SET (scratch2,
+				      SHIFT (ASHIFTRT, in_up, scratch2))));
+	      emit_insn (IF (GE, SET (out_down, ORR (out_down, scratch2))));
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef IF
+}
+
 #include "gt-arm.h"
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..7cae822 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,33 @@ 
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3525,21 +3537,33 @@ 
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3582,21 +3606,33 @@ 
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )