Patchwork [SH] PR 54089 - Improve software dynamic shifts

login
register
mail settings
Submitter Oleg Endo
Date Sept. 9, 2012, 1:30 p.m.
Message ID <1347197428.2306.31.camel@yam-132-YW-E178-FTW>
Download mbox | patch
Permalink /patch/182623/
State New
Headers show

Comments

Oleg Endo - Sept. 9, 2012, 1:30 p.m.
Hello,

This patch does two things...

1) The dynamic shift cost is set to be the same if HW dynamic shifts are
available.  This improves code size for SH2A a little (-2 KByte on CSiBE
for -m2a-single -O2).

2) Improve code around library function calls for software dynamic
shifts (logical right + left shifts only for now).
For this I had to change the implementations of ashlsi3 and lshrsi3 in
lib1funcs.S, but  the changes are backwards compatible with older
binaries.  Due to the additional branch insn in the dyn shift functions
they might be one or two cycles slower than the original, but this
reduces the amount of clobbered regs and cuts 9.5 KByte in the CSiBE set
(-m2 -ml -O2), which seems more beneficial to do on average.

Tested on rev. 190990 with
make -k check RUNTESTFLAGS="--target_board=sh-sim
\{-m2/-ml,-m2/-mb,-m2a/-mb,-m4/-ml,-m4/-mb,-m4a/-ml,-m4a/-mb}"

and no new failures except for this one on SH2:

FAIL: gcc.dg/pr28402.c scan-assembler-not __[a-z]*si3

The reason for this is that now the middle-end will expand DImode shifts
as SImode shifts instead of a DImode shift library call, because it sees
the new SImode dynamic library call shift patterns for SH2.  I will have
a look at this issue later to see if it is beneficial to do special
handling of DImode shifts on SH2.

OK to install?

Cheers,
Oleg

gcc/ChangeLog:

	PR target/54089
	* config/sh/sh.h (SH_DYNAMIC_SHIFT_COST): Set always to 1 if 
	dynamic shifts are available.
	(SHIFT_COUNT_TRUNCATED): Always define to 0.  Correct comment.
	* config/sh/sh.c (ashl_lshr_seq, ext_ashl_lshr_seq): Add 
	comments.
	* config/sh/predicates.md (shift_count_operand): Allow 
	arith_reg_operand even if TARGET_DYNSHIFT is false.
	* config/sh/sh.md (ashlsi3, lshrsi3): Expand library call 
	patterns if needed.
	(ashlsi3_d_call, lshrsi3_d_call): New insns.

libgcc/ChangeLog:

	PR target/54089
	* config/sh/lib1funcs.S (ashlsi3): Reimplement as ashlsi3_r0.
	(lshrsi3): Reimplement as lshrsi3_r0.

testsuite/ChangeLog:

	PR target/54089
	* gcc.target/sh/pr54089-3.c: New.
Kaz Kojima - Sept. 10, 2012, 12:37 p.m.
Oleg Endo <oleg.endo@t-online.de> wrote:
> This patch does two things...
> 
> 1) The dynamic shift cost is set to be the same if HW dynamic shifts are
> available.  This improves code size for SH2A a little (-2 KByte on CSiBE
> for -m2a-single -O2).
> 
> 2) Improve code around library function calls for software dynamic
> shifts (logical right + left shifts only for now).
> For this I had to change the implementations of ashlsi3 and lshrsi3 in
> lib1funcs.S, but  the changes are backwards compatible with older
> binaries.  Due to the additional branch insn in the dyn shift functions
> they might be one or two cycles slower than the original, but this
> reduces the amount of clobbered regs and cuts 9.5 KByte in the CSiBE set
> (-m2 -ml -O2), which seems more beneficial to do on average.
> 
> Tested on rev. 190990 with
> make -k check RUNTESTFLAGS="--target_board=sh-sim
> \{-m2/-ml,-m2/-mb,-m2a/-mb,-m4/-ml,-m4/-mb,-m4a/-ml,-m4a/-mb}"
> 
> and no new failures except for this one on SH2:
> 
> FAIL: gcc.dg/pr28402.c scan-assembler-not __[a-z]*si3
> 
> The reason for this is that now the middle-end will expand DImode shifts
> as SImode shifts instead of a DImode shift library call, because it sees
> the new SImode dynamic library call shift patterns for SH2.  I will have
> a look at this issue later to see if it is beneficial to do special
> handling of DImode shifts on SH2.
> 
> OK to install?

OK.

Regards,
	kaz

Patch

Index: libgcc/config/sh/lib1funcs.S
===================================================================
--- libgcc/config/sh/lib1funcs.S	(revision 190990)
+++ libgcc/config/sh/lib1funcs.S	(working copy)
@@ -1,5 +1,5 @@ 
 /* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
-   2004, 2005, 2006, 2009
+   2004, 2005, 2006, 2009, 2012
    Free Software Foundation, Inc.
 
 This file is free software; you can redistribute it and/or modify it
@@ -241,7 +241,7 @@ 
 ! Entry:
 !
 ! r4: Value to shift
-! r5: Shifts
+! r5: Shift count
 !
 ! Exit:
 !
@@ -249,7 +249,7 @@ 
 !
 ! Destroys:
 !
-! (none)
+! T bit, r5
 !
 
 	.global	GLOBAL(ashrsi3)
@@ -388,318 +388,353 @@ 
 
 !
 ! GLOBAL(ashlsi3)
+! (For compatibility with older binaries, not used by compiler)
 !
 ! Entry:
+!	r4: Value to shift
+!	r5: Shift count
 !
-! r4: Value to shift
-! r5: Shifts
-!
 ! Exit:
+!	r0: Result
 !
-! r0: Result
-!
 ! Destroys:
+!	T bit
 !
-! (none)
 !
+! GLOBAL(ashlsi3_r0)
+!
+! Entry:
+!	r4: Value to shift
+!	r0: Shift count
+!
+! Exit:
+!	r0: Result
+!
+! Destroys:
+!	T bit
+
 	.global	GLOBAL(ashlsi3)
+	.global GLOBAL(ashlsi3_r0)
 	HIDDEN_FUNC(GLOBAL(ashlsi3))
+	HIDDEN_FUNC(GLOBAL(ashlsi3_r0))
+GLOBAL(ashlsi3):
+	mov	r5,r0
 	.align	2
-GLOBAL(ashlsi3):
-	mov	#31,r0
-	and	r0,r5
+GLOBAL(ashlsi3_r0):
+
+#ifdef __sh1__
+	and	#31,r0
+	shll2	r0
+	mov.l	r4,@-r15
+	mov	r0,r4
 	mova	LOCAL(ashlsi3_table),r0
-	mov.b	@(r0,r5),r5
-#ifdef __sh1__
-	add	r5,r0
+	add	r4,r0
+	mov.l	@r15+,r4
 	jmp	@r0
+	mov	r4,r0
+	.align 2
 #else
-	braf	r5
+	and	#31,r0
+	shll2	r0
+	braf	r0
+	mov	r4,r0
 #endif
-	mov	r4,r0
 
-	.align	2
 LOCAL(ashlsi3_table):
-	.byte		LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table)
-	.byte		LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table)
-
-LOCAL(ashlsi3_6):
-	shll2	r0
-LOCAL(ashlsi3_4):
-	shll2	r0
-LOCAL(ashlsi3_2):
+	rts				// << 0
+	nop
+LOCAL(ashlsi_1):
+	rts				// << 1
+	shll	r0
+LOCAL(ashlsi_2):			// << 2
 	rts
 	shll2	r0
-
-LOCAL(ashlsi3_7):
+	bra	LOCAL(ashlsi_1)		// << 3
 	shll2	r0
-LOCAL(ashlsi3_5):
+	bra	LOCAL(ashlsi_2)		// << 4
 	shll2	r0
-LOCAL(ashlsi3_3):
+	bra	LOCAL(ashlsi_5)		// << 5
+	shll	r0
+	bra	LOCAL(ashlsi_6)		// << 6
 	shll2	r0
-LOCAL(ashlsi3_1):
-	rts
+	bra	LOCAL(ashlsi_7)		// << 7
 	shll	r0
-
-LOCAL(ashlsi3_14):
-	shll2	r0
-LOCAL(ashlsi3_12):
-	shll2	r0
-LOCAL(ashlsi3_10):
-	shll2	r0
-LOCAL(ashlsi3_8):
+LOCAL(ashlsi_8):			// << 8
 	rts
 	shll8	r0
-
-LOCAL(ashlsi3_15):
+	bra	LOCAL(ashlsi_8)		// << 9
+	shll	r0
+	bra	LOCAL(ashlsi_8)		// << 10
 	shll2	r0
-LOCAL(ashlsi3_13):
+	bra	LOCAL(ashlsi_11)	// << 11
+	shll	r0
+	bra	LOCAL(ashlsi_12)	// << 12
 	shll2	r0
-LOCAL(ashlsi3_11):
-	shll2	r0
-LOCAL(ashlsi3_9):
+	bra	LOCAL(ashlsi_13)	// << 13
+	shll	r0
+	bra	LOCAL(ashlsi_14)	// << 14
 	shll8	r0
+	bra	LOCAL(ashlsi_15)	// << 15
+	shll8	r0
+LOCAL(ashlsi_16):			// << 16
 	rts
+	shll16	r0
+	bra	LOCAL(ashlsi_16)	// << 17
 	shll	r0
-
-LOCAL(ashlsi3_22):
+	bra	LOCAL(ashlsi_16)	// << 18
 	shll2	r0
-LOCAL(ashlsi3_20):
+	bra	LOCAL(ashlsi_19)	// << 19
+	shll	r0
+	bra	LOCAL(ashlsi_20)	// << 20
 	shll2	r0
-LOCAL(ashlsi3_18):
-	shll2	r0
-LOCAL(ashlsi3_16):
-	rts
+	bra	LOCAL(ashlsi_21)	// << 21
+	shll	r0
+	bra	LOCAL(ashlsi_22)	// << 22
 	shll16	r0
-
-LOCAL(ashlsi3_23):
+	bra	LOCAL(ashlsi_23)	// << 23
+	shll16	r0
+	bra	LOCAL(ashlsi_16)	// << 24
+	shll8	r0
+	bra	LOCAL(ashlsi_25)	// << 25
+	shll	r0
+	bra	LOCAL(ashlsi_26)	// << 26
 	shll2	r0
-LOCAL(ashlsi3_21):
+	bra	LOCAL(ashlsi_27)	// << 27
+	shll	r0
+	bra	LOCAL(ashlsi_28)	// << 28
 	shll2	r0
-LOCAL(ashlsi3_19):
-	shll2	r0
-LOCAL(ashlsi3_17):
+	bra	LOCAL(ashlsi_29)	// << 29
 	shll16	r0
+	bra	LOCAL(ashlsi_30)	// << 30
+	shll16	r0
+	and	#1,r0			// << 31
 	rts
-	shll	r0
+	rotr	r0
 
-LOCAL(ashlsi3_30):
+LOCAL(ashlsi_7):
 	shll2	r0
-LOCAL(ashlsi3_28):
+LOCAL(ashlsi_5):
+LOCAL(ashlsi_6):
 	shll2	r0
-LOCAL(ashlsi3_26):
+	rts
+LOCAL(ashlsi_13):
 	shll2	r0
-LOCAL(ashlsi3_24):
+LOCAL(ashlsi_12):
+LOCAL(ashlsi_11):
+	shll8	r0
+	rts
+LOCAL(ashlsi_21):
+	shll2	r0
+LOCAL(ashlsi_20):
+LOCAL(ashlsi_19):
 	shll16	r0
 	rts
-	shll8	r0
-
-LOCAL(ashlsi3_31):
+LOCAL(ashlsi_28):
+LOCAL(ashlsi_27):
 	shll2	r0
-LOCAL(ashlsi3_29):
-	shll2	r0
-LOCAL(ashlsi3_27):
-	shll2	r0
-LOCAL(ashlsi3_25):
+LOCAL(ashlsi_26):
+LOCAL(ashlsi_25):
 	shll16	r0
+	rts
 	shll8	r0
+
+LOCAL(ashlsi_22):
+LOCAL(ashlsi_14):
+	shlr2	r0
 	rts
-	shll	r0
+	shll8	r0
 
-LOCAL(ashlsi3_0):
+LOCAL(ashlsi_23):
+LOCAL(ashlsi_15):
+	shlr	r0
 	rts
-	nop
+	shll8	r0
 
+LOCAL(ashlsi_29):
+	shlr	r0
+LOCAL(ashlsi_30):
+	shlr2	r0
+	rts
+	shll16	r0	
+
 	ENDFUNC(GLOBAL(ashlsi3))
+	ENDFUNC(GLOBAL(ashlsi3_r0))
 #endif
 
 #ifdef L_lshiftrt
 
 !
 ! GLOBAL(lshrsi3)
+! (For compatibility with older binaries, not used by compiler)
 !
 ! Entry:
+!	r4: Value to shift
+!	r5: Shift count
 !
-! r4: Value to shift
-! r5: Shifts
-!
 ! Exit:
+!	r0: Result
 !
-! r0: Result
-!
 ! Destroys:
+!	T bit
 !
-! (none)
 !
+! GLOBAL(lshrsi3_r0)
+!
+! Entry:
+!	r4: Value to shift
+!	r0: Shift count
+!
+! Exit:
+!	r0: Result
+!
+! Destroys:
+!	T bit
+
 	.global	GLOBAL(lshrsi3)
+	.global	GLOBAL(lshrsi3_r0)
 	HIDDEN_FUNC(GLOBAL(lshrsi3))
+	HIDDEN_FUNC(GLOBAL(lshrsi3_r0))
+GLOBAL(lshrsi3):
+	mov	r5,r0
 	.align	2
-GLOBAL(lshrsi3):
-	mov	#31,r0
-	and	r0,r5
+GLOBAL(lshrsi3_r0):
+
+#ifdef __sh1__
+	and	#31,r0
+	shll2	r0
+	mov.l	r4,@-r15
+	mov	r0,r4
 	mova	LOCAL(lshrsi3_table),r0
-	mov.b	@(r0,r5),r5
-#ifdef __sh1__
-	add	r5,r0
+	add	r4,r0
+	mov.l	@r15+,r4
 	jmp	@r0
+	mov	r4,r0
+	.align 2
 #else
-	braf	r5
+	and	#31,r0
+	shll2	r0
+	braf	r0
+	mov	r4,r0
 #endif
-	mov	r4,r0
-
-	.align	2
 LOCAL(lshrsi3_table):
-	.byte		LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table)
-	.byte		LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table)
-
-LOCAL(lshrsi3_6):
-	shlr2	r0
-LOCAL(lshrsi3_4):
-	shlr2	r0
-LOCAL(lshrsi3_2):
+	rts				// >> 0
+	nop
+LOCAL(lshrsi_1):			// >> 1
 	rts
+	shlr	r0
+LOCAL(lshrsi_2):			// >> 2
+	rts
 	shlr2	r0
-
-LOCAL(lshrsi3_7):
+	bra	LOCAL(lshrsi_1)		// >> 3
 	shlr2	r0
-LOCAL(lshrsi3_5):
+	bra	LOCAL(lshrsi_2)		// >> 4
 	shlr2	r0
-LOCAL(lshrsi3_3):
+	bra	LOCAL(lshrsi_5)		// >> 5
+	shlr	r0
+	bra	LOCAL(lshrsi_6)		// >> 6
 	shlr2	r0
-LOCAL(lshrsi3_1):
-	rts
+	bra	LOCAL(lshrsi_7)		// >> 7
 	shlr	r0
-
-LOCAL(lshrsi3_14):
-	shlr2	r0
-LOCAL(lshrsi3_12):
-	shlr2	r0
-LOCAL(lshrsi3_10):
-	shlr2	r0
-LOCAL(lshrsi3_8):
+LOCAL(lshrsi_8):			// >> 8
 	rts
 	shlr8	r0
-
-LOCAL(lshrsi3_15):
+	bra	LOCAL(lshrsi_8)		// >> 9
+	shlr	r0
+	bra	LOCAL(lshrsi_8)		// >> 10
 	shlr2	r0
-LOCAL(lshrsi3_13):
+	bra	LOCAL(lshrsi_11)	// >> 11
+	shlr	r0
+	bra	LOCAL(lshrsi_12)	// >> 12
 	shlr2	r0
-LOCAL(lshrsi3_11):
-	shlr2	r0
-LOCAL(lshrsi3_9):
+	bra	LOCAL(lshrsi_13)	// >> 13
+	shlr	r0
+	bra	LOCAL(lshrsi_14)	// >> 14
 	shlr8	r0
+	bra	LOCAL(lshrsi_15)	// >> 15
+	shlr8	r0
+LOCAL(lshrsi_16):			// >> 16
 	rts
+	shlr16	r0
+	bra	LOCAL(lshrsi_16)	// >> 17
 	shlr	r0
-
-LOCAL(lshrsi3_22):
+	bra	LOCAL(lshrsi_16)	// >> 18
 	shlr2	r0
-LOCAL(lshrsi3_20):
+	bra	LOCAL(lshrsi_19)	// >> 19
+	shlr	r0
+	bra	LOCAL(lshrsi_20)	// >> 20
 	shlr2	r0
-LOCAL(lshrsi3_18):
-	shlr2	r0
-LOCAL(lshrsi3_16):
-	rts
+	bra	LOCAL(lshrsi_21)	// >> 21
+	shlr	r0
+	bra	LOCAL(lshrsi_22)	// >> 22
 	shlr16	r0
-
-LOCAL(lshrsi3_23):
+	bra	LOCAL(lshrsi_23)	// >> 23
+	shlr16	r0
+	bra	LOCAL(lshrsi_16)	// >> 24
+	shlr8	r0
+	bra	LOCAL(lshrsi_25)	// >> 25
+	shlr	r0
+	bra	LOCAL(lshrsi_26)	// >> 26
 	shlr2	r0
-LOCAL(lshrsi3_21):
+	bra	LOCAL(lshrsi_27)	// >> 27
+	shlr	r0
+	bra	LOCAL(lshrsi_28)	// >> 28
 	shlr2	r0
-LOCAL(lshrsi3_19):
-	shlr2	r0
-LOCAL(lshrsi3_17):
+	bra	LOCAL(lshrsi_29)	// >> 29
 	shlr16	r0
+	bra	LOCAL(lshrsi_30)	// >> 30
+	shlr16	r0
+	shll	r0			// >> 31
 	rts
-	shlr	r0
+	movt	r0
 
-LOCAL(lshrsi3_30):
+LOCAL(lshrsi_7):
 	shlr2	r0
-LOCAL(lshrsi3_28):
+LOCAL(lshrsi_5):
+LOCAL(lshrsi_6):
 	shlr2	r0
-LOCAL(lshrsi3_26):
+	rts
+LOCAL(lshrsi_13):
 	shlr2	r0
-LOCAL(lshrsi3_24):
+LOCAL(lshrsi_12):
+LOCAL(lshrsi_11):
+	shlr8	r0
+	rts
+LOCAL(lshrsi_21):
+	shlr2	r0
+LOCAL(lshrsi_20):
+LOCAL(lshrsi_19):
 	shlr16	r0
 	rts
-	shlr8	r0
-
-LOCAL(lshrsi3_31):
+LOCAL(lshrsi_28):
+LOCAL(lshrsi_27):
 	shlr2	r0
-LOCAL(lshrsi3_29):
-	shlr2	r0
-LOCAL(lshrsi3_27):
-	shlr2	r0
-LOCAL(lshrsi3_25):
+LOCAL(lshrsi_26):
+LOCAL(lshrsi_25):
 	shlr16	r0
+	rts
 	shlr8	r0
+
+LOCAL(lshrsi_22):
+LOCAL(lshrsi_14):
+	shll2	r0
 	rts
-	shlr	r0
+	shlr8	r0
 
-LOCAL(lshrsi3_0):
+LOCAL(lshrsi_23):
+LOCAL(lshrsi_15):
+	shll	r0
 	rts
-	nop
+	shlr8	r0
 
+LOCAL(lshrsi_29):
+	shll	r0
+LOCAL(lshrsi_30):
+	shll2	r0
+	rts
+	shlr16	r0	
+
 	ENDFUNC(GLOBAL(lshrsi3))
+	ENDFUNC(GLOBAL(lshrsi3_r0))
 #endif
 
 #ifdef L_movmem
Index: gcc/testsuite/gcc.target/sh/pr54089-3.c
===================================================================
--- gcc/testsuite/gcc.target/sh/pr54089-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr54089-3.c	(revision 0)
@@ -0,0 +1,40 @@ 
+/* The dynamic shift library functions truncate the shift count to 5 bits.
+   Verify that this is taken into account and no extra shift count
+   truncations are generated before the library call.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O1" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m1*" "-m2" "-m2e*" } } */
+/* { dg-final { scan-assembler-not "and" } } */
+/* { dg-final { scan-assembler-not "31" } } */
+
+int
+test00 (unsigned int a, int* b, int c, int* d, unsigned int e)
+{
+  int s = 0;
+  int i;
+  for (i = 0; i < c; ++i)
+    s += d[i] + b[i] + (e << (i & 31));
+  return s;  
+}
+
+int
+test01 (unsigned int a, int* b, int c, int* d, unsigned int e)
+{
+  int s = 0;
+  int i;
+  for (i = 0; i < c; ++i)
+    s += d[i] + b[i] + (e >> (i & 31));
+  return s;  
+}
+
+int
+test03 (unsigned int a, unsigned int b)
+{
+  return b << (a & 31);
+}
+
+unsigned int
+test04 (unsigned int a, int b)
+{
+  return a >> (b & 31);
+}
Index: gcc/config/sh/sh.h
===================================================================
--- gcc/config/sh/sh.h	(revision 190990)
+++ gcc/config/sh/sh.h	(working copy)
@@ -1932,19 +1932,27 @@ 
    like shad and shld.  */
 #define TARGET_DYNSHIFT (TARGET_SH3 || TARGET_SH2A)
 
-#define SH_DYNAMIC_SHIFT_COST \
-  (TARGET_HARD_SH4 ? 1 : TARGET_DYNSHIFT ? (optimize_size ? 1 : 2) : 20)
+/* The cost of using the dynamic shift insns (shad, shld) are the same
+   if they are available.  If they are not available a library function will
+   be emitted instead, which is more expensive.  */
+#define SH_DYNAMIC_SHIFT_COST (TARGET_DYNSHIFT ? 1 : 20)
 
-/* Immediate shift counts are truncated by the output routines (or was it
-   the assembler?).  Shift counts in a register are truncated by SH.  Note
-   that the native compiler puts too large (> 32) immediate shift counts
-   into a register and shifts by the register, letting the SH decide what
-   to do instead of doing that itself.  */
-/* ??? The library routines in lib1funcs.S truncate the shift count.
-   However, the SH3 has hardware shifts that do not truncate exactly as gcc
-   expects - the sign bit is significant - so it appears that we need to
-   leave this zero for correct SH3 code.  */
-#define SHIFT_COUNT_TRUNCATED (! TARGET_SH3 && ! TARGET_SH2A)
+/* Defining SHIFT_COUNT_TRUNCATED tells the combine pass that code like
+   (X << (Y % 32)) for register X, Y is equivalent to (X << Y).
+   This is not generally true when hardware dynamic shifts (shad, shld) are
+   used, because they check the sign bit _before_ the modulo op.  The sign
+   bit determines whether it is a left shift or a right shift:
+     if (Y < 0)
+       return X << (Y & 31);
+     else
+       return X >> (-Y) & 31);
+ 
+   The dynamic shift library routines in lib1funcs.S do not use the sign bit
+   like the hardware dynamic shifts and truncate the shift count to 31.
+   We define SHIFT_COUNT_TRUNCATED to 0 and express the implied shift count
+   truncation in the library function call patterns, as this gives slightly
+   more compact code.  */
+#define SHIFT_COUNT_TRUNCATED (0)
 
 /* CANONICALIZE_COMPARISON macro for the combine pass.  */
 #define CANONICALIZE_COMPARISON(CODE, OP0, OP1) \
Index: gcc/config/sh/sh.md
===================================================================
--- gcc/config/sh/sh.md	(revision 190990)
+++ gcc/config/sh/sh.md	(working copy)
@@ -4023,6 +4023,17 @@ 
 					   operands[2]));
       DONE;
     }
+
+  /* Expand a library call for the dynamic shift.  */
+  if (!CONST_INT_P (operands[2]) && !TARGET_DYNSHIFT)
+    {
+      emit_move_insn (gen_rtx_REG (SImode, R4_REG), operands[1]);
+      rtx funcaddr = gen_reg_rtx (Pmode);
+      function_symbol (funcaddr, "__ashlsi3_r0", SFUNC_STATIC);
+      emit_insn (gen_ashlsi3_d_call (operands[0], operands[2], funcaddr));
+
+      DONE;
+    }
 })
 
 (define_insn "ashlsi3_k"
@@ -4067,6 +4078,23 @@ 
 }
   [(set_attr "type" "dyn_shift")])
 
+;; If dynamic shifts are not available use a library function.
+;; By specifying the pattern we reduce the number of call clobbered regs.
+;; In order to make combine understand the truncation of the shift amount
+;; operand we have to allow it to use pseudo regs for the shift operands.
+(define_insn "ashlsi3_d_call"
+  [(set (match_operand:SI 0 "arith_reg_dest" "=z")
+	(ashift:SI (reg:SI R4_REG)
+		   (and:SI (match_operand:SI 1 "arith_reg_operand" "z")
+			   (const_int 31))))
+   (use (match_operand:SI 2 "arith_reg_operand" "r"))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))]
+  "TARGET_SH1 && !TARGET_DYNSHIFT"
+  "jsr	@%2%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_insn_and_split "ashlsi3_n"
   [(set (match_operand:SI 0 "arith_reg_dest" "=r")
 	(ashift:SI (match_operand:SI 1 "arith_reg_operand" "0")
@@ -4512,6 +4540,16 @@ 
 		 operands[2]));
       DONE;
     }
+
+  /* Expand a library call for the dynamic shift.  */
+  if (!CONST_INT_P (operands[2]) && !TARGET_DYNSHIFT)
+    {
+      emit_move_insn (gen_rtx_REG (SImode, R4_REG), operands[1]);
+      rtx funcaddr = gen_reg_rtx (Pmode);
+      function_symbol (funcaddr, "__lshrsi3_r0", SFUNC_STATIC);
+      emit_insn (gen_lshrsi3_d_call (operands[0], operands[2], funcaddr));
+      DONE;
+    }
 })
 
 (define_insn "lshrsi3_k"
@@ -4556,6 +4594,23 @@ 
 }
   [(set_attr "type" "dyn_shift")])
 
+;; If dynamic shifts are not available use a library function.
+;; By specifying the pattern we reduce the number of call clobbered regs.
+;; In order to make combine understand the truncation of the shift amount
+;; operand we have to allow it to use pseudo regs for the shift operands.
+(define_insn "lshrsi3_d_call"
+  [(set (match_operand:SI 0 "arith_reg_dest" "=z")
+	(lshiftrt:SI (reg:SI R4_REG)
+		     (and:SI (match_operand:SI 1 "arith_reg_operand" "z")
+			     (const_int 31))))
+   (use (match_operand:SI 2 "arith_reg_operand" "r"))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))]
+  "TARGET_SH1 && !TARGET_DYNSHIFT"
+  "jsr	@%2%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
+
 (define_insn_and_split "lshrsi3_n"
   [(set (match_operand:SI 0 "arith_reg_dest" "=r")
 	(lshiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0")
Index: gcc/config/sh/predicates.md
===================================================================
--- gcc/config/sh/predicates.md	(revision 190990)
+++ gcc/config/sh/predicates.md	(working copy)
@@ -791,9 +791,8 @@ 
   /* Allow T_REG as shift count for dynamic shifts, although it is not
      really possible.  It will then be copied to a general purpose reg.  */
   if (! TARGET_SHMEDIA)
-    return const_int_operand (op, mode)
-	   || (TARGET_DYNSHIFT && (arith_reg_operand (op, mode)
-				   || t_reg_operand (op, mode)));
+    return const_int_operand (op, mode) || arith_reg_operand (op, mode)
+	   || (TARGET_DYNSHIFT && t_reg_operand (op, mode));
 
   return (CONSTANT_P (op)
 	  ? (CONST_INT_P (op)
Index: gcc/config/sh/sh.c
===================================================================
--- gcc/config/sh/sh.c	(revision 190990)
+++ gcc/config/sh/sh.c	(working copy)
@@ -2871,35 +2871,35 @@ 
 
 static const struct ashl_lshr_sequence ashl_lshr_seq[32] =
 {
-  { 0, { 0 },		    0 },
+  { 0, { 0 },		    0 },		// 0
   { 1, { 1 },		    LSHR_CLOBBERS_T },
   { 1, { 2 },		    0 },
   { 2, { 2, 1 },	    LSHR_CLOBBERS_T },
-  { 2, { 2, 2 },	    0 },
+  { 2, { 2, 2 },	    0 },		// 4
   { 3, { 2, 1, 2 },	    LSHR_CLOBBERS_T },
   { 3, { 2, 2, 2 },	    0 },
   { 4, { 2, 2, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 1, { 8 },		    0 },
+  { 1, { 8 },		    0 },		// 8
   { 2, { 8, 1 },	    LSHR_CLOBBERS_T },
   { 2, { 8, 2 },	    0 },
   { 3, { 8, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 3, { 8, 2, 2 },	    0 },
+  { 3, { 8, 2, 2 },	    0 },		// 12
   { 4, { 8, 2, 1, 2 },	    LSHR_CLOBBERS_T },
   { 3, { 8, -2, 8 },	    0 },
   { 3, { 8, -1, 8 },	    ASHL_CLOBBERS_T },
-  { 1, { 16 },		    0 },
+  { 1, { 16 },		    0 },		// 16
   { 2, { 16, 1 },	    LSHR_CLOBBERS_T },
   { 2, { 16, 2 },	    0 },
   { 3, { 16, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 3, { 16, 2, 2 },	    0 },
+  { 3, { 16, 2, 2 },	    0 },		// 20
   { 4, { 16, 2, 1, 2 },	    LSHR_CLOBBERS_T },
   { 3, { 16, -2, 8 },	    0 },
   { 3, { 16, -1, 8 },	    ASHL_CLOBBERS_T },
-  { 2, { 16, 8 },	    0 },
+  { 2, { 16, 8 },	    0 },		// 24
   { 3, { 16, 1, 8 },	    LSHR_CLOBBERS_T },
   { 3, { 16, 8, 2 },	    0 },
   { 4, { 16, 8, 1, 2 },     LSHR_CLOBBERS_T },
-  { 4, { 16, 8, 2, 2 },	    0 },
+  { 4, { 16, 8, 2, 2 },	    0 },		// 28
   { 4, { 16, -1, -2, 16 },  ASHL_CLOBBERS_T },
   { 3, { 16, -2, 16 },	    0 },
 
@@ -2915,35 +2915,35 @@ 
    kind of sign or zero extension.  */
 static const struct ashl_lshr_sequence ext_ashl_lshr_seq[32] =
 {
-  { 0, { 0 },		    0 },
+  { 0, { 0 },		    0 },		// 0
   { 1, { 1 },		    LSHR_CLOBBERS_T },
   { 1, { 2 },		    0 },
   { 2, { 2, 1 },	    LSHR_CLOBBERS_T },
-  { 2, { 2, 2 },	    0 },
+  { 2, { 2, 2 },	    0 },		// 4
   { 3, { 2, 1, 2 },	    LSHR_CLOBBERS_T },
   { 2, { 8, -2 },	    0 },
   { 2, { 8, -1 },	    ASHL_CLOBBERS_T },
-  { 1, { 8 },		    0 },
+  { 1, { 8 },		    0 },		// 8
   { 2, { 8, 1 },	    LSHR_CLOBBERS_T },
   { 2, { 8, 2 },	    0 },
   { 3, { 8, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 3, { 8, 2, 2 },	    0 },
+  { 3, { 8, 2, 2 },	    0 },		// 12
   { 3, { 16, -2, -1 },	    ASHL_CLOBBERS_T },
   { 2, { 16, -2 },	    0 },
   { 2, { 16, -1 },	    ASHL_CLOBBERS_T },
-  { 1, { 16 },		    0 },
+  { 1, { 16 },		    0 },		// 16
   { 2, { 16, 1 },	    LSHR_CLOBBERS_T },
   { 2, { 16, 2 },	    0 },
   { 3, { 16, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 3, { 16, 2, 2 },	    0 },
+  { 3, { 16, 2, 2 },	    0 },		// 20
   { 4, { 16, 2, 1, 2 },	    LSHR_CLOBBERS_T },
   { 3, { 16, -2, 8 },	    0 },
   { 3, { 16, -1, 8 },	    ASHL_CLOBBERS_T },
-  { 2, { 16, 8 },	    0 },
+  { 2, { 16, 8 },	    0 },		// 24
   { 3, { 16, 1, 8 },	    LSHR_CLOBBERS_T },
   { 3, { 16, 8, 2 },	    0 },
   { 4, { 16, 8, 1, 2 },	    LSHR_CLOBBERS_T },
-  { 4, { 16, 8, 2, 2 },	    0 },
+  { 4, { 16, 8, 2, 2 },	    0 },		// 28
   { 4, { 16, -1, -2, 16 },  ASHL_CLOBBERS_T },
   { 3, { 16, -2, 16 },	    0 },
   { 3, { 16, -1, 16 },	    ASHL_CLOBBERS_T }