diff mbox

[avr] : Speed up 64-bit shifts in libgcc

Message ID 513488C3.1000608@gjlay.de
State New
Headers show

Commit Message

Georg-Johann Lay March 4, 2013, 11:42 a.m. UTC
This patch fixed the speed of 64-bit shifts and rotate.

These operations were implemented by bit-wise shifts and thus the speed is not
reasonable for such basic arithmetic.

The new implementation first shifts byte-wise and only the remaining mod 8 is
shifted bit-wise.

The new methods needs few more instructions, but 64-bit arithmetic needs much
code, anyway...  But base arithmetic should operate reasonably fast and not
take 600 or mote ticks for a simple shift.

Ok for trunk?

Johann


	* config/avr/lib1funcs.S (__ashrdi3, __lshrdi3, __ashldi3)
	(__rotldi3): Shift bytewise if applicable.

Comments

Denis Chertykov March 4, 2013, 11:45 a.m. UTC | #1
2013/3/4 Georg-Johann Lay <avr@gjlay.de>:
> This patch fixed the speed of 64-bit shifts and rotate.
>
> These operations were implemented by bit-wise shifts and thus the speed is not
> reasonable for such basic arithmetic.
>
> The new implementation first shifts byte-wise and only the remaining mod 8 is
> shifted bit-wise.
>
> The new methods needs few more instructions, but 64-bit arithmetic needs much
> code, anyway...  But base arithmetic should operate reasonably fast and not
> take 600 or mote ticks for a simple shift.
>
> Ok for trunk?
>
> Johann
>
>
>         * config/avr/lib1funcs.S (__ashrdi3, __lshrdi3, __ashldi3)
>         (__rotldi3): Shift bytewise if applicable.
>

Approved.

Denis.
diff mbox

Patch

Index: config/avr/lib1funcs.S
===================================================================
--- config/avr/lib1funcs.S	(revision 196329)
+++ config/avr/lib1funcs.S	(working copy)
@@ -3030,64 +3030,73 @@  ENDF __bswapdi2
 ;; Arithmetic shift right
 ;; r25:r18 = ashr64 (r25:r18, r17:r16)
 DEFUN __ashrdi3
-    push r16
-    andi r16, 63
-    breq 2f
-1:  asr  r25
-    ror  r24
-    ror  r23
-    ror  r22
-    ror  r21
-    ror  r20
-    ror  r19
-    ror  r18
-    dec  r16
-    brne 1b
-2:  pop  r16
-    ret
-ENDF __ashrdi3
-#endif /* defined (L_ashrdi3) */
+    bst     r25, 7
+    bld     __zero_reg__, 0
+    ;; FALLTHRU
+ENDF  __ashrdi3
 
-#if defined (L_lshrdi3)
 ;; Logic shift right
 ;; r25:r18 = lshr64 (r25:r18, r17:r16)
 DEFUN __lshrdi3
-    push r16
-    andi r16, 63
-    breq 2f
-1:  lsr  r25
-    ror  r24
-    ror  r23
-    ror  r22
-    ror  r21
-    ror  r20
-    ror  r19
-    ror  r18
-    dec  r16
-    brne 1b
-2:  pop  r16
+    lsr     __zero_reg__
+    sbc     __tmp_reg__, __tmp_reg__
+    push    r16
+0:  cpi     r16, 8
+    brlo 2f
+    subi    r16, 8
+    mov     r18, r19
+    mov     r19, r20
+    mov     r20, r21
+    mov     r21, r22
+    mov     r22, r23
+    mov     r23, r24
+    mov     r24, r25
+    mov     r25, __tmp_reg__
+    rjmp 0b
+1:  asr     __tmp_reg__
+    ror     r25
+    ror     r24
+    ror     r23
+    ror     r22
+    ror     r21
+    ror     r20
+    ror     r19
+    ror     r18
+2:  dec     r16
+    brpl 1b
+    pop     r16
     ret
 ENDF __lshrdi3
-#endif /* defined (L_lshrdi3) */
+#endif /* defined (L_ashrdi3) */
 
 #if defined (L_ashldi3)
 ;; Shift left
 ;; r25:r18 = ashl64 (r25:r18, r17:r16)
 DEFUN __ashldi3
-    push r16
-    andi r16, 63
-    breq 2f
-1:  lsl  r18
-    rol  r19
-    rol  r20
-    rol  r21
-    rol  r22
-    rol  r23
-    rol  r24
-    rol  r25
-    dec  r16
-    brne 1b
-2:  pop  r16
+    push    r16
+0:  cpi     r16, 8
+    brlo 2f
+    mov     r25, r24
+    mov     r24, r23
+    mov     r23, r22
+    mov     r22, r21
+    mov     r21, r20
+    mov     r20, r19
+    mov     r19, r18
+    clr     r18
+    subi    r16, 8
+    rjmp 0b
+1:  lsl     r18
+    rol     r19
+    rol     r20
+    rol     r21
+    rol     r22
+    rol     r23
+    rol     r24
+    rol     r25
+2:  dec     r16
+    brpl 1b
+    pop     r16
     ret
 ENDF __ashldi3
 #endif /* defined (L_ashldi3) */
@@ -3096,21 +3105,32 @@  ENDF __ashldi3
 ;; Shift left
 ;; r25:r18 = rotl64 (r25:r18, r17:r16)
 DEFUN __rotldi3
-    push r16
-    andi r16, 63
-    breq 2f
-1:  lsl  r18
-    rol  r19
-    rol  r20
-    rol  r21
-    rol  r22
-    rol  r23
-    rol  r24
-    rol  r25
-    adc  r18, __zero_reg__
-    dec  r16
-    brne 1b
-2:  pop  r16
+    push    r16
+0:  cpi     r16, 8
+    brlo 2f
+    subi    r16, 8
+    mov     __tmp_reg__, r25
+    mov     r25, r24
+    mov     r24, r23
+    mov     r23, r22
+    mov     r22, r21
+    mov     r21, r20
+    mov     r20, r19
+    mov     r19, r18
+    mov     r18, __tmp_reg__
+    rjmp 0b
+1:  lsl     r18
+    rol     r19
+    rol     r20
+    rol     r21
+    rol     r22
+    rol     r23
+    rol     r24
+    rol     r25
+    adc     r18, __zero_reg__
+2:  dec     r16
+    brpl 1b
+    pop     r16
     ret
 ENDF __rotldi3
 #endif /* defined (L_rotldi3) */