diff mbox series

[avr,committed] Tweak IEEE double multiplication

Message ID 3a6f29b8-73ae-4f8b-babf-c772c02fa709@gjlay.de
State New
Headers show
Series [avr,committed] Tweak IEEE double multiplication | expand

Commit Message

Georg-Johann Lay Nov. 8, 2023, 11:53 a.m. UTC
Applied this patch that improves IEEE double multiplication.
The old code consumed time for calling local helpers and to
prepare arguments.

Functions that use mul like expl or sinl are around 5%...9% faster
now.  The code size did not increase.

Johann

--

LibF7: Tweak IEEE double multiplication.

libgcc/config/avr/libf7/
	* libf7-asm.sx (mul_mant) [AVR_HAVE_MUL]: Tweak code.


      ;; Finally...

@@ -1032,38 +1070,6 @@ DEFUN mul_mant

      do_epilogue_restores 10

-;; TT0 * Tmp  -> 3:2
-;; TT0 * Atmp -> 1:0
-;; BB  * Atmp -> a:-
-;;
-;; Clobbers : TMP, TT0...TT3.
-;; Sets     : ZERO = 0.
-.Lmul.help.3:
-    mul     TT0, TMP    $   movw    TT2, r0
-    mul     TT0, Atmp   $   movw    TT0, r0
-    mul     BB,  Atmp
-
-    ADD     CA, r1
-    adc     C0, TT0     $   adc     C1, TT1
-    adc     C2, TT2
-.Lmul.help.3.C3:        $   adc     C3, TT3     $ clr ZERO
-    adc     C4, ZERO    $   adc     C5, ZERO
-    adc     C6, ZERO
-    ret
-
-;; BB * TMP  -> 2:1
-;; BB * Atmp -> 0:a
-;;
-;; Asserts  : TT3 = 0
-;; Clobbers : TMP, TT0, TT1.
-;; Sets     : ZERO = 0.
-.Lmul.help.2:
-    mul     BB, TMP     $   movw    TT0, r0
-    mul     BB, Atmp
-    ADD     CA, r0      $   adc     C0, r1
-    adc     C1, TT0     $   adc     C2, TT1
-    rjmp .Lmul.help.3.C3
-
  ENDF mul_mant
  #endif /* F7MOD_mul_mant_ && MUL */
diff mbox series

Patch

diff --git a/libgcc/config/avr/libf7/libf7-asm.sx 
b/libgcc/config/avr/libf7/libf7-asm.sx
index 4505764c126..01d1fa3e876 100644
--- a/libgcc/config/avr/libf7/libf7-asm.sx
+++ b/libgcc/config/avr/libf7/libf7-asm.sx
@@ -877,10 +877,14 @@  DEFUN ashldi3
  ;; R18.0 = 1: No rounding.

  DEFUN mul_mant
+    ;; 10 = Y, R17...R10
      do_prologue_saves 10
+    ;; T = R18.0: Skip rounding?
      bst     r18,    0
+    ;; Save result address for later.
      push    r25
      push    r24
+    ;; Load A's mantissa.
      movw    ZL,     r22
      LDD     A0,     Z+0+Off
      LDD     A1,     Z+1+Off
@@ -913,26 +917,15 @@  DEFUN mul_mant
      adc     C6, ZERO
      ;; Done B6

-    ;; 3 * 3 -> 0:a
-    ;; 4 * 4 -> 2:1
-    ;; 5 * 5 -> 4:3
-    ldd     BB, Z+3+Off $   mul     A3, BB      $   movw    TT0, r0
-    ldd     BB, Z+4+Off $   mul     A4, BB      $   movw    TT2, r0
-    ldd     BB, Z+5+Off $   mul     A5, BB
-
-    ADD     CA, TT0     $   adc     C0, TT1
-    adc     C1, TT2     $   adc     C2, TT3
-    adc     C3, r0      $   adc     C4, r1
-    brcc .+2
-    adiw    C5, 1
-
      ;; 6 * 5 -> 5:4
      ;; 4 * 5 -> 3:2
      ;; 2 * 5 -> 1:0
      ;; 0 * 5 -> a:-
+    ldd     BB, Z+5+Off
      mul     A0, BB
-    ;; A0 done
+    ;; Done A0
  #define Atmp A0
+#define Null A0

      mov     Atmp, r1
      mul     A6, BB      $   movw    TT2, r0
@@ -942,82 +935,127 @@  DEFUN mul_mant
      ADD     CA, Atmp
      adc     C0, r0      $   adc     C1, r1
      adc     C2, TT0     $   adc     C3, TT1
-    adc     C4, TT2     $   adc     C5, TT3     $   clr ZERO
-    adc     C6, ZERO
+    adc     C4, TT2     $   adc     C5, TT3     $   clr Null
+    adc     C6, Null

      ;; 1 * 5 -> 0:a
      ;; 3 * 5 -> 2:1
-    ;; 6 * 4 -> 4:3
+    ;; 5 * 5 -> 4:3
      mul     A1, BB      $   movw    TT0, r0
      mul     A3, BB      $   movw    TT2, r0
+    mul     A5, BB
+
+    ADD     CA, TT0     $   adc     C0, TT1
+    adc     C1, TT2     $   adc     C2, TT3
+    adc     C3, r0      $   adc     C4, r1
+    adc     C5, Null    $   adc     C6, Null
+    ;; Done B5
+
+    ;; 2 * 4 -> 0:a
+    ;; 4 * 4 -> 2:1
+    ;; 6 * 4 -> 4:3
      ldd     BB, Z+4+Off
+    mul     A2, BB      $   movw    TT0, r0
+    mul     A4, BB      $   movw    TT2, r0
      mul     A6, BB

      ADD     CA, TT0     $   adc     C0, TT1
      adc     C1, TT2     $   adc     C2, TT3
-    adc     C3, r0      $   adc     C4, r1      $   clr ZERO
-    adc     C5, ZERO    $   adc     C6, ZERO
-    ;; B5 done
+    adc     C3, r0      $   adc     C4, r1
+    adc     C5, Null    $   adc     C6, Null

+    ;; 1 * 4 -> a:-
+    ;; 3 * 4 -> 1:0
+    ;; 5 * 4 -> 3:2
+    mul     A1, BB      $   mov     TT1, r1
+    mul     A3, BB      $   movw    TT2, r0
+    mul     A5, BB
+    ;; Done A1
+    ;; Done B4
+    ADD     CA, TT1
+    adc     C0, TT2     $   adc     C1, TT3
+    adc     C2, r0      $   adc     C3, r1
+    ;; Accumulate carry for C3 in TT1.
+    ;; Accumulate carry for C4 in A1.
+#define Cry3 TT1
+#define Cry4 A1
+    clr     Cry3
+    clr     Cry4
+    rol     Cry4
+
+    ;; 6 * 2 -> 2:1
      ;; 6 * 3 -> 3:2
-    ;; 6 * 1 -> 1:0
-    ;; 4 * 1 -> a:-
-    mov     TT0, A6     $   ldd TMP,  Z+3+Off
-    mov     BB,  A4     $   ldd Atmp, Z+1+Off
-    rcall   .Lmul.help.3
+    ;; 5 * 3 -> 2:1
+    ldd     BB, Z+2+Off
+    mul     A6, BB
+    add     C1, r0
+    adc     C2, r1
+    adc     Cry3, Null

-    ;; 5 * 4 -> 3:2
-    ;; 5 * 2 -> 1:0
-    ;; 3 * 2 -> a:-
-    mov     TT0, A5     $   ldd TMP,  Z+4+Off
-    mov     BB,  A3     $   ldd Atmp, Z+2+Off
-    rcall   .Lmul.help.3
+    ldd     BB, Z+3+Off
+    mul     A6, BB
+    add     C2, r0
+    adc     C3, r1
+    adc     Cry4, Null
+
+    mul     A5, BB
+    add     C1, r0
+    adc     C2, r1
+    adc     Cry3, Null

-    ;; 4 *   -> 3:2 (=0)
+    ;; Perform the remaining 11 multiplications in 4 loopings:
      ;; 4 * 3 -> 1:0
+    ;; 3 * 3 -> 0:a
      ;; 2 * 3 -> a:-
-    mov     TT0, A4     $   clr TMP
-    mov     BB,  A2     $   ldd Atmp, Z+3+Off
-    rcall   .Lmul.help.3
-
-    ;; 3 * . -> 3:2 (=0)
-    ;; 3 * 4 -> 1:0
-    ;; 1 * 4 -> a:-
-    mov     TT0, A3     $   clr TMP
-    mov     BB,  A1     $   ldd Atmp, Z+4+Off
-    rcall   .Lmul.help.3
-
-    ;; . * ? -> 3:2 (=0)
-    ;; . * 0 -> 1:0 (=0)
+    ;;
+    ;; 5 * 2 -> 1:0
+    ;; 4 * 2 -> 0:a
+    ;; 3 * 2 -> a:-
+    ;;
+    ;; 6 * 1 -> 1:0
+    ;; 5 * 1 -> 0:a
+    ;; 4 * 1 -> a:-
+    ;;
+    ;; . * 0 -> 1:0  (=0)
+    ;; 6 * 0 -> 0:a
      ;; 5 * 0 -> a:-
-    clr     TT0
-    mov     BB,  A5     $   ldd Atmp, Z+0+Off
-    rcall   .Lmul.help.3

-    clr TT3  ;; Asserted by .Lmul.help.2
-    ;; 6 * 2 -> 2:1
-    ;; 6 * 0 -> 0:a
-                        $   ldd TMP,  Z+2+Off
-    mov     BB, A6     ;$   ldd Atmp, Z+0+Off
-    rcall   .Lmul.help.2
+    ;; BB already contains B3, hence let Z point one past B2 so that
+    ;; the  LD *, -Z  below will pick up B2, B1, B0.
+    adiw    r30,    1 + Off+2

-    ;; 5 * 3 -> 2:1
-    ;; 5 * 1 -> 0:a
-                        $   ldd TMP,  Z+3+Off
-    mov     BB, A5      $   ldd Atmp, Z+1+Off
-    rcall   .Lmul.help.2
+    ;; Accumulate carry for C2 in TT2.
+#define Cry2 TT2
+    clr     Cry2

-    ;; 4 * . -> 2:1 (=0)
-    ;; 4 * 2 -> 0:a
-                        $   clr TMP
-    mov     BB, A4      $   ldd Atmp, Z+2+Off
-    rcall   .Lmul.help.2
+    ;; TT3 is the loop counter, iterate over B3...B0.
+    ldi     TT3,    4
+    rjmp .Loop_start

-    ;; 2 * . -> 2:1 (=0)
-    ;; 2 * 4 -> 0:a
-                        $   clr TMP
-    mov     BB, A2      $   ldd Atmp, Z+4+Off
-    rcall   .Lmul.help.2
+.Loop:
+    ;; We use A2...A4 below; so shift bytes of A into place.
+    mov     A2, A3
+    mov     A3, A4
+    mov     A4, A5
+    mov     A5, A6
+    clr     A6
+    ld      BB, -Z
+.Loop_start:
+    mul     A3, BB
+    ADD     CA, r0      $  adc  C0, r1  $  adc  C1, Null    $  adc 
Cry2, Null
+    MUL     A2, BB
+    mov     TT0, r1
+    MUL     A4, BB
+    ADD     CA, TT0     $  adc  C0, r0  $  adc  C1, r1      $  adc 
Cry2, Null
+    dec     TT3
+    brne .Loop
+
+    clr     ZERO
+    ADD     C2, Cry2
+    adc     C3, Cry3
+    adc     C4, Cry4
+    adc     C5, ZERO
+    adc     C6, ZERO