diff mbox

[AVR] : Tweak division

Message ID 4EB28F42.50301@gjlay.de
State New
Headers show

Commit Message

Georg-Johann Lay Nov. 3, 2011, 12:55 p.m. UTC
This is a tweak for signed 16- and 32-bit division routines.
The old code called subroutine __divmod{si|hi}4_neg1 and returned if T-flag is
not set.  This is costly.  By shuffling the instructions the test can be moved
up without increasing the code size but saving calls here and there.

The speed gain is 1..17 ticks for ATmega88 which is a speed-up of up to 7% for
16-bit division (formerly about 230-240 ticks).  For 16-bit division the
absolute speed gain is the same.

Moreover, addqi3 can handle +/-2 now which saves reload if the constant in
non-d register.  The new *negqihi2 insn is for code like

int minus (char a)
{
    return -a;
}

that compiled to

minus:
	clr r25	 ;  6	extendqihi2/1	[length = 3]
	sbrc r24,7
	com r25
	com r25	 ;  7	neghi2/1	[length = 3]
	neg r24
	sbci r25,lo8(-1)
	ret	 ;  25	return	[length = 1]

and now is compiled to a shorter, faster sequence without need of d-register:

minus:
	clr r25	 ;  7	*negqihi2	[length = 4]
	neg r24
	brge .+2
	com r25
	ret	 ;  25	return	[length = 1]

Tested without regressions. Moreover, the new sequences are tested individually
against the old code.

The patch is against the old infrastructure but the changelog is already for
the new libgcc layout.

Ok for trunk?

Johann

gcc/
	* config/avr/constraints.md (Cm2): New constraint for int -2.
	* config/avr/avr.md (addqi3): Use it. New alternatives for +/-2.
	(*negqihi2): New insn.
libgcc/
	* config/avr/lib1funcs.S (__divmodhi4, __divmodsi4): Tweak speed.

Comments

Denis Chertykov Nov. 4, 2011, 11:15 a.m. UTC | #1
2011/11/3 Georg-Johann Lay <avr@gjlay.de>:
> This is a tweak for signed 16- and 32-bit division routines.
> The old code called subroutine __divmod{si|hi}4_neg1 and returned if T-flag is
> not set.  This is costly.  By shuffling the instructions the test can be moved
> up without increasing the code size but saving calls here and there.
>
> The speed gain is 1..17 ticks for ATmega88 which is a speed-up of up to 7% for
> 16-bit division (formerly about 230-240 ticks).  For 16-bit division the
> absolute speed gain is the same.
>
> Moreover, addqi3 can handle +/-2 now which saves reload if the constant in
> non-d register.  The new *negqihi2 insn is for code like
>
> int minus (char a)
> {
>    return -a;
> }
>
> that compiled to
>
> minus:
>        clr r25  ;  6   extendqihi2/1   [length = 3]
>        sbrc r24,7
>        com r25
>        com r25  ;  7   neghi2/1        [length = 3]
>        neg r24
>        sbci r25,lo8(-1)
>        ret      ;  25  return  [length = 1]
>
> and now is compiled to a shorter, faster sequence without need of d-register:
>
> minus:
>        clr r25  ;  7   *negqihi2       [length = 4]
>        neg r24
>        brge .+2
>        com r25
>        ret      ;  25  return  [length = 1]
>
> Tested without regressions. Moreover, the new sequences are tested individually
> against the old code.
>
> The patch is against the old infrastructure but the changelog is already for
> the new libgcc layout.
>
> Ok for trunk?
>
> Johann
>
> gcc/
>        * config/avr/constraints.md (Cm2): New constraint for int -2.
>        * config/avr/avr.md (addqi3): Use it. New alternatives for +/-2.
>        (*negqihi2): New insn.
> libgcc/
>        * config/avr/lib1funcs.S (__divmodhi4, __divmodsi4): Tweak speed.
>

Approved.

Denis.
diff mbox

Patch

Index: config/avr/libgcc.S
===================================================================
--- config/avr/libgcc.S	(revision 180738)
+++ config/avr/libgcc.S	(working copy)
@@ -565,27 +565,28 @@  DEFUN __divmodhi4
 	.global	_div
 _div:
         bst     r_arg1H,7	; store sign of dividend
-        mov     __tmp_reg__,r_arg1H
-        eor     __tmp_reg__,r_arg2H   ; r0.7 is sign of result
+        mov     __tmp_reg__,r_arg2H
+        brtc	0f
+        com     __tmp_reg__   ; r0.7 is sign of result
 	rcall	__divmodhi4_neg1 ; dividend negative : negate
+0:
 	sbrc	r_arg2H,7
 	rcall	__divmodhi4_neg2 ; divisor negative : negate
 	XCALL	__udivmodhi4	; do the unsigned div/mod
-	rcall	__divmodhi4_neg1 ; correct remainder sign
-	tst	__tmp_reg__
-	brpl	__divmodhi4_exit
+	sbrc	__tmp_reg__,7
+	rcall	__divmodhi4_neg2 ; correct remainder sign
+	brtc	__divmodhi4_exit
+__divmodhi4_neg1:
+	com	r_arg1H
+	neg	r_arg1L		; correct dividend/remainder sign
+	sbci	r_arg1H,0xff
+	ret
 __divmodhi4_neg2:
 	com	r_arg2H
 	neg	r_arg2L		; correct divisor/result sign
 	sbci	r_arg2H,0xff
 __divmodhi4_exit:
 	ret
-__divmodhi4_neg1:
-	brtc	__divmodhi4_exit
-	com	r_arg1H
-	neg	r_arg1L		; correct dividend/remainder sign
-	sbci	r_arg1H,0xff
-	ret
 ENDF __divmodhi4
 #endif /* defined (L_divmodhi4) */
 
@@ -672,16 +673,27 @@  ENDF __udivmodsi4
 
 #if defined (L_divmodsi4)
 DEFUN __divmodsi4
-        bst     r_arg1HH,7	; store sign of dividend
-        mov     __tmp_reg__,r_arg1HH
-        eor     __tmp_reg__,r_arg2HH   ; r0.7 is sign of result
+	mov	__tmp_reg__,r_arg2HH
+	bst	r_arg1HH,7	; store sign of dividend
+	brtc	0f
+	com	__tmp_reg__     ; r0.7 is sign of result
 	rcall	__divmodsi4_neg1 ; dividend negative : negate
+0:
 	sbrc	r_arg2HH,7
 	rcall	__divmodsi4_neg2 ; divisor negative : negate
 	XCALL	__udivmodsi4	; do the unsigned div/mod
-	rcall	__divmodsi4_neg1 ; correct remainder sign
-	rol	__tmp_reg__
-	brcc	__divmodsi4_exit
+	sbrc	__tmp_reg__, 7  ; correct quotient sign
+	rcall	__divmodsi4_neg2
+	brtc	__divmodsi4_exit ; correct remainder sign
+__divmodsi4_neg1:
+	com	r_arg1HH
+	com	r_arg1HL
+	com	r_arg1H
+	neg	r_arg1L		; correct dividend/remainder sign
+	sbci	r_arg1H, 0xff
+	sbci	r_arg1HL,0xff
+	sbci	r_arg1HH,0xff
+	ret
 __divmodsi4_neg2:
 	com	r_arg2HH
 	com	r_arg2HL
@@ -692,16 +704,6 @@  __divmodsi4_neg2:
 	sbci	r_arg2HH,0xff
 __divmodsi4_exit:
 	ret
-__divmodsi4_neg1:
-	brtc	__divmodsi4_exit
-	com	r_arg1HH
-	com	r_arg1HL
-	com	r_arg1H
-	neg	r_arg1L		; correct dividend/remainder sign
-	sbci	r_arg1H, 0xff
-	sbci	r_arg1HL,0xff
-	sbci	r_arg1HH,0xff
-	ret
 ENDF __divmodsi4
 #endif /* defined (L_divmodsi4) */
 
Index: config/avr/avr.md
===================================================================
--- config/avr/avr.md	(revision 180739)
+++ config/avr/avr.md	(working copy)
@@ -739,17 +739,19 @@  (define_insn "*strlenhi"
 ; add bytes
 
 (define_insn "addqi3"
-  [(set (match_operand:QI 0 "register_operand" "=r,d,r,r")
-        (plus:QI (match_operand:QI 1 "register_operand" "%0,0,0,0")
-                 (match_operand:QI 2 "nonmemory_operand" "r,i,P,N")))]
+  [(set (match_operand:QI 0 "register_operand"          "=r,d,r,r,r,r")
+        (plus:QI (match_operand:QI 1 "register_operand" "%0,0,0,0,0,0")
+                 (match_operand:QI 2 "nonmemory_operand" "r,i,P,N,K,Cm2")))]
   ""
   "@
 	add %0,%2
 	subi %0,lo8(-(%2))
 	inc %0
-	dec %0"
-  [(set_attr "length" "1,1,1,1")
-   (set_attr "cc" "set_czn,set_czn,set_zn,set_zn")])
+	dec %0
+	inc %0\;inc %0
+	dec %0\;dec %0"
+  [(set_attr "length" "1,1,1,1,2,2")
+   (set_attr "cc" "set_czn,set_czn,set_zn,set_zn,set_zn,set_zn")])
 
 
 (define_expand "addhi3"
@@ -3089,6 +3091,14 @@  (define_insn "negqi2"
   [(set_attr "length" "1")
    (set_attr "cc" "set_zn")])
 
+(define_insn "*negqihi2"
+  [(set (match_operand:HI 0 "register_operand"                        "=r")
+        (neg:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0"))))]
+  ""
+  "clr %B0\;neg %A0\;brge .+2\;com %B0"
+  [(set_attr "length" "4")
+   (set_attr "cc" "set_n")])
+
 (define_insn "neghi2"
   [(set (match_operand:HI 0 "register_operand"       "=!d,r,&r")
 	(neg:HI (match_operand:HI 1 "register_operand" "0,0,r")))]
Index: config/avr/constraints.md
===================================================================
--- config/avr/constraints.md	(revision 180738)
+++ config/avr/constraints.md	(working copy)
@@ -103,6 +103,11 @@  (define_memory_constraint "Q"
   (and (match_code "mem")
        (match_test "extra_constraint_Q (op)")))
 
+(define_constraint "Cm2"
+  "Constant integer @minus{}2."
+  (and (match_code "const_int")
+       (match_test "ival == -2")))
+
 (define_constraint "C03"
   "Constant integer 3."
   (and (match_code "const_int")