diff mbox

[ARM] Optimized 64-bit multiplication for THUMB-1

Message ID AANLkTi=BQk4TdhxW6vUcGzk_YkjMZE8iEZM02cZ7GJSt@mail.gmail.com
State New
Headers show

Commit Message

Doug Kwan (關振德) Oct. 25, 2010, 10:15 a.m. UTC
Hi Paul,

   Thank you very much for your review and comments.  I have fixed the
push/pop and use of 2-argument code in 32-bit code.  I am not quite
sure what the problem in the __thumb2__ test is.  I built arm-eabi-gcc
with arches armv4, armv5te, armv7-a and no-arch and all build was
successful.  I did change the test so that forcing ARM mode is only
done if:

-ARM mode has UMULL instruction
-we are compiling for THUMB-1
-interworking is enabled.

Attached is the updated patch.

-Doug


在 2010年10月23日上午2:20,Paul Brook <paul@codesourcery.com> 寫道:
>> +/* Force using ARM code if it is possible except for THUMB2 target. */
>> +#if defined(USE_FAST_MULDI3) && !defined(__thumb2__)
>> +     ARM_FUNC_START muldi3
>
> The !__thumb2__ test is wrong. I'm surprised this even compiles.
>
>>+      mul     xxh, yyl
>>...
>>+      add     xxh, yyh
>
> Please use the proper 3-argument form in 32-bit code.
>
>>+      push    {r4, r5, r6, r7}
>
> Older assemblers do not support push/pop in ARM mode.
> Use do_push/do_pop.
>
> Paul
>

Comments

Paul Brook Oct. 25, 2010, 11:36 p.m. UTC | #1
> Hi Paul,
> 
>    Thank you very much for your review and comments.  I have fixed the
> push/pop and use of 2-argument code in 32-bit code.  I am not quite
> sure what the problem in the __thumb2__ test is.  I built arm-eabi-gcc
> with arches armv4, armv5te, armv7-a and no-arch and all build was
> successful.  I did change the test so that forcing ARM mode is only
> done if:

No. You're missing the point. ARM_FUNC_START does not force the use of ARM 
mode.  See comments near the definition of that macro.

Paul
diff mbox

Patch

Index: gcc/config/arm/lib1funcs.asm
===================================================================
--- gcc/config/arm/lib1funcs.asm	(revision 165462)
+++ gcc/config/arm/lib1funcs.asm	(working copy)
@@ -1274,6 +1274,90 @@  LSYM(Lover12):
 #endif
 	
 #endif /* L_dvmd_lnx */
+
+#ifdef L_muldi3
+
+/* ------------------------------------------------------------------------ */
+/* Dword multiplication operation.
+
+   The THUMB ISA lacks an instruction to compute the higher half of the
+   64-bit result from a 32-bit by 32-bit multiplication.  This makes 64-bit
+   multiplication difficult to implement efficiently.  The ARM ISAs after V3M
+   have UMULL and MLA which can be used to implement 64-bit muliplication
+   efficiently.  On a target that support both ARM V3M+ and THUMB ISA's (but
+   not THUMB2), we want to use the ARM version of _muldi3 in the THUMB libgcc.
+
+   We do not need to use the ARM version for THUMB2 targets as the THUMB2
+   targets also support MLA and UMULL. */
+
+/* We cannot use the faster version for following situations:
+
+   -ARM architetures older than V3M lack the UMULL instruction.
+   -Target is ARMV6M, which does not run ARM code.  */
+
+#undef USE_FAST_MULDI3
+#if (__ARM_ARCH__ > 3 || defined(__ARM_ARCH_3M__)) && !defined(__ARM_ARCH_6M__)
+#define USE_FAST_MULDI3
+#endif
+
+/* Force using ARM code if:
+   1. ARM mode has UMULL (i.e. USE_FAST_MULDI3 is defined) and
+   2. This is THUMB-1 mode and
+   3. INTERWORKING is enabled.  */
+
+#if defined(USE_FAST_MULDI3) \
+    && (defined(__thumb__) && !defined(__thumb2__)) \
+    && defined(__THUMB_INTERWORK__)
+	ARM_FUNC_START muldi3
+	ARM_FUNC_ALIAS aeabi_lmul muldi3
+#else
+	FUNC_START muldi3
+	FUNC_ALIAS aeabi_lmul muldi3
+#endif
+
+#if defined(USE_FAST_MULDI3)
+	/* Fast version for ARM with umull and THUMB2.  */
+	mul	xxh, xxh, yyl
+	mla	yyh, xxl, yyh, xxh
+	umull	xxl, xxh, yyl, xxl
+	add	xxh, xxh, yyh
+	RET
+#else
+	/* Slow version for both THUMB and older ARMs lacking umull. */
+	mul	xxh, yyl		/* xxh := AH*BL */
+	do_push	{r4, r5, r6, r7}
+	mul	yyh, xxl		/* yyh := AL*BH */
+	ldr	r4, .L_mask
+	lsr	r5, xxl, #16		/* r5 := (AL>>16) */
+	lsr	r6, yyl, #16		/* r6 := (BL>>16) */
+	lsr	r7, xxl, #16		/* r7 := (AL>>16) */
+	mul	r5, r6			/* r5 = (AL>>16) * (BL>>16) */
+	and	xxl, r4			/* xxl = AL & 0xffff */
+	and	yyl, r4			/* yyl = BL & 0xffff */
+	add	xxh, yyh		/* xxh = AH*BL+AL*BH */
+	mul	r6, xxl			/* r6 = (AL&0xffff) * (BL>>16) */
+	mul	r7, yyl			/* r7 = (AL>>16) * (BL&0xffff) */
+	add	xxh, r5
+	mul	xxl, yyl		/* xxl = (AL&0xffff) * (BL&0xffff) */
+	mov	r4, #0	
+	adds	r6, r7			/* partial sum to result[47:16]. */
+	adc	r4, r4			/* carry to result[48]. */
+	lsr	yyh, r6, #16
+	lsl	r4, r4, #16
+	lsl	yyl, r6, #16
+	add	xxh, r4
+	adds	xxl, yyl
+	adc	xxh, yyh
+	do_pop	{r4, r5, r6, r7}
+	RET
+	.align	2
+.L_mask:
+	.word	65535
+#endif
+
+	FUNC_END muldi3
+#endif
+
 #ifdef L_clear_cache
 #if defined __ARM_EABI__ && defined __linux__
 @ EABI GNU/Linux call to cacheflush syscall.
Index: gcc/config/arm/t-strongarm-elf
===================================================================
--- gcc/config/arm/t-strongarm-elf	(revision 165462)
+++ gcc/config/arm/t-strongarm-elf	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
+	_clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/config/arm/t-vxworks
===================================================================
--- gcc/config/arm/t-vxworks	(revision 165462)
+++ gcc/config/arm/t-vxworks	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
+	_call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/config/arm/t-pe
===================================================================
--- gcc/config/arm/t-pe	(revision 165462)
+++ gcc/config/arm/t-pe	(working copy)
@@ -17,7 +17,7 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/config/arm/t-arm-elf
===================================================================
--- gcc/config/arm/t-arm-elf	(revision 165462)
+++ gcc/config/arm/t-arm-elf	(working copy)
@@ -29,7 +29,7 @@  LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3
 	_arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \
 	_arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \
 	_arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \
-	_clzsi2 _clzdi2 
+	_clzsi2 _clzdi2 _muldi3
 
 MULTILIB_OPTIONS     = marm/mthumb
 MULTILIB_DIRNAMES    = arm thumb
Index: gcc/config/arm/t-linux
===================================================================
--- gcc/config/arm/t-linux	(revision 165462)
+++ gcc/config/arm/t-linux	(working copy)
@@ -23,7 +23,7 @@  TARGET_LIBGCC2_CFLAGS = -fomit-frame-pointer -fPIC
 
 LIB1ASMSRC = arm/lib1funcs.asm
 LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_lnx _clzsi2 _clzdi2 \
-	_arm_addsubdf3 _arm_addsubsf3
+	_arm_addsubdf3 _arm_addsubsf3 _muldi3
 
 # MULTILIB_OPTIONS = mhard-float/msoft-float
 # MULTILIB_DIRNAMES = hard-float soft-float
Index: gcc/config/arm/t-symbian
===================================================================
--- gcc/config/arm/t-symbian	(revision 165462)
+++ gcc/config/arm/t-symbian	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 \
+	_clzdi2 _muldi3
 
 # These functions have __aeabi equivalents and will never be called by GCC.  
 # By putting them in LIB1ASMFUNCS, we avoid the standard libgcc2.c code being
Index: gcc/config/arm/t-wince-pe
===================================================================
--- gcc/config/arm/t-wince-pe	(revision 165462)
+++ gcc/config/arm/t-wince-pe	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX \
+	_interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.