diff mbox

[ARM] Optimized 64-bit multiplication for THUMB-1

Message ID AANLkTi=A-ZuBARrxXJaJC4hz=J-oL4_5zYjbJ1w2d_nR@mail.gmail.com
State New
Headers show

Commit Message

Doug Kwan (關振德) Oct. 22, 2010, 5:19 p.m. UTC
Hi,

     This patch had been submitted for a previous gcc release quite
some time ago but I missed the deadline last time.  The patch
implemented hand-tuned 64-bit multiplication run-time __aeabi_lmul.
On ARM and THUMB-2, this is done using a very short, 5-instruction
routine.  On THUMB-1, this is done using the ARM version via
interworking or a slower, hand-optimized version, depending on whether
the interworking is supported or not.  This is tested on
arm-unknown-linux-gnueabi in qemu with not new regression.

-Doug

2010-10-22  Doug Kwan  <dougkwan@google.com>

        * config/arm/lib1funcs.asm (muldi3): Implement muldi3 in hand-written
        assembly.
        * config/arm/t-strongarm-elf (LIB1ASMFUNCS): Add _muldi3.
        * config/arm/t-vxworks: Ditto.
        * config/arm/t-pe: Ditto.
        * config/arm/t-arm-elf: Ditto.
        * config/arm/t-linux: Ditto.
        * config/arm/t-symbian: Ditto.
        * config/arm/t-wince-pe: Ditto.

Comments

Paul Brook Oct. 22, 2010, 6:20 p.m. UTC | #1
> +/* Force using ARM code if it is possible except for THUMB2 target. */
> +#if defined(USE_FAST_MULDI3) && !defined(__thumb2__)
> +	ARM_FUNC_START muldi3

The !__thumb2__ test is wrong. I'm surprised this even compiles.

>+	mul	xxh, yyl
>...
>+	add     xxh, yyh

Please use the proper 3-argument form in 32-bit code.

>+	push	{r4, r5, r6, r7}

Older assemblers do not support push/pop in ARM mode.
Use do_push/do_pop.

Paul
diff mbox

Patch

Index: gcc/gcc/config/arm/lib1funcs.asm
===================================================================
--- gcc/gcc/config/arm/lib1funcs.asm	(revision 165462)
+++ gcc/gcc/config/arm/lib1funcs.asm	(working copy)
@@ -1274,6 +1274,84 @@  LSYM(Lover12):
 #endif
 	
 #endif /* L_dvmd_lnx */
+
+#ifdef L_muldi3
+
+/* ------------------------------------------------------------------------ */
+/* Dword multiplication operation.
+
+   The THUMB ISA lacks an instruction to compute the higher half of the
+   64-bit result from a 32-bit by 32-bit multiplication.  This makes 64-bit
+   multiplication difficult to implement efficiently.  The ARM ISAs after V3M
+   have UMULL and MLA which can be used to implement 64-bit muliplication
+   efficiently.  On a target that support both ARM V3M+ and THUMB ISA's (but
+   not THUMB2), we want to use the ARM version of _muldi3 in the THUMB libgcc.
+
+   We do not need to use the ARM version for THUMB2 targets as the THUMB2
+   targets also support MLA and UMULL. */
+
+/* We cannot use the faster version for following situations:
+
+   -ARM architetures older V3M lack the UMULL instruction.
+   -Target is ARMV6M, which does not run ARM code.  */
+
+#undef USE_FAST_MULDI3
+#if (__ARM_ARCH__ > 3 || defined(__ARM_ARCH_3M__)) && !defined(__ARM_ARCH_6M__)
+#define USE_FAST_MULDI3
+#endif
+
+/* Force using ARM code if it is possible except for THUMB2 target. */
+#if defined(USE_FAST_MULDI3) && !defined(__thumb2__)
+	ARM_FUNC_START muldi3
+	ARM_FUNC_ALIAS aeabi_lmul muldi3
+#else
+	FUNC_START muldi3
+	FUNC_ALIAS aeabi_lmul muldi3
+#endif
+
+#if defined(USE_FAST_MULDI3)
+	/* Fast version for ARM with umull and THUMB2.  */
+	mul	xxh, yyl
+	mla	yyh, xxl, yyh, xxh
+	umull	xxl, xxh, yyl, xxl
+	add	xxh, yyh
+	RET
+#else
+	/* Slow version for both THUMB and older ARMs lacking umull. */
+	mul	xxh, yyl		/* xxh := AH*BL */
+	push	{r4, r5, r6, r7}
+	mul	yyh, xxl		/* yyh := AL*BH */
+	ldr	r4, .L_mask
+	lsr	r5, xxl, #16		/* r5 := (AL>>16) */
+	lsr	r6, yyl, #16		/* r6 := (BL>>16) */
+	lsr	r7, xxl, #16		/* r7 := (AL>>16) */
+	mul	r5, r6			/* r5 = (AL>>16) * (BL>>16) */
+	and	xxl, r4			/* xxl = AL & 0xffff */
+	and	yyl, r4			/* yyl = BL & 0xffff */
+	add	xxh, yyh		/* xxh = AH*BL+AL*BH */
+	mul	r6, xxl			/* r6 = (AL&0xffff) * (BL>>16) */
+	mul	r7, yyl			/* r7 = (AL>>16) * (BL&0xffff) */
+	add	xxh, r5
+	mul	xxl, yyl		/* xxl = (AL&0xffff) * (BL&0xffff) */
+	mov	r4, #0	
+	adds	r6, r7			/* partial sum to result[47:16]. */
+	adc	r4, r4			/* carry to result[48]. */
+	lsr	yyh, r6, #16
+	lsl	r4, r4, #16
+	lsl	yyl, r6, #16
+	add	xxh, r4
+	adds	xxl, yyl
+	adc	xxh, yyh
+	pop	{r4, r5, r6, r7}
+	RET
+	.align	2
+.L_mask:
+	.word	65535
+#endif
+
+	FUNC_END muldi3
+#endif
+
 #ifdef L_clear_cache
 #if defined __ARM_EABI__ && defined __linux__
 @ EABI GNU/Linux call to cacheflush syscall.
Index: gcc/gcc/config/arm/t-strongarm-elf
===================================================================
--- gcc/gcc/config/arm/t-strongarm-elf	(revision 165462)
+++ gcc/gcc/config/arm/t-strongarm-elf	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
+	_clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/gcc/config/arm/t-vxworks
===================================================================
--- gcc/gcc/config/arm/t-vxworks	(revision 165462)
+++ gcc/gcc/config/arm/t-vxworks	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
+	_call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/gcc/config/arm/t-pe
===================================================================
--- gcc/gcc/config/arm/t-pe	(revision 165462)
+++ gcc/gcc/config/arm/t-pe	(working copy)
@@ -17,7 +17,7 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.
Index: gcc/gcc/config/arm/t-arm-elf
===================================================================
--- gcc/gcc/config/arm/t-arm-elf	(revision 165462)
+++ gcc/gcc/config/arm/t-arm-elf	(working copy)
@@ -29,7 +29,7 @@  LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3
 	_arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \
 	_arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \
 	_arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \
-	_clzsi2 _clzdi2 
+	_clzsi2 _clzdi2 _muldi3
 
 MULTILIB_OPTIONS     = marm/mthumb
 MULTILIB_DIRNAMES    = arm thumb
Index: gcc/gcc/config/arm/t-linux
===================================================================
--- gcc/gcc/config/arm/t-linux	(revision 165462)
+++ gcc/gcc/config/arm/t-linux	(working copy)
@@ -23,7 +23,7 @@  TARGET_LIBGCC2_CFLAGS = -fomit-frame-pointer -fPIC
 
 LIB1ASMSRC = arm/lib1funcs.asm
 LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_lnx _clzsi2 _clzdi2 \
-	_arm_addsubdf3 _arm_addsubsf3
+	_arm_addsubdf3 _arm_addsubsf3 _muldi3
 
 # MULTILIB_OPTIONS = mhard-float/msoft-float
 # MULTILIB_DIRNAMES = hard-float soft-float
Index: gcc/gcc/config/arm/t-symbian
===================================================================
--- gcc/gcc/config/arm/t-symbian	(revision 165462)
+++ gcc/gcc/config/arm/t-symbian	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 \
+	_clzdi2 _muldi3
 
 # These functions have __aeabi equivalents and will never be called by GCC.  
 # By putting them in LIB1ASMFUNCS, we avoid the standard libgcc2.c code being
Index: gcc/gcc/config/arm/t-wince-pe
===================================================================
--- gcc/gcc/config/arm/t-wince-pe	(revision 165462)
+++ gcc/gcc/config/arm/t-wince-pe	(working copy)
@@ -16,7 +16,8 @@ 
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX \
+	_interwork_call_via_rX _clzsi2 _clzdi2 _muldi3
 
 # We want fine grained libraries, so use the new code to build the
 # floating point emulation libraries.