diff mbox

[arm] Improve longlong.h umul_ppmm, count_trailing_zeros

Message ID 4F2778EA.7060009@redhat.com
State New
Headers show

Commit Message

Richard Henderson Jan. 31, 2012, 5:15 a.m. UTC
I noticed this accidentally, while looking for something else.
There are significant improvements in the DImode multiplication
and division routines for armv4+.

Despite how trivial this is, I assume this must wait for stage1.
Ok?


r~


	* longlong.h [arm] (umul_ppmm): Use umull.
	[arm] (count_trailing_zeros): Use __builtin_ctz.

Comments

Richard Earnshaw Feb. 1, 2012, 1:23 p.m. UTC | #1
On 31/01/12 05:15, Richard Henderson wrote:
> I noticed this accidentally, while looking for something else.
> There are significant improvements in the DImode multiplication
> and division routines for armv4+.
> 
> Despite how trivial this is, I assume this must wait for stage1.
> Ok?
> 
> 
> r~
> 
> 
> 	* longlong.h [arm] (umul_ppmm): Use umull.
> 	[arm] (count_trailing_zeros): Use __builtin_ctz.

armv3m also has the widening multiply operation (it's what the M stands
for).

Otherwise ok for stage1

R.

> 
> diff --git a/libgcc/longlong.h b/libgcc/longlong.h
> index 30cc2e3..7204679 100644
> --- a/libgcc/longlong.h
> +++ b/libgcc/longlong.h
> @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype);
>  	     "rI" ((USItype) (bh)),					\
>  	     "r" ((USItype) (al)),					\
>  	     "rI" ((USItype) (bl)) __CLOBBER_CC)
> -#define umul_ppmm(xh, xl, a, b) \
> -{register USItype __t0, __t1, __t2;					\
> -  __asm__ ("%@ Inlined umul_ppmm\n"					\
> +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
> +     || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
> +#  define umul_ppmm(xh, xl, a, b)					\
> +  do {									\
> +    register USItype __t0, __t1, __t2;					\
> +    __asm__ ("%@ Inlined umul_ppmm\n"					\
>  	   "	mov	%2, %5, lsr #16\n"				\
>  	   "	mov	%0, %6, lsr #16\n"				\
>  	   "	bic	%3, %5, %2, lsl #16\n"				\
> @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype);
>  	     "=r" ((USItype) (xl)),					\
>  	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
>  	   : "r" ((USItype) (a)),					\
> -	     "r" ((USItype) (b)) __CLOBBER_CC );}
> -#define UMUL_TIME 20
> -#define UDIV_TIME 100
> +	     "r" ((USItype) (b)) __CLOBBER_CC );			\
> +  } while (0)
> +#  define UMUL_TIME 20
> +# else
> +#  define umul_ppmm(xh, xl, a, b)					\
> +  do {									\
> +    /* Generate umull, under compiler control.  */			\
> +    register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
> +    (xl) = (USItype)__t0;						\
> +    (xh) = (USItype)(__t0 >> 32);					\
> +  } while (0)
> +#  define UMUL_TIME 3
> +# endif
> +# define UDIV_TIME 100
>  #endif /* __arm__ */
>  
>  #if defined(__arm__)
>  /* Let gcc decide how best to implement count_leading_zeros.  */
>  #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
> +#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
>  #define COUNT_LEADING_ZEROS_0 32
>  #endif
>  
>
Richard Earnshaw March 20, 2012, 2:55 p.m. UTC | #2
On 01/02/12 13:23, Richard Earnshaw wrote:
> On 31/01/12 05:15, Richard Henderson wrote:
>> Despite how trivial this is, I assume this must wait for stage1.
>> Ok?
>>
>>
>> r~
>>
>>
>> 	* longlong.h [arm] (umul_ppmm): Use umull.
>> 	[arm] (count_trailing_zeros): Use __builtin_ctz.
> 
> armv3m also has the widening multiply operation (it's what the M stands
> for).
> 
> Otherwise ok for stage1
> 

And it's a good job we did.  I've just noticed that it's broken thumb1
builds of libgcc.

00000000 <__ctzsi2>:
   0:   b508            push    {r3, lr}
   2:   f7ff fffe       bl      0 <__ctzsi2>
                        2: R_ARM_THM_CALL       __ctzsi2
   6:   bc08            pop     {r3}
   8:   bc02            pop     {r1}
   a:   4708            bx      r1

R.
diff mbox

Patch

diff --git a/libgcc/longlong.h b/libgcc/longlong.h
index 30cc2e3..7204679 100644
--- a/libgcc/longlong.h
+++ b/libgcc/longlong.h
@@ -220,9 +220,12 @@  UDItype __umulsidi3 (USItype, USItype);
 	     "rI" ((USItype) (bh)),					\
 	     "r" ((USItype) (al)),					\
 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
-#define umul_ppmm(xh, xl, a, b) \
-{register USItype __t0, __t1, __t2;					\
-  __asm__ ("%@ Inlined umul_ppmm\n"					\
+# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
+     || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+#  define umul_ppmm(xh, xl, a, b)					\
+  do {									\
+    register USItype __t0, __t1, __t2;					\
+    __asm__ ("%@ Inlined umul_ppmm\n"					\
 	   "	mov	%2, %5, lsr #16\n"				\
 	   "	mov	%0, %6, lsr #16\n"				\
 	   "	bic	%3, %5, %2, lsl #16\n"				\
@@ -239,14 +242,26 @@  UDItype __umulsidi3 (USItype, USItype);
 	     "=r" ((USItype) (xl)),					\
 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
 	   : "r" ((USItype) (a)),					\
-	     "r" ((USItype) (b)) __CLOBBER_CC );}
-#define UMUL_TIME 20
-#define UDIV_TIME 100
+	     "r" ((USItype) (b)) __CLOBBER_CC );			\
+  } while (0)
+#  define UMUL_TIME 20
+# else
+#  define umul_ppmm(xh, xl, a, b)					\
+  do {									\
+    /* Generate umull, under compiler control.  */			\
+    register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
+    (xl) = (USItype)__t0;						\
+    (xh) = (USItype)(__t0 >> 32);					\
+  } while (0)
+#  define UMUL_TIME 3
+# endif
+# define UDIV_TIME 100
 #endif /* __arm__ */
 
 #if defined(__arm__)
 /* Let gcc decide how best to implement count_leading_zeros.  */
 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
 #define COUNT_LEADING_ZEROS_0 32
 #endif