Message ID | 4F2778EA.7060009@redhat.com |
---|---|
State | New |
Headers | show |
On 31/01/12 05:15, Richard Henderson wrote: > I noticed this accidentally, while looking for something else. > There are significant improvements in the DImode multiplication > and division routines for armv4+. > > Despite how trivial this is, I assume this must wait for stage1. > Ok? > > > r~ > > > * longlong.h [arm] (umul_ppmm): Use umull. > [arm] (count_trailing_zeros): Use __builtin_ctz. armv3m also has the widening multiply operation (it's what the M stands for). Otherwise ok for stage1 R. > > diff --git a/libgcc/longlong.h b/libgcc/longlong.h > index 30cc2e3..7204679 100644 > --- a/libgcc/longlong.h > +++ b/libgcc/longlong.h > @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype); > "rI" ((USItype) (bh)), \ > "r" ((USItype) (al)), \ > "rI" ((USItype) (bl)) __CLOBBER_CC) > -#define umul_ppmm(xh, xl, a, b) \ > -{register USItype __t0, __t1, __t2; \ > - __asm__ ("%@ Inlined umul_ppmm\n" \ > +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \ > + || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) > +# define umul_ppmm(xh, xl, a, b) \ > + do { \ > + register USItype __t0, __t1, __t2; \ > + __asm__ ("%@ Inlined umul_ppmm\n" \ > " mov %2, %5, lsr #16\n" \ > " mov %0, %6, lsr #16\n" \ > " bic %3, %5, %2, lsl #16\n" \ > @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype); > "=r" ((USItype) (xl)), \ > "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ > : "r" ((USItype) (a)), \ > - "r" ((USItype) (b)) __CLOBBER_CC );} > -#define UMUL_TIME 20 > -#define UDIV_TIME 100 > + "r" ((USItype) (b)) __CLOBBER_CC ); \ > + } while (0) > +# define UMUL_TIME 20 > +# else > +# define umul_ppmm(xh, xl, a, b) \ > + do { \ > + /* Generate umull, under compiler control. */ \ > + register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b); \ > + (xl) = (USItype)__t0; \ > + (xh) = (USItype)(__t0 >> 32); \ > + } while (0) > +# define UMUL_TIME 3 > +# endif > +# define UDIV_TIME 100 > #endif /* __arm__ */ > > #if defined(__arm__) > /* Let gcc decide how best to implement count_leading_zeros. */ > #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X)) > +#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X)) > #define COUNT_LEADING_ZEROS_0 32 > #endif > >
On 01/02/12 13:23, Richard Earnshaw wrote: > On 31/01/12 05:15, Richard Henderson wrote: >> Despite how trivial this is, I assume this must wait for stage1. >> Ok? >> >> >> r~ >> >> >> * longlong.h [arm] (umul_ppmm): Use umull. >> [arm] (count_trailing_zeros): Use __builtin_ctz. > > armv3m also has the widening multiply operation (it's what the M stands > for). > > Otherwise ok for stage1 > And it's a good job we did. I've just noticed that it's broken thumb1 builds of libgcc. 00000000 <__ctzsi2>: 0: b508 push {r3, lr} 2: f7ff fffe bl 0 <__ctzsi2> 2: R_ARM_THM_CALL __ctzsi2 6: bc08 pop {r3} 8: bc02 pop {r1} a: 4708 bx r1 R.
diff --git a/libgcc/longlong.h b/libgcc/longlong.h index 30cc2e3..7204679 100644 --- a/libgcc/longlong.h +++ b/libgcc/longlong.h @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype); "rI" ((USItype) (bh)), \ "r" ((USItype) (al)), \ "rI" ((USItype) (bl)) __CLOBBER_CC) -#define umul_ppmm(xh, xl, a, b) \ -{register USItype __t0, __t1, __t2; \ - __asm__ ("%@ Inlined umul_ppmm\n" \ +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \ + || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) +# define umul_ppmm(xh, xl, a, b) \ + do { \ + register USItype __t0, __t1, __t2; \ + __asm__ ("%@ Inlined umul_ppmm\n" \ " mov %2, %5, lsr #16\n" \ " mov %0, %6, lsr #16\n" \ " bic %3, %5, %2, lsl #16\n" \ @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype); "=r" ((USItype) (xl)), \ "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \ : "r" ((USItype) (a)), \ - "r" ((USItype) (b)) __CLOBBER_CC );} -#define UMUL_TIME 20 -#define UDIV_TIME 100 + "r" ((USItype) (b)) __CLOBBER_CC ); \ + } while (0) +# define UMUL_TIME 20 +# else +# define umul_ppmm(xh, xl, a, b) \ + do { \ + /* Generate umull, under compiler control. */ \ + register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b); \ + (xl) = (USItype)__t0; \ + (xh) = (USItype)(__t0 >> 32); \ + } while (0) +# define UMUL_TIME 3 +# endif +# define UDIV_TIME 100 #endif /* __arm__ */ #if defined(__arm__) /* Let gcc decide how best to implement count_leading_zeros. */ #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X)) +#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X)) #define COUNT_LEADING_ZEROS_0 32 #endif