diff mbox

Improve __popcount?i2 (PR middle-end/36041)

Message ID 20130627144825.GD2336@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek June 27, 2013, 2:48 p.m. UTC
Hi!

This patch speeds up the libgcc __popcount{si,di,ti}2 implementations
more than 2 times by avoiding table lookup.
I've kept using the table lookup for small word size targets (unfortunately
avr/rl78 cheap and thus e.g. MIN_UNITS_PER_WORD isn't reliable for those
purposes).  Maybe it would be nice to detect if UWtype multiplication isn't
too expensive (as in, implemented using libcall) and use shits + additions
instead for that case.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-06-27  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/36041
	* libgcc2.c (POPCOUNTCST2, POPCOUNTCST4, POPCOUNTCST8, POPCOUNTCST):
	Define.
	(__popcountSI2): For __SIZEOF_INT__ > 2 targets use arithmetics
	instead of table lookups.
	(__popcountDI2): Likewise.


	Jakub

Comments

Richard Henderson June 27, 2013, 7:32 p.m. UTC | #1
On 06/27/2013 07:48 AM, Jakub Jelinek wrote:
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2013-06-27  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR middle-end/36041
> 	* libgcc2.c (POPCOUNTCST2, POPCOUNTCST4, POPCOUNTCST8, POPCOUNTCST):
> 	Define.
> 	(__popcountSI2): For __SIZEOF_INT__ > 2 targets use arithmetics
> 	instead of table lookups.
> 	(__popcountDI2): Likewise.

Ok.


r~
diff mbox

Patch

--- libgcc/libgcc2.c.jj	2013-02-05 09:06:55.000000000 +0100
+++ libgcc/libgcc2.c	2013-06-27 08:59:12.319971540 +0200
@@ -819,17 +819,42 @@  const UQItype __popcount_tab[256] =
 };
 #endif
 
+#if defined(L_popcountsi2) || defined(L_popcountdi2)
+#define POPCOUNTCST2(x) (((UWtype) x << BITS_PER_UNIT) | x)
+#define POPCOUNTCST4(x) (((UWtype) x << (2 * BITS_PER_UNIT)) | x)
+#define POPCOUNTCST8(x) (((UWtype) x << (4 * BITS_PER_UNIT)) | x)
+#if W_TYPE_SIZE == BITS_PER_UNIT
+#define POPCOUNTCST(x) x
+#elif W_TYPE_SIZE == 2 * BITS_PER_UNIT
+#define POPCOUNTCST(x) POPCOUNTCST2 (x)
+#elif W_TYPE_SIZE == 4 * BITS_PER_UNIT
+#define POPCOUNTCST(x) POPCOUNTCST4 (POPCOUNTCST2 (x))
+#elif W_TYPE_SIZE == 8 * BITS_PER_UNIT
+#define POPCOUNTCST(x) POPCOUNTCST8 (POPCOUNTCST4 (POPCOUNTCST2 (x)))
+#endif
+#endif
+
 #ifdef L_popcountsi2
 #undef int
 int
 __popcountSI2 (UWtype x)
 {
+  /* Force table lookup on targets like AVR and RL78 which only
+     pretend they have LIBGCC2_UNITS_PER_WORD 4, but actually
+     have 1, and other small word targets.  */
+#if __SIZEOF_INT__ > 2 && defined (POPCOUNTCST) && BITS_PER_UNIT == 8
+  x = x - ((x >> 1) & POPCOUNTCST (0x55));
+  x = (x & POPCOUNTCST (0x33)) + ((x >> 2) & POPCOUNTCST (0x33));
+  x = (x + (x >> 4)) & POPCOUNTCST (0x0F);
+  return (x * POPCOUNTCST (0x01)) >> (W_TYPE_SIZE - BITS_PER_UNIT);
+#else
   int i, ret = 0;
 
   for (i = 0; i < W_TYPE_SIZE; i += 8)
     ret += __popcount_tab[(x >> i) & 0xff];
 
   return ret;
+#endif
 }
 #endif
 
@@ -838,12 +863,28 @@  __popcountSI2 (UWtype x)
 int
 __popcountDI2 (UDWtype x)
 {
+  /* Force table lookup on targets like AVR and RL78 which only
+     pretend they have LIBGCC2_UNITS_PER_WORD 4, but actually
+     have 1, and other small word targets.  */
+#if __SIZEOF_INT__ > 2 && defined (POPCOUNTCST) && BITS_PER_UNIT == 8
+  const DWunion uu = {.ll = x};
+  UWtype x1 = uu.s.low, x2 = uu.s.high;
+  x1 = x1 - ((x1 >> 1) & POPCOUNTCST (0x55));
+  x2 = x2 - ((x2 >> 1) & POPCOUNTCST (0x55));
+  x1 = (x1 & POPCOUNTCST (0x33)) + ((x1 >> 2) & POPCOUNTCST (0x33));
+  x2 = (x2 & POPCOUNTCST (0x33)) + ((x2 >> 2) & POPCOUNTCST (0x33));
+  x1 = (x1 + (x1 >> 4)) & POPCOUNTCST (0x0F);
+  x2 = (x2 + (x2 >> 4)) & POPCOUNTCST (0x0F);
+  x1 += x2;
+  return (x1 * POPCOUNTCST (0x01)) >> (W_TYPE_SIZE - BITS_PER_UNIT);
+#else
   int i, ret = 0;
 
   for (i = 0; i < 2*W_TYPE_SIZE; i += 8)
     ret += __popcount_tab[(x >> i) & 0xff];
 
   return ret;
+#endif
 }
 #endif