Patchwork Speedup bitmap iterators

login
register
mail settings
Submitter Jan Hubicka
Date June 9, 2010, 9:51 a.m.
Message ID <20100609095137.GA18910@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/55065/
State New
Headers show

Comments

Jan Hubicka - June 9, 2010, 9:51 a.m.
Hi,
this patch avoid dififcult to predict branch in bmp walking. Improving:
               :static inline bool
               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
               :{
               :  /* If our current word is nonzero, it contains the bit we want.  */
  1551  0.1115 :  if (bi->bits)
               :    {
               :    next_bit:
  7282  0.5237 :      while (!(bi->bits & 1))
               :        {
  5312  0.3820 :          bi->bits >>= 1;
  7904  0.5685 :          *bit_no += 1;
               :        }
               :      return true;
               :    }

to:

               : bool
               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
               :{
               :  /* If our current word is nonzero, it contains the bit we want.  */
  1274  0.0973 :  if (bi->bits)
               :    {
               :    next_bit:
               :#if (GCC_VERSION >= 3004)
               :      {
  1106  0.0845 :        unsigned int n = __builtin_ctzl (bi->bits);
               :        gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
  1811  0.1384 :        bi->bits >>= n;
  1892  0.1446 :        *bit_no += n;
               :      }
               :#else

The code (including checking for 3.4) is the same as one present in
bitmap_first_bit already.

Bootstrapped/regtested x86_64-linux, OK?

Honza

	* bitmap.h (bmp_iter_set, bmp_iter_and, bmp_iter_and_compl):
	Use __builtin_ctzl where possible.
Manuel López-Ibáñez - June 9, 2010, 9:57 a.m.
On 9 June 2010 11:51, Jan Hubicka <hubicka@ucw.cz> wrote:
>
>        * bitmap.h (bmp_iter_set, bmp_iter_and, bmp_iter_and_compl):
>        Use __builtin_ctzl where possible.

Am I missing something or is the code exactly the same in the three
cases? Why so much duplication?

Manuel.
Richard Guenther - June 9, 2010, 10:32 a.m.
On Wed, Jun 9, 2010 at 11:51 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> Hi,
> this patch avoid dififcult to predict branch in bmp walking. Improving:
>               :static inline bool
>               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>               :{
>               :  /* If our current word is nonzero, it contains the bit we want.  */
>  1551  0.1115 :  if (bi->bits)
>               :    {
>               :    next_bit:
>  7282  0.5237 :      while (!(bi->bits & 1))
>               :        {
>  5312  0.3820 :          bi->bits >>= 1;
>  7904  0.5685 :          *bit_no += 1;
>               :        }
>               :      return true;
>               :    }
>
> to:
>
>               : bool
>               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>               :{
>               :  /* If our current word is nonzero, it contains the bit we want.  */
>  1274  0.0973 :  if (bi->bits)
>               :    {
>               :    next_bit:
>               :#if (GCC_VERSION >= 3004)
>               :      {
>  1106  0.0845 :        unsigned int n = __builtin_ctzl (bi->bits);
>               :        gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
>  1811  0.1384 :        bi->bits >>= n;
>  1892  0.1446 :        *bit_no += n;
>               :      }
>               :#else
>
> The code (including checking for 3.4) is the same as one present in
> bitmap_first_bit already.
>
> Bootstrapped/regtested x86_64-linux, OK?

Hm.  I am concerned about using __builtin_ctzl unconditionally
as it might expand to a call to libgcc which for sure wouldn't be
faster(?).  Maybe that's also a question as instead of a loop it
would use a few table lookups.

Note that the common case might even be that bi->bits & 1 already
which the dispatching to ctzl would be even worse.

So I guess you should probably use the count_leading/trailing_zeros
facility from longlong.h instead and just for selected host archs
use __builtin_ctzl (it wouldn't be available for generic x86_64 even).

Also duplicating all this code again and again is bad - can you
factor it out into a single macro first?

Thanks,
Richard.

> Honza
>
>        * bitmap.h (bmp_iter_set, bmp_iter_and, bmp_iter_and_compl):
>        Use __builtin_ctzl where possible.
> Index: bitmap.h
> ===================================================================
> --- bitmap.h    (revision 160447)
> +++ bitmap.h    (working copy)
> @@ -396,11 +396,20 @@ bmp_iter_set (bitmap_iterator *bi, unsig
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
> @@ -443,11 +452,20 @@ bmp_iter_and (bitmap_iterator *bi, unsig
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
> @@ -510,11 +528,20 @@ bmp_iter_and_compl (bitmap_iterator *bi,
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
>
Paolo Carlini - June 9, 2010, 11:15 a.m.
On 06/09/2010 12:32 PM, Richard Guenther wrote:
> Hm. I am concerned about using __builtin_ctzl unconditionally
> as it might expand to a call to libgcc which for sure wouldn't be
> faster(?).
Somewhen I'd like to understand this issue better, because quite a few
years ago somebody decided that using unconditionally builtins like
__builtin_ctzl in <bitset> would be a win...

Paolo.
Jan Hubicka - June 9, 2010, 11:25 a.m.
> On 06/09/2010 12:32 PM, Richard Guenther wrote:
> > Hm. I am concerned about using __builtin_ctzl unconditionally
> > as it might expand to a call to libgcc which for sure wouldn't be
> > faster(?).
> Somewhen I'd like to understand this issue better, because quite a few
> years ago somebody decided that using unconditionally builtins like
> __builtin_ctzl in <bitset> would be a win...

I would personally sayt hat if we don't generate faster or equal code with
__builtin_ctzl than we do with hand written loop, it is codegen bug on that
particular target.  My preferrence would be to switch to using __builtin_ctzl
and then fix the targets where this results in lousy code.

Note that ctzl is in use in GCC:
bitmap.c:  bit_no += __builtin_ctzl (word);
bitmap.c:  bit_no += sizeof (long) * 8 - __builtin_ctzl (word);
ggc-page.c:       bit = __builtin_ctzl (~entry->in_use_p[word]);
toplev.h:#  define CTZ_HWI __builtin_ctzl
toplev.h:#  define CTZ_HWI __builtin_ctzll
toplev.h:  return x == (x & -x) && x ? (int) CTZ_HWI (x) : -1;

There are I guess 4 ways to handle this.  Either hardware instruction like i386
have, or the loop, jump sequence halving the interval or the libcall.
I guess that depending on target each of those do make sense.

I also don't link longlong.h facility applies here very easilly since it is for
long longs, not for longs that makes difference.

Honza
> 
> Paolo.
Paolo Bonzini - June 9, 2010, 4:21 p.m.
On 06/09/2010 11:51 AM, Jan Hubicka wrote:
> Hi,
> this patch avoid dififcult to predict branch in bmp walking. Improving:
>                 :static inline bool
>                 :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>                 :{
>                 :  /* If our current word is nonzero, it contains the bit we want.  */
>    1551  0.1115 :  if (bi->bits)
>                 :    {
>                 :    next_bit:
>    7282  0.5237 :      while (!(bi->bits&  1))
>                 :        {
>    5312  0.3820 :          bi->bits>>= 1;
>    7904  0.5685 :          *bit_no += 1;
>                 :        }
>                 :      return true;
>                 :    }
>
> to:
>
>                 : bool
>                 :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>                 :{
>                 :  /* If our current word is nonzero, it contains the bit we want.  */
>    1274  0.0973 :  if (bi->bits)
>                 :    {
>                 :    next_bit:
>                 :#if (GCC_VERSION>= 3004)
>                 :      {
>    1106  0.0845 :        unsigned int n = __builtin_ctzl (bi->bits);
>                 :        gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
>    1811  0.1384 :        bi->bits>>= n;
>    1892  0.1446 :        *bit_no += n;
>                 :      }
>                 :#else
>
> The code (including checking for 3.4) is the same as one present in
> bitmap_first_bit already.

Interesting, I thought it was done for a reason this way, when I was 
looking at bitmaps to speedup the df branch... should have profiled it. :-)

Paolo

Patch

Index: bitmap.h
===================================================================
--- bitmap.h	(revision 160447)
+++ bitmap.h	(working copy)
@@ -396,11 +396,20 @@  bmp_iter_set (bitmap_iterator *bi, unsig
   if (bi->bits)
     {
     next_bit:
+#if (GCC_VERSION >= 3004)
+      {
+	unsigned int n = __builtin_ctzl (bi->bits);
+	gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
+	bi->bits >>= n;
+	*bit_no += n;
+      }
+#else
       while (!(bi->bits & 1))
 	{
 	  bi->bits >>= 1;
 	  *bit_no += 1;
 	}
+#endif
       return true;
     }
 
@@ -443,11 +452,20 @@  bmp_iter_and (bitmap_iterator *bi, unsig
   if (bi->bits)
     {
     next_bit:
+#if (GCC_VERSION >= 3004)
+      {
+	unsigned int n = __builtin_ctzl (bi->bits);
+	gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
+	bi->bits >>= n;
+	*bit_no += n;
+      }
+#else
       while (!(bi->bits & 1))
 	{
 	  bi->bits >>= 1;
 	  *bit_no += 1;
 	}
+#endif
       return true;
     }
 
@@ -510,11 +528,20 @@  bmp_iter_and_compl (bitmap_iterator *bi,
   if (bi->bits)
     {
     next_bit:
+#if (GCC_VERSION >= 3004)
+      {
+	unsigned int n = __builtin_ctzl (bi->bits);
+	gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
+	bi->bits >>= n;
+	*bit_no += n;
+      }
+#else
       while (!(bi->bits & 1))
 	{
 	  bi->bits >>= 1;
 	  *bit_no += 1;
 	}
+#endif
       return true;
     }