Patchwork The speed of the compiler, was: Re: Combine four insns

login
register
mail settings
Submitter Andi Kleen
Date Aug. 10, 2010, 2:48 p.m.
Message ID <877hjy8qwk.fsf@basil.nowhere.org>
Download mbox | patch
Permalink /patch/61394/
State New
Headers show

Comments

Andi Kleen - Aug. 10, 2010, 2:48 p.m.
Chris Lattner <clattner@apple.com> writes:
>
>   e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster.  For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often.  I don't what what else GCC is doing wrong, I haven't looked at its frontends much.

I looked at this a weekend or two ago. The two hot functions in the
preprocessor are cpp_clean_line and the lexer.

At least cpp_clean_line was pretty easy to speed up using SSE 4.2
string instructions and vectorizing it. 

That change made it drop down from top 10 in a unoptimized build to
lower top 40 or so. I suspect with that change the clang advantage
is much less than 2x.

Drawback: the patch broke some of the PCH test cases in the test
suite and I never quite figured out why (that's why I didn't post
the patch)

Other drawback: the optimization only helps on x86 systems
that support SSE 4.2 (but presumably that's a popular build system)

Here's the patch if anyone is interested.

Vectorizing the lexer might be possible too, but it's somewhat
harder.

The other problem I found is that cpplib is not using profile
feedback, that is likely giving some performance away too.

-Andi
Richard Guenther - Aug. 10, 2010, 2:58 p.m.
On Tue, Aug 10, 2010 at 4:48 PM, Andi Kleen <andi@firstfloor.org> wrote:
> Chris Lattner <clattner@apple.com> writes:
>>
>>   e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster.  For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often.  I don't what what else GCC is doing wrong, I haven't looked at its frontends much.
>
> I looked at this a weekend or two ago. The two hot functions in the
> preprocessor are cpp_clean_line and the lexer.
>
> At least cpp_clean_line was pretty easy to speed up using SSE 4.2
> string instructions and vectorizing it.
>
> That change made it drop down from top 10 in a unoptimized build to
> lower top 40 or so. I suspect with that change the clang advantage
> is much less than 2x.
>
> Drawback: the patch broke some of the PCH test cases in the test
> suite and I never quite figured out why (that's why I didn't post
> the patch)
>
> Other drawback: the optimization only helps on x86 systems
> that support SSE 4.2 (but presumably that's a popular build system)
>
> Here's the patch if anyone is interested.
>
> Vectorizing the lexer might be possible too, but it's somewhat
> harder.
>
> The other problem I found is that cpplib is not using profile
> feedback, that is likely giving some performance away too.

I'm sure there is a way to open-code this using integer math.
Likely the performance issue is both that we use byte loads
and 4 comparisons per char.  Maybe 4 parallel strchr optimized
searches are comparable fast?

Richard.

> -Andi
>
>
> diff --git a/libcpp/init.c b/libcpp/init.c
> index c5b8c28..769aa50 100644
> --- a/libcpp/init.c
> +++ b/libcpp/init.c
> @@ -137,6 +137,8 @@ init_library (void)
>  #ifdef ENABLE_NLS
>        (void) bindtextdomain (PACKAGE, LOCALEDIR);
>  #endif
> +
> +       init_vectorized_lexer ();
>     }
>  }
>
> diff --git a/libcpp/internal.h b/libcpp/internal.h
> index 9209b55..10ed033 100644
> --- a/libcpp/internal.h
> +++ b/libcpp/internal.h
> @@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
>   return fputs ((const char *)s, f);
>  }
>
> +extern void init_vectorized_lexer (void);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/libcpp/lex.c b/libcpp/lex.c
> index f628272..589fa64 100644
> --- a/libcpp/lex.c
> +++ b/libcpp/lex.c
> @@ -96,6 +96,82 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
>   buffer->notes_used++;
>  }
>
> +#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
> +
> +#define HAVE_SSE42 1
> +
> +#include <stdint.h>
> +#include "../gcc/config/i386/cpuid.h"
> +
> +bool cpu_has_sse42;
> +
> +/* Check if CPU supports vectorized string instructions. */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> +  unsigned dummy, ecx;
> +
> +  if (__get_cpuid (1, &dummy, &dummy, &ecx, &dummy))
> +      cpu_has_sse42 = !!(ecx & (1 << 20));
> +}
> +
> +/* Fast path to find line special characters using SSE 4.2 vectorized string
> +   instructions. Anything complicated falls back to the slow path below.
> +   Since this loop is very hot it's worth doing these kinds of
> +   optimizations. Returns true if stopper character found.
> +
> +   We should be using the _mm intrinsics, but the xxxintr headers do things
> +   not allowed in gcc. So instead use direct builtins. */
> +
> +static bool __attribute__((__target__("sse4.2")))
> +search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
> +{
> +  typedef char m128i __attribute__ ((__vector_size__ (16)));
> +  int left;
> +  int index;
> +  static char searchstr[16] __attribute__ ((aligned(16))) = "\n\r?\\";
> +  m128i search = *(m128i *)searchstr;
> +  m128i data;
> +
> +  for (left = end - (uchar *)s; left > 0; left -= 16)
> +    {
> +      if (((uintptr_t)s & 0xfff) > 0xff0)
> +       {
> +         /* Too near page boundary. Use slow path. This could be
> +            avoided if we ensure suitable padding or alignment in
> +            the input buffer. */
> +         *out = s;
> +         return false;
> +       }
> +
> +      /* Use vectorized string comparison, looking for the 4 stoppers. */
> +      data = (m128i) __builtin_ia32_loaddqu((const char *)s);
> +      index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
> +      if (index < 16)
> +       {
> +         *out = s + index;
> +         return true;
> +       }
> +      s += 16;
> +    }
> +
> +  /* Ran out of buffer. Should not happen? */
> +  *out = end;
> +  return false;
> +}
> +
> +#else
> +
> +/* Dummy */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> +}
> +
> +#endif
> +
>  /* Returns with a logical line that contains no escaped newlines or
>    trigraphs.  This is a time-critical inner loop.  */
>  void
> @@ -109,12 +185,41 @@ _cpp_clean_line (cpp_reader *pfile)
>   buffer->cur_note = buffer->notes_used = 0;
>   buffer->cur = buffer->line_base = buffer->next_line;
>   buffer->need_line = false;
> -  s = buffer->next_line - 1;
> +  s = buffer->next_line;
>
>   if (!buffer->from_stage3)
>     {
>       const uchar *pbackslash = NULL;
>
> +#ifdef HAVE_SSE42
> +      if (cpu_has_sse42)
> +       {
> +         for (;;)
> +           {
> +             /* Drop into slow path if ? or nothing is found. */
> +             if (search_line_sse42 (s, buffer->rlimit, &s) == false
> +                 || *s == '?')
> +               break;
> +
> +             c = *s;
> +
> +             /* Special case for backslash which is reasonably common.
> +                Continue searching using the fast path */
> +             if (c == '\\')
> +               {
> +                 pbackslash = s;
> +                 s++;
> +                 continue;
> +               }
> +
> +             /* \n or \r here. Process it below. */
> +             goto found;
> +           }
> +       }
> +#endif
> +
> +      s--;
> +
>       /* Short circuit for the common case of an un-escaped line with
>         no trigraphs.  The primary win here is by not writing any
>         data back to memory until we have to.  */
> @@ -124,6 +229,9 @@ _cpp_clean_line (cpp_reader *pfile)
>          if (__builtin_expect (c == '\n', false)
>              || __builtin_expect (c == '\r', false))
>            {
> +#ifdef HAVE_SSE42
> +           found:
> +#endif
>              d = (uchar *) s;
>
>              if (__builtin_expect (s == buffer->rlimit, false))
>
>
>
> --
> ak@linux.intel.com -- Speaking for myself only.
>
Andi Kleen - Aug. 10, 2010, 3:11 p.m.
> I'm sure there is a way to open-code this using integer math.

I don't think so. Take a look at what PCMPESTRI does.  There's no easy 
replacement, even if you use all the Hacker's Delight tricks 
(it's really a cool instruction, but also very complicated :-)

> Likely the performance issue is both that we use byte loads
> and 4 comparisons per char.  Maybe 4 parallel strchr optimized
> searches are comparable fast?

and various other overhead.

-Andi
Tom Tromey - Aug. 10, 2010, 7:51 p.m.
>>>>> "Andi" == Andi Kleen <andi@firstfloor.org> writes:

Richard> I'm sure there is a way to open-code this using integer math.

Andi> I don't think so. Take a look at what PCMPESTRI does.  There's no easy 
Andi> replacement, even if you use all the Hacker's Delight tricks 
Andi> (it's really a cool instruction, but also very complicated :-)

This is still pending:

http://gcc.gnu.org/ml/gcc-patches/2010-03/msg00526.html

I think any sort of hackery in this area is fine, if it speeds up the
compiler.  All that is needed is someone with the time and motivation to
do the testing.

Tom
H.J. Lu - Aug. 10, 2010, 8:09 p.m.
On Tue, Aug 10, 2010 at 7:48 AM, Andi Kleen <andi@firstfloor.org> wrote:
> Chris Lattner <clattner@apple.com> writes:
>>
>>   e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster.  For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often.  I don't what what else GCC is doing wrong, I haven't looked at its frontends much.
>
> I looked at this a weekend or two ago. The two hot functions in the
> preprocessor are cpp_clean_line and the lexer.
>
> At least cpp_clean_line was pretty easy to speed up using SSE 4.2
> string instructions and vectorizing it.
>
> That change made it drop down from top 10 in a unoptimized build to
> lower top 40 or so. I suspect with that change the clang advantage
> is much less than 2x.
>
> Drawback: the patch broke some of the PCH test cases in the test
> suite and I never quite figured out why (that's why I didn't post
> the patch)
>
> Other drawback: the optimization only helps on x86 systems
> that support SSE 4.2 (but presumably that's a popular build system)
>
> Here's the patch if anyone is interested.
>
> Vectorizing the lexer might be possible too, but it's somewhat
> harder.
>
> The other problem I found is that cpplib is not using profile
> feedback, that is likely giving some performance away too.
>
> -Andi
>
>

You can use IFUNC to automatically enable it on Linux.
Andi Kleen - Aug. 10, 2010, 8:15 p.m.
On Tue, Aug 10, 2010 at 01:51:46PM -0600, Tom Tromey wrote:
> >>>>> "Andi" == Andi Kleen <andi@firstfloor.org> writes:
> 
> Richard> I'm sure there is a way to open-code this using integer math.
> 
> Andi> I don't think so. Take a look at what PCMPESTRI does.  There's no easy 
> Andi> replacement, even if you use all the Hacker's Delight tricks 
> Andi> (it's really a cool instruction, but also very complicated :-)
> 
> This is still pending:
> 
> http://gcc.gnu.org/ml/gcc-patches/2010-03/msg00526.html

I see. I suspect my version is faster though :-) (if you have a suitable
CPU)

Also I think David got the "buffer near end of page before hole" case wrong
(although I'm not fully sure that really happens in gcc) 

> 
> I think any sort of hackery in this area is fine, if it speeds up the
> compiler.  All that is needed is someone with the time and motivation to
> do the testing.

My version passes testing, except for a few PCH test cases. I actually
ran it in parallel with the original cleanline and it gave the same
results. Any ideas why it could break PCH?

-Andi
Mike Stump - Aug. 10, 2010, 10:37 p.m.
On Aug 10, 2010, at 1:15 PM, Andi Kleen wrote:
> My version passes testing, except for a few PCH test cases. I actually
> ran it in parallel with the original cleanline and it gave the same
> results. Any ideas why it could break PCH?

:-)  Yes.  I'd first run it again and see if it always fails.  Sometimes for me the pch testcases shimmer.  After that, you have to just look at the output.  The output can't depend upon things like rtx addresses.  If you have multiple hosts, try the testcase (no pch) on different hosts and ensure you get the same output (via cross compilation).  Any differences tend to be the sorts of things that pch isn't going to like.  sorts on hashes that involve the addresses are typical.  Instead, one should use UID or other data to sort on.
Tom Tromey - Aug. 10, 2010, 10:40 p.m.
>>>>> "Andi" == Andi Kleen <andi@firstfloor.org> writes:

Andi> My version passes testing, except for a few PCH test cases. I actually
Andi> ran it in parallel with the original cleanline and it gave the same
Andi> results. Any ideas why it could break PCH?

I don't have any theory.  It doesn't seem like it could affect PCH.

Tom
Nathan Froyd - Aug. 12, 2010, 8:25 p.m.
On Tue, Aug 10, 2010 at 01:51:46PM -0600, Tom Tromey wrote:
> >>>>> "Andi" == Andi Kleen <andi@firstfloor.org> writes:
> 
> Richard> I'm sure there is a way to open-code this using integer math.
> 
> Andi> I don't think so. Take a look at what PCMPESTRI does.  There's no easy 
> Andi> replacement, even if you use all the Hacker's Delight tricks 
> Andi> (it's really a cool instruction, but also very complicated :-)
> 
> This is still pending:
> 
> http://gcc.gnu.org/ml/gcc-patches/2010-03/msg00526.html
> 
> I think any sort of hackery in this area is fine, if it speeds up the
> compiler.  All that is needed is someone with the time and motivation to
> do the testing.

FWIW, testing it on x86-64 on two source files lying about (admittedly a
rather small sample size) reduced the # of instructions executed cc1plus
-E by ~6% in one case and 0% (yes, exactly 0%) in the other.  Perhaps
there's a bug lying about (the patch wouldn't compile on a little-endian
machine, for one...).

I like Andi's PCMPESTRI idea, but that does nothing for the many people
without Intel's latest generation of chips (of which there are many),
let alone people on other platforms.

-Nathan
Mark Mitchell - Aug. 17, 2010, 3:13 p.m.
Tom Tromey wrote:

> http://gcc.gnu.org/ml/gcc-patches/2010-03/msg00526.html
> 
> I think any sort of hackery in this area is fine, if it speeds up the
> compiler.

I completely agree.  It's entirely acceptable to use machine-specific
code in a very hot loop if that's what it takes.

Patch

diff --git a/libcpp/init.c b/libcpp/init.c
index c5b8c28..769aa50 100644
--- a/libcpp/init.c
+++ b/libcpp/init.c
@@ -137,6 +137,8 @@  init_library (void)
 #ifdef ENABLE_NLS
        (void) bindtextdomain (PACKAGE, LOCALEDIR);
 #endif
+
+       init_vectorized_lexer ();
     }
 }
 
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 9209b55..10ed033 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -725,6 +725,8 @@  ufputs (const unsigned char *s, FILE *f)
   return fputs ((const char *)s, f);
 }
 
+extern void init_vectorized_lexer (void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libcpp/lex.c b/libcpp/lex.c
index f628272..589fa64 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -96,6 +96,82 @@  add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
   buffer->notes_used++;
 }
 
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+
+#define HAVE_SSE42 1
+
+#include <stdint.h>
+#include "../gcc/config/i386/cpuid.h"
+
+bool cpu_has_sse42;
+
+/* Check if CPU supports vectorized string instructions. */
+
+void 
+init_vectorized_lexer (void)
+{
+  unsigned dummy, ecx;
+
+  if (__get_cpuid (1, &dummy, &dummy, &ecx, &dummy))
+      cpu_has_sse42 = !!(ecx & (1 << 20));
+}
+
+/* Fast path to find line special characters using SSE 4.2 vectorized string 
+   instructions. Anything complicated falls back to the slow path below. 
+   Since this loop is very hot it's worth doing these kinds of
+   optimizations. Returns true if stopper character found. 
+
+   We should be using the _mm intrinsics, but the xxxintr headers do things
+   not allowed in gcc. So instead use direct builtins. */
+
+static bool __attribute__((__target__("sse4.2")))
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef char m128i __attribute__ ((__vector_size__ (16)));
+  int left;
+  int index;
+  static char searchstr[16] __attribute__ ((aligned(16))) = "\n\r?\\";
+  m128i search = *(m128i *)searchstr;
+  m128i data;
+		
+  for (left = end - (uchar *)s; left > 0; left -= 16) 
+    { 
+      if (((uintptr_t)s & 0xfff) > 0xff0) 
+	{
+	  /* Too near page boundary. Use slow path. This could be
+	     avoided if we ensure suitable padding or alignment in
+	     the input buffer. */
+	  *out = s;
+	  return false;
+	}
+
+      /* Use vectorized string comparison, looking for the 4 stoppers. */
+      data = (m128i) __builtin_ia32_loaddqu((const char *)s);
+      index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+      if (index < 16) 
+	{
+	  *out = s + index;
+	  return true;
+	}
+      s += 16;
+    }
+
+  /* Ran out of buffer. Should not happen? */
+  *out = end;
+  return false;
+}
+
+#else
+
+/* Dummy */
+
+void 
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
 /* Returns with a logical line that contains no escaped newlines or
    trigraphs.  This is a time-critical inner loop.  */
 void
@@ -109,12 +185,41 @@  _cpp_clean_line (cpp_reader *pfile)
   buffer->cur_note = buffer->notes_used = 0;
   buffer->cur = buffer->line_base = buffer->next_line;
   buffer->need_line = false;
-  s = buffer->next_line - 1;
+  s = buffer->next_line;
 
   if (!buffer->from_stage3)
     {
       const uchar *pbackslash = NULL;
 
+#ifdef HAVE_SSE42
+      if (cpu_has_sse42)
+	{
+	  for (;;) 
+	    {
+	      /* Drop into slow path if ? or nothing is found. */
+	      if (search_line_sse42 (s, buffer->rlimit, &s) == false
+		  || *s == '?')
+		break;
+
+	      c = *s;
+	      
+	      /* Special case for backslash which is reasonably common.
+		 Continue searching using the fast path */
+	      if (c == '\\') 
+		{
+		  pbackslash = s;
+		  s++;
+		  continue;
+		}
+
+	      /* \n or \r here. Process it below. */
+	      goto found;
+	    }
+	}
+#endif
+
+      s--;
+
       /* Short circuit for the common case of an un-escaped line with
 	 no trigraphs.  The primary win here is by not writing any
 	 data back to memory until we have to.  */
@@ -124,6 +229,9 @@  _cpp_clean_line (cpp_reader *pfile)
 	  if (__builtin_expect (c == '\n', false)
 	      || __builtin_expect (c == '\r', false))
 	    {
+#ifdef HAVE_SSE42
+	    found:
+#endif
 	      d = (uchar *) s;
 
 	      if (__builtin_expect (s == buffer->rlimit, false))