Patchwork Vectorized _cpp_clean_line

login
register
mail settings
Submitter Richard Henderson
Date Aug. 12, 2010, 11:07 p.m.
Message ID <4C647EC7.4010601@redhat.com>
Download mbox | patch
Permalink /patch/61654/
State New
Headers show

Comments

Richard Henderson - Aug. 12, 2010, 11:07 p.m.
On 08/12/2010 03:23 PM, Richard Henderson wrote:
> On 08/12/2010 03:07 PM, Andi Kleen wrote:
>> At least for sse 4.2 I'm not sure the table lookup
>> for alignment is worth it. The unaligned loads are quite
>> cheap on current micro architectures with sse 4.2 
>> and the page end test is also not that expensive.
> 
> Perhaps.  That's something else that will want testing, as
> it's all of a dozen instructions.

Alternately, we can check for page end only at the end
of the buffer.  Because obviously all the middle bits
of the buffer are present.

I've done nothing with the indirect branch yet, but here's
a version that bootstraps and checks ok for x86_64 (w/sse4.2).
It also bootstraps on i686 (w/sse2 but not sse4.2); I've just
started the test run for that though.


r~
Richard Henderson - Aug. 12, 2010, 11:10 p.m.
On 08/12/2010 04:07 PM, Richard Henderson wrote:
> I've done nothing with the indirect branch yet, but here's
> a version that bootstraps and checks ok for x86_64 (w/sse4.2).
> It also bootstraps on i686 (w/sse2 but not sse4.2); I've just
> started the test run for that though.

Gah.  Hit send too soon.  Oprofile samples go from 

Before:

 16.31    518.00   518.00        7    74.00    74.00  _cpp_lex_direct
 12.25    907.00   389.00                             ht_lookup_with_hash
  9.83   1219.00   312.00        6    52.00    52.00  _cpp_clean_line

After:

 17.27    402.00   402.00        2   201.00   201.00  _cpp_lex_direct
 12.67    697.00   295.00                             ht_lookup_with_hash
 12.16    980.00   283.00        1   283.00   775.00  cpp_get_token
  9.41   1199.00   219.00                             cpp_output_token
  6.14   1342.00   143.00                             lex_identifier
  5.84   1478.00   136.00                             preprocess_file
  5.54   1607.00   129.00                             enter_macro_context
  3.39   1686.00    79.00        2    39.50    39.50  _cpp_lex_token
  2.88   1753.00    67.00        2    33.50    33.50  _cpp_clean_line
  2.28   1806.00    53.00                             search_line_sse42



r~
Andi Kleen - Aug. 13, 2010, 7:30 a.m.
On Thu, Aug 12, 2010 at 04:07:51PM -0700, Richard Henderson wrote:
> On 08/12/2010 03:23 PM, Richard Henderson wrote:
> > On 08/12/2010 03:07 PM, Andi Kleen wrote:
> >> At least for sse 4.2 I'm not sure the table lookup
> >> for alignment is worth it. The unaligned loads are quite
> >> cheap on current micro architectures with sse 4.2 
> >> and the page end test is also not that expensive.
> > 
> > Perhaps.  That's something else that will want testing, as
> > it's all of a dozen instructions.
> 
> Alternately, we can check for page end only at the end
> of the buffer.  Because obviously all the middle bits
> of the buffer are present.

Looks good.

-Andi

Patch

diff --git a/libcpp/init.c b/libcpp/init.c
index c5b8c28..769aa50 100644
--- a/libcpp/init.c
+++ b/libcpp/init.c
@@ -137,6 +137,8 @@  init_library (void)
 #ifdef ENABLE_NLS
        (void) bindtextdomain (PACKAGE, LOCALEDIR);
 #endif
+
+       init_vectorized_lexer ();
     }
 }
 
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 9209b55..10ed033 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -725,6 +725,8 @@  ufputs (const unsigned char *s, FILE *f)
   return fputs ((const char *)s, f);
 }
 
+extern void init_vectorized_lexer (void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libcpp/lex.c b/libcpp/lex.c
index f628272..96fb2dd 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -96,6 +96,399 @@  add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
   buffer->notes_used++;
 }
 
+
+/* Fast path to find line special characters using optimized character
+   scanning algorithms.  Anything complicated falls back to the slow
+   path below.  Since this loop is very hot it's worth doing these kinds
+   of optimizations.
+
+   One of the paths through the ifdefs should provide 
+
+     bool search_line_fast (const uchar *s, const uchar *end,
+                            const uchar **out)
+
+   Between S and END, search for \n, \r, \\, ?.  Return true if found.
+   Always update *OUT to the last character scanned, even if not found.  */
+
+/* ??? Should be in configure.ac.  */
+#define WORDS_BIGENDIAN 0
+
+/* We'd like the largest integer that fits into a register.  There's nothing
+   in <stdint.h> that gives us that.  For most hosts this is unsigned long,
+   but MS decided on an LLP64 model.  Thankfully when building with GCC we
+   can get the "real" word size.  */
+#ifdef __GNUC__
+typedef unsigned int word_type __attribute__((__mode__(__word__)));
+#else
+typedef unsigned long word_type;
+#endif
+
+/* Return X with the first N bytes forced to values that won't match one
+   of the interesting characters.  Note that NUL is not interesting.  */
+
+static inline word_type
+acc_char_mask_misalign (word_type val, unsigned int n)
+{
+  word_type mask = -1;
+  if (WORDS_BIGENDIAN)
+    mask >>= n * 8;
+  else
+    mask <<= n * 8;
+  return val & mask;
+}
+
+/* Return X replicated to all byte positions within WORD_TYPE.  */
+
+static inline word_type
+acc_char_replicate (uchar x)
+{
+  word_type ret;
+
+  ret = (x << 24) | (x << 16) | (x << 8) | x;
+  switch (sizeof (ret))
+    {
+    case 8:
+      ret = (ret << 16 << 16) | ret;
+      break;
+    case 4:
+      break;
+    default:
+      abort ();
+    }
+  return ret;
+}
+
+/* Return non-zero if some byte of VAL is (probably) C.  */
+
+static inline word_type
+acc_char_cmp (word_type val, word_type c)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+  /* We can get exact results using a compare-bytes instruction.  Since
+     comparison is always >=, we could either do ((v >= c) & (c >= v))
+     for 3 operations or force matching bytes to zero and compare vs
+     zero with (0 >= (val ^ c)) for 2 operations.  */
+  return __builtin_alpha_cmpbge (0, val ^ c);
+#elif defined(__GNUC__) && defined(__ia64__)
+  /* ??? Ideally we'd have some sort of builtin for this, so that we
+     can pack the 4 instances into fewer bundles.  */
+  word_type ret;
+  __asm__("pcmp1.eq %0 = %1, %2" : "=r"(ret) : "r"(val), "r"(c));
+  return ret;
+#else
+  word_type magic = 0x7efefefeU;
+  switch (sizeof(word_type))
+    {
+    case 8:
+      magic = (magic << 16 << 16) | 0xfefefefeU;
+    case 4:
+      break;
+    default:
+      abort ();
+    }
+  magic |= 1;
+
+  val ^= c;
+  return ((val + magic) ^ ~val) & ~magic;
+#endif
+}
+
+/* Given the result of acc_char_cmp is non-zero, return the index of
+   the found character.  If this was a false positive, return -1.  */
+
+static inline int
+acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
+		word_type val ATTRIBUTE_UNUSED)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+  /* The cmpbge instruction sets *bits* of the result corresponding to
+     matches in the bytes with no false positives.  This means that clz/ctx
+     produces a correct unscaled result.  */
+  return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp));
+#elif defined(__GNUC__) && defined(__ia64__)
+  /* The pcmp1 instruction sets matching bytes to 0xff with no false
+     positives.  This means that clz/ctz produces a correct scaled result.  */
+  return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp)) >> 3;
+#else
+  unsigned int i;
+
+  /* ??? It would be nice to force unrolling here,
+     and have all of these constants folded.  */
+  for (i = 0; i < sizeof(word_type); ++i)
+    {
+      uchar c;
+      if (WORDS_BIGENDIAN)
+	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
+      else
+	c = (val >> i * 8) & 0xff;
+
+      if (c == '\n' || c == '\r' || c == '\\' || c == '?')
+	return i;
+    }
+
+  return -1;
+#endif
+}
+
+/* A version of the fast scanner using bit fiddling techniques.
+ 
+   For 32-bit words, one would normally perform 16 comparisons and
+   16 branches.  With this algorithm one performs 24 arithmetic
+   operations and one branch.  Whether this is faster with a 32-bit
+   word size is going to be somewhat system dependent.
+
+   For 64-bit words, we eliminate twice the number of comparisons
+   and branches without increasing the number of arithmetic operations.
+   It's almost certainly going to be a win with 64-bit word size.  */
+
+static bool
+search_line_acc_char (const uchar *s, const uchar *end, const uchar **out)
+{
+  const word_type repl_nl = acc_char_replicate ('\n');
+  const word_type repl_cr = acc_char_replicate ('\r');
+  const word_type repl_bs = acc_char_replicate ('\\');
+  const word_type repl_qm = acc_char_replicate ('?');
+
+  unsigned int misalign;
+  ptrdiff_t left;
+  const word_type *p;
+  word_type val;
+  
+  /* Don't bother optimizing very short lines; too much masking to do.  */
+  left = end - s;
+  if (left < (ptrdiff_t) sizeof(word_type))
+    {
+      *out = s;
+      return false;
+    }
+
+  /* Align the buffer.  Mask out any bytes from before the beginning.  */
+  p = (word_type *)((uintptr_t)s & -sizeof(word_type));
+  val = *p;
+  misalign = (uintptr_t)s & (sizeof(word_type) - 1);
+  if (misalign)
+    {
+      val = acc_char_mask_misalign (val, misalign);
+      left += misalign;
+    }
+
+  /* Main loop.  */
+  while (1)
+    {
+      word_type m_nl, m_cr, m_bs, m_qm, t;
+
+      m_nl = acc_char_cmp (val, repl_nl);
+      m_cr = acc_char_cmp (val, repl_cr);
+      m_bs = acc_char_cmp (val, repl_bs);
+      m_qm = acc_char_cmp (val, repl_qm);
+      t = (m_nl | m_cr) | (m_bs | m_qm);
+
+      if (__builtin_expect (t != 0, 0))
+	{
+	  int i = acc_char_index (t, val);
+	  if (i >= 0)
+	    {
+	      *out = (const uchar *)p + i;
+	      return true;
+	    }
+	}
+
+      left -= sizeof(word_type);
+      ++p;
+      if (left < (ptrdiff_t) sizeof(word_type))
+	{
+	  /* Ran out of complete words.  */
+	  *out = (const uchar *)p;
+	  return false;
+	}
+      val = *p;
+    }
+}
+
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+/* Fast path to find line special characters using SSE 4.2 vectorized string 
+   instructions. Anything complicated falls back to the slow path below. 
+   Since this loop is very hot it's worth doing these kinds of
+   optimizations. Returns true if stopper character found. 
+
+   We should be using the _mm intrinsics, but the xxxintr headers do things
+   not allowed in gcc.  So instead use direct builtins.  */
+
+static bool __attribute__((__target__("sse4.2")))
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef char m128i __attribute__ ((__vector_size__ (16)));
+  static const m128i search
+    = { '\n', '\r', '?', '\\', 0,0,0,0,0,0,0,0,0,0,0,0 };
+
+  ptrdiff_t left;
+  m128i data;
+  int index;
+
+  /* Main loop, processing 16 bytes at a time.  */
+  for (left = end - s; left >= 16; left -= 16, s += 16)
+    {
+      data = __builtin_ia32_loaddqu((const char *)s);
+      index = __builtin_ia32_pcmpestri128 (search, 4, data, 16, 0);
+      if (__builtin_expect (index < 16, 0)) 
+	{
+	  *out = (const uchar *)s + index;
+	  return true;
+	}
+    }
+
+  /* There are less than 16 bytes remaining.  If we can read those bytes
+     without reading from a possibly unmapped next page, then go ahead and
+     do so.  If we are near the end of the page then don't bother; returning
+     will scan the balance of the buffer byte-by-byte.  */
+  if (left > 0 && ((uintptr_t)s & 0xfff) <= 0xff0)
+    {
+      data = __builtin_ia32_loaddqu((const char *)s);
+      index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+      if (__builtin_expect (index < 16, 0))
+	{
+	  *out = (const uchar *)s + index;
+	  return true;
+	}
+      s = end;
+    }
+
+  /* No match within buffer.  */
+  *out = s;
+  return false;
+}
+
+static bool __attribute__((__target__("sse2")))
+search_line_sse2 (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef char m128i __attribute__ ((__vector_size__ (16)));
+
+  static const m128i mask_align[16] = {
+    {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0 }
+  };
+
+  static const m128i repl_nl = {
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+  };
+  static const m128i repl_cr = {
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+  };
+  static const m128i repl_bs = {
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+  };
+  static const m128i repl_qm = {
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+  };
+
+  ptrdiff_t left;
+  unsigned int misalign, mask;
+  const m128i *p;
+  m128i data, t;
+
+  left = end - s;
+  if (left <= 0)
+    goto no_match;
+
+  /* Align the pointer and mask out the bytes before the start.  */
+  misalign = (uintptr_t)s & 15;
+  p = (const m128i *)((uintptr_t)s & -16);
+  data = *p;
+  if (misalign)
+    {
+      data |= mask_align[misalign];
+      left += misalign;
+    }
+
+  while (1)
+    {
+      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+      mask = __builtin_ia32_pmovmskb128 (t);
+
+      if (__builtin_expect (mask, 0))
+	{
+	  /* If there were less than 16 bytes left in the buffer, then there
+	     will be comparisons beyond the end of buffer.  Mask them.  */
+	  if (left < 16)
+	    {
+	      mask &= (2u << left) - 1;
+	      if (mask == 0)
+		goto no_match;
+	    }
+
+	  *out = (const uchar *)p + __builtin_ctz (mask);
+	  return true;
+	}
+
+      left -= 16;
+      if (left <= 0)
+	goto no_match;
+      data = *++p;
+    }
+
+ no_match:
+  /* No match within buffer.  */
+  *out = end;
+  return false;
+}
+
+/* Check if CPU supports vectorized string instructions. */
+
+#include "../gcc/config/i386/cpuid.h"
+
+static bool (*search_line_fast)(const uchar *, const uchar *, const uchar **)
+  = search_line_acc_char;
+
+void 
+init_vectorized_lexer (void)
+{
+  unsigned dummy, ecx, edx;
+
+  if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx))
+    {
+      if (ecx & bit_SSE4_2)
+	search_line_fast = search_line_sse42;
+      else if (edx & bit_SSE2)
+	search_line_fast = search_line_sse2;
+    }
+}
+
+#else
+
+/* We only have one accellerated alternative.  Use a direct call so that
+   we encourage inlining.  We must still provide the init_vectorized_lexer
+   entry point, even though it does nothing.  */
+
+#define search_line_fast  search_line_acc_char
+
+void 
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
 /* Returns with a logical line that contains no escaped newlines or
    trigraphs.  This is a time-critical inner loop.  */
 void
@@ -109,12 +502,34 @@  _cpp_clean_line (cpp_reader *pfile)
   buffer->cur_note = buffer->notes_used = 0;
   buffer->cur = buffer->line_base = buffer->next_line;
   buffer->need_line = false;
-  s = buffer->next_line - 1;
+  s = buffer->next_line;
 
   if (!buffer->from_stage3)
     {
       const uchar *pbackslash = NULL;
 
+    found_bs:
+      /* Perform an optimized search for \n, \r, \\, ?.  */
+      if (search_line_fast (s, buffer->rlimit, &s))
+	{
+	  c = *s;
+	      
+	  /* Special case for backslash which is reasonably common.
+	     Continue searching using the fast path.  */
+	  if (c == '\\') 
+	    {
+	      pbackslash = s;
+	      s++;
+	      goto found_bs;
+	    }
+	  if (__builtin_expect (c == '?', false))
+	    goto found_qm;
+	  else
+	    goto found_nl_cr;
+	}
+
+      s--;
+
       /* Short circuit for the common case of an un-escaped line with
 	 no trigraphs.  The primary win here is by not writing any
 	 data back to memory until we have to.  */
@@ -124,6 +539,7 @@  _cpp_clean_line (cpp_reader *pfile)
 	  if (__builtin_expect (c == '\n', false)
 	      || __builtin_expect (c == '\r', false))
 	    {
+	    found_nl_cr:
 	      d = (uchar *) s;
 
 	      if (__builtin_expect (s == buffer->rlimit, false))
@@ -157,26 +573,28 @@  _cpp_clean_line (cpp_reader *pfile)
 	    }
 	  if (__builtin_expect (c == '\\', false))
 	    pbackslash = s;
-	  else if (__builtin_expect (c == '?', false)
-		   && __builtin_expect (s[1] == '?', false)
-		   && _cpp_trigraph_map[s[2]])
+	  else if (__builtin_expect (c == '?', false))
 	    {
-	      /* Have a trigraph.  We may or may not have to convert
-		 it.  Add a line note regardless, for -Wtrigraphs.  */
-	      add_line_note (buffer, s, s[2]);
-	      if (CPP_OPTION (pfile, trigraphs))
+	    found_qm:
+	      if (__builtin_expect (s[1] == '?', false)
+		   && _cpp_trigraph_map[s[2]])
 		{
-		  /* We do, and that means we have to switch to the
-		     slow path.  */
-		  d = (uchar *) s;
-		  *d = _cpp_trigraph_map[s[2]];
-		  s += 2;
-		  break;
+		  /* Have a trigraph.  We may or may not have to convert
+		     it.  Add a line note regardless, for -Wtrigraphs.  */
+		  add_line_note (buffer, s, s[2]);
+		  if (CPP_OPTION (pfile, trigraphs))
+		    {
+		      /* We do, and that means we have to switch to the
+		         slow path.  */
+		      d = (uchar *) s;
+		      *d = _cpp_trigraph_map[s[2]];
+		      s += 2;
+		      break;
+		    }
 		}
 	    }
 	}
 
-
       for (;;)
 	{
 	  c = *++s;
@@ -184,7 +602,7 @@  _cpp_clean_line (cpp_reader *pfile)
 
 	  if (c == '\n' || c == '\r')
 	    {
-		  /* Handle DOS line endings.  */
+	      /* Handle DOS line endings.  */
 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 		s++;
 	      if (s == buffer->rlimit)
@@ -215,9 +633,8 @@  _cpp_clean_line (cpp_reader *pfile)
     }
   else
     {
-      do
+      while (*s != '\n' && *s != '\r')
 	s++;
-      while (*s != '\n' && *s != '\r');
       d = (uchar *) s;
 
       /* Handle DOS line endings.  */
diff --git a/libcpp/system.h b/libcpp/system.h
index 2472799..1a74734 100644
--- a/libcpp/system.h
+++ b/libcpp/system.h
@@ -29,6 +29,9 @@  along with GCC; see the file COPYING3.  If not see
 #ifdef HAVE_STDDEF_H
 # include <stddef.h>
 #endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
 
 #include <stdio.h>