diff --git a/libcpp/init.c b/libcpp/init.c
index c5b8c28..769aa50 100644
--- a/libcpp/init.c
+++ b/libcpp/init.c
@@ -137,6 +137,8 @@ init_library (void)
 #ifdef ENABLE_NLS
        (void) bindtextdomain (PACKAGE, LOCALEDIR);
 #endif
+
+       init_vectorized_lexer ();
     }
 }
 
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 9209b55..10ed033 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
   return fputs ((const char *)s, f);
 }
 
+extern void init_vectorized_lexer (void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libcpp/lex.c b/libcpp/lex.c
index f628272..6ea4667 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -96,6 +96,408 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
   buffer->notes_used++;
 }
 
+
+/* Fast path to find line special characters using optimized character
+   scanning algorithms.  Anything complicated falls back to the slow
+   path below.  Since this loop is very hot it's worth doing these kinds
+   of optimizations.
+
+   One of the paths through the ifdefs should provide 
+
+     bool search_line_fast (const uchar *s, const uchar *end,
+                            const uchar **out)
+
+   Between S and END, search for \n, \r, \\, ?.  Return true if found.
+   Always update *OUT to the last character scanned, even if not found.  */
+
+/* ??? Should be in configure.ac.  */
+#define WORDS_BIGENDIAN 0
+
+/* We'd like the largest integer that fits into a register.  There's nothing
+   in <stdint.h> that gives us that.  For most hosts this is unsigned long,
+   but MS decided on an LLP64 model.  Thankfully when building with GCC we
+   can get the "real" word size.  */
+#ifdef __GNUC__
+typedef unsigned int word_type __attribute__((__mode__(__word__)));
+#else
+typedef unsigned long word_type;
+#endif
+
+/* Return X with the first N bytes forced to values that won't match one
+   of the interesting characters.  Note that NUL is not interesting.  */
+
+static inline word_type
+acc_char_mask_misalign (word_type val, unsigned int n)
+{
+  word_type mask = -1;
+  if (WORDS_BIGENDIAN)
+    mask >>= n * 8;
+  else
+    mask <<= n * 8;
+  return val & mask;
+}
+
+/* Return X replicated to all byte positions within WORD_TYPE.  */
+
+static inline word_type
+acc_char_replicate (uchar x)
+{
+  word_type ret;
+
+  ret = (x << 24) | (x << 16) | (x << 8) | x;
+  switch (sizeof (ret))
+    {
+    case 8:
+      ret = (ret << 16 << 16) | ret;
+      break;
+    case 4:
+      break;
+    default:
+      abort ();
+    }
+  return ret;
+}
+
+/* Return non-zero if some byte of VAL is (probably) C.  */
+
+static inline word_type
+acc_char_cmp (word_type val, word_type c)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+  /* We can get exact results using a compare-bytes instruction.  Since
+     comparison is always >=, we could either do ((v >= c) & (c >= v))
+     for 3 operations or force matching bytes to zero and compare vs
+     zero with (0 >= (val ^ c)) for 2 operations.  */
+  return __builtin_alpha_cmpbge (0, val ^ c);
+#elif defined(__GNUC__) && defined(__ia64__)
+  /* ??? Ideally we'd have some sort of builtin for this, so that we
+     can pack the 4 instances into fewer bundles.  */
+  word_type ret;
+  __asm__("pcmp1.eq %0 = %1, %2" : "=r"(ret) : "r"(val), "r"(c));
+  return ret;
+#else
+  word_type magic = 0x7efefefeU;
+  switch (sizeof(word_type))
+    {
+    case 8:
+      magic = (magic << 16 << 16) | 0xfefefefeU;
+    case 4:
+      break;
+    default:
+      abort ();
+    }
+  magic |= 1;
+
+  val ^= c;
+  return ((val + magic) ^ ~val) & ~magic;
+#endif
+}
+
+/* Given the result of acc_char_cmp is non-zero, return the index of
+   the found character.  If this was a false positive, return -1.  */
+
+static inline int
+acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
+		word_type val ATTRIBUTE_UNUSED)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+  /* The cmpbge instruction sets *bits* of the result corresponding to
+     matches in the bytes with no false positives.  This means that clz/ctx
+     produces a correct unscaled result.  */
+  return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp));
+#elif defined(__GNUC__) && defined(__ia64__)
+  /* The pcmp1 instruction sets matching bytes to 0xff with no false
+     positives.  This means that clz/ctz produces a correct scaled result.  */
+  return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp)) >> 3;
+#else
+  unsigned int i;
+
+  /* ??? It would be nice to force unrolling here,
+     and have all of these constants folded.  */
+  for (i = 0; i < sizeof(word_type); ++i)
+    {
+      uchar c;
+      if (WORDS_BIGENDIAN)
+	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
+      else
+	c = (val >> i * 8) & 0xff;
+
+      if (c == '\n' || c == '\r' || c == '\\' || c == '?')
+	return i;
+    }
+
+  return -1;
+#endif
+}
+
+/* A version of the fast scanner using bit fiddling techniques.
+ 
+   For 32-bit words, one would normally perform 16 comparisons and
+   16 branches.  With this algorithm one performs 24 arithmetic
+   operations and one branch.  Whether this is faster with a 32-bit
+   word side is going to be somewhat system dependent.
+
+   For 64-bit words, we eliminate twice the number of comparisons
+   and branches without increasing the number of arithmetic operations.
+   It's almost certainly going to be a win with 64-bit word size.  */
+
+static bool
+search_line_acc_char (const uchar *s, const uchar *end, const uchar **out)
+{
+  const word_type repl_nl = acc_char_replicate ('\n');
+  const word_type repl_cr = acc_char_replicate ('\r');
+  const word_type repl_bs = acc_char_replicate ('\\');
+  const word_type repl_qm = acc_char_replicate ('?');
+
+  unsigned int misalign;
+  ptrdiff_t left;
+  const word_type *p;
+  word_type val;
+  
+  /* Don't bother optimizing very short lines; too much masking to do.  */
+  left = end - s;
+  if (left < (ptrdiff_t) sizeof(word_type))
+    {
+      *out = s;
+      return false;
+    }
+
+  /* Align the buffer.  Mask out any bytes from before the beginning.  */
+  p = (word_type *)((uintptr_t)s & -sizeof(word_type));
+  val = *p;
+  misalign = (uintptr_t)s & (sizeof(word_type) - 1);
+  if (misalign)
+    {
+      val = acc_char_mask_misalign (val, misalign);
+      left += misalign;
+    }
+
+  /* Main loop.  */
+  while (1)
+    {
+      word_type m_nl, m_cr, m_bs, m_qm, t;
+
+      m_nl = acc_char_cmp (val, repl_nl);
+      m_cr = acc_char_cmp (val, repl_cr);
+      m_bs = acc_char_cmp (val, repl_bs);
+      m_qm = acc_char_cmp (val, repl_qm);
+      t = (m_nl | m_cr) | (m_bs | m_qm);
+
+      if (__builtin_expect (t != 0, 0))
+	{
+	  int i = acc_char_index (t, val);
+	  if (i >= 0)
+	    {
+	      *out = (const uchar *)p + i;
+	      return true;
+	    }
+	}
+
+      left -= sizeof(word_type);
+      ++p;
+      if (left < (ptrdiff_t) sizeof(word_type))
+	{
+	  /* Ran out of complete words.  */
+	  *out = (const uchar *)p;
+	  return false;
+	}
+      val = *p;
+    }
+}
+
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+/* We'd like to share this table between the sse2 and sse4.2 functions
+   below, but we can't define an SSE vector type outside of a context
+   of a function that forces SSE enabled.  So we must define this table
+   in terms of characters.  */
+static const uchar sse_mask_align[16][16] __attribute__((aligned(16))) =
+{
+  {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0 },
+  { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0 }
+};
+
+/* Fast path to find line special characters using SSE 4.2 vectorized string 
+   instructions. Anything complicated falls back to the slow path below. 
+   Since this loop is very hot it's worth doing these kinds of
+   optimizations. Returns true if stopper character found. 
+
+   We should be using the _mm intrinsics, but the xxxintr headers do things
+   not allowed in gcc.  So instead use direct builtins.  */
+
+static bool __attribute__((__target__("sse4.2")))
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef char m128i __attribute__ ((__vector_size__ (16)));
+  static const m128i search
+    = { '\n', '\r', '?', '\\', 0,0,0,0,0,0,0,0,0,0,0,0 };
+
+  ptrdiff_t left;
+  unsigned int misalign;
+  const m128i *p;
+  m128i data;
+
+  left = end - s;
+  if (left <= 0)
+    goto no_match;
+
+  /* Align the pointer and mask out the bytes before the start.  */
+  misalign = (uintptr_t)s & 15;
+  p = (m128i *)((uintptr_t)s & -16);
+  data = *p;
+  if (misalign)
+    {
+      data |= ((m128i *)sse_mask_align)[misalign];
+      left += misalign;
+    }
+
+  while (1)
+    {
+      int index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+      if (__builtin_expect (index < 16, 0)) 
+	{
+	  *out = (const uchar *)p + index;
+	  return true;
+	}
+
+      left -= 16;
+      if (left <= 0)
+	goto no_match;
+
+      data = *++p;
+    }
+
+ no_match:
+  /* No match within buffer.  */
+  *out = end;
+  return false;
+}
+
+static bool __attribute__((__target__("sse2")))
+search_line_sse2 (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef char m128i __attribute__ ((__vector_size__ (16)));
+
+  static const m128i repl_nl = {
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+  };
+  static const m128i repl_cr = {
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+  };
+  static const m128i repl_bs = {
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+  };
+  static const m128i repl_qm = {
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+  };
+
+  ptrdiff_t left;
+  unsigned int misalign, mask;
+  const m128i *p;
+  m128i data, t;
+
+  left = end - s;
+  if (left <= 0)
+    goto no_match;
+
+  /* Align the pointer and mask out the bytes before the start.  */
+  misalign = (uintptr_t)s & 15;
+  p = (m128i *)((uintptr_t)s & -16);
+  data = *p;
+  if (misalign)
+    {
+      data |= ((const m128i *)sse_mask_align)[misalign];
+      left += misalign;
+    }
+
+  while (1)
+    {
+      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+      mask = __builtin_ia32_pmovmskb128 (t);
+
+      if (__builtin_expect (mask, 0))
+	break;
+
+      left -= 16;
+      if (left <= 0)
+	goto no_match;
+      data = *++p;
+    }
+
+  /* If there were less than 16 bytes left in the buffer, then there will
+     be comparisons beyond the end of buffer.  Mask them out.  */
+  if (left < 16)
+    {
+      mask &= (2u << left) - 1;
+      if (mask == 0)
+	goto no_match;
+    }
+
+  s = (const uchar *)p + __builtin_ctz (mask);
+  return true;
+
+ no_match:
+  /* No match within buffer.  */
+  *out = end;
+  return false;
+}
+
+/* Check if CPU supports vectorized string instructions. */
+
+#include "../gcc/config/i386/cpuid.h"
+
+static bool (*search_line_fast)(const uchar *, const uchar *, const uchar **)
+  = search_line_acc_char;
+
+void 
+init_vectorized_lexer (void)
+{
+  unsigned dummy, ecx, edx;
+
+  if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx))
+    {
+      if (ecx & bit_SSE4_2)
+	search_line_fast = search_line_sse42;
+      else if (edx & bit_SSE2)
+	search_line_fast = search_line_sse2;
+    }
+}
+
+#else
+
+/* We only have one accellerated alternative.  Use a direct call so that
+   we encourage inlining.  We must still provide the init_vectorized_lexer
+   entry point, even though it does nothing.  */
+
+#define search_line_fast  search_line_acc_char
+
+void 
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
 /* Returns with a logical line that contains no escaped newlines or
    trigraphs.  This is a time-critical inner loop.  */
 void
@@ -109,12 +511,34 @@ _cpp_clean_line (cpp_reader *pfile)
   buffer->cur_note = buffer->notes_used = 0;
   buffer->cur = buffer->line_base = buffer->next_line;
   buffer->need_line = false;
-  s = buffer->next_line - 1;
+  s = buffer->next_line;
 
   if (!buffer->from_stage3)
     {
       const uchar *pbackslash = NULL;
 
+    found_bs:
+      /* Perform an optimized search for \n, \r, \\, ?.  */
+      if (search_line_fast (s, buffer->rlimit, &s))
+	{
+	  c = *s;
+	      
+	  /* Special case for backslash which is reasonably common.
+	     Continue searching using the fast path.  */
+	  if (c == '\\') 
+	    {
+	      pbackslash = s;
+	      s++;
+	      goto found_bs;
+	    }
+	  if (__builtin_expect (c == '?', false))
+	    goto found_qm;
+	  else
+	    goto found_nl_cr;
+	}
+
+      s--;
+
       /* Short circuit for the common case of an un-escaped line with
 	 no trigraphs.  The primary win here is by not writing any
 	 data back to memory until we have to.  */
@@ -124,6 +548,7 @@ _cpp_clean_line (cpp_reader *pfile)
 	  if (__builtin_expect (c == '\n', false)
 	      || __builtin_expect (c == '\r', false))
 	    {
+	    found_nl_cr:
 	      d = (uchar *) s;
 
 	      if (__builtin_expect (s == buffer->rlimit, false))
@@ -157,26 +582,28 @@ _cpp_clean_line (cpp_reader *pfile)
 	    }
 	  if (__builtin_expect (c == '\\', false))
 	    pbackslash = s;
-	  else if (__builtin_expect (c == '?', false)
-		   && __builtin_expect (s[1] == '?', false)
-		   && _cpp_trigraph_map[s[2]])
+	  else if (__builtin_expect (c == '?', false))
 	    {
-	      /* Have a trigraph.  We may or may not have to convert
-		 it.  Add a line note regardless, for -Wtrigraphs.  */
-	      add_line_note (buffer, s, s[2]);
-	      if (CPP_OPTION (pfile, trigraphs))
+	    found_qm:
+	      if (__builtin_expect (s[1] == '?', false)
+		   && _cpp_trigraph_map[s[2]])
 		{
-		  /* We do, and that means we have to switch to the
-		     slow path.  */
-		  d = (uchar *) s;
-		  *d = _cpp_trigraph_map[s[2]];
-		  s += 2;
-		  break;
+		  /* Have a trigraph.  We may or may not have to convert
+		     it.  Add a line note regardless, for -Wtrigraphs.  */
+		  add_line_note (buffer, s, s[2]);
+		  if (CPP_OPTION (pfile, trigraphs))
+		    {
+		      /* We do, and that means we have to switch to the
+		         slow path.  */
+		      d = (uchar *) s;
+		      *d = _cpp_trigraph_map[s[2]];
+		      s += 2;
+		      break;
+		    }
 		}
 	    }
 	}
 
-
       for (;;)
 	{
 	  c = *++s;
@@ -184,7 +611,7 @@ _cpp_clean_line (cpp_reader *pfile)
 
 	  if (c == '\n' || c == '\r')
 	    {
-		  /* Handle DOS line endings.  */
+	      /* Handle DOS line endings.  */
 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 		s++;
 	      if (s == buffer->rlimit)
@@ -215,9 +642,8 @@ _cpp_clean_line (cpp_reader *pfile)
     }
   else
     {
-      do
+      while (*s != '\n' && *s != '\r')
 	s++;
-      while (*s != '\n' && *s != '\r');
       d = (uchar *) s;
 
       /* Handle DOS line endings.  */
diff --git a/libcpp/system.h b/libcpp/system.h
index 2472799..1a74734 100644
--- a/libcpp/system.h
+++ b/libcpp/system.h
@@ -29,6 +29,9 @@ along with GCC; see the file COPYING3.  If not see
 #ifdef HAVE_STDDEF_H
 # include <stddef.h>
 #endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
 
 #include <stdio.h>
 
