diff mbox

[CFT,v4] Vectorized _cpp_clean_line

Message ID 4C6C557F.7050806@redhat.com
State New
Headers show

Commit Message

Richard Henderson Aug. 18, 2010, 9:49 p.m. UTC
On 08/18/2010 12:51 PM, Richard Henderson wrote:
> Before:
> 
>   %   cumulative   self              self     total           
>  time   seconds   seconds    calls  ms/call  ms/call  name    
>  29.41      0.05     0.05   168709     0.00     0.00  ._cpp_clean_line
>  17.65      0.08     0.03   606267     0.00     0.00  ._cpp_lex_direct
>  11.76      0.10     0.02   168081     0.00     0.00  .linemap_line_start
>  11.76      0.12     0.02                             .variably_modified_type_p
>   5.88      0.13     0.01   503900     0.00     0.00  ._cpp_lex_token
> 
> After:
> 
>   %   cumulative   self              self     total           
>  time   seconds   seconds    calls  ms/call  ms/call  name    
>  20.83      0.05     0.05   606267     0.00     0.00  ._cpp_lex_direct
>  20.83      0.10     0.05   228345     0.00     0.00  .ht_lookup_with_hash
>  16.67      0.14     0.04                             .variably_modified_type_p
>   4.17      0.15     0.01   503900     0.00     0.00  ._cpp_lex_token
>   4.17      0.16     0.01   304199     0.00     0.00  ._cpp_lex_identifier
>   4.17      0.17     0.01   304199     0.00     0.00  .cpp_token_as_text
>   4.17      0.18     0.01   168709     0.00     0.00  ._cpp_clean_line
> 
> Note that ._cpp_clean_line is about 5 times faster.
> 
> Is the cpu you're testing on (power7 or what?) just that much
> better with the original integer code?

Not to overload you too much, but here's an incremental patch
to test as well.

You do have to add "-maltivec" to {BOOT,STAGE}_CFLAGS at the moment,
since the ppc backend does not yet support the kind of target mixing
and matching that i386 port does.  On that same G5 I get:

  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 37.50      0.06     0.06   606267     0.00     0.00  ._cpp_lex_direct
 12.50      0.08     0.02   504424     0.00     0.00  .cpp_get_token_with_location
 12.50      0.10     0.02   503900     0.00     0.00  ._cpp_lex_token
 12.50      0.12     0.02                             .variably_modified_type_p
  6.25      0.13     0.01   457580     0.00     0.00  .cpp_output_token
  6.25      0.14     0.01   217473     0.00     0.00  .init_pp_output
  6.25      0.15     0.01    40842     0.00     0.00  ._cpp_test_assertion
  6.25      0.16     0.01        1    10.00   140.00  .preprocess_file
  0.00      0.16     0.00   633800     0.00     0.00  .cpp_get_token
  0.00      0.16     0.00   505969     0.00     0.00  .linemap_lookup
  0.00      0.16     0.00   304199     0.00     0.00  ._cpp_lex_identifier
  0.00      0.16     0.00   304199     0.00     0.00  .cpp_token_as_text
  0.00      0.16     0.00   228345     0.00     0.00  .ht_lookup_with_hash
  0.00      0.16     0.00   180752     0.00     0.00  ._cpp_get_fresh_line
  0.00      0.16     0.00   173870     0.00     0.00  ._cpp_init_tokenrun
  0.00      0.16     0.00   168709     0.00     0.00  ._cpp_clean_line

I.e. _cpp_clean_line has essentially vanished off the radar.  It
seems likely that oprofile across an entire bootstrap stage would
be more likely to be able to pick out how much time it really takes.


r~

Comments

Luis Machado Aug. 19, 2010, 1:29 p.m. UTC | #1
On Wed, 2010-08-18 at 14:49 -0700, Richard Henderson wrote:
> On 08/18/2010 12:51 PM, Richard Henderson wrote:
> > Before:
> > 
> >   %   cumulative   self              self     total           
> >  time   seconds   seconds    calls  ms/call  ms/call  name    
> >  29.41      0.05     0.05   168709     0.00     0.00  ._cpp_clean_line
> >  17.65      0.08     0.03   606267     0.00     0.00  ._cpp_lex_direct
> >  11.76      0.10     0.02   168081     0.00     0.00  .linemap_line_start
> >  11.76      0.12     0.02                             .variably_modified_type_p
> >   5.88      0.13     0.01   503900     0.00     0.00  ._cpp_lex_token
> > 
> > After:
> > 
> >   %   cumulative   self              self     total           
> >  time   seconds   seconds    calls  ms/call  ms/call  name    
> >  20.83      0.05     0.05   606267     0.00     0.00  ._cpp_lex_direct
> >  20.83      0.10     0.05   228345     0.00     0.00  .ht_lookup_with_hash
> >  16.67      0.14     0.04                             .variably_modified_type_p
> >   4.17      0.15     0.01   503900     0.00     0.00  ._cpp_lex_token
> >   4.17      0.16     0.01   304199     0.00     0.00  ._cpp_lex_identifier
> >   4.17      0.17     0.01   304199     0.00     0.00  .cpp_token_as_text
> >   4.17      0.18     0.01   168709     0.00     0.00  ._cpp_clean_line
> > 
> > Note that ._cpp_clean_line is about 5 times faster.
> > 
> > Is the cpu you're testing on (power7 or what?) just that much
> > better with the original integer code?
> 
> Not to overload you too much, but here's an incremental patch
> to test as well.
> 
> You do have to add "-maltivec" to {BOOT,STAGE}_CFLAGS at the moment,
> since the ppc backend does not yet support the kind of target mixing
> and matching that i386 port does.  On that same G5 I get:
> 
>   %   cumulative   self              self     total           
>  time   seconds   seconds    calls  ms/call  ms/call  name    
>  37.50      0.06     0.06   606267     0.00     0.00  ._cpp_lex_direct
>  12.50      0.08     0.02   504424     0.00     0.00  .cpp_get_token_with_location
>  12.50      0.10     0.02   503900     0.00     0.00  ._cpp_lex_token
>  12.50      0.12     0.02                             .variably_modified_type_p
>   6.25      0.13     0.01   457580     0.00     0.00  .cpp_output_token
>   6.25      0.14     0.01   217473     0.00     0.00  .init_pp_output
>   6.25      0.15     0.01    40842     0.00     0.00  ._cpp_test_assertion
>   6.25      0.16     0.01        1    10.00   140.00  .preprocess_file
>   0.00      0.16     0.00   633800     0.00     0.00  .cpp_get_token
>   0.00      0.16     0.00   505969     0.00     0.00  .linemap_lookup
>   0.00      0.16     0.00   304199     0.00     0.00  ._cpp_lex_identifier
>   0.00      0.16     0.00   304199     0.00     0.00  .cpp_token_as_text
>   0.00      0.16     0.00   228345     0.00     0.00  .ht_lookup_with_hash
>   0.00      0.16     0.00   180752     0.00     0.00  ._cpp_get_fresh_line
>   0.00      0.16     0.00   173870     0.00     0.00  ._cpp_init_tokenrun
>   0.00      0.16     0.00   168709     0.00     0.00  ._cpp_clean_line
> 
> I.e. _cpp_clean_line has essentially vanished off the radar.  It
> seems likely that oprofile across an entire bootstrap stage would
> be more likely to be able to pick out how much time it really takes.
> 
> 
> r~

I'll get it tested.

Luis
diff mbox

Patch

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 8e56784..1e8e847 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -220,7 +220,7 @@  acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    and branches without increasing the number of arithmetic operations.
    It's almost certainly going to be a win with 64-bit word size.  */
 
-static bool
+static bool ATTRIBUTE_UNUSED
 search_line_acc_char (const uchar *s, const uchar *end, const uchar **out)
 {
   const word_type repl_nl = acc_char_replicate ('\n');
@@ -497,6 +497,116 @@  search_line_fast (const uchar *s, const uchar *end, const uchar **out)
     return search_line_acc_char (s, end, out);
 }
 
+#elif defined(__ALTIVEC__)
+
+static bool
+search_line_fast (const uchar *s, const uchar *end, const uchar **out)
+{
+  typedef __vector unsigned char vc;
+
+  const vc repl_nl = {
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+  };
+  const vc repl_cr = {
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+  };
+  const vc repl_bs = {
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+  };
+  const vc repl_qm = {
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+  };
+  const vc ones = {
+    -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1,
+  };
+  const vc zero = { 0 };
+
+  ptrdiff_t left;
+  vc data, vsl, t;
+
+  left = end - s;
+  data = __builtin_vec_ld(0, (const vc *)s);
+  vsl = __builtin_vec_lvsr(0, s);
+  t = __builtin_vec_perm(zero, ones, vsl);
+  data &= t;
+
+  left += (uintptr_t)s & 15;
+  s = (const uchar *)((uintptr_t)s & -16);
+  goto start;
+
+  do
+    {
+      vc m_nl, m_cr, m_bs, m_qm;
+
+      left -= 16;
+      s += 16;
+      if (__builtin_expect (left <= 0, 0))
+	{
+	  *out = s;
+	  return false;
+	}
+      data = __builtin_vec_ld(0, (const vc *)s);
+
+    start:
+      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
+      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
+      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
+      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
+      t = (m_nl | m_cr) | (m_bs | m_qm);
+    }
+  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
+
+  /* A match somewhere.  Scan T for the match.  */
+  {
+#define N  (sizeof(vc) / sizeof(long))
+
+    union {
+      vc v;
+      unsigned long l[N];
+    } u;
+    typedef char check_count[(N == 2 || N == 4) * 2 - 1];
+
+    unsigned long l, i = 0;
+
+    u.v = t;
+    switch (N)
+      {
+      case 4:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+      case 2:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i];
+      }
+
+    l = __builtin_clzl(l);
+    l /= 8;
+    *out = s + l;
+    return true;
+
+#undef N
+  }
+}
+
+void 
+init_vectorized_lexer (void)
+{
+}
+
 #else
 
 /* We only have one accellerated alternative.  Use a direct call so that