From patchwork Thu Aug 12 21:37:31 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: Vectorized _cpp_clean_line Date: Thu, 12 Aug 2010 11:37:31 -0000 From: Richard Henderson X-Patchwork-Id: 61647 Message-Id: <4C64699B.20804@redhat.com> To: Andi Kleen Cc: gcc-patches@gcc.gnu.org On 08/10/2010 07:48 AM, Andi Kleen wrote: > @@ -109,12 +185,41 @@ _cpp_clean_line (cpp_reader *pfile) > buffer->cur_note = buffer->notes_used = 0; > buffer->cur = buffer->line_base = buffer->next_line; > buffer->need_line = false; > - s = buffer->next_line - 1; > + s = buffer->next_line; I believe I've found the extra failures. You adjusted this path: > > if (!buffer->from_stage3) > { ... > + > + s--; but not the from_stage3 path, like so: else { - do + while (*s != '\n' && *s != '\r') s++; - while (*s != '\n' && *s != '\r'); d = (uchar *) s; I'm currently testing this patch that merges your patch with David Miller's bit-twiddling version, plus extra stuff for Alpha, IA-64, and SSE2 (without SSE4.2). r~ diff --git a/libcpp/init.c b/libcpp/init.c index c5b8c28..769aa50 100644 --- a/libcpp/init.c +++ b/libcpp/init.c @@ -137,6 +137,8 @@ init_library (void) #ifdef ENABLE_NLS (void) bindtextdomain (PACKAGE, LOCALEDIR); #endif + + init_vectorized_lexer (); } } diff --git a/libcpp/internal.h b/libcpp/internal.h index 9209b55..10ed033 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f) return fputs ((const char *)s, f); } +extern void init_vectorized_lexer (void); + #ifdef __cplusplus } #endif diff --git a/libcpp/lex.c b/libcpp/lex.c index f628272..6ea4667 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -96,6 +96,408 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type) buffer->notes_used++; } + +/* Fast path to find line special characters using optimized character + scanning algorithms. Anything complicated falls back to the slow + path below. Since this loop is very hot it's worth doing these kinds + of optimizations. + + One of the paths through the ifdefs should provide + + bool search_line_fast (const uchar *s, const uchar *end, + const uchar **out) + + Between S and END, search for \n, \r, \\, ?. Return true if found. + Always update *OUT to the last character scanned, even if not found. */ + +/* ??? Should be in configure.ac. */ +#define WORDS_BIGENDIAN 0 + +/* We'd like the largest integer that fits into a register. There's nothing + in that gives us that. For most hosts this is unsigned long, + but MS decided on an LLP64 model. Thankfully when building with GCC we + can get the "real" word size. */ +#ifdef __GNUC__ +typedef unsigned int word_type __attribute__((__mode__(__word__))); +#else +typedef unsigned long word_type; +#endif + +/* Return X with the first N bytes forced to values that won't match one + of the interesting characters. Note that NUL is not interesting. */ + +static inline word_type +acc_char_mask_misalign (word_type val, unsigned int n) +{ + word_type mask = -1; + if (WORDS_BIGENDIAN) + mask >>= n * 8; + else + mask <<= n * 8; + return val & mask; +} + +/* Return X replicated to all byte positions within WORD_TYPE. */ + +static inline word_type +acc_char_replicate (uchar x) +{ + word_type ret; + + ret = (x << 24) | (x << 16) | (x << 8) | x; + switch (sizeof (ret)) + { + case 8: + ret = (ret << 16 << 16) | ret; + break; + case 4: + break; + default: + abort (); + } + return ret; +} + +/* Return non-zero if some byte of VAL is (probably) C. */ + +static inline word_type +acc_char_cmp (word_type val, word_type c) +{ +#if defined(__GNUC__) && defined(__alpha__) + /* We can get exact results using a compare-bytes instruction. Since + comparison is always >=, we could either do ((v >= c) & (c >= v)) + for 3 operations or force matching bytes to zero and compare vs + zero with (0 >= (val ^ c)) for 2 operations. */ + return __builtin_alpha_cmpbge (0, val ^ c); +#elif defined(__GNUC__) && defined(__ia64__) + /* ??? Ideally we'd have some sort of builtin for this, so that we + can pack the 4 instances into fewer bundles. */ + word_type ret; + __asm__("pcmp1.eq %0 = %1, %2" : "=r"(ret) : "r"(val), "r"(c)); + return ret; +#else + word_type magic = 0x7efefefeU; + switch (sizeof(word_type)) + { + case 8: + magic = (magic << 16 << 16) | 0xfefefefeU; + case 4: + break; + default: + abort (); + } + magic |= 1; + + val ^= c; + return ((val + magic) ^ ~val) & ~magic; +#endif +} + +/* Given the result of acc_char_cmp is non-zero, return the index of + the found character. If this was a false positive, return -1. */ + +static inline int +acc_char_index (word_type cmp ATTRIBUTE_UNUSED, + word_type val ATTRIBUTE_UNUSED) +{ +#if defined(__GNUC__) && defined(__alpha__) + /* The cmpbge instruction sets *bits* of the result corresponding to + matches in the bytes with no false positives. This means that clz/ctx + produces a correct unscaled result. */ + return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp)); +#elif defined(__GNUC__) && defined(__ia64__) + /* The pcmp1 instruction sets matching bytes to 0xff with no false + positives. This means that clz/ctz produces a correct scaled result. */ + return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp)) >> 3; +#else + unsigned int i; + + /* ??? It would be nice to force unrolling here, + and have all of these constants folded. */ + for (i = 0; i < sizeof(word_type); ++i) + { + uchar c; + if (WORDS_BIGENDIAN) + c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff; + else + c = (val >> i * 8) & 0xff; + + if (c == '\n' || c == '\r' || c == '\\' || c == '?') + return i; + } + + return -1; +#endif +} + +/* A version of the fast scanner using bit fiddling techniques. + + For 32-bit words, one would normally perform 16 comparisons and + 16 branches. With this algorithm one performs 24 arithmetic + operations and one branch. Whether this is faster with a 32-bit + word side is going to be somewhat system dependent. + + For 64-bit words, we eliminate twice the number of comparisons + and branches without increasing the number of arithmetic operations. + It's almost certainly going to be a win with 64-bit word size. */ + +static bool +search_line_acc_char (const uchar *s, const uchar *end, const uchar **out) +{ + const word_type repl_nl = acc_char_replicate ('\n'); + const word_type repl_cr = acc_char_replicate ('\r'); + const word_type repl_bs = acc_char_replicate ('\\'); + const word_type repl_qm = acc_char_replicate ('?'); + + unsigned int misalign; + ptrdiff_t left; + const word_type *p; + word_type val; + + /* Don't bother optimizing very short lines; too much masking to do. */ + left = end - s; + if (left < (ptrdiff_t) sizeof(word_type)) + { + *out = s; + return false; + } + + /* Align the buffer. Mask out any bytes from before the beginning. */ + p = (word_type *)((uintptr_t)s & -sizeof(word_type)); + val = *p; + misalign = (uintptr_t)s & (sizeof(word_type) - 1); + if (misalign) + { + val = acc_char_mask_misalign (val, misalign); + left += misalign; + } + + /* Main loop. */ + while (1) + { + word_type m_nl, m_cr, m_bs, m_qm, t; + + m_nl = acc_char_cmp (val, repl_nl); + m_cr = acc_char_cmp (val, repl_cr); + m_bs = acc_char_cmp (val, repl_bs); + m_qm = acc_char_cmp (val, repl_qm); + t = (m_nl | m_cr) | (m_bs | m_qm); + + if (__builtin_expect (t != 0, 0)) + { + int i = acc_char_index (t, val); + if (i >= 0) + { + *out = (const uchar *)p + i; + return true; + } + } + + left -= sizeof(word_type); + ++p; + if (left < (ptrdiff_t) sizeof(word_type)) + { + /* Ran out of complete words. */ + *out = (const uchar *)p; + return false; + } + val = *p; + } +} + +#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) +/* We'd like to share this table between the sse2 and sse4.2 functions + below, but we can't define an SSE vector type outside of a context + of a function that forces SSE enabled. So we must define this table + in terms of characters. */ +static const uchar sse_mask_align[16][16] __attribute__((aligned(16))) = +{ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } +}; + +/* Fast path to find line special characters using SSE 4.2 vectorized string + instructions. Anything complicated falls back to the slow path below. + Since this loop is very hot it's worth doing these kinds of + optimizations. Returns true if stopper character found. + + We should be using the _mm intrinsics, but the xxxintr headers do things + not allowed in gcc. So instead use direct builtins. */ + +static bool __attribute__((__target__("sse4.2"))) +search_line_sse42 (const uchar *s, const uchar *end, const uchar **out) +{ + typedef char m128i __attribute__ ((__vector_size__ (16))); + static const m128i search + = { '\n', '\r', '?', '\\', 0,0,0,0,0,0,0,0,0,0,0,0 }; + + ptrdiff_t left; + unsigned int misalign; + const m128i *p; + m128i data; + + left = end - s; + if (left <= 0) + goto no_match; + + /* Align the pointer and mask out the bytes before the start. */ + misalign = (uintptr_t)s & 15; + p = (m128i *)((uintptr_t)s & -16); + data = *p; + if (misalign) + { + data |= ((m128i *)sse_mask_align)[misalign]; + left += misalign; + } + + while (1) + { + int index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0); + if (__builtin_expect (index < 16, 0)) + { + *out = (const uchar *)p + index; + return true; + } + + left -= 16; + if (left <= 0) + goto no_match; + + data = *++p; + } + + no_match: + /* No match within buffer. */ + *out = end; + return false; +} + +static bool __attribute__((__target__("sse2"))) +search_line_sse2 (const uchar *s, const uchar *end, const uchar **out) +{ + typedef char m128i __attribute__ ((__vector_size__ (16))); + + static const m128i repl_nl = { + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' + }; + static const m128i repl_cr = { + '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', + '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' + }; + static const m128i repl_bs = { + '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', + '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' + }; + static const m128i repl_qm = { + '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?', + }; + + ptrdiff_t left; + unsigned int misalign, mask; + const m128i *p; + m128i data, t; + + left = end - s; + if (left <= 0) + goto no_match; + + /* Align the pointer and mask out the bytes before the start. */ + misalign = (uintptr_t)s & 15; + p = (m128i *)((uintptr_t)s & -16); + data = *p; + if (misalign) + { + data |= ((const m128i *)sse_mask_align)[misalign]; + left += misalign; + } + + while (1) + { + t = __builtin_ia32_pcmpeqb128(data, repl_nl); + t |= __builtin_ia32_pcmpeqb128(data, repl_cr); + t |= __builtin_ia32_pcmpeqb128(data, repl_bs); + t |= __builtin_ia32_pcmpeqb128(data, repl_qm); + mask = __builtin_ia32_pmovmskb128 (t); + + if (__builtin_expect (mask, 0)) + break; + + left -= 16; + if (left <= 0) + goto no_match; + data = *++p; + } + + /* If there were less than 16 bytes left in the buffer, then there will + be comparisons beyond the end of buffer. Mask them out. */ + if (left < 16) + { + mask &= (2u << left) - 1; + if (mask == 0) + goto no_match; + } + + s = (const uchar *)p + __builtin_ctz (mask); + return true; + + no_match: + /* No match within buffer. */ + *out = end; + return false; +} + +/* Check if CPU supports vectorized string instructions. */ + +#include "../gcc/config/i386/cpuid.h" + +static bool (*search_line_fast)(const uchar *, const uchar *, const uchar **) + = search_line_acc_char; + +void +init_vectorized_lexer (void) +{ + unsigned dummy, ecx, edx; + + if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx)) + { + if (ecx & bit_SSE4_2) + search_line_fast = search_line_sse42; + else if (edx & bit_SSE2) + search_line_fast = search_line_sse2; + } +} + +#else + +/* We only have one accellerated alternative. Use a direct call so that + we encourage inlining. We must still provide the init_vectorized_lexer + entry point, even though it does nothing. */ + +#define search_line_fast search_line_acc_char + +void +init_vectorized_lexer (void) +{ +} + +#endif + /* Returns with a logical line that contains no escaped newlines or trigraphs. This is a time-critical inner loop. */ void @@ -109,12 +511,34 @@ _cpp_clean_line (cpp_reader *pfile) buffer->cur_note = buffer->notes_used = 0; buffer->cur = buffer->line_base = buffer->next_line; buffer->need_line = false; - s = buffer->next_line - 1; + s = buffer->next_line; if (!buffer->from_stage3) { const uchar *pbackslash = NULL; + found_bs: + /* Perform an optimized search for \n, \r, \\, ?. */ + if (search_line_fast (s, buffer->rlimit, &s)) + { + c = *s; + + /* Special case for backslash which is reasonably common. + Continue searching using the fast path. */ + if (c == '\\') + { + pbackslash = s; + s++; + goto found_bs; + } + if (__builtin_expect (c == '?', false)) + goto found_qm; + else + goto found_nl_cr; + } + + s--; + /* Short circuit for the common case of an un-escaped line with no trigraphs. The primary win here is by not writing any data back to memory until we have to. */ @@ -124,6 +548,7 @@ _cpp_clean_line (cpp_reader *pfile) if (__builtin_expect (c == '\n', false) || __builtin_expect (c == '\r', false)) { + found_nl_cr: d = (uchar *) s; if (__builtin_expect (s == buffer->rlimit, false)) @@ -157,26 +582,28 @@ _cpp_clean_line (cpp_reader *pfile) } if (__builtin_expect (c == '\\', false)) pbackslash = s; - else if (__builtin_expect (c == '?', false) - && __builtin_expect (s[1] == '?', false) - && _cpp_trigraph_map[s[2]]) + else if (__builtin_expect (c == '?', false)) { - /* Have a trigraph. We may or may not have to convert - it. Add a line note regardless, for -Wtrigraphs. */ - add_line_note (buffer, s, s[2]); - if (CPP_OPTION (pfile, trigraphs)) + found_qm: + if (__builtin_expect (s[1] == '?', false) + && _cpp_trigraph_map[s[2]]) { - /* We do, and that means we have to switch to the - slow path. */ - d = (uchar *) s; - *d = _cpp_trigraph_map[s[2]]; - s += 2; - break; + /* Have a trigraph. We may or may not have to convert + it. Add a line note regardless, for -Wtrigraphs. */ + add_line_note (buffer, s, s[2]); + if (CPP_OPTION (pfile, trigraphs)) + { + /* We do, and that means we have to switch to the + slow path. */ + d = (uchar *) s; + *d = _cpp_trigraph_map[s[2]]; + s += 2; + break; + } } } } - for (;;) { c = *++s; @@ -184,7 +611,7 @@ _cpp_clean_line (cpp_reader *pfile) if (c == '\n' || c == '\r') { - /* Handle DOS line endings. */ + /* Handle DOS line endings. */ if (c == '\r' && s != buffer->rlimit && s[1] == '\n') s++; if (s == buffer->rlimit) @@ -215,9 +642,8 @@ _cpp_clean_line (cpp_reader *pfile) } else { - do + while (*s != '\n' && *s != '\r') s++; - while (*s != '\n' && *s != '\r'); d = (uchar *) s; /* Handle DOS line endings. */ diff --git a/libcpp/system.h b/libcpp/system.h index 2472799..1a74734 100644 --- a/libcpp/system.h +++ b/libcpp/system.h @@ -29,6 +29,9 @@ along with GCC; see the file COPYING3. If not see #ifdef HAVE_STDDEF_H # include #endif +#ifdef HAVE_STDINT_H +# include +#endif #include