Message ID | DB5PR08MB1030CDDFA10DCA6E414A844683B20@DB5PR08MB1030.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [v2] Improve performance of strstr | expand |
On Mon, Dec 31, 2018 at 09:01:56PM +0000, Wilco Dijkstra wrote: > This patch significantly improves performance of strstr compared to the > previous version [1] using a novel modified Horspool algorithm. Needles up > to size 256 use a bad-character table indexed by hashed pairs of characters > to quickly skip past mismatches. Long needles use a self-adapting filtering > step to avoid comparing the whole needle repeatedly. > > By limiting the needle length to 256, the shift table only requires 8 bits > per entry, lowering preprocessing overhead and minimizing cache effects. > This limit also implies worst-case performance is linear. > > Small needles up to size 3 use a dedicated linear search. Very long needles > use the Two-Way algorithm. > > The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5..8 > times basic_strstr and 3.7 times twoway_strstr. > > Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test > (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). > > OK for commit? This is a major performance regression and should not be committed. The test case does not cover anything near the worst cases. With a limit more like 16 bytes it might be reasonable but 256 is plenty large to see the quadratic effects (64k is a big number). Rich
Hi Rich, > This is a major performance regression and should not be committed. Exactly which case regresses? I'd love to hear about it since I can't find any. > The test case does not cover anything near the worst cases. With a > limit more like 16 bytes it might be reasonable but 256 is plenty > large to see the quadratic effects (64k is a big number). We currently don't have a good strstr benchmark in GLIBC, which is why I posted a patch to improve this. If you have further suggestions how to improve it just let me know. Cheers, Wilco
ping
From: Wilco Dijkstra
Sent: 31 December 2018 21:01
To: 'GNU C Library'
Cc: nd; Rich Felker
Subject: [PATCH v2] Improve performance of strstr
This patch significantly improves performance of strstr compared to the
previous version [1] using a novel modified Horspool algorithm. Needles up
to size 256 use a bad-character table indexed by hashed pairs of characters
to quickly skip past mismatches. Long needles use a self-adapting filtering
step to avoid comparing the whole needle repeatedly.
By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.
Small needles up to size 3 use a dedicated linear search. Very long needles
use the Two-Way algorithm.
The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5.8
times basic_strstr and 3.7 times twoway_strstr.
Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test
(https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c).
OK for commit?
[1] https://www.sourceware.org/ml/libc-alpha/2018-10/msg00634.html
[2] https://www.sourceware.org/ml/libc-alpha/2018-12/msg01057.html
ChangeLog:
2018-12-31 Wilco Dijkstra <wdijkstr@arm.com>
* string/str-two-way.h (two_way_short_needle): Add inline to avoid
warning.
(two_way_long_needle): Block inlining.
* string/strstr.c (strstr2): Add new function.
(strstr3): Likewise.
(STRSTR): Completely rewrite strstr to improve performance.
--
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
-static RETURN_TYPE
+static inline RETURN_TYPE
two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
const unsigned char *needle, size_t needle_len)
{
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
sublinear performance is not possible. */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
const unsigned char *needle, size_t needle_len)
{
diff --git a/string/strstr.c b/string/strstr.c
index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -16,21 +16,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-/* This particular implementation was written by Eric Blake, 2008. */
-
#ifndef _LIBC
# include <config.h>
#endif
-/* Specification of strstr. */
#include <string.h>
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val) (expr)
-#endif
-
#define RETURN_TYPE char *
#define AVAILABLE(h, h_l, j, n_l) \
(((j) + (n_l) <= (h_l)) \
@@ -47,47 +38,122 @@
#define STRSTR strstr
#endif
-/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
- if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
- HAYSTACK. */
-char *
-STRSTR (const char *haystack, const char *needle)
+static inline char *
+strstr2 (const unsigned char *hs, const unsigned char *ne)
{
- size_t needle_len; /* Length of NEEDLE. */
- size_t haystack_len; /* Known minimum length of HAYSTACK. */
-
- /* Handle empty NEEDLE special case. */
- if (needle[0] == '\0')
- return (char *) haystack;
+ uint32_t h1 = (ne[0] << 16) | ne[1];
+ uint32_t h2 = 0;
+ for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+ h2 = (h2 << 16) | c;
+ return h1 == h2 ? (char *)hs - 2 : NULL;
+}
- /* Skip until we find the first matching char from NEEDLE. */
- haystack = strchr (haystack, needle[0]);
- if (haystack == NULL || needle[1] == '\0')
- return (char *) haystack;
+static inline char *
+strstr3 (const unsigned char *hs, const unsigned char *ne)
+{
+ uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
+ uint32_t h2 = 0;
+ for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+ h2 = (h2 | c) << 8;
+ return h1 == h2 ? (char *)hs - 3 : NULL;
+}
- /* Ensure HAYSTACK length is at least as long as NEEDLE length.
- Since a match may occur early on in a huge HAYSTACK, use strnlen
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast strstr algorithm with guaranteed linear-time performance.
+ Small needles up to size 2 use a dedicated linear search. Longer needles
+ up to size 256 use a novel modified Horspool algorithm. It hashes pairs
+ of characters to quickly skip past mismatches. The main search loop only
+ exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+ and allowing for a larger skip if there is no match. A self-adapting
+ filtering check is used to quickly detect mismatches in long needles.
+ By limiting the needle length to 256, the shift table can be reduced to 8
+ bits per entry, lowering preprocessing overhead and minimizing cache effects.
+ The limit also implies worst-case performance is linear.
+ Needles larger than 256 characters use the linear-time Two-Way algorithm. */
+char *
+STRSTR (const char *haystack, const char *needle)
+{
+ const unsigned char *hs = (const unsigned char *) haystack;
+ const unsigned char *ne = (const unsigned char *) needle;
+
+ /* Handle short needle special cases first. */
+ if (ne[0] == '\0')
+ return (char *)hs;
+ hs = (const unsigned char *)strchr ((const char*)hs, ne[0]);
+ if (hs == NULL || ne[1] == '\0')
+ return (char*)hs;
+ if (ne[2] == '\0')
+ return strstr2 (hs, ne);
+ if (ne[3] == '\0')
+ return strstr3 (hs, ne);
+
+ /* Ensure haystack length is at least as long as needle length.
+ Since a match may occur early on in a huge haystack, use strnlen
and read ahead a few cachelines for improved performance. */
- needle_len = strlen (needle);
- haystack_len = __strnlen (haystack, needle_len + 256);
- if (haystack_len < needle_len)
+ size_t ne_len = strlen ((const char*)ne);
+ size_t hs_len = __strnlen ((const char*)hs, ne_len | 512);
+ if (hs_len < ne_len)
return NULL;
- /* Check whether we have a match. This improves performance since we avoid
- the initialization overhead of the two-way algorithm. */
- if (memcmp (haystack, needle, needle_len) == 0)
- return (char *) haystack;
-
- /* Perform the search. Abstract memory is considered to be an array
- of 'unsigned char' values, not an array of 'char' values. See
- ISO C 99 section 6.2.6.1. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
- return two_way_short_needle ((const unsigned char *) haystack,
- haystack_len,
- (const unsigned char *) needle, needle_len);
- return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
- (const unsigned char *) needle, needle_len);
+ /* Check whether we have a match. This improves performance since we
+ avoid initialization overheads. */
+ if (memcmp (hs, ne, ne_len) == 0)
+ return (char *) hs;
+
+ /* Use Two-Way algorithm for very long needles. */
+ if (__glibc_unlikely (ne_len > 256))
+ return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+ const unsigned char *end = hs + hs_len - ne_len;
+ uint8_t shift[256];
+ size_t tmp, shift1;
+ size_t m1 = ne_len - 1;
+ size_t offset = 0;
+
+ /* Initialize bad character shift hash table. */
+ memset (shift, 0, sizeof (shift));
+ for (int i = 1; i < m1; i++)
+ shift[hash2 (ne + i)] = i;
+ shift1 = m1 - shift[hash2 (ne + m1)];
+ shift[hash2 (ne + m1)] = m1;
+
+ while (1)
+ {
+ if (__glibc_unlikely (hs > end))
+ {
+ end += __strnlen ((const char*)end + m1 + 1, 2048);
+ if (hs > end)
+ return NULL;
+ }
+
+ /* Skip past character pairs not in the needle. */
+ do
+ {
+ hs += m1;
+ tmp = shift[hash2 (hs)];
+ }
+ while (hs <= end && tmp == 0);
+
+ /* If the match is not at the end of the needle, shift to the end
+ and continue until we match the last 2 characters. */
+ hs -= tmp;
+ if (tmp < m1)
+ continue;
+
+ /* The last 2 characters match. If the needle is long, check a
+ fixed number of characters first to quickly filter out mismatches. */
+ if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0)
+ {
+ if (memcmp (hs, ne, m1) == 0)
+ return (void *) hs;
+
+ /* Adjust filter offset when it doesn't find the mismatch. */
+ offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int);
+ }
+
+ /* Skip based on matching the last 2 characters. */
+ hs += shift1;
+ }
}
libc_hidden_builtin_def (strstr)
-
-#undef LONG_NEEDLE_THRESHOLD
ping This patch significantly improves performance of strstr compared to the previous version [1] using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 3 use a dedicated linear search. Very long needles use the Two-Way algorithm. The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5.8 times basic_strstr and 3.7 times twoway_strstr. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). OK for commit? [1] https://www.sourceware.org/ml/libc-alpha/2018-10/msg00634.html [2] https://www.sourceware.org/ml/libc-alpha/2018-12/msg01057.html ChangeLog: 2018-12-31 Wilco Dijkstra <wdijkstr@arm.com> * string/str-two-way.h (two_way_short_needle): Add inline to avoid warning. (two_way_long_needle): Block inlining. * string/strstr.c (strstr2): Add new function. (strstr3): Likewise. (STRSTR): Completely rewrite strstr to improve performance. -- diff --git a/string/str-two-way.h b/string/str-two-way.h index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and sublinear performance is not possible. */ -static RETURN_TYPE +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strstr.c b/string/strstr.c index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,21 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include <config.h> #endif -/* Specification of strstr. */ #include <string.h> -#include <stdbool.h> - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ @@ -47,47 +38,122 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ -char * -STRSTR (const char *haystack, const char *needle) +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) { - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - - /* Handle empty NEEDLE special case. */ - if (needle[0] == '\0') - return (char *) haystack; + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} - /* Skip until we find the first matching char from NEEDLE. */ - haystack = strchr (haystack, needle[0]); - if (haystack == NULL || needle[1] == '\0') - return (char *) haystack; +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} - /* Ensure HAYSTACK length is at least as long as NEEDLE length. - Since a match may occur early on in a huge HAYSTACK, use strnlen +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ +char * +STRSTR (const char *haystack, const char *needle) +{ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen and read ahead a few cachelines for improved performance. */ - needle_len = strlen (needle); - haystack_len = __strnlen (haystack, needle_len + 256); - if (haystack_len < needle_len) + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - /* Check whether we have a match. This improves performance since we avoid - the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (char *) haystack; - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (hs <= end && tmp == 0); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the last 2 characters. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* The last 2 characters match. If the needle is long, check a + fixed number of characters first to quickly filter out mismatches. */ + if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int); + } + + /* Skip based on matching the last 2 characters. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD
ping This patch significantly improves performance of strstr compared to the previous version [1] using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 3 use a dedicated linear search. Very long needles use the Two-Way algorithm. The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5.8 times basic_strstr and 3.7 times twoway_strstr. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). OK for commit? [1] https://www.sourceware.org/ml/libc-alpha/2018-10/msg00634.html [2] https://www.sourceware.org/ml/libc-alpha/2018-12/msg01057.html ChangeLog: 2018-12-31 Wilco Dijkstra <wdijkstr@arm.com> * string/str-two-way.h (two_way_short_needle): Add inline to avoid warning. (two_way_long_needle): Block inlining. * string/strstr.c (strstr2): Add new function. (strstr3): Likewise. (STRSTR): Completely rewrite strstr to improve performance. -- diff --git a/string/str-two-way.h b/string/str-two-way.h index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and sublinear performance is not possible. */ -static RETURN_TYPE +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strstr.c b/string/strstr.c index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,21 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include <config.h> #endif -/* Specification of strstr. */ #include <string.h> -#include <stdbool.h> - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ @@ -47,47 +38,122 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ -char * -STRSTR (const char *haystack, const char *needle) +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) { - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - - /* Handle empty NEEDLE special case. */ - if (needle[0] == '\0') - return (char *) haystack; + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} - /* Skip until we find the first matching char from NEEDLE. */ - haystack = strchr (haystack, needle[0]); - if (haystack == NULL || needle[1] == '\0') - return (char *) haystack; +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} - /* Ensure HAYSTACK length is at least as long as NEEDLE length. - Since a match may occur early on in a huge HAYSTACK, use strnlen +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ +char * +STRSTR (const char *haystack, const char *needle) +{ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen and read ahead a few cachelines for improved performance. */ - needle_len = strlen (needle); - haystack_len = __strnlen (haystack, needle_len + 256); - if (haystack_len < needle_len) + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - /* Check whether we have a match. This improves performance since we avoid - the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (char *) haystack; - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (hs <= end && tmp == 0); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the last 2 characters. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* The last 2 characters match. If the needle is long, check a + fixed number of characters first to quickly filter out mismatches. */ + if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int); + } + + /* Skip based on matching the last 2 characters. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD
ping This patch significantly improves performance of strstr compared to the previous version [1] using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 3 use a dedicated linear search. Very long needles use the Two-Way algorithm. The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5.8 times basic_strstr and 3.7 times twoway_strstr. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). OK for commit? [1] https://www.sourceware.org/ml/libc-alpha/2018-10/msg00634.html [2] https://www.sourceware.org/ml/libc-alpha/2018-12/msg01057.html ChangeLog: 2018-12-31 Wilco Dijkstra <wdijkstr@arm.com> * string/str-two-way.h (two_way_short_needle): Add inline to avoid warning. (two_way_long_needle): Block inlining. * string/strstr.c (strstr2): Add new function. (strstr3): Likewise. (STRSTR): Completely rewrite strstr to improve performance. -- diff --git a/string/str-two-way.h b/string/str-two-way.h index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and sublinear performance is not possible. */ -static RETURN_TYPE +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strstr.c b/string/strstr.c index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,21 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include <config.h> #endif -/* Specification of strstr. */ #include <string.h> -#include <stdbool.h> - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ @@ -47,47 +38,122 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ -char * -STRSTR (const char *haystack, const char *needle) +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) { - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - - /* Handle empty NEEDLE special case. */ - if (needle[0] == '\0') - return (char *) haystack; + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} - /* Skip until we find the first matching char from NEEDLE. */ - haystack = strchr (haystack, needle[0]); - if (haystack == NULL || needle[1] == '\0') - return (char *) haystack; +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} - /* Ensure HAYSTACK length is at least as long as NEEDLE length. - Since a match may occur early on in a huge HAYSTACK, use strnlen +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ +char * +STRSTR (const char *haystack, const char *needle) +{ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen and read ahead a few cachelines for improved performance. */ - needle_len = strlen (needle); - haystack_len = __strnlen (haystack, needle_len + 256); - if (haystack_len < needle_len) + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - /* Check whether we have a match. This improves performance since we avoid - the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (char *) haystack; - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (hs <= end && tmp == 0); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the last 2 characters. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* The last 2 characters match. If the needle is long, check a + fixed number of characters first to quickly filter out mismatches. */ + if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int); + } + + /* Skip based on matching the last 2 characters. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD
On 12/04/2019 16:50, Wilco Dijkstra wrote: > > ping > please cc those who previously had objections against the patch and ask if they sustain their objections and if so what changes or demonstrations are needed to move this forward. cc += Rich Felker. > > This patch significantly improves performance of strstr compared to the > previous version [1] using a novel modified Horspool algorithm. Needles up > to size 256 use a bad-character table indexed by hashed pairs of characters > to quickly skip past mismatches. Long needles use a self-adapting filtering > step to avoid comparing the whole needle repeatedly. > > By limiting the needle length to 256, the shift table only requires 8 bits > per entry, lowering preprocessing overhead and minimizing cache effects. > This limit also implies worst-case performance is linear. > > Small needles up to size 3 use a dedicated linear search. Very long needles > use the Two-Way algorithm. > > The performance gain using the improved bench-strstr [2] on Cortex-A72 is 5.8 > times basic_strstr and 3.7 times twoway_strstr. > > Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test > (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). > > OK for commit? > > [1] https://www.sourceware.org/ml/libc-alpha/2018-10/msg00634.html > [2] https://www.sourceware.org/ml/libc-alpha/2018-12/msg01057.html > > ChangeLog: > 2018-12-31 Wilco Dijkstra <wdijkstr@arm.com> > > * string/str-two-way.h (two_way_short_needle): Add inline to avoid > warning. > (two_way_long_needle): Block inlining. > * string/strstr.c (strstr2): Add new function. > (strstr3): Likewise. > (STRSTR): Completely rewrite strstr to improve performance. > > -- > diff --git a/string/str-two-way.h b/string/str-two-way.h > index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644 > --- a/string/str-two-way.h > +++ b/string/str-two-way.h > @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, > most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. > If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * > HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ > -static RETURN_TYPE > +static inline RETURN_TYPE > two_way_short_needle (const unsigned char *haystack, size_t haystack_len, > const unsigned char *needle, size_t needle_len) > { > @@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, > If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * > HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and > sublinear performance is not possible. */ > -static RETURN_TYPE > +__attribute__((noinline)) static RETURN_TYPE > two_way_long_needle (const unsigned char *haystack, size_t haystack_len, > const unsigned char *needle, size_t needle_len) > { > diff --git a/string/strstr.c b/string/strstr.c > index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644 > --- a/string/strstr.c > +++ b/string/strstr.c > @@ -16,21 +16,12 @@ > License along with the GNU C Library; if not, see > <http://www.gnu.org/licenses/>. */ > > -/* This particular implementation was written by Eric Blake, 2008. */ > - > #ifndef _LIBC > # include <config.h> > #endif > > -/* Specification of strstr. */ > #include <string.h> > > -#include <stdbool.h> > - > -#ifndef _LIBC > -# define __builtin_expect(expr, val) (expr) > -#endif > - > #define RETURN_TYPE char * > #define AVAILABLE(h, h_l, j, n_l) \ > (((j) + (n_l) <= (h_l)) \ > @@ -47,47 +38,122 @@ > #define STRSTR strstr > #endif > > -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK > - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in > - HAYSTACK. */ > -char * > -STRSTR (const char *haystack, const char *needle) > +static inline char * > +strstr2 (const unsigned char *hs, const unsigned char *ne) > { > - size_t needle_len; /* Length of NEEDLE. */ > - size_t haystack_len; /* Known minimum length of HAYSTACK. */ > - > - /* Handle empty NEEDLE special case. */ > - if (needle[0] == '\0') > - return (char *) haystack; > + uint32_t h1 = (ne[0] << 16) | ne[1]; > + uint32_t h2 = 0; > + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) > + h2 = (h2 << 16) | c; > + return h1 == h2 ? (char *)hs - 2 : NULL; > +} > > - /* Skip until we find the first matching char from NEEDLE. */ > - haystack = strchr (haystack, needle[0]); > - if (haystack == NULL || needle[1] == '\0') > - return (char *) haystack; > +static inline char * > +strstr3 (const unsigned char *hs, const unsigned char *ne) > +{ > + uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); > + uint32_t h2 = 0; > + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) > + h2 = (h2 | c) << 8; > + return h1 == h2 ? (char *)hs - 3 : NULL; > +} > > - /* Ensure HAYSTACK length is at least as long as NEEDLE length. > - Since a match may occur early on in a huge HAYSTACK, use strnlen > +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) > + > +/* Fast strstr algorithm with guaranteed linear-time performance. > + Small needles up to size 2 use a dedicated linear search. Longer needles > + up to size 256 use a novel modified Horspool algorithm. It hashes pairs > + of characters to quickly skip past mismatches. The main search loop only > + exits if the last 2 characters match, avoiding unnecessary calls to memcmp > + and allowing for a larger skip if there is no match. A self-adapting > + filtering check is used to quickly detect mismatches in long needles. > + By limiting the needle length to 256, the shift table can be reduced to 8 > + bits per entry, lowering preprocessing overhead and minimizing cache effects. > + The limit also implies worst-case performance is linear. > + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ > +char * > +STRSTR (const char *haystack, const char *needle) > +{ > + const unsigned char *hs = (const unsigned char *) haystack; > + const unsigned char *ne = (const unsigned char *) needle; > + > + /* Handle short needle special cases first. */ > + if (ne[0] == '\0') > + return (char *)hs; > + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); > + if (hs == NULL || ne[1] == '\0') > + return (char*)hs; > + if (ne[2] == '\0') > + return strstr2 (hs, ne); > + if (ne[3] == '\0') > + return strstr3 (hs, ne); > + > + /* Ensure haystack length is at least as long as needle length. > + Since a match may occur early on in a huge haystack, use strnlen > and read ahead a few cachelines for improved performance. */ > - needle_len = strlen (needle); > - haystack_len = __strnlen (haystack, needle_len + 256); > - if (haystack_len < needle_len) > + size_t ne_len = strlen ((const char*)ne); > + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); > + if (hs_len < ne_len) > return NULL; > > - /* Check whether we have a match. This improves performance since we avoid > - the initialization overhead of the two-way algorithm. */ > - if (memcmp (haystack, needle, needle_len) == 0) > - return (char *) haystack; > - > - /* Perform the search. Abstract memory is considered to be an array > - of 'unsigned char' values, not an array of 'char' values. See > - ISO C 99 section 6.2.6.1. */ > - if (needle_len < LONG_NEEDLE_THRESHOLD) > - return two_way_short_needle ((const unsigned char *) haystack, > - haystack_len, > - (const unsigned char *) needle, needle_len); > - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, > - (const unsigned char *) needle, needle_len); > + /* Check whether we have a match. This improves performance since we > + avoid initialization overheads. */ > + if (memcmp (hs, ne, ne_len) == 0) > + return (char *) hs; > + > + /* Use Two-Way algorithm for very long needles. */ > + if (__glibc_unlikely (ne_len > 256)) > + return two_way_long_needle (hs, hs_len, ne, ne_len); > + > + const unsigned char *end = hs + hs_len - ne_len; > + uint8_t shift[256]; > + size_t tmp, shift1; > + size_t m1 = ne_len - 1; > + size_t offset = 0; > + > + /* Initialize bad character shift hash table. */ > + memset (shift, 0, sizeof (shift)); > + for (int i = 1; i < m1; i++) > + shift[hash2 (ne + i)] = i; > + shift1 = m1 - shift[hash2 (ne + m1)]; > + shift[hash2 (ne + m1)] = m1; > + > + while (1) > + { > + if (__glibc_unlikely (hs > end)) > + { > + end += __strnlen ((const char*)end + m1 + 1, 2048); > + if (hs > end) > + return NULL; > + } > + > + /* Skip past character pairs not in the needle. */ > + do > + { > + hs += m1; > + tmp = shift[hash2 (hs)]; > + } > + while (hs <= end && tmp == 0); > + > + /* If the match is not at the end of the needle, shift to the end > + and continue until we match the last 2 characters. */ > + hs -= tmp; > + if (tmp < m1) > + continue; > + > + /* The last 2 characters match. If the needle is long, check a > + fixed number of characters first to quickly filter out mismatches. */ > + if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0) > + { > + if (memcmp (hs, ne, m1) == 0) > + return (void *) hs; > + > + /* Adjust filter offset when it doesn't find the mismatch. */ > + offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int); > + } > + > + /* Skip based on matching the last 2 characters. */ > + hs += shift1; > + } > } > libc_hidden_builtin_def (strstr) > - > -#undef LONG_NEEDLE_THRESHOLD > > >
Hi Szabolcs, > please cc those who previously had objections against > the patch and ask if they sustain their objections > and if so what changes or demonstrations are needed > to move this forward. I haven't seen any constructive objections nor received any reply when I asked for actual evidence of a claimed regression [1]. Wilco [1] https://sourceware.org/ml/libc-alpha/2019-01/msg00024.html
On Fri, Apr 12, 2019 at 04:49:11PM +0000, Wilco Dijkstra wrote: > Hi Szabolcs, > > > please cc those who previously had objections against > > the patch and ask if they sustain their objections > > and if so what changes or demonstrations are needed > > to move this forward. > > I haven't seen any constructive objections nor received any reply > when I asked for actual evidence of a claimed regression [1]. I have already explained to you the cases which are going to perform pathologically bad with your vaguely optimized version of the naive O(nm) strstr. I don't have your specific hardware to run benchmarks, but the order of magnitude here is so large that a big-O argument suffices. If the proposed size limit were something like 24 or 32 rather than 256 that might not necessarily be the case. I vaguely recall doing some testing on a glibc x86_64 box that showed the magnitude of the problem; I can try to dig that up or do it again if you really want to see it. Rich
Hi Rich, > On Fri, Apr 12, 2019 at 04:49:11PM +0000, Wilco Dijkstra wrote: >> Hi Szabolcs, >> >> > please cc those who previously had objections against >> > the patch and ask if they sustain their objections >> > and if so what changes or demonstrations are needed >> > to move this forward. >> >> I haven't seen any constructive objections nor received any reply >> when I asked for actual evidence of a claimed regression [1]. > > I have already explained to you the cases which are going to perform > pathologically bad with your vaguely optimized version of the naive > O(nm) strstr. I don't have your specific hardware to run benchmarks, > but the order of magnitude here is so large that a big-O argument > suffices. If the proposed size limit were something like 24 or 32 > rather than 256 that might not necessarily be the case. The O notation means O (256 N) is equivalent to O (N), ie. linear performance in the worst case. And that is what matters for large inputs. The average runtime of my algorithm is actually sublinear. So in that sense Two-Way performs pathologically bad. > I vaguely recall doing some testing on a glibc x86_64 box that showed > the magnitude of the problem; I can try to dig that up or do it again > if you really want to see it. Yes, without a reproducible example I can't see what your issue is. You can't make it go quadratic because it simply isn't. Wilco
On Mon, Apr 15, 2019 at 10:24:15AM +0000, Wilco Dijkstra wrote: > Hi Rich, > > > On Fri, Apr 12, 2019 at 04:49:11PM +0000, Wilco Dijkstra wrote: > >> Hi Szabolcs, > >> > >> > please cc those who previously had objections against > >> > the patch and ask if they sustain their objections > >> > and if so what changes or demonstrations are needed > >> > to move this forward. > >> > >> I haven't seen any constructive objections nor received any reply > >> when I asked for actual evidence of a claimed regression [1]. > > > > I have already explained to you the cases which are going to perform > > pathologically bad with your vaguely optimized version of the naive > > O(nm) strstr. I don't have your specific hardware to run benchmarks, > > but the order of magnitude here is so large that a big-O argument > > suffices. If the proposed size limit were something like 24 or 32 > > rather than 256 that might not necessarily be the case. > > The O notation means O (256 N) is equivalent to O (N), ie. linear > performance in the worst case. And that is what matters for large inputs. I know what big O notation means. My point is that 256 is a sufficiently large number that, for practical purposes, the presence or absence of the M factor (whose value can be up to 256) in O(N) vs O(MN) is more significant than any small constant factors. > The average runtime of my algorithm is actually sublinear. So in that > sense Two-Way performs pathologically bad. For strstr, it's never sublinear because it operates on null-terminated strings (but the linear term can have a very small constant because of fast incremental strnlen-like operation, if you want). For memmem, of course it can be sublinear. The widely used variant of twoway (used in both glibc and musl) does the jump table that makes it sublinear (or for strstr, smaller linear-term coefficient) already. If you're this unaware of what the current code is doing, you shouldn't be trying to replace it with something naive but rather trying to understand it and if/how it can be improved. > > I vaguely recall doing some testing on a glibc x86_64 box that showed > > the magnitude of the problem; I can try to dig that up or do it again > > if you really want to see it. > > Yes, without a reproducible example I can't see what your issue is. You > can't make it go quadratic because it simply isn't. Obviously it's not unbounded because you have a (very large) bound on the size, 256. I can make it do a 256-byte strcmp for nearly every byte of the input haystack. Maybe because of vectorization on some targets that's only 16x slower than the current code rather than 256x slower, but it's still a lot slower. Rich
Hi Rich, >> The O notation means O (256 N) is equivalent to O (N), ie. linear >> performance in the worst case. And that is what matters for large inputs. > > I know what big O notation means. My point is that 256 is a > sufficiently large number that, for practical purposes, the presence > or absence of the M factor (whose value can be up to 256) in O(N) vs > O(MN) is more significant than any small constant factors. No, you can't gloss over one constant factor but emphasise another. The constant factor is much smaller in my algorithm and actually gets better (relatively) when you encounter bad cases. As a result worst cases perform about the same even with needles of 256 characters. >> The average runtime of my algorithm is actually sublinear. So in that >> sense Two-Way performs pathologically bad. > > For strstr, it's never sublinear because it operates on > null-terminated strings (but the linear term can have a very small > constant because of fast incremental strnlen-like operation, if you > want). strstr requires strnlen of course but the main algorithm is still sublinear. As a result performance improves with larger needles and the relative amount of time spent in strnlen increases. Asymptotically it ends up as fast as strnlen. > For memmem, of course it can be sublinear. The widely used > variant of twoway (used in both glibc and musl) does the jump table > that makes it sublinear (or for strstr, smaller linear-term > coefficient) already. If you're this unaware of what the current code > is doing, you shouldn't be trying to replace it with something naive > but rather trying to understand it and if/how it can be improved. Well if you've actually seen the existing code, you'd know that it does not use a skip table for all common cases. And when it uses one, it actually slows down the worst case by an order of magnitude. As a result Two-Way performs absolutely abysmally both in common cases as well as worst cases. It's almost as if people designed it to be as bad as possible! Eg. normalised performance from GLIBC bench-strstr test: basic_strstr twoway_strstr strstr(new) Length 65536/ 16, alignment 1/11, found: 3.07 1.0 11.0 Yes, twoway_strstr is 3 times SLOWER than a trivial quadratic loop... >> Yes, without a reproducible example I can't see what your issue is. You >> can't make it go quadratic because it simply isn't. > > Obviously it's not unbounded because you have a (very large) bound on > the size, 256. I can make it do a 256-byte strcmp for nearly every > byte of the input haystack. Maybe because of vectorization on some > targets that's only 16x slower than the current code rather than 256x > slower, but it's still a lot slower. No you can't. It's impossible to make it do a full 256 byte memcmp every character. And bad cases are not bad because of the time spent comparing strings - they are bad because of mispredicted branches. So it's not possible to compare bad cases without benchmarking actual examples on modern CPUs. Wilco
On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > Hi Rich, ... > >> Yes, without a reproducible example I can't see what your issue is. You > >> can't make it go quadratic because it simply isn't. > > > > Obviously it's not unbounded because you have a (very large) bound on > > the size, 256. I can make it do a 256-byte strcmp for nearly every > > byte of the input haystack. Maybe because of vectorization on some > > targets that's only 16x slower than the current code rather than 256x > > slower, but it's still a lot slower. > > No you can't. It's impossible to make it do a full 256 byte memcmp every > character. And bad cases are not bad because of the time spent comparing > strings - they are bad because of mispredicted branches. So it's not possible > to compare bad cases without benchmarking actual examples on modern > CPUs. This discussion has been going in circles for quite some time now. Wilco, Rich, I think it would help a lot if you could BOTH write down some example needle and haystack pairs that you believe will demonstrate significantly improved performance with your preferred algorithm, and/or pathologically slow performance with your dispreferred algorithm. Even without specific numbers, that will give everyone something concrete to argue over, at least. zw
On Mon, Apr 15, 2019 at 04:15:02PM -0400, Zack Weinberg wrote: > On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > Hi Rich, > .... > > >> Yes, without a reproducible example I can't see what your issue is. You > > >> can't make it go quadratic because it simply isn't. > > > > > > Obviously it's not unbounded because you have a (very large) bound on > > > the size, 256. I can make it do a 256-byte strcmp for nearly every > > > byte of the input haystack. Maybe because of vectorization on some > > > targets that's only 16x slower than the current code rather than 256x > > > slower, but it's still a lot slower. > > > > No you can't. It's impossible to make it do a full 256 byte memcmp every > > character. And bad cases are not bad because of the time spent comparing > > strings - they are bad because of mispredicted branches. So it's not possible > > to compare bad cases without benchmarking actual examples on modern > > CPUs. > > This discussion has been going in circles for quite some time now. > > Wilco, Rich, I think it would help a lot if you could BOTH write down > some example needle and haystack pairs that you believe will > demonstrate significantly improved performance with your preferred > algorithm, and/or pathologically slow performance with your > dispreferred algorithm. Even without specific numbers, that will give > everyone something concrete to argue over, at least. That would be easier if he didn't keep adding random heuristic mitigations to hide the worst cases. Writing up the specific worst case now requires reverse engineering the hash function for collisions and figuring out the sliding offset that gets checked first. The current code does not even look *correct*, much less fast. The comment: + /* The last 2 characters match. If the needle is long, check a occurs at a point where no comparison on the last 2 characters has even been made; only the hash has been examined. I'm not sure if there are collisions possible which would make it return false positives (it's constrained somewhat by the second-to-last byte being compared in the final memcmp), but it's very concerning. I'll try to reverse-engineer the offset stuff later to produce a real worst-case, but this whole exercise is really frustrating. I've already been nerd-sniped into spending an inordinate amount of time researching possible improvements to twoway, which should be the job of someone proposing the changes, not someone objecting to bogus changes. Rich
On 15/04/2019 21:15, Zack Weinberg wrote: > On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >> Hi Rich, > ... >>>> Yes, without a reproducible example I can't see what your issue is. You >>>> can't make it go quadratic because it simply isn't. >>> >>> Obviously it's not unbounded because you have a (very large) bound on >>> the size, 256. I can make it do a 256-byte strcmp for nearly every >>> byte of the input haystack. Maybe because of vectorization on some >>> targets that's only 16x slower than the current code rather than 256x >>> slower, but it's still a lot slower. >> >> No you can't. It's impossible to make it do a full 256 byte memcmp every >> character. And bad cases are not bad because of the time spent comparing >> strings - they are bad because of mispredicted branches. So it's not possible >> to compare bad cases without benchmarking actual examples on modern >> CPUs. > > This discussion has been going in circles for quite some time now. > > Wilco, Rich, I think it would help a lot if you could BOTH write down > some example needle and haystack pairs that you believe will > demonstrate significantly improved performance with your preferred > algorithm, and/or pathologically slow performance with your > dispreferred algorithm. Even without specific numbers, that will give > everyone something concrete to argue over, at least. since the new algorithm tries to look at the last two chars first i'd say a possible bad case for it is needle = 250*"a" + "abbbaa" haystack = (250*"a" + "bbbbaa") * 1000 (256 is the longest needle for the new algo, checking in a 256k haystack should be large enough to see matching performance instead of setup time) i think this should be close to worst case, but i haven't done a detailed analysis, the regression with the new algorithm is 16.0x slower on Cortex-A72 17.8x slower on Cortex-A57 (for a fair comparison, the worst-case of the new code should be compared against the worst-case of the old code as well as comparing over common inputs for the average case) (the claimed average case improvement is 3.7x on Cortex-A72) if we are afraid the worst-case will be used for denial of service attack, then i think such slowdown is not enough to cause problems (and requires the control of both haystack and needle). if we are afraid of hitting bad cases in practice, then the heuristic tweaks in the new matching logic should reduce the probability of bad cases with real needle/haystack. (but it does not prevent them, so on interactive systems one might occasionally experience larger latency than before)
On 16/04/2019 17:40, Szabolcs Nagy wrote: > On 15/04/2019 21:15, Zack Weinberg wrote: >> On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >>> Hi Rich, >> ... >>>>> Yes, without a reproducible example I can't see what your issue is. You >>>>> can't make it go quadratic because it simply isn't. >>>> >>>> Obviously it's not unbounded because you have a (very large) bound on >>>> the size, 256. I can make it do a 256-byte strcmp for nearly every >>>> byte of the input haystack. Maybe because of vectorization on some >>>> targets that's only 16x slower than the current code rather than 256x >>>> slower, but it's still a lot slower. >>> >>> No you can't. It's impossible to make it do a full 256 byte memcmp every >>> character. And bad cases are not bad because of the time spent comparing >>> strings - they are bad because of mispredicted branches. So it's not possible >>> to compare bad cases without benchmarking actual examples on modern >>> CPUs. >> >> This discussion has been going in circles for quite some time now. >> >> Wilco, Rich, I think it would help a lot if you could BOTH write down >> some example needle and haystack pairs that you believe will >> demonstrate significantly improved performance with your preferred >> algorithm, and/or pathologically slow performance with your >> dispreferred algorithm. Even without specific numbers, that will give >> everyone something concrete to argue over, at least. > > since the new algorithm tries to look at the last two chars first > i'd say a possible bad case for it is > > needle = 250*"a" + "abbbaa" > haystack = (250*"a" + "bbbbaa") * 1000 > > (256 is the longest needle for the new algo, checking in a 256k > haystack should be large enough to see matching performance > instead of setup time) > > i think this should be close to worst case, but i haven't done > a detailed analysis, the regression with the new algorithm is > > 16.0x slower on Cortex-A72 > 17.8x slower on Cortex-A57 scratch that, with needle = 248*"a" + "abbbbaaa" haystack = (248*"a" + "bbbbbaaa") * 1000 i get 37.8x slower on Cortex-A72 40.0x slower on Cortex-A57 i didn't expect such difference compared to the previous case, i guess i would have to do a more detailed analysis. > > (for a fair comparison, the worst-case of the new code should > be compared against the worst-case of the old code as well as > comparing over common inputs for the average case) > > (the claimed average case improvement is 3.7x on Cortex-A72) > > if we are afraid the worst-case will be used for denial of > service attack, then i think such slowdown is not enough to > cause problems (and requires the control of both haystack and > needle). > > if we are afraid of hitting bad cases in practice, then the > heuristic tweaks in the new matching logic should reduce the > probability of bad cases with real needle/haystack. (but it > does not prevent them, so on interactive systems one might > occasionally experience larger latency than before) >
On 16/04/2019 18:01, Szabolcs Nagy wrote: > On 16/04/2019 17:40, Szabolcs Nagy wrote: >> On 15/04/2019 21:15, Zack Weinberg wrote: >>> On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >>>> Hi Rich, >>> ... >>>>>> Yes, without a reproducible example I can't see what your issue is. You >>>>>> can't make it go quadratic because it simply isn't. >>>>> >>>>> Obviously it's not unbounded because you have a (very large) bound on >>>>> the size, 256. I can make it do a 256-byte strcmp for nearly every >>>>> byte of the input haystack. Maybe because of vectorization on some >>>>> targets that's only 16x slower than the current code rather than 256x >>>>> slower, but it's still a lot slower. >>>> >>>> No you can't. It's impossible to make it do a full 256 byte memcmp every >>>> character. And bad cases are not bad because of the time spent comparing >>>> strings - they are bad because of mispredicted branches. So it's not possible >>>> to compare bad cases without benchmarking actual examples on modern >>>> CPUs. >>> >>> This discussion has been going in circles for quite some time now. >>> >>> Wilco, Rich, I think it would help a lot if you could BOTH write down >>> some example needle and haystack pairs that you believe will >>> demonstrate significantly improved performance with your preferred >>> algorithm, and/or pathologically slow performance with your >>> dispreferred algorithm. Even without specific numbers, that will give >>> everyone something concrete to argue over, at least. >> >> since the new algorithm tries to look at the last two chars first >> i'd say a possible bad case for it is >> >> needle = 250*"a" + "abbbaa" >> haystack = (250*"a" + "bbbbaa") * 1000 >> >> (256 is the longest needle for the new algo, checking in a 256k >> haystack should be large enough to see matching performance >> instead of setup time) >> >> i think this should be close to worst case, but i haven't done >> a detailed analysis, the regression with the new algorithm is >> >> 16.0x slower on Cortex-A72 >> 17.8x slower on Cortex-A57 > > scratch that, with > > needle = 248*"a" + "abbbbaaa" > haystack = (248*"a" + "bbbbbaaa") * 1000 > > i get > > 37.8x slower on Cortex-A72 > 40.0x slower on Cortex-A57 this is a good case for twoway, so we need a twoway worst case too for a real comparision: comparing the worst for new vs worst for twoway i've seen so far, new is 4.5x slower on Cortex-A72 2.7x slower on Cortex-A57 but there is no guarantee that my inputs are near the real worst cases, it's hard to tell (and clearly uarch matters too). (i wanted to avoid diving into the algorithm details to find worst cases) > > i didn't expect such difference compared to the previous case, > i guess i would have to do a more detailed analysis. > >> >> (for a fair comparison, the worst-case of the new code should >> be compared against the worst-case of the old code as well as >> comparing over common inputs for the average case) >> >> (the claimed average case improvement is 3.7x on Cortex-A72) >> >> if we are afraid the worst-case will be used for denial of >> service attack, then i think such slowdown is not enough to >> cause problems (and requires the control of both haystack and >> needle). >> >> if we are afraid of hitting bad cases in practice, then the >> heuristic tweaks in the new matching logic should reduce the >> probability of bad cases with real needle/haystack. (but it >> does not prevent them, so on interactive systems one might >> occasionally experience larger latency than before) >> >
On Tue, Apr 16, 2019 at 06:07:37PM +0000, Szabolcs Nagy wrote: > On 16/04/2019 18:01, Szabolcs Nagy wrote: > > On 16/04/2019 17:40, Szabolcs Nagy wrote: > >> On 15/04/2019 21:15, Zack Weinberg wrote: > >>> On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > >>>> Hi Rich, > >>> ... > >>>>>> Yes, without a reproducible example I can't see what your issue is. You > >>>>>> can't make it go quadratic because it simply isn't. > >>>>> > >>>>> Obviously it's not unbounded because you have a (very large) bound on > >>>>> the size, 256. I can make it do a 256-byte strcmp for nearly every > >>>>> byte of the input haystack. Maybe because of vectorization on some > >>>>> targets that's only 16x slower than the current code rather than 256x > >>>>> slower, but it's still a lot slower. > >>>> > >>>> No you can't. It's impossible to make it do a full 256 byte memcmp every > >>>> character. And bad cases are not bad because of the time spent comparing > >>>> strings - they are bad because of mispredicted branches. So it's not possible > >>>> to compare bad cases without benchmarking actual examples on modern > >>>> CPUs. > >>> > >>> This discussion has been going in circles for quite some time now. > >>> > >>> Wilco, Rich, I think it would help a lot if you could BOTH write down > >>> some example needle and haystack pairs that you believe will > >>> demonstrate significantly improved performance with your preferred > >>> algorithm, and/or pathologically slow performance with your > >>> dispreferred algorithm. Even without specific numbers, that will give > >>> everyone something concrete to argue over, at least. > >> > >> since the new algorithm tries to look at the last two chars first > >> i'd say a possible bad case for it is > >> > >> needle = 250*"a" + "abbbaa" > >> haystack = (250*"a" + "bbbbaa") * 1000 > >> > >> (256 is the longest needle for the new algo, checking in a 256k > >> haystack should be large enough to see matching performance > >> instead of setup time) > >> > >> i think this should be close to worst case, but i haven't done > >> a detailed analysis, the regression with the new algorithm is > >> > >> 16.0x slower on Cortex-A72 > >> 17.8x slower on Cortex-A57 > > > > scratch that, with > > > > needle = 248*"a" + "abbbbaaa" > > haystack = (248*"a" + "bbbbbaaa") * 1000 > > > > i get > > > > 37.8x slower on Cortex-A72 > > 40.0x slower on Cortex-A57 > > this is a good case for twoway, so we need a twoway worst case too > for a real comparision: comparing the worst for new vs worst for > twoway i've seen so far, new is > > 4.5x slower on Cortex-A72 > 2.7x slower on Cortex-A57 > > but there is no guarantee that my inputs are near the real worst > cases, it's hard to tell (and clearly uarch matters too). > > (i wanted to avoid diving into the algorithm details to find > worst cases) There've been a series of mitigations analogous to someone 'fixing' an SQLi vuln by filtering for SQL keywords or single-quote characters in the input rather than actually fixing the bug, burying the issue and making it harder to find the worst-cases that still exist. That's why I'm frustrated with this whole topic/proposal. Given a bound on the needle size it's used for, this is fundamentally a finite problem (albeit a pretty damn large one), so there likely are "heuristic" solutions sufficiently complete to cover all cases. However, even if one can be found, who wants to look at or validate the argument/proof that it covers them all? Or run the fuzzers looking for mistakes? Etc. The right way forward here, if there's actually any performance issue to care about improving on, is to study why the current two-way implementation is slower than expected and make changes that improve performance in all cases. The kind of primitives (e.g. comparison of particular subneedle first) Wilco Dijkstra is using are very similar to what two-way (with bad-char shift table) does, but invoked heuristically rather than rigorously. Optimized versions can be done rigorously instead. For example, when the right factor is bounded by word/vector size (which is very common and can be special-cased), two-way reduces to something close to the proposed loop, but with *much better* advance on mismatches. And of course, if the 2-char-hash shift table is better than a single-byte one, that can be used with two-way too. I suggested before that, for the general case in two-way, it would help to have a fast vectorized memcmp that returns the length of the matching prefix, rather than the difference, but this case (long right half) may actually be sufficiently rare that we shouldn't care about making it faster than it already is. Rich
Hi Szabolcs, > this is a good case for twoway, so we need a twoway worst case too > for a real comparision: comparing the worst for new vs worst for > twoway i've seen so far, new is Yes absolutely, it's easy to get 1000 times difference between fast and slow cases, but that's not interesting at all. The goal is to make the average case fast - even if it makes uncommon cases slower. > 4.5x slower on Cortex-A72 > 2.7x slower on Cortex-A57 That shows how much it varies even on 2 similar micro architectures. Note these results are using the optimized Two-Way implementation - the original implementation was more than twice as slow before I optimized it. So it's possible the worst cases are faster now than in GLIBC 2.26. > but there is no guarantee that my inputs are near the real worst > cases, it's hard to tell (and clearly uarch matters too). We could generate lots of cases to try finding a minimum, but the search space is impossibly large. And the worst cases depend a lot on microarchitecture so they will be different on different CPUs. Ultimately the 3 key questions are: 1. Is it linear time even on huge inputs? 2. Is it faster than Two-way in the average case? 3. Is it within a small factor even in the worst case? The first 2 were never in doubt, and your result proves 3 as well. Wilco
On Tue, Apr 16, 2019 at 10:52:28PM +0000, Wilco Dijkstra wrote: > Hi Szabolcs, > > > this is a good case for twoway, so we need a twoway worst case too > > for a real comparision: comparing the worst for new vs worst for > > twoway i've seen so far, new is > > Yes absolutely, it's easy to get 1000 times difference between fast and > slow cases, but that's not interesting at all. The goal is to make the average > case fast - even if it makes uncommon cases slower. > > > 4.5x slower on Cortex-A72 > > 2.7x slower on Cortex-A57 > > That shows how much it varies even on 2 similar micro architectures. > Note these results are using the optimized Two-Way implementation - the > original implementation was more than twice as slow before I optimized it. > So it's possible the worst cases are faster now than in GLIBC 2.26. > > > but there is no guarantee that my inputs are near the real worst > > cases, it's hard to tell (and clearly uarch matters too). > > We could generate lots of cases to try finding a minimum, but the search space is > impossibly large. And the worst cases depend a lot on microarchitecture so they > will be different on different CPUs. > > Ultimately the 3 key questions are: > > 1. Is it linear time even on huge inputs? > 2. Is it faster than Two-way in the average case? > 3. Is it within a small factor even in the worst case? > > The first 2 were never in doubt, and your result proves 3 as well. His result was up to 40x slower, which refutes #3. Rich
Hi Rich, > Ultimately the 3 key questions are: > > 1. Is it linear time even on huge inputs? > 2. Is it faster than Two-way in the average case? > 3. Is it within a small factor even in the worst case? > > The first 2 were never in doubt, and your result proves 3 as well. > His result was up to 40x slower, which refutes #3. No that was incorrectly comparing a fast case with a slow case. You can easily show 1000x difference between fast and slow cases, but what we're interested in is how the slow cases compare. Wilco
On Tue, Apr 16, 2019 at 11:49:43PM +0000, Wilco Dijkstra wrote: > Hi Rich, > > > Ultimately the 3 key questions are: > > > > 1. Is it linear time even on huge inputs? > > 2. Is it faster than Two-way in the average case? > > 3. Is it within a small factor even in the worst case? > > > > The first 2 were never in doubt, and your result proves 3 as well. > > > His result was up to 40x slower, which refutes #3. > > No that was incorrectly comparing a fast case with a slow case. You can > easily show 1000x difference between fast and slow cases, but what we're > interested in is how the slow cases compare. No, the 40x was comparing the same input -- that is, there's an input for which your patch makes it 40x slower. You can find some inputs which are mildly slow for the current two-way implementation (likely due to a mix of fundamental reasons and implementation flaws), and normalized for size (a poor normalization but whatever), searching for them takes significantly more than 1/40 the time that your code takes on its worst-case. This is really not a meaningful comparison. What's meaningful is that some operations become 40x slower for no good reason except your irrational refusal to improve the implementation of the good algorithm rather than reverting to something backwards. Rich
Hi Rich, On Tue, Apr 16, 2019 at 11:49:43PM +0000, Wilco Dijkstra wrote: > Hi Rich, > > > Ultimately the 3 key questions are: > > > > 1. Is it linear time even on huge inputs? > > 2. Is it faster than Two-way in the average case? > > 3. Is it within a small factor even in the worst case? > > > > The first 2 were never in doubt, and your result proves 3 as well. > > > His result was up to 40x slower, which refutes #3. > > No that was incorrectly comparing a fast case with a slow case. You can > easily show 1000x difference between fast and slow cases, but what we're > interested in is how the slow cases compare. > No, the 40x was comparing the same input -- that is, there's an input > for which your patch makes it 40x slower. The same input can be fast for one algorithm and slow on another. Is that hard to understand? Like I said it can be 1000 times. And it goes both ways so it doesn't matter at all. > You can find some inputs which are mildly slow for the current two-way > implementation (likely due to a mix of fundamental reasons and > implementation flaws), and normalized for size (a poor normalization > but whatever), searching for them takes significantly more than 1/40 > the time that your code takes on its worst-case. This is really not a Yes it's very common to get bad factorizations, but every now and again it gets one that starts with an uncommon character and it ends up really fast. That huge disparity means it's a bad algorithm. Yes the implementation is insanely stupid with the large and small needle code reversed. For small needles you simply want to use a skip table since factorization can't do anything useful anyway. For large needles you can get reasonable factorizations and so a skip table actually slows things down. > meaningful comparison. What's meaningful is that some operations > become 40x slower for no good reason except your irrational refusal to > improve the implementation of the good algorithm rather than reverting > to something backwards. A 40x slowdown vs a rare lucky fast case for Two-way is not in any way meaningful. Being 11 times faster than the optimized Two-way on the average case certainly is. What is irrational is insisting that Two-way is in any way better when it obviously isn't. Wilco
On Wed, Apr 17, 2019 at 12:54:05AM +0000, Wilco Dijkstra wrote: > Hi Rich, > > On Tue, Apr 16, 2019 at 11:49:43PM +0000, Wilco Dijkstra wrote: > > Hi Rich, > > > > > Ultimately the 3 key questions are: > > > > > > 1. Is it linear time even on huge inputs? > > > 2. Is it faster than Two-way in the average case? > > > 3. Is it within a small factor even in the worst case? > > > > > > The first 2 were never in doubt, and your result proves 3 as well. > > > > > His result was up to 40x slower, which refutes #3. > > > > No that was incorrectly comparing a fast case with a slow case. You can > > easily show 1000x difference between fast and slow cases, but what we're > > interested in is how the slow cases compare. > > > No, the 40x was comparing the same input -- that is, there's an input > > for which your patch makes it 40x slower. > > The same input can be fast for one algorithm and slow on another. Is that > hard to understand? Like I said it can be 1000 times. And it goes both ways > so it doesn't matter at all. > > > You can find some inputs which are mildly slow for the current two-way > > implementation (likely due to a mix of fundamental reasons and > > implementation flaws), and normalized for size (a poor normalization > > but whatever), searching for them takes significantly more than 1/40 > > the time that your code takes on its worst-case. This is really not a > > Yes it's very common to get bad factorizations, but every now and again it > gets one that starts with an uncommon character and it ends up really fast. > That huge disparity means it's a bad algorithm. No it doesn't. It's unfortunate that the factorization depends on the choice of order on the alphabet. This can be mitigated by ordering the alphabet according to first or last appearance in the needle, and indeed doing so tends to improve things. But even without that, the difference between a "good" and "bad" choice is small. When it's not small, the cause is inefficient code paths in the implementation, not anything fundamental to the algorithm. > Yes the implementation is insanely stupid with the large and small needle > code reversed. For small needles you simply want to use a skip table since > factorization can't do anything useful anyway. Arguably, on an arch with vector registers, this is roughly correct, for glibc's large/small cutoff of 32 bytes. However keep in mind that in a worst case the skip table is always useless. The way you make an efficient strstr for needles that fit in registers is just rotating in a byte at a time and comparing the whole register. I'm not sure if you can tack the skip table onto this in a way that works well. > For large needles you can get > reasonable factorizations and so a skip table actually slows things down. Unfortunately the memset to make the skip table is somewhat costly for small needles, in terms of blowing cache lines even if not raw cycles under an artificial benchmark. musl's version mitigates this with a bit array marking the valid entries rather than initializing them all, but it might be better (if you don't mind code size, which glibc doesn't) to have a different version for needles <256 bytes in which each skip entry is a single byte. > > meaningful comparison. What's meaningful is that some operations > > become 40x slower for no good reason except your irrational refusal to > > improve the implementation of the good algorithm rather than reverting > > to something backwards. > > A 40x slowdown vs a rare lucky fast case for Two-way is not in any way It's not a "lucky" fast case. It's the classic pattern of cases that are awful for O(nm) algorithms. That a non-idiotic algorithm isn't slow on this is not luck. It's the whole point. > meaningful. Being 11 times faster than the optimized Two-way on the > average case certainly is. It's not 11x faster, and the two-way in glibc is not at all optimized, but does lots of things in glaringly, gratuitously slow ways. Rich
On 16/04/2019 21:56, Rich Felker wrote: > On Tue, Apr 16, 2019 at 06:07:37PM +0000, Szabolcs Nagy wrote: >> On 16/04/2019 18:01, Szabolcs Nagy wrote: >>> On 16/04/2019 17:40, Szabolcs Nagy wrote: >>>> On 15/04/2019 21:15, Zack Weinberg wrote: >>>>> On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: >>>>>> Hi Rich, >>>>> ... >>>>>>>> Yes, without a reproducible example I can't see what your issue is. You >>>>>>>> can't make it go quadratic because it simply isn't. >>>>>>> >>>>>>> Obviously it's not unbounded because you have a (very large) bound on >>>>>>> the size, 256. I can make it do a 256-byte strcmp for nearly every >>>>>>> byte of the input haystack. Maybe because of vectorization on some >>>>>>> targets that's only 16x slower than the current code rather than 256x >>>>>>> slower, but it's still a lot slower. >>>>>> >>>>>> No you can't. It's impossible to make it do a full 256 byte memcmp every >>>>>> character. And bad cases are not bad because of the time spent comparing >>>>>> strings - they are bad because of mispredicted branches. So it's not possible >>>>>> to compare bad cases without benchmarking actual examples on modern >>>>>> CPUs. >>>>> >>>>> This discussion has been going in circles for quite some time now. >>>>> >>>>> Wilco, Rich, I think it would help a lot if you could BOTH write down >>>>> some example needle and haystack pairs that you believe will >>>>> demonstrate significantly improved performance with your preferred >>>>> algorithm, and/or pathologically slow performance with your >>>>> dispreferred algorithm. Even without specific numbers, that will give >>>>> everyone something concrete to argue over, at least. >>>> >>>> since the new algorithm tries to look at the last two chars first >>>> i'd say a possible bad case for it is >>>> >>>> needle = 250*"a" + "abbbaa" >>>> haystack = (250*"a" + "bbbbaa") * 1000 >>>> >>>> (256 is the longest needle for the new algo, checking in a 256k >>>> haystack should be large enough to see matching performance >>>> instead of setup time) >>>> >>>> i think this should be close to worst case, but i haven't done >>>> a detailed analysis, the regression with the new algorithm is >>>> >>>> 16.0x slower on Cortex-A72 >>>> 17.8x slower on Cortex-A57 >>> >>> scratch that, with >>> >>> needle = 248*"a" + "abbbbaaa" >>> haystack = (248*"a" + "bbbbbaaa") * 1000 >>> >>> i get >>> >>> 37.8x slower on Cortex-A72 >>> 40.0x slower on Cortex-A57 >> >> this is a good case for twoway, so we need a twoway worst case too >> for a real comparision: comparing the worst for new vs worst for >> twoway i've seen so far, new is >> >> 4.5x slower on Cortex-A72 >> 2.7x slower on Cortex-A57 >> >> but there is no guarantee that my inputs are near the real worst >> cases, it's hard to tell (and clearly uarch matters too). >> >> (i wanted to avoid diving into the algorithm details to find >> worst cases) > > There've been a series of mitigations analogous to someone 'fixing' an > SQLi vuln by filtering for SQL keywords or single-quote characters in > the input rather than actually fixing the bug, burying the issue and > making it harder to find the worst-cases that still exist. That's why > I'm frustrated with this whole topic/proposal. in this case a large slowdown is not a real vulnerability. i think even doing memcmp at every byte position is not a denial of service concern if the needle is <= 256 bytes. (maybe we should use a lower limit? <= 64?) so only 'fixing' bad cases statistically is a valid approach. > Given a bound on the needle size it's used for, this is fundamentally > a finite problem (albeit a pretty damn large one), so there likely are > "heuristic" solutions sufficiently complete to cover all cases. > However, even if one can be found, who wants to look at or validate > the argument/proof that it covers them all? Or run the fuzzers looking > for mistakes? Etc. a mistake is not a disaster in this case, we just want to avoid hitting 'memcmp per byte' behaviour on practical workloads. of course 'practical workload' is hard to reason about, which is why an algorithm that's provably better on all inputs would be preferred. > The right way forward here, if there's actually any performance issue > to care about improving on, is to study why the current two-way > implementation is slower than expected and make changes that improve > performance in all cases. i agree that would be nicer, but that's harder to do.
Hi Szabolcs, I wrote a simple testcase (below) that shows the worst cases for both algorithms. On Cortex-A72 the results are (relative runtime): glibc 2.27 2.28 musl new bad needle 1 (hs 1048575, ne 255): 77.5 46.9 50.5 1.0 bad needle 2 (hs 1048575, ne 255): 1.0 17.9 19.7 59.0 So it shows the same input has huge performance variations between different implementations, even of the same algorithm. When comparing the performance of slow cases between the algorithms, the worst case of the new algorithm is 28% faster than GLIBC 2.27, 29% slower than GLIBC 2.28 or 20% slower than the Musl version. Wilco #include <stdio.h> #include <string.h> #include <time.h> #include <stdlib.h> char hs[MAX_HS_LEN+1]; char ne[MAX_NE_LEN+1]; char *volatile res; static void run (size_t hs_len, size_t ne_len, char *name) { clock_t t = clock (); for (int i = 0; i < N; i++) res = strstr (hs, ne); t = clock () - t; printf ("%s (hs %d, ne %d): %ld\n", name, hs_len-1, ne_len-1, t); } static void bad_needle_1 (size_t hs_len, size_t ne_len) { for (int i = 0; i < hs_len; i++) hs[i] = (rand () & 255) > 155 ? 'a' : 'b'; hs[hs_len] = 0; memset (ne, 'a', ne_len); ne[ne_len-2] = 'b'; ne[0] = 'b'; ne[ne_len] = 0; run (hs_len, ne_len, "bad needle 1"); } static void bad_needle_2 (size_t hs_len, size_t ne_len) { memset (ne, 'a', ne_len); ne[ne_len] = '\0'; ne[ne_len - 6] = 'b'; memset (hs, 'a', hs_len); for (size_t i = ne_len; i < hs_len; i += ne_len) { hs[i-5] = 'b'; hs[i-6] = 'b'; } hs[hs_len] = 0; run (hs_len, ne_len, "bad needle 2"); } int main (void) { bad_needle_1 (MAX_HS_LEN, 256); bad_needle_2 (MAX_HS_LEN, 256); return 0; }
On Wed, Apr 17, 2019 at 02:24:19PM +0000, Szabolcs Nagy wrote: > On 16/04/2019 21:56, Rich Felker wrote: > > On Tue, Apr 16, 2019 at 06:07:37PM +0000, Szabolcs Nagy wrote: > >> On 16/04/2019 18:01, Szabolcs Nagy wrote: > >>> On 16/04/2019 17:40, Szabolcs Nagy wrote: > >>>> On 15/04/2019 21:15, Zack Weinberg wrote: > >>>>> On Mon, Apr 15, 2019 at 2:02 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > >>>>>> Hi Rich, > >>>>> ... > >>>>>>>> Yes, without a reproducible example I can't see what your issue is. You > >>>>>>>> can't make it go quadratic because it simply isn't. > >>>>>>> > >>>>>>> Obviously it's not unbounded because you have a (very large) bound on > >>>>>>> the size, 256. I can make it do a 256-byte strcmp for nearly every > >>>>>>> byte of the input haystack. Maybe because of vectorization on some > >>>>>>> targets that's only 16x slower than the current code rather than 256x > >>>>>>> slower, but it's still a lot slower. > >>>>>> > >>>>>> No you can't. It's impossible to make it do a full 256 byte memcmp every > >>>>>> character. And bad cases are not bad because of the time spent comparing > >>>>>> strings - they are bad because of mispredicted branches. So it's not possible > >>>>>> to compare bad cases without benchmarking actual examples on modern > >>>>>> CPUs. > >>>>> > >>>>> This discussion has been going in circles for quite some time now. > >>>>> > >>>>> Wilco, Rich, I think it would help a lot if you could BOTH write down > >>>>> some example needle and haystack pairs that you believe will > >>>>> demonstrate significantly improved performance with your preferred > >>>>> algorithm, and/or pathologically slow performance with your > >>>>> dispreferred algorithm. Even without specific numbers, that will give > >>>>> everyone something concrete to argue over, at least. > >>>> > >>>> since the new algorithm tries to look at the last two chars first > >>>> i'd say a possible bad case for it is > >>>> > >>>> needle = 250*"a" + "abbbaa" > >>>> haystack = (250*"a" + "bbbbaa") * 1000 > >>>> > >>>> (256 is the longest needle for the new algo, checking in a 256k > >>>> haystack should be large enough to see matching performance > >>>> instead of setup time) > >>>> > >>>> i think this should be close to worst case, but i haven't done > >>>> a detailed analysis, the regression with the new algorithm is > >>>> > >>>> 16.0x slower on Cortex-A72 > >>>> 17.8x slower on Cortex-A57 > >>> > >>> scratch that, with > >>> > >>> needle = 248*"a" + "abbbbaaa" > >>> haystack = (248*"a" + "bbbbbaaa") * 1000 > >>> > >>> i get > >>> > >>> 37.8x slower on Cortex-A72 > >>> 40.0x slower on Cortex-A57 > >> > >> this is a good case for twoway, so we need a twoway worst case too > >> for a real comparision: comparing the worst for new vs worst for > >> twoway i've seen so far, new is > >> > >> 4.5x slower on Cortex-A72 > >> 2.7x slower on Cortex-A57 > >> > >> but there is no guarantee that my inputs are near the real worst > >> cases, it's hard to tell (and clearly uarch matters too). > >> > >> (i wanted to avoid diving into the algorithm details to find > >> worst cases) > > > > There've been a series of mitigations analogous to someone 'fixing' an > > SQLi vuln by filtering for SQL keywords or single-quote characters in > > the input rather than actually fixing the bug, burying the issue and > > making it harder to find the worst-cases that still exist. That's why > > I'm frustrated with this whole topic/proposal. > > in this case a large slowdown is not a real vulnerability. I agree the bounded magnitude does not make it a vulnerability. Rather it's just a QoI regression. I made the analogy with SQLi not to imply that it's a vuln, but to critique the practice of burying a problem by making it hard to work out which specific input case hits the problem, rather than actually fixing the problem. > i think even doing memcmp at every byte position is not a > denial of service concern if the needle is <= 256 bytes. > (maybe we should use a lower limit? <= 64?) > > so only 'fixing' bad cases statistically is a valid approach. If you recall, users were upset about glibc's math functions being tens or hundreds of times slower on particular inputs because of correct rounding. AIUI these were not inputs that were found by attempts to perform DoS, just randomly hit in the real world. Adding such serious input-dependent variance in performance is not a good thing. > > Given a bound on the needle size it's used for, this is fundamentally > > a finite problem (albeit a pretty damn large one), so there likely are > > "heuristic" solutions sufficiently complete to cover all cases. > > However, even if one can be found, who wants to look at or validate > > the argument/proof that it covers them all? Or run the fuzzers looking > > for mistakes? Etc. > > a mistake is not a disaster in this case, we just want to Mistakes can easily turn into correctness bugs too. I'm not clear on whether the current patch version has a bug or not, but it looks plausible that it has false positives on hash collisions. When you pile on heuristic mitigations like this, they become bug surface. > avoid hitting 'memcmp per byte' behaviour on practical > workloads. Especially for memmem, all workloads are "practical". > of course 'practical workload' is hard to reason about, > which is why an algorithm that's provably better on all > inputs would be preferred. > > > The right way forward here, if there's actually any performance issue > > to care about improving on, is to study why the current two-way > > implementation is slower than expected and make changes that improve > > performance in all cases. > > i agree that would be nicer, but that's harder to do. Wilco has already highlighted a few bad things it's doing that could be improved without much work. After making those improvements, we could then evaluate whether there are any cases where there's a compelling advantage to the proposed quadratic version. I think what we'd find is that it would be better on average, and probably better for all inputs, for sizes where quadratic might win, to do a rolling compare (like the current 2- to 4-byte versions in musl) using a vector register (up to whatever size that is). On machines without wide vector registers, redundant strcmp is going to be really slow, and two-way is going to win even at small needle sizes. Rich
Hi Szabolcs, > in this case a large slowdown is not a real vulnerability. Absolutely, performance varies quite dramatically with different inputs. That is true for all algorithms even they are "real-time". Caches and branch predictors are not going away. > i think even doing memcmp at every byte position is not a > denial of service concern if the needle is <= 256 bytes. > (maybe we should use a lower limit? <= 64?) It couldn't be since its worst case isn't actually much worse (~2.2 times for needle size of 256). A simple memcmp at every character is significantly faster than Two-way on the first bad needle, and about the same performance on the second. Also when reducing the size of the needle to 31 from 255 on my algorithm, there is only a 73% speedup for its worst case (bad needle 2). So reducing the cutoff doesn't really help the worst case since we're clearly not quadratic. Wilco
On Thu, Apr 18, 2019 at 01:51:14PM +0000, Wilco Dijkstra wrote: > Hi Szabolcs, > > > in this case a large slowdown is not a real vulnerability. > > Absolutely, performance varies quite dramatically with different inputs. That > is true for all algorithms even they are "real-time". Caches and branch > predictors are not going away. > > > i think even doing memcmp at every byte position is not a > > denial of service concern if the needle is <= 256 bytes. > > (maybe we should use a lower limit? <= 64?) > > It couldn't be since its worst case isn't actually much worse (~2.2 times for The worst is 40x, and it's going to be much much worse on archs without big vectors. > needle size of 256). A simple memcmp at every character is significantly > faster than Two-way on the first bad needle, and about the same performance > on the second. Then fix whatever bug made two-way slow on that case. It should not be slow there. > Also when reducing the size of the needle to 31 from 255 on my algorithm, > there is only a 73% speedup for its worst case (bad needle 2). So reducing the > cutoff doesn't really help the worst case since we're clearly not quadratic.. I don't follow. You mean two-way is only 73% faster on that needle? That does not match previous findings, unless you've changed the needle vs the 40x one nsz found (I didn't read the test case source in depth but saw that it does not comment what the test cases it's generating are). Rich
On 18/04/2019 16:20, Rich Felker wrote: > On Thu, Apr 18, 2019 at 01:51:14PM +0000, Wilco Dijkstra wrote: >>> i think even doing memcmp at every byte position is not a >>> denial of service concern if the needle is <= 256 bytes. >>> (maybe we should use a lower limit? <= 64?) >> >> It couldn't be since its worst case isn't actually much worse (~2.2 times for > > The worst is 40x, and it's going to be much much worse on archs > without big vectors. 40x happens on an input where twoway is fast and new algo is slow, there are examples the other way around. if we compare the worst known case of twoway to worst known case of new algo then the slow down is < 2x according to https://sourceware.org/ml/libc-alpha/2019-04/msg00404.html (bad needle 1 is bad for twoway, bad needle 2 is bad for new algo)
Hi, > 40x happens on an input where twoway is fast and new algo is slow, > there are examples the other way around. On the worst cases I tested Two-way is 78 times slower than my new algorithm. It's trivial to find even worse factors, but it's not a useful comparison given each algorithm will have different worst cases, and they differ with each microarchitecture. Wilco
On Thu, Apr 18, 2019 at 04:37:41PM +0000, Wilco Dijkstra wrote: > Hi, > > > 40x happens on an input where twoway is fast and new algo is slow, > > there are examples the other way around. > > On the worst cases I tested Two-way is 78 times slower than my new > algorithm. It's trivial to find even worse factors, but it's not a useful > comparison given each algorithm will have different worst cases, and > they differ with each microarchitecture. Unless you have concrete suggestions to improve the two-way implementation you want to discuss, I'm done with this thread for now. It's an endless time-sink/bikeshed to "fix" a non-problem with a fundamentally bad algorithm with bad performance properties. If glibc folks want to adopt this bad code, that's on you guys. I've made it clear what directions I'd like to see pursued instead, if there's an actual problem that needs to be fixed. But if there's not, this is all a colossal waste of time. Rich
diff --git a/string/str-two-way.h b/string/str-two-way.h index 523d946c59412e1f1f65b8ec3778553b83191952..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and sublinear performance is not possible. */ -static RETURN_TYPE +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strstr.c b/string/strstr.c index f74d7189ed1319f6225525cc2d32380745de1523..aca83626772f5a23da78643edfeec3bf2feafc2d 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,21 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include <config.h> #endif -/* Specification of strstr. */ #include <string.h> -#include <stdbool.h> - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ @@ -47,47 +38,122 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ -char * -STRSTR (const char *haystack, const char *needle) +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) { - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - - /* Handle empty NEEDLE special case. */ - if (needle[0] == '\0') - return (char *) haystack; + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} - /* Skip until we find the first matching char from NEEDLE. */ - haystack = strchr (haystack, needle[0]); - if (haystack == NULL || needle[1] == '\0') - return (char *) haystack; +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} - /* Ensure HAYSTACK length is at least as long as NEEDLE length. - Since a match may occur early on in a huge HAYSTACK, use strnlen +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ +char * +STRSTR (const char *haystack, const char *needle) +{ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen and read ahead a few cachelines for improved performance. */ - needle_len = strlen (needle); - haystack_len = __strnlen (haystack, needle_len + 256); - if (haystack_len < needle_len) + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - /* Check whether we have a match. This improves performance since we avoid - the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (char *) haystack; - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (hs <= end && tmp == 0); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the last 2 characters. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* The last 2 characters match. If the needle is long, check a + fixed number of characters first to quickly filter out mismatches. */ + if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int); + } + + /* Skip based on matching the last 2 characters. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD