Improve performance of memmem
diff mbox series

Message ID DB5PR08MB1030105EE3706615A33BB02283B80@DB5PR08MB1030.eurprd08.prod.outlook.com
State New
Headers show
Series
  • Improve performance of memmem
Related show

Commit Message

Wilco Dijkstra Dec. 21, 2018, 4:04 p.m. UTC
This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

	* string/str-two-way.h (two_way_long_needle): Block inlining.
	* string/memmem.c (__memmem): Rewrite to improve performance.

--

Comments

Wilco Dijkstra Feb. 1, 2019, 3:39 p.m. UTC | #1
ping



From: Wilco Dijkstra
Sent: 21 December 2018 16:04
To: 'GNU C Library'
Cc: nd
Subject: [PATCH] Improve performance of memmem
  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra March 6, 2019, 4:55 p.m. UTC | #2
ping

  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra March 18, 2019, 5:17 p.m. UTC | #3
ping

  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra April 12, 2019, 3:49 p.m. UTC | #4
ping

  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra May 24, 2019, 1:59 p.m. UTC | #5
ping

  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra June 10, 2019, 11:48 a.m. UTC | #6
ping

  

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 4.5 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2018-12-21  Wilco Dijkstra  <wdijkstr@arm.com>

        * string/str-two-way.h (two_way_long_needle): Block inlining.
        * string/memmem.c (__memmem): Rewrite to improve performance.

--

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem       memmem
 #endif
 
@@ -36,51 +32,90 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-         const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+         const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+       hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-       return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-       return NULL;
-      /* Check whether we have a match.  This improves performance since we
-        avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-       return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+       {
+         hs += m1;
+         tmp = shift[hash2 (hs)];
+       }
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+        and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+       continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+       {
+         if (memcmp (hs, ne, m1) == 0)
+           return (void *) hs;
+
+         /* Adjust filter offset when it doesn't find the mismatch.  */
+         offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+       }
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
Wilco Dijkstra June 10, 2019, 6:31 p.m. UTC | #7
v2: Update comments after review.

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

Tested against GLIBC testsuite and randomized tests. OK for commit?

ChangeLog:
2019-06-10  Wilco Dijkstra  <wdijkstr@arm.com>

	* string/memmem.c (__memmem): Rewrite to improve performance.

---
diff --git a/string/memmem.c b/string/memmem.c
index 4bf733f1f03cb27c289bd6dc61590909bb0eefdf..83ee75e8c74f7fd56e8e86d1941db56ef296e135 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem	memmem
 #endif
 
@@ -36,51 +32,98 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+/* Hash character pairs so a small shift table can be used.  All bits of
+   p[0] are included, but not all bits from p[-1].  So if two equal hashes
+   match on p[-1], p[0] matches too.  Hash collisions are harmless and result
+   in smaller shifts.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-	  const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+	  const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+	hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  /* Shift1 is the amount we can skip after matching the hash of the
+     needle end but not the full needle.  */
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      /* Check whether we have a match.  This improves performance since we
-	 avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-	return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the hash of the needle end.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* Hash of the last 2 characters matches.  If the needle is long,
+	 try to quickly filter out mismatches.  */
+      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= 8 ? offset : m1) - 8;
+	}
+
+      /* Skip based on matching the hash of the needle end.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
Szabolcs Nagy June 11, 2019, 9:30 a.m. UTC | #8
On 10/06/2019 19:31, Wilco Dijkstra wrote:
> v2: Update comments after review.
> 
> This patch significantly improves performance of memmem using a novel
> modified Horspool algorithm.  Needles up to size 256 use a bad-character
> table indexed by hashed pairs of characters to quickly skip past mismatches.
> Long needles use a self-adapting filtering step to avoid comparing the whole
> needle repeatedly.
> 
> By limiting the needle length to 256, the shift table only requires 8 bits
> per entry, lowering preprocessing overhead and minimizing cache effects.
> This limit also implies worst-case performance is linear.
> 
> Small needles up to size 2 use a dedicated linear search.  Very long needles
> use the Two-Way algorithm (to avoid increasing stack size or slowing down
> the common case, inlining is disabled).
> 
> The performance gain is 6.6 times on English text on AArch64 using random
> needles with average size 8 (this is even faster than the recently improved strstr
> algorithm, so I'll update that in the near future).

the comment about strstr is no longer relevant.

> 
> Tested against GLIBC testsuite and randomized tests. OK for commit?
> 
> ChangeLog:
> 2019-06-10  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* string/memmem.c (__memmem): Rewrite to improve performance.
> 

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

i only had one comment below if that's addressed
then i think it's ready to commit.

(but i think you should wait a day in case there
are further comments on this latest version.)

> +  for ( ; hs <= end; )
>      {
> -      haystack = memchr (haystack, *needle, haystack_len);
> -      if (!haystack || __builtin_expect (needle_len == 1, 0))
> -	return (void *) haystack;
> -      haystack_len -= haystack - (const unsigned char *) haystack_start;
> -      if (haystack_len < needle_len)
> -	return NULL;
> -      /* Check whether we have a match.  This improves performance since we
> -	 avoid the initialization overhead of the two-way algorithm.  */
> -      if (memcmp (haystack, needle, needle_len) == 0)
> -	return (void *) haystack;
> -      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
> +      /* Skip past character pairs not in the needle.  */
> +      do
> +	{
> +	  hs += m1;
> +	  tmp = shift[hash2 (hs)];
> +	}
> +      while (tmp == 0 && hs <= end);

i noticed that the check here is in different
order than in strstr, i wonder if that's deliberate.

if either way is fine i'd prefer to have the same
logic in strstr and memmem.

> +
> +      /* If the match is not at the end of the needle, shift to the end
> +	 and continue until we match the hash of the needle end.  */
> +      hs -= tmp;
> +      if (tmp < m1)
> +	continue;
> +
> +      /* Hash of the last 2 characters matches.  If the needle is long,
> +	 try to quickly filter out mismatches.  */
> +      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
> +	{
> +	  if (memcmp (hs, ne, m1) == 0)
> +	    return (void *) hs;
> +
> +	  /* Adjust filter offset when it doesn't find the mismatch.  */
> +	  offset = (offset >= 8 ? offset : m1) - 8;
> +	}
> +
> +      /* Skip based on matching the hash of the needle end.  */
> +      hs += shift1;
>      }

Patch
diff mbox series

diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem	memmem
 #endif
 
@@ -36,51 +32,90 @@ 
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-	  const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+	  const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+	hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      /* Check whether we have a match.  This improves performance since we
-	 avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-	return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+	}
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
 		     const unsigned char *needle, size_t needle_len)
 {