Improve string benchtests

Message ID DB5PR08MB10302F67370DDC84C904006383B60@DB5PR08MB1030.eurprd08.prod.outlook.com
State New
Headers show
Series
  • Improve string benchtests
Related show

Commit Message

Wilco Dijkstra Dec. 27, 2018, 6:26 p.m.
Replace slow byte-oriented tests in several string benchmarks with the
generic implementations from the string/ directory so the comparisons
are more realistic and useful.

OK to commit?

ChangeLog:
2018-12-27  Wilco Dijkstra  <wdijkstr@arm.com>

        * benchtests/bench-stpcpy.c (SIMPLE_STPCPY): Remove function.
        (generic_stpcpy): New function.
        * benchtests/bench-stpncpy.c (SIMPLE_STPNCPY): Remove function.
        (generic_stpncpy): New function.
        * benchtests/bench-strcat.c (SIMPLE_STRCAT): Remove function.
        (generic_strcat): New function.
        * benchtests/bench-strcpy.c (SIMPLE_STRCPY): Remove function.
        (generic_strcpy): New function.
        * benchtests/bench-strncat.c (SIMPLE_STRNCAT): Remove function.
        (STUPID_STRNCAT): Remove function.
        (generic_strncat): New function.
        * benchtests/bench-strncpy.c (SIMPLE_STRNCPY): Remove function.
        (STUPID_STRNCPY): Remove function.
        (generic_strncpy): New function.
        * benchtests/bench-strnlen.c (SIMPLE_STRNLEN): Remove function.
        (generic_strnlen): New function.
        (memchr_strnlen): New function.

--

Comments

Wilco Dijkstra Feb. 1, 2019, 3:36 p.m. | #1
ping




From: Wilco Dijkstra
Sent: 27 December 2018 18:26
To: 'GNU C Library'
Cc: nd
Subject: [PATCH] Improve string benchtests
  

Replace slow byte-oriented tests in several string benchmarks with the
generic implementations from the string/ directory so the comparisons
are more realistic and useful.

OK to commit?

ChangeLog:
2018-12-27  Wilco Dijkstra  <wdijkstr@arm.com>

        * benchtests/bench-stpcpy.c (SIMPLE_STPCPY): Remove function.
        (generic_stpcpy): New function.
        * benchtests/bench-stpncpy.c (SIMPLE_STPNCPY): Remove function.
        (generic_stpncpy): New function.
        * benchtests/bench-strcat.c (SIMPLE_STRCAT): Remove function.
        (generic_strcat): New function.
        * benchtests/bench-strcpy.c (SIMPLE_STRCPY): Remove function.
        (generic_strcpy): New function.
        * benchtests/bench-strncat.c (SIMPLE_STRNCAT): Remove function.
        (STUPID_STRNCAT): Remove function.
        (generic_strncat): New function.
        * benchtests/bench-strncpy.c (SIMPLE_STRNCPY): Remove function.
        (STUPID_STRNCPY): Remove function.
        (generic_strncpy): New function.
        * benchtests/bench-strnlen.c (SIMPLE_STRNLEN): Remove function.
        (generic_strnlen): New function.
        (memchr_strnlen): New function.

--
diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
index 77ffb0f5c22b8d8ba83d46e903a23c90d8c983c1..7982c1886bb0af680b00757d8383b2ef84503533 100644
--- a/benchtests/bench-stpcpy.c
+++ b/benchtests/bench-stpcpy.c
@@ -24,22 +24,15 @@
 # define TEST_NAME "wcpcpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPCPY simple_stpcpy
-#else
-# define SIMPLE_STPCPY simple_wcpcpy
-#endif /* WIDE */
-
-CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STPCPY, 0)
-IMPL (STPCPY, 1)
 
 CHAR *
-SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
+generic_stpcpy (CHAR *dst, const CHAR *src)
 {
-  while ((*dst++ = *src++) != '\0');
-  return dst - 1;
+  size_t len = STRLEN (src);
+  return (CHAR*)MEMCPY (dst, src, len + 1) + len;
 }
 
+IMPL (STPCPY, 1)
+IMPL (generic_stpcpy, 0)
+
 #include "bench-strcpy.c"
diff --git a/benchtests/bench-stpncpy.c b/benchtests/bench-stpncpy.c
index 40c82cf716e6d7bc8fcb8d5a390da1ee8fee3245..331f5e67b7fc054bd678dbd264a24e43f940c0ed 100644
--- a/benchtests/bench-stpncpy.c
+++ b/benchtests/bench-stpncpy.c
@@ -24,47 +24,19 @@
 # define TEST_NAME "wcpncpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPNCPY simple_stpncpy
-# define STUPID_STPNCPY stupid_stpncpy
-#else
-# define SIMPLE_STPNCPY simple_wcpncpy
-# define STUPID_STPNCPY stupid_wcpncpy
-#endif /* WIDE */
-
-CHAR *SIMPLE_STPNCPY (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STPNCPY (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STPNCPY, 0)
-IMPL (SIMPLE_STPNCPY, 0)
-IMPL (STPNCPY, 1)
-
-CHAR *
-SIMPLE_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
-{
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      {
-       size_t i;
-
-       for (i = 0; i < n; ++i)
-         dst[i] = '\0';
-       return dst - 1;
-      }
-  return dst;
-}
 
 CHAR *
-STUPID_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
+generic_stpncpy (CHAR *dst, const CHAR *src, size_t n)
 {
   size_t nc = STRNLEN (src, n);
-  size_t i;
-
-  for (i = 0; i < nc; ++i)
-    dst[i] = src[i];
-  for (; i < n; ++i)
-    dst[i] = '\0';
-  return dst + nc;
+  MEMCPY (dst, src, nc);
+  dst += nc;
+  if (nc == n)
+    return dst;
+  return MEMSET (dst, 0, n - nc);
 }
 
+IMPL (STPNCPY, 1)
+IMPL (generic_stpncpy, 0)
+
 #include "bench-strncpy.c"
diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
index 6b3b084ae19a931cb2bda07794380b5745ccfc86..d3e96f4ec6331bea0027016eb6af4a276d0e6714 100644
--- a/benchtests/bench-strcat.c
+++ b/benchtests/bench-strcat.c
@@ -28,31 +28,25 @@
 
 #ifndef WIDE
 # define sfmt "s"
-# define SIMPLE_STRCAT simple_strcat
 # define SMALL_CHAR 127
 #else
 # define sfmt "ls"
-# define SIMPLE_STRCAT simple_wcscat
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
-CHAR *SIMPLE_STRCAT (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STRCAT, 0)
-IMPL (STRCAT, 1)
 
 CHAR *
-SIMPLE_STRCAT (CHAR *dst, const CHAR *src)
+generic_strcat (CHAR *dst, const CHAR *src)
 {
-  CHAR *ret = dst;
-  while (*dst++ != '\0');
-  --dst;
-  while ((*dst++ = *src++) != '\0');
-  return ret;
+  STRCPY (dst + STRLEN (dst), src);
+  return dst;
 }
 
+IMPL (STRCAT, 1)
+IMPL (generic_strcat, 0)
+
 static void
 do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 {
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index e5fd27ffba9ca6dd871e658d1cffc22f28047e49..426ee8f79ad8400b5ceddad5d6cdfc3523f49f16 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -34,25 +34,17 @@
 # else
 #  define TEST_NAME "wcscpy"
 # endif
-# include "bench-string.h"
-# ifndef WIDE
-#  define SIMPLE_STRCPY simple_strcpy
-# else
-#  define SIMPLE_STRCPY simple_wcscpy
-# endif
-
-CHAR *SIMPLE_STRCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STRCPY, 0)
-IMPL (STRCPY, 1)
+#include "bench-string.h"
 
 CHAR *
-SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
+generic_strcpy (CHAR *dst, const CHAR *src)
 {
-  CHAR *ret = dst;
-  while ((*dst++ = *src++) != '\0');
-  return ret;
+  return MEMCPY (dst, src, STRLEN (src) + 1);
 }
+
+IMPL (STRCPY, 1)
+IMPL (generic_strcpy, 0)
+
 #endif
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
index 7e0112273ba0727ae29407e38724c39159ce1c93..da8318e9a24e50ca491b65e092cc5152b3c47715 100644
--- a/benchtests/bench-strncat.c
+++ b/benchtests/bench-strncat.c
@@ -27,35 +27,26 @@
 #define BIG_CHAR MAX_CHAR
 
 #ifndef WIDE
-# define SIMPLE_STRNCAT simple_strncat
-# define STUPID_STRNCAT stupid_strncat
 # define SMALL_CHAR 127
 #else
-# define SIMPLE_STRNCAT simple_wcsncat
-# define STUPID_STRNCAT stupid_wcsncat
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STRNCAT (CHAR *, const CHAR *, size_t);
-CHAR *SIMPLE_STRNCAT (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STRNCAT, 0)
-IMPL (STRNCAT, 2)
 
 CHAR *
-STUPID_STRNCAT (CHAR *dst, const CHAR *src, size_t n)
+generic_strncat (CHAR *dst, const CHAR *src, size_t n)
 {
-  CHAR *ret = dst;
-  while (*dst++ != '\0');
-  --dst;
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      return ret;
-  *dst = '\0';
-  return ret;
+  CHAR *end = dst + STRLEN (dst);
+  n = STRNLEN (src, n);
+  end[n] = 0;
+  MEMCPY (end, src, n);
+  return dst;
 }
 
+IMPL (STRNCAT, 2)
+IMPL (generic_strncat, 0)
+
 static void
 do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
 {
diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
index 79953759bfbd7201b9e3a846b59666375cb76348..e32a1519cd95af5456277e765813eb78211a3a2a 100644
--- a/benchtests/bench-strncpy.c
+++ b/benchtests/bench-strncpy.c
@@ -33,47 +33,19 @@
 #  define TEST_NAME "wcsncpy"
 # endif /* WIDE */
 # include "bench-string.h"
-# ifndef WIDE
-#  define SIMPLE_STRNCPY simple_strncpy
-#  define STUPID_STRNCPY stupid_strncpy
-# else
-#  define SIMPLE_STRNCPY simple_wcsncpy
-#  define STUPID_STRNCPY stupid_wcsncpy
-# endif /* WIDE */
-
-CHAR *SIMPLE_STRNCPY (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STRNCPY (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STRNCPY, 0)
-IMPL (SIMPLE_STRNCPY, 0)
-IMPL (STRNCPY, 1)
 
 CHAR *
-SIMPLE_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
+generic_strncpy (CHAR *dst, const CHAR *src, size_t n)
 {
-  CHAR *ret = dst;
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      {
-       while (n--)
-         *dst++ = '\0';
-       return ret;
-      }
-  return ret;
+  size_t nc = STRNLEN (src, n);
+  if (nc != n)
+    MEMSET (dst + nc, 0, n - nc);
+  return MEMCPY (dst, src, nc);
 }
 
-CHAR *
-STUPID_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
-{
-  size_t nc = STRNLEN (src, n);
-  size_t i;
+IMPL (STRNCPY, 1)
+IMPL (generic_strncpy, 0)
 
-  for (i = 0; i < nc; ++i)
-    dst[i] = src[i];
-  for (; i < n; ++i)
-    dst[i] = '\0';
-  return dst;
-}
 #endif /* !STRNCPY_RESULT */
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
index 8a7641669a36603e3943b5ca3b9cb0f648dd18f0..8a0fabfcf6c4223dceab3a8eb581851c6ab6be5d 100644
--- a/benchtests/bench-strnlen.c
+++ b/benchtests/bench-strnlen.c
@@ -28,27 +28,24 @@
 
 #ifndef WIDE
 # define MIDDLE_CHAR 127
-# define SIMPLE_STRNLEN simple_strnlen
 #else
 # define MIDDLE_CHAR 1121
-# define SIMPLE_STRNLEN simple_wcsnlen
 #endif /* WIDE */
 
 typedef size_t (*proto_t) (const CHAR *, size_t);
-size_t SIMPLE_STRNLEN (const CHAR *, size_t);
-
-IMPL (SIMPLE_STRNLEN, 0)
-IMPL (STRNLEN, 1)
+size_t generic_strnlen (const CHAR *, size_t);
 
 size_t
-SIMPLE_STRNLEN (const CHAR *s, size_t maxlen)
+memchr_strnlen (const CHAR *s, size_t maxlen)
 {
-  size_t i;
-
-  for (i = 0; i < maxlen && s[i]; ++i);
-  return i;
+  const CHAR *s1 = MEMCHR (s, 0, maxlen);
+  return (s1 == NULL) ? maxlen : s1 - s;
 }
 
+IMPL (STRNLEN, 1)
+IMPL (memchr_strnlen, 0)
+IMPL (generic_strnlen, 0)
+
 static void
 do_one_test (impl_t *impl, const CHAR *s, size_t maxlen, size_t exp_len)
 {
@@ -146,3 +143,13 @@ test_main (void)
 }
 
 #include <support/test-driver.c>
+
+#define libc_hidden_def(X)
+#ifndef WIDE
+# undef STRNLEN
+# define STRNLEN generic_strnlen
+# include <string/strnlen.c>
+#else
+# define WCSNLEN generic_strnlen
+# include <wcsmbs/wcsnlen.c>
+#endif
Adhemerval Zanella Feb. 4, 2019, 7:35 p.m. | #2
On 27/12/2018 16:26, Wilco Dijkstra wrote:
> Replace slow byte-oriented tests in several string benchmarks with the
> generic implementations from the string/ directory so the comparisons
> are more realistic and useful.
> 
> OK to commit?
> 
> ChangeLog:
> 2018-12-27  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>         * benchtests/bench-stpcpy.c (SIMPLE_STPCPY): Remove function.
>         (generic_stpcpy): New function.
>         * benchtests/bench-stpncpy.c (SIMPLE_STPNCPY): Remove function.
>         (generic_stpncpy): New function.
>         * benchtests/bench-strcat.c (SIMPLE_STRCAT): Remove function.
>         (generic_strcat): New function.
>         * benchtests/bench-strcpy.c (SIMPLE_STRCPY): Remove function.
>         (generic_strcpy): New function.
>         * benchtests/bench-strncat.c (SIMPLE_STRNCAT): Remove function.
>         (STUPID_STRNCAT): Remove function.
>         (generic_strncat): New function.
>         * benchtests/bench-strncpy.c (SIMPLE_STRNCPY): Remove function.
>         (STUPID_STRNCPY): Remove function.
>         (generic_strncpy): New function.
>         * benchtests/bench-strnlen.c (SIMPLE_STRNLEN): Remove function.
>         (generic_strnlen): New function.
>         (memchr_strnlen): New function.
> 
> --
> diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
> index 77ffb0f5c22b8d8ba83d46e903a23c90d8c983c1..7982c1886bb0af680b00757d8383b2ef84503533 100644
> --- a/benchtests/bench-stpcpy.c
> +++ b/benchtests/bench-stpcpy.c
> @@ -24,22 +24,15 @@
>  # define TEST_NAME "wcpcpy"
>  #endif /* WIDE */
>  #include "bench-string.h"
> -#ifndef WIDE
> -# define SIMPLE_STPCPY simple_stpcpy
> -#else
> -# define SIMPLE_STPCPY simple_wcpcpy
> -#endif /* WIDE */
> -
> -CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STPCPY, 0)
> -IMPL (STPCPY, 1)
>  
>  CHAR *
> -SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
> +generic_stpcpy (CHAR *dst, const CHAR *src)
>  {
> -  while ((*dst++ = *src++) != '\0');
> -  return dst - 1;
> +  size_t len = STRLEN (src);
> +  return (CHAR*)MEMCPY (dst, src, len + 1) + len;
>  }

Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
will use an optimized version and it will result in a wrong generic name for
wcscpy.

What about:

---
diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
index 74ea54f..b73eba6 100644
--- a/benchtests/bench-stpcpy.c
+++ b/benchtests/bench-stpcpy.c
@@ -24,22 +24,19 @@
 # define TEST_NAME "wcpcpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPCPY simple_stpcpy
-#else
-# define SIMPLE_STPCPY simple_wcpcpy
-#endif /* WIDE */
 
-CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STPCPY, 0)
 IMPL (STPCPY, 1)
 
-CHAR *
-SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
-{
-  while ((*dst++ = *src++) != '\0');
-  return dst - 1;
-}
+#undef STPCPY
+#ifndef WIDE
+# define GENERIC_STPCPY generic_stpcpy
+# define STPCPY         GENERIC_STPCPY
+# include <string/stpcpy.c>
+#else
+# define GENERIC_STPCPY generic_wcpcpy
+# define WCPCPY         GENERIC_STPCPY
+# include <wcsmbs/wcpcpy.c>
+#endif
+IMPL (GENERIC_STPCPY, 0)
 
 #include "bench-strcpy.c"
diff --git a/string/stpcpy.c b/string/stpcpy.c
index 2e65988..728e64f 100644
--- a/string/stpcpy.c
+++ b/string/stpcpy.c
@@ -25,17 +25,20 @@
 #undef __stpcpy
 #undef stpcpy
 
-#ifndef STPCPY
-# define STPCPY __stpcpy
+#ifdef STPCPY
+# define __stpcpy STPCPY
 #endif
 
 /* Copy SRC to DEST, returning the address of the terminating '\0' in DEST.  */
 char *
-STPCPY (char *dest, const char *src)
+__stpcpy (char *dest, const char *src)
 {
   size_t len = strlen (src);
   return memcpy (dest, src, len + 1) + len;
 }
+
+#ifndef STPCPY
 weak_alias (__stpcpy, stpcpy)
 libc_hidden_def (__stpcpy)
 libc_hidden_builtin_def (stpcpy)
+#endif
---

And as a following cleanup for wcpcpy we can use the similar code for strcpy
adjusting for wide-chars:

wchar_t *
__wcpcpy (wchar_t *dest, const wchar_t *src)
{
  size_t len = wcslen (src) * sizeof (wchar_t);
  return memcpy (dest, src, len + sizeof (wchar_t)) + len;
}

And this allows remove the arch-specific m68k implementation as well.

>  
> +IMPL (STPCPY, 1)
> +IMPL (generic_stpcpy, 0)
> +
>  #include "bench-strcpy.c"
> diff --git a/benchtests/bench-stpncpy.c b/benchtests/bench-stpncpy.c
> index 40c82cf716e6d7bc8fcb8d5a390da1ee8fee3245..331f5e67b7fc054bd678dbd264a24e43f940c0ed 100644
> --- a/benchtests/bench-stpncpy.c
> +++ b/benchtests/bench-stpncpy.c
> @@ -24,47 +24,19 @@
>  # define TEST_NAME "wcpncpy"
>  #endif /* WIDE */
>  #include "bench-string.h"
> -#ifndef WIDE
> -# define SIMPLE_STPNCPY simple_stpncpy
> -# define STUPID_STPNCPY stupid_stpncpy
> -#else
> -# define SIMPLE_STPNCPY simple_wcpncpy
> -# define STUPID_STPNCPY stupid_wcpncpy
> -#endif /* WIDE */
> -
> -CHAR *SIMPLE_STPNCPY (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STPNCPY (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STPNCPY, 0)
> -IMPL (SIMPLE_STPNCPY, 0)
> -IMPL (STPNCPY, 1)
> -
> -CHAR *
> -SIMPLE_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
> -{
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      {
> -	size_t i;
> -
> -	for (i = 0; i < n; ++i)
> -	  dst[i] = '\0';
> -	return dst - 1;
> -      }
> -  return dst;
> -}
>  
>  CHAR *
> -STUPID_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
> +generic_stpncpy (CHAR *dst, const CHAR *src, size_t n)
>  {
>    size_t nc = STRNLEN (src, n);
> -  size_t i;
> -
> -  for (i = 0; i < nc; ++i)
> -    dst[i] = src[i];
> -  for (; i < n; ++i)
> -    dst[i] = '\0';
> -  return dst + nc;
> +  MEMCPY (dst, src, nc);
> +  dst += nc;
> +  if (nc == n)
> +    return dst;
> +  return MEMSET (dst, 0, n - nc);
>  }
>  
> +IMPL (STPNCPY, 1)
> +IMPL (generic_stpncpy, 0)
> +
>  #include "bench-strncpy.c"

Same as before for wcpncpy: instead of reimplement the generic implementation
on benchtests we can just include them. And it also leads to an possible
optimization on generic implementation for wcpncpy.

> diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> index 6b3b084ae19a931cb2bda07794380b5745ccfc86..d3e96f4ec6331bea0027016eb6af4a276d0e6714 100644
> --- a/benchtests/bench-strcat.c
> +++ b/benchtests/bench-strcat.c
> @@ -28,31 +28,25 @@
>  
>  #ifndef WIDE
>  # define sfmt "s"
> -# define SIMPLE_STRCAT simple_strcat
>  # define SMALL_CHAR 127
>  #else
>  # define sfmt "ls"
> -# define SIMPLE_STRCAT simple_wcscat
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>  
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
> -CHAR *SIMPLE_STRCAT (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STRCAT, 0)
> -IMPL (STRCAT, 1)
>  
>  CHAR *
> -SIMPLE_STRCAT (CHAR *dst, const CHAR *src)
> +generic_strcat (CHAR *dst, const CHAR *src)
>  {
> -  CHAR *ret = dst;
> -  while (*dst++ != '\0');
> -  --dst;
> -  while ((*dst++ = *src++) != '\0');
> -  return ret;
> +  STRCPY (dst + STRLEN (dst), src);
> +  return dst;
>  }
>  
> +IMPL (STRCAT, 1)
> +IMPL (generic_strcat, 0)
> +
>  static void
>  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>  {

Same as before for wcscat.

> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index e5fd27ffba9ca6dd871e658d1cffc22f28047e49..426ee8f79ad8400b5ceddad5d6cdfc3523f49f16 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -34,25 +34,17 @@
>  # else
>  #  define TEST_NAME "wcscpy"
>  # endif
> -# include "bench-string.h"
> -# ifndef WIDE
> -#  define SIMPLE_STRCPY simple_strcpy
> -# else
> -#  define SIMPLE_STRCPY simple_wcscpy
> -# endif
> -
> -CHAR *SIMPLE_STRCPY (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STRCPY, 0)
> -IMPL (STRCPY, 1)
> +#include "bench-string.h"
>  
>  CHAR *
> -SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
> +generic_strcpy (CHAR *dst, const CHAR *src)
>  {
> -  CHAR *ret = dst;
> -  while ((*dst++ = *src++) != '\0');
> -  return ret;
> +  return MEMCPY (dst, src, STRLEN (src) + 1);
>  }
> +
> +IMPL (STRCPY, 1)
> +IMPL (generic_strcpy, 0)
> +
>  #endif
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);

Same as before for wcscpy.

> diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> index 7e0112273ba0727ae29407e38724c39159ce1c93..da8318e9a24e50ca491b65e092cc5152b3c47715 100644
> --- a/benchtests/bench-strncat.c
> +++ b/benchtests/bench-strncat.c
> @@ -27,35 +27,26 @@
>  #define BIG_CHAR MAX_CHAR
>  
>  #ifndef WIDE
> -# define SIMPLE_STRNCAT simple_strncat
> -# define STUPID_STRNCAT stupid_strncat
>  # define SMALL_CHAR 127
>  #else
> -# define SIMPLE_STRNCAT simple_wcsncat
> -# define STUPID_STRNCAT stupid_wcsncat
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STRNCAT (CHAR *, const CHAR *, size_t);
> -CHAR *SIMPLE_STRNCAT (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STRNCAT, 0)
> -IMPL (STRNCAT, 2)
>  
>  CHAR *
> -STUPID_STRNCAT (CHAR *dst, const CHAR *src, size_t n)
> +generic_strncat (CHAR *dst, const CHAR *src, size_t n)
>  {
> -  CHAR *ret = dst;
> -  while (*dst++ != '\0');
> -  --dst;
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      return ret;
> -  *dst = '\0';
> -  return ret;
> +  CHAR *end = dst + STRLEN (dst);
> +  n = STRNLEN (src, n);
> +  end[n] = 0;
> +  MEMCPY (end, src, n);
> +  return dst;
>  }
>  
> +IMPL (STRNCAT, 2)
> +IMPL (generic_strncat, 0)
> +
>  static void
>  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>  {

Same as before for wcsncat.

> diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> index 79953759bfbd7201b9e3a846b59666375cb76348..e32a1519cd95af5456277e765813eb78211a3a2a 100644
> --- a/benchtests/bench-strncpy.c
> +++ b/benchtests/bench-strncpy.c
> @@ -33,47 +33,19 @@
>  #  define TEST_NAME "wcsncpy"
>  # endif /* WIDE */
>  # include "bench-string.h"
> -# ifndef WIDE
> -#  define SIMPLE_STRNCPY simple_strncpy
> -#  define STUPID_STRNCPY stupid_strncpy
> -# else
> -#  define SIMPLE_STRNCPY simple_wcsncpy
> -#  define STUPID_STRNCPY stupid_wcsncpy
> -# endif /* WIDE */
> -
> -CHAR *SIMPLE_STRNCPY (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STRNCPY (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STRNCPY, 0)
> -IMPL (SIMPLE_STRNCPY, 0)
> -IMPL (STRNCPY, 1)
>  
>  CHAR *
> -SIMPLE_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
> +generic_strncpy (CHAR *dst, const CHAR *src, size_t n)
>  {
> -  CHAR *ret = dst;
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      {
> -	while (n--)
> -	  *dst++ = '\0';
> -	return ret;
> -      }
> -  return ret;
> +  size_t nc = STRNLEN (src, n);
> +  if (nc != n)
> +    MEMSET (dst + nc, 0, n - nc);
> +  return MEMCPY (dst, src, nc);
>  }
>  
> -CHAR *
> -STUPID_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
> -{
> -  size_t nc = STRNLEN (src, n);
> -  size_t i;
> +IMPL (STRNCPY, 1)
> +IMPL (generic_strncpy, 0)
>  
> -  for (i = 0; i < nc; ++i)
> -    dst[i] = src[i];
> -  for (; i < n; ++i)
> -    dst[i] = '\0';
> -  return dst;
> -}
>  #endif /* !STRNCPY_RESULT */
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);

Same as before for wcsncpy.

> diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> index 8a7641669a36603e3943b5ca3b9cb0f648dd18f0..8a0fabfcf6c4223dceab3a8eb581851c6ab6be5d 100644
> --- a/benchtests/bench-strnlen.c
> +++ b/benchtests/bench-strnlen.c
> @@ -28,27 +28,24 @@
>  
>  #ifndef WIDE
>  # define MIDDLE_CHAR 127
> -# define SIMPLE_STRNLEN simple_strnlen
>  #else
>  # define MIDDLE_CHAR 1121
> -# define SIMPLE_STRNLEN simple_wcsnlen
>  #endif /* WIDE */
>  
>  typedef size_t (*proto_t) (const CHAR *, size_t);
> -size_t SIMPLE_STRNLEN (const CHAR *, size_t);
> -
> -IMPL (SIMPLE_STRNLEN, 0)
> -IMPL (STRNLEN, 1)
> +size_t generic_strnlen (const CHAR *, size_t);
>  
>  size_t
> -SIMPLE_STRNLEN (const CHAR *s, size_t maxlen)
> +memchr_strnlen (const CHAR *s, size_t maxlen)
>  {
> -  size_t i;
> -
> -  for (i = 0; i < maxlen && s[i]; ++i);
> -  return i;
> +  const CHAR *s1 = MEMCHR (s, 0, maxlen);
> +  return (s1 == NULL) ? maxlen : s1 - s;
>  }

You will need to adjust the name for WIDE, maybe memchr_wcsnlen.

>  
> +IMPL (STRNLEN, 1)
> +IMPL (memchr_strnlen, 0)
> +IMPL (generic_strnlen, 0)
> +
>  static void
>  do_one_test (impl_t *impl, const CHAR *s, size_t maxlen, size_t exp_len)
>  {
> @@ -146,3 +143,13 @@ test_main (void)
>  }
>  
>  #include <support/test-driver.c>
> +
> +#define libc_hidden_def(X)
> +#ifndef WIDE
> +# undef STRNLEN
> +# define STRNLEN generic_strnlen
> +# include <string/strnlen.c>
> +#else
> +# define WCSNLEN generic_strnlen
> +# include <wcsmbs/wcsnlen.c>
> +#endif
>
Wilco Dijkstra Feb. 5, 2019, 3:17 p.m. | #3
Hi Adhemerval,

> Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
> will use an optimized version and it will result in a wrong generic name for
> wcscpy.

I'll have a look at that. Maybe we could create GENERIC_STPCPY from
concatenation of the STPCPY define. I want to avoid huge amounts of defines
leading to completely incomprehensible magic.

> And as a following cleanup for wcpcpy we can use the similar code for strcpy
> adjusting for wide-chars:

Sure, there are lots of generic string functions which aren't optimized yet. However
that's a different patch... This patch simply makes it easy to find those cases.

> Same as before for wcpncpy: instead of reimplement the generic implementation
> on benchtests we can just include them. And it also leads to an possible
> optimization on generic implementation for wcpncpy.

The point is to enable useful comparisons of string implementations. If we include
the generic implementation then we just compare the generic implementation with
itself in many cases. And that isn't useful. If I change a generic implementation I
want to see the difference that makes in the benchmark comparison rather than
showing no difference.

Maybe the name generic_xxx is confusing? It's meant to be the baseline,
something which you should beat in all cases with the actual implementation.

Cheers,
Wilco
Adhemerval Zanella Feb. 5, 2019, 7:33 p.m. | #4
On 05/02/2019 13:17, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
>> will use an optimized version and it will result in a wrong generic name for
>> wcscpy.
> 
> I'll have a look at that. Maybe we could create GENERIC_STPCPY from
> concatenation of the STPCPY define. I want to avoid huge amounts of defines
> leading to completely incomprehensible magic.
> 
>> And as a following cleanup for wcpcpy we can use the similar code for strcpy
>> adjusting for wide-chars:
> 
> Sure, there are lots of generic string functions which aren't optimized yet. However
> that's a different patch... This patch simply makes it easy to find those cases.

The idea was not make is a pre-requisite, but rather an idea for a different
patch indeed.

> 
>> Same as before for wcpncpy: instead of reimplement the generic implementation
>> on benchtests we can just include them. And it also leads to an possible
>> optimization on generic implementation for wcpncpy.
> 
> The point is to enable useful comparisons of string implementations. If we include
> the generic implementation then we just compare the generic implementation with
> itself in many cases. And that isn't useful. If I change a generic implementation I
> want to see the difference that makes in the benchmark comparison rather than
> showing no difference.

My understanding is we have the generic implementation as the baseline
where arch-specific optimization might be applied and the idea of the 
comparison is to check against it.  I see no point in using a different 
implementation on benchtests, it should compare against exactly what 
glibc is currently providing.

If you want to check if the your changes improves the generic, you can
compare against multiples glibc builds.

> 
> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
> something which you should beat in all cases with the actual implementation.

My understanding is the baseline should be the generic implementation which
is selected if the architecture does not provide an optimized one.

> 
> Cheers,
> Wilco
>   
>
Wilco Dijkstra Feb. 6, 2019, 2:01 p.m. | #5
Hi Adhemerval,

>>> Same as before for wcpncpy: instead of reimplement the generic implementation
>>> on benchtests we can just include them. And it also leads to an possible
>>> optimization on generic implementation for wcpncpy.
>> 
>> The point is to enable useful comparisons of string implementations. If we include
>> the generic implementation then we just compare the generic implementation with
>> itself in many cases. And that isn't useful. If I change a generic implementation I
>> want to see the difference that makes in the benchmark comparison rather than
>> showing no difference.
>
> My understanding is we have the generic implementation as the baseline
> where arch-specific optimization might be applied and the idea of the 
> comparison is to check against it.  I see no point in using a different 
> implementation on benchtests, it should compare against exactly what 
> glibc is currently providing.

I have to disagree, we cannot do an exact comparison unless build the generic
string functions as part of GLIBC and call them via the PLT. Including source
files with lots of #define magic is never going to be equivalent.

The goal here is not an accurate comparison with generic string functions but
to enable a realistic comparison with an efficient baseline - the existing byte
oriented implementations provide a baseline but are too slow to be useful.

> If you want to check if the your changes improves the generic, you can
> compare against multiples glibc builds.

That doesn't work so well given it takes a long time to rebuild GLIBC and 
benchmarks. For all benchmarking I do, I always create a direct comparison of
old vs new in a single run so it shows the differences and can be run repeatedly
to confirm. The string bench is setup to do this already, so why remove this
useful feature?
 
>> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
>> something which you should beat in all cases with the actual implementation.
>
> My understanding is the baseline should be the generic implementation which
> is selected if the architecture does not provide an optimized one.

That means you never compare the generic implementation against a baseline.
Given that is what we do today, I don't see why we should stop doing that.

Cheers,
Wilco
Adhemerval Zanella Feb. 6, 2019, 2:53 p.m. | #6
On 06/02/2019 12:01, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>>>> Same as before for wcpncpy: instead of reimplement the generic implementation
>>>> on benchtests we can just include them. And it also leads to an possible
>>>> optimization on generic implementation for wcpncpy.
>>>
>>> The point is to enable useful comparisons of string implementations. If we include
>>> the generic implementation then we just compare the generic implementation with
>>> itself in many cases. And that isn't useful. If I change a generic implementation I
>>> want to see the difference that makes in the benchmark comparison rather than
>>> showing no difference.
>>
>> My understanding is we have the generic implementation as the baseline
>> where arch-specific optimization might be applied and the idea of the 
>> comparison is to check against it.  I see no point in using a different 
>> implementation on benchtests, it should compare against exactly what 
>> glibc is currently providing.
> 
> I have to disagree, we cannot do an exact comparison unless build the generic
> string functions as part of GLIBC and call them via the PLT. Including source
> files with lots of #define magic is never going to be equivalent.
> 
> The goal here is not an accurate comparison with generic string functions but
> to enable a realistic comparison with an efficient baseline - the existing byte
> oriented implementations provide a baseline but are too slow to be useful.

The idea is not to be equivalent, since benchtests already adds the exported
libc symbol which will be called through PLT.  I do agree with you that the
byte-oriented baseline is somewhat useless now that most architecture implements
efficient word or vectorized common symbol, and the idea is also to provide
some more efficient generic string implementation (I have a long-standing patchset
'Improve generic string routines' to address this).

So my point is to which exactly should we compare on benchtests? Current we have:

  1. Byte-oriented 'simple' implementation which, as we agree, should not be
     used as a baseline.

  2. Some named 'stupid' which are usually composed implementation that might
     in fact be a faster implementation than some 'clever' ones.

  3. Compiler builtins, which also does not represent meaningful data for
     libc optimization (it will either be inline, call libc implementation, or
     mix both strategies).

  4. The libc implementations themselves, possible including all ifunc variations.

So which really give us meaningful data for future optimization? Should we keep
add multiple implementation as baselines to compare with?

What about an architecture that uses as baseline an arch-specific implementation,
which might use non optimal strategy that a future generic implementation might
use? We have examples for both string and math code on different architectures
where the generic implementation ended up performing better than the arch-specific
implementation.

Another example is your recent bench-strlen improvement (5289f1f56b7) which
added the memchr_strlen.  The generic implementation of strlen uses a similar
strategy of memchr, with the difference it does not need to materialize the
magic constant and add some loop unrolling for tail comparison. At first
it should be faster than memchr_strlen, however if the architecture has 
an optimized memchr implementation (which is a hotspot and it is usually a
target for arch-specific optimization), the memchr_strlen should be indeed
faster (and have a lower i-cache footprint). 

My point is using memchr_strlen as the *generic* implementation and also use
it as the *baseline* for performance comparison shows to the developer that
optimizing memchr would be a net gain in general than providing multiple
different optimization for multiple symbols that can be built by memchr
calls.

So I still think we should define better which exactly we need to compare
in benchtests and use the generic implementation, which will be used as
default for new ports, as the default basline. The file inclusion is just to 
avoid code duplication, I don't have a strong opinion whether to include or 
just copy-paste the code on benchtests. 

> 
>> If you want to check if the your changes improves the generic, you can
>> compare against multiples glibc builds.
> 
> That doesn't work so well given it takes a long time to rebuild GLIBC and 
> benchmarks. For all benchmarking I do, I always create a direct comparison of
> old vs new in a single run so it shows the differences and can be run repeatedly
> to confirm. The string bench is setup to do this already, so why remove this
> useful feature?
>  
>>> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
>>> something which you should beat in all cases with the actual implementation.
>>
>> My understanding is the baseline should be the generic implementation which
>> is selected if the architecture does not provide an optimized one.
> 
> That means you never compare the generic implementation against a baseline.
> Given that is what we do today, I don't see why we should stop doing that.
> 
> Cheers,
> Wilco
>     
>
Wilco Dijkstra March 6, 2019, 6:14 p.m. | #7
Hi Adhemerval,

> So my point is to which exactly should we compare on benchtests? Current we have:
>
>  1. Byte-oriented 'simple' implementation which, as we agree, should not be
>     used as a baseline.

That is for functions like memcpy where you'd expect any implementation -
including generic - to act on words rather than bytes. So for those comparing
with a byte-oriented version doesn't add anything useful.

A byte-oriented version does make sense when it is competitive with the
generic implementation. This is true for various wcs functions and strstr for
example.

>  2. Some named 'stupid' which are usually composed implementation that might
>     in fact be a faster implementation than some 'clever' ones.

In all cases I found the stupid ones were slower, mostly because they first
called strlen and then still processed the string one byte at a time instead of
calling memcpy with the known size. So they weren't adding useful data.

>  3. Compiler builtins, which also does not represent meaningful data for
>      libc optimization (it will either be inline, call libc implementation, or
>     mix both strategies).

Agreed - there are few cases where string functions with non-constant inputs are
inlined, so in most cases you're just benchmarking the libc version.

>   4. The libc implementations themselves, possible including all ifunc variations.
>
> So which really give us meaningful data for future optimization? Should we keep
> add multiple implementation as baselines to compare with?

Well it is useful to check different strategies as well as comparing similar string
functions. For example I noticed that on some microarchitectures memchr was
faster, but on others strlen is faster. Given the individual benchmarks use very
different inputs, you only notice this in a direct comparison. 

> What about an architecture that uses as baseline an arch-specific implementation,
> which might use non optimal strategy that a future generic implementation might
> use? We have examples for both string and math code on different architectures
> where the generic implementation ended up performing better than the arch-specific
> implementation.

So having more than 1 baseline is often useful since you can compare different
implementations and strategies.

> My point is using memchr_strlen as the *generic* implementation and also use
> it as the *baseline* for performance comparison shows to the developer that
> optimizing memchr would be a net gain in general than providing multiple
> different optimization for multiple symbols that can be built by memchr
> calls.

Well if you're only adding a target specific implementation for either strlen or
memchr then it makes sense to do memchr first. And yes we could make the 
generic strlen defer to memchr as that makes it easier to get good performance
on a new target using just a fast memchr.

> So I still think we should define better which exactly we need to compare
> in benchtests and use the generic implementation, which will be used as
> default for new ports, as the default basline. The file inclusion is just to 
> avoid code duplication, I don't have a strong opinion whether to include or 
> just copy-paste the code on benchtests. 

It's hard to come up with simple rules - what is best depends on the specific
function. Memcpy/memchr/strlen have no obvious "baseline", and given
most targets implement these in assembler (which should beat the generic
version), comparing against the generic implementations isn't really that useful.

For more complex string functions we've seen cases where an assembler
strcpy/strcat was slower than strlen/memcpy, so it makes sense to
benchmark against that as the baseline. The same is true for strlen, memchr
and strnlen as well as strchr and strlen/memchr.

My goal is to add the most obvious and useful comparisons to the benchtests
to make it much easier to find these unexpected performance flaws across
targets and microarchitectures.

Cheers,
Wilco

Patch

diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
index 77ffb0f5c22b8d8ba83d46e903a23c90d8c983c1..7982c1886bb0af680b00757d8383b2ef84503533 100644
--- a/benchtests/bench-stpcpy.c
+++ b/benchtests/bench-stpcpy.c
@@ -24,22 +24,15 @@ 
 # define TEST_NAME "wcpcpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPCPY simple_stpcpy
-#else
-# define SIMPLE_STPCPY simple_wcpcpy
-#endif /* WIDE */
-
-CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STPCPY, 0)
-IMPL (STPCPY, 1)
 
 CHAR *
-SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
+generic_stpcpy (CHAR *dst, const CHAR *src)
 {
-  while ((*dst++ = *src++) != '\0');
-  return dst - 1;
+  size_t len = STRLEN (src);
+  return (CHAR*)MEMCPY (dst, src, len + 1) + len;
 }
 
+IMPL (STPCPY, 1)
+IMPL (generic_stpcpy, 0)
+
 #include "bench-strcpy.c"
diff --git a/benchtests/bench-stpncpy.c b/benchtests/bench-stpncpy.c
index 40c82cf716e6d7bc8fcb8d5a390da1ee8fee3245..331f5e67b7fc054bd678dbd264a24e43f940c0ed 100644
--- a/benchtests/bench-stpncpy.c
+++ b/benchtests/bench-stpncpy.c
@@ -24,47 +24,19 @@ 
 # define TEST_NAME "wcpncpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPNCPY simple_stpncpy
-# define STUPID_STPNCPY stupid_stpncpy
-#else
-# define SIMPLE_STPNCPY simple_wcpncpy
-# define STUPID_STPNCPY stupid_wcpncpy
-#endif /* WIDE */
-
-CHAR *SIMPLE_STPNCPY (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STPNCPY (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STPNCPY, 0)
-IMPL (SIMPLE_STPNCPY, 0)
-IMPL (STPNCPY, 1)
-
-CHAR *
-SIMPLE_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
-{
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      {
-	size_t i;
-
-	for (i = 0; i < n; ++i)
-	  dst[i] = '\0';
-	return dst - 1;
-      }
-  return dst;
-}
 
 CHAR *
-STUPID_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
+generic_stpncpy (CHAR *dst, const CHAR *src, size_t n)
 {
   size_t nc = STRNLEN (src, n);
-  size_t i;
-
-  for (i = 0; i < nc; ++i)
-    dst[i] = src[i];
-  for (; i < n; ++i)
-    dst[i] = '\0';
-  return dst + nc;
+  MEMCPY (dst, src, nc);
+  dst += nc;
+  if (nc == n)
+    return dst;
+  return MEMSET (dst, 0, n - nc);
 }
 
+IMPL (STPNCPY, 1)
+IMPL (generic_stpncpy, 0)
+
 #include "bench-strncpy.c"
diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
index 6b3b084ae19a931cb2bda07794380b5745ccfc86..d3e96f4ec6331bea0027016eb6af4a276d0e6714 100644
--- a/benchtests/bench-strcat.c
+++ b/benchtests/bench-strcat.c
@@ -28,31 +28,25 @@ 
 
 #ifndef WIDE
 # define sfmt "s"
-# define SIMPLE_STRCAT simple_strcat
 # define SMALL_CHAR 127
 #else
 # define sfmt "ls"
-# define SIMPLE_STRCAT simple_wcscat
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
-CHAR *SIMPLE_STRCAT (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STRCAT, 0)
-IMPL (STRCAT, 1)
 
 CHAR *
-SIMPLE_STRCAT (CHAR *dst, const CHAR *src)
+generic_strcat (CHAR *dst, const CHAR *src)
 {
-  CHAR *ret = dst;
-  while (*dst++ != '\0');
-  --dst;
-  while ((*dst++ = *src++) != '\0');
-  return ret;
+  STRCPY (dst + STRLEN (dst), src);
+  return dst;
 }
 
+IMPL (STRCAT, 1)
+IMPL (generic_strcat, 0)
+
 static void
 do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 {
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index e5fd27ffba9ca6dd871e658d1cffc22f28047e49..426ee8f79ad8400b5ceddad5d6cdfc3523f49f16 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -34,25 +34,17 @@ 
 # else
 #  define TEST_NAME "wcscpy"
 # endif
-# include "bench-string.h"
-# ifndef WIDE
-#  define SIMPLE_STRCPY simple_strcpy
-# else
-#  define SIMPLE_STRCPY simple_wcscpy
-# endif
-
-CHAR *SIMPLE_STRCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STRCPY, 0)
-IMPL (STRCPY, 1)
+#include "bench-string.h"
 
 CHAR *
-SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
+generic_strcpy (CHAR *dst, const CHAR *src)
 {
-  CHAR *ret = dst;
-  while ((*dst++ = *src++) != '\0');
-  return ret;
+  return MEMCPY (dst, src, STRLEN (src) + 1);
 }
+
+IMPL (STRCPY, 1)
+IMPL (generic_strcpy, 0)
+
 #endif
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
index 7e0112273ba0727ae29407e38724c39159ce1c93..da8318e9a24e50ca491b65e092cc5152b3c47715 100644
--- a/benchtests/bench-strncat.c
+++ b/benchtests/bench-strncat.c
@@ -27,35 +27,26 @@ 
 #define BIG_CHAR MAX_CHAR
 
 #ifndef WIDE
-# define SIMPLE_STRNCAT simple_strncat
-# define STUPID_STRNCAT stupid_strncat
 # define SMALL_CHAR 127
 #else
-# define SIMPLE_STRNCAT simple_wcsncat
-# define STUPID_STRNCAT stupid_wcsncat
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STRNCAT (CHAR *, const CHAR *, size_t);
-CHAR *SIMPLE_STRNCAT (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STRNCAT, 0)
-IMPL (STRNCAT, 2)
 
 CHAR *
-STUPID_STRNCAT (CHAR *dst, const CHAR *src, size_t n)
+generic_strncat (CHAR *dst, const CHAR *src, size_t n)
 {
-  CHAR *ret = dst;
-  while (*dst++ != '\0');
-  --dst;
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      return ret;
-  *dst = '\0';
-  return ret;
+  CHAR *end = dst + STRLEN (dst);
+  n = STRNLEN (src, n);
+  end[n] = 0;
+  MEMCPY (end, src, n);
+  return dst;
 }
 
+IMPL (STRNCAT, 2)
+IMPL (generic_strncat, 0)
+
 static void
 do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
 {
diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
index 79953759bfbd7201b9e3a846b59666375cb76348..e32a1519cd95af5456277e765813eb78211a3a2a 100644
--- a/benchtests/bench-strncpy.c
+++ b/benchtests/bench-strncpy.c
@@ -33,47 +33,19 @@ 
 #  define TEST_NAME "wcsncpy"
 # endif /* WIDE */
 # include "bench-string.h"
-# ifndef WIDE
-#  define SIMPLE_STRNCPY simple_strncpy
-#  define STUPID_STRNCPY stupid_strncpy
-# else
-#  define SIMPLE_STRNCPY simple_wcsncpy
-#  define STUPID_STRNCPY stupid_wcsncpy
-# endif /* WIDE */
-
-CHAR *SIMPLE_STRNCPY (CHAR *, const CHAR *, size_t);
-CHAR *STUPID_STRNCPY (CHAR *, const CHAR *, size_t);
-
-IMPL (STUPID_STRNCPY, 0)
-IMPL (SIMPLE_STRNCPY, 0)
-IMPL (STRNCPY, 1)
 
 CHAR *
-SIMPLE_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
+generic_strncpy (CHAR *dst, const CHAR *src, size_t n)
 {
-  CHAR *ret = dst;
-  while (n--)
-    if ((*dst++ = *src++) == '\0')
-      {
-	while (n--)
-	  *dst++ = '\0';
-	return ret;
-      }
-  return ret;
+  size_t nc = STRNLEN (src, n);
+  if (nc != n)
+    MEMSET (dst + nc, 0, n - nc);
+  return MEMCPY (dst, src, nc);
 }
 
-CHAR *
-STUPID_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
-{
-  size_t nc = STRNLEN (src, n);
-  size_t i;
+IMPL (STRNCPY, 1)
+IMPL (generic_strncpy, 0)
 
-  for (i = 0; i < nc; ++i)
-    dst[i] = src[i];
-  for (; i < n; ++i)
-    dst[i] = '\0';
-  return dst;
-}
 #endif /* !STRNCPY_RESULT */
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
index 8a7641669a36603e3943b5ca3b9cb0f648dd18f0..8a0fabfcf6c4223dceab3a8eb581851c6ab6be5d 100644
--- a/benchtests/bench-strnlen.c
+++ b/benchtests/bench-strnlen.c
@@ -28,27 +28,24 @@ 
 
 #ifndef WIDE
 # define MIDDLE_CHAR 127
-# define SIMPLE_STRNLEN simple_strnlen
 #else
 # define MIDDLE_CHAR 1121
-# define SIMPLE_STRNLEN simple_wcsnlen
 #endif /* WIDE */
 
 typedef size_t (*proto_t) (const CHAR *, size_t);
-size_t SIMPLE_STRNLEN (const CHAR *, size_t);
-
-IMPL (SIMPLE_STRNLEN, 0)
-IMPL (STRNLEN, 1)
+size_t generic_strnlen (const CHAR *, size_t);
 
 size_t
-SIMPLE_STRNLEN (const CHAR *s, size_t maxlen)
+memchr_strnlen (const CHAR *s, size_t maxlen)
 {
-  size_t i;
-
-  for (i = 0; i < maxlen && s[i]; ++i);
-  return i;
+  const CHAR *s1 = MEMCHR (s, 0, maxlen);
+  return (s1 == NULL) ? maxlen : s1 - s;
 }
 
+IMPL (STRNLEN, 1)
+IMPL (memchr_strnlen, 0)
+IMPL (generic_strnlen, 0)
+
 static void
 do_one_test (impl_t *impl, const CHAR *s, size_t maxlen, size_t exp_len)
 {
@@ -146,3 +143,13 @@  test_main (void)
 }
 
 #include <support/test-driver.c>
+
+#define libc_hidden_def(X)
+#ifndef WIDE
+# undef STRNLEN
+# define STRNLEN generic_strnlen
+# include <string/strnlen.c>
+#else
+# define WCSNLEN generic_strnlen
+# include <wcsmbs/wcsnlen.c>
+#endif