diff mbox series

[v2] x86: Rename generic functions with unique postfix for clarity

Message ID 20220610005840.557184-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v2] x86: Rename generic functions with unique postfix for clarity | expand

Commit Message

Noah Goldstein June 10, 2022, 12:58 a.m. UTC
No functions are changed. It just renames generic implementations from
'{func}_sse2' to '{func}_generic'. This is just because the postfix
"_sse2" was overloaded and was used for files that had hand-optimized
sse2 assembly implementations and files that just redirected back
to the generic implementation.

Full xcheck passed on x86_64.
---
 sysdeps/x86_64/multiarch/Makefile             |  15 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 +-
 sysdeps/x86_64/multiarch/ifunc-sse4_2.h       |   4 +-
 sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   8 +-
 sysdeps/x86_64/multiarch/ifunc-wcslen.h       |   8 +-
 sysdeps/x86_64/multiarch/stpncpy-c.c          |   2 +-
 sysdeps/x86_64/multiarch/stpncpy.c            |   1 +
 sysdeps/x86_64/multiarch/strcspn-c-sse4.c     | 163 ++++++++++++++++++
 sysdeps/x86_64/multiarch/strcspn-c.c          | 151 +---------------
 sysdeps/x86_64/multiarch/strcspn-sse2.c       |  28 ---
 sysdeps/x86_64/multiarch/strncat-c.c          |   2 +-
 sysdeps/x86_64/multiarch/strncat.c            |   1 +
 sysdeps/x86_64/multiarch/strncpy-c.c          |   2 +-
 sysdeps/x86_64/multiarch/strncpy.c            |   1 +
 .../{strspn-sse2.c => strpbrk-c-sse4.c}       |  18 +-
 sysdeps/x86_64/multiarch/strpbrk-c.c          |  18 +-
 sysdeps/x86_64/multiarch/strpbrk-sse2.c       |  28 ---
 sysdeps/x86_64/multiarch/strspn-c-sse4.c      | 136 +++++++++++++++
 sysdeps/x86_64/multiarch/strspn-c.c           | 126 +-------------
 sysdeps/x86_64/multiarch/wcscpy-c.c           |   2 +-
 sysdeps/x86_64/multiarch/wcscpy.c             |   4 +-
 sysdeps/x86_64/multiarch/wcsnlen-c.c          |   4 +-
 sysdeps/x86_64/multiarch/wcsnlen.c            |   1 +
 23 files changed, 376 insertions(+), 363 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c
 rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%)
 delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c
 create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c

Comments

H.J. Lu June 10, 2022, 1:19 a.m. UTC | #1
On Thu, Jun 9, 2022 at 5:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No functions are changed. It just renames generic implementations from
> '{func}_sse2' to '{func}_generic'. This is just because the postfix
> "_sse2" was overloaded and was used for files that had hand-optimized
> sse2 assembly implementations and files that just redirected back
> to the generic implementation.

This change isn't small and its benefit is very small.  Can it be the part of
a big change to support building glibc with

-march=x86-64-vN

> Full xcheck passed on x86_64.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |  15 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 +-
>  sysdeps/x86_64/multiarch/ifunc-sse4_2.h       |   4 +-
>  sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   8 +-
>  sysdeps/x86_64/multiarch/ifunc-wcslen.h       |   8 +-
>  sysdeps/x86_64/multiarch/stpncpy-c.c          |   2 +-
>  sysdeps/x86_64/multiarch/stpncpy.c            |   1 +
>  sysdeps/x86_64/multiarch/strcspn-c-sse4.c     | 163 ++++++++++++++++++
>  sysdeps/x86_64/multiarch/strcspn-c.c          | 151 +---------------
>  sysdeps/x86_64/multiarch/strcspn-sse2.c       |  28 ---
>  sysdeps/x86_64/multiarch/strncat-c.c          |   2 +-
>  sysdeps/x86_64/multiarch/strncat.c            |   1 +
>  sysdeps/x86_64/multiarch/strncpy-c.c          |   2 +-
>  sysdeps/x86_64/multiarch/strncpy.c            |   1 +
>  .../{strspn-sse2.c => strpbrk-c-sse4.c}       |  18 +-
>  sysdeps/x86_64/multiarch/strpbrk-c.c          |  18 +-
>  sysdeps/x86_64/multiarch/strpbrk-sse2.c       |  28 ---
>  sysdeps/x86_64/multiarch/strspn-c-sse4.c      | 136 +++++++++++++++
>  sysdeps/x86_64/multiarch/strspn-c.c           | 126 +-------------
>  sysdeps/x86_64/multiarch/wcscpy-c.c           |   2 +-
>  sysdeps/x86_64/multiarch/wcscpy.c             |   4 +-
>  sysdeps/x86_64/multiarch/wcsnlen-c.c          |   4 +-
>  sysdeps/x86_64/multiarch/wcsnlen.c            |   1 +
>  23 files changed, 376 insertions(+), 363 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c
>  rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%)
>  delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c
>  create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 3d153cac35..86c6ecdfc1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -77,7 +77,7 @@ sysdep_routines += \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
>    strcspn-c \
> -  strcspn-sse2 \
> +  strcspn-c-sse4 \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> @@ -109,21 +109,22 @@ sysdep_routines += \
>    strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
> -  strpbrk-sse2 \
> +  strpbrk-c-sse4 \
>    strrchr-avx2 \
>    strrchr-avx2-rtm \
>    strrchr-evex \
>    strrchr-sse2 \
>    strspn-c \
> -  strspn-sse2 \
> +  strspn-c-sse4 \
>    strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> -CFLAGS-varshift.c += -msse4
> -CFLAGS-strcspn-c.c += -msse4
> -CFLAGS-strpbrk-c.c += -msse4
> -CFLAGS-strspn-c.c += -msse4
> +
> +CFLAGS-strcspn-c-sse4.c += -msse4
> +CFLAGS-strpbrk-c-sse4.c += -msse4
> +CFLAGS-strspn-c-sse4.c += -msse4
> +
>  CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>  endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 58f3ec8306..4cbd200d39 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __stpncpy_evex)
>               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
>                               __stpncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
> +             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    IFUNC_IMPL (i, name, strcspn,
>               IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2),
>                               __strcspn_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
> +             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic))
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strncat_evex)
>               IFUNC_IMPL_ADD (array, i, strncat, 1,
>                               __strncat_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> +             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
>
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
> @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strncpy_evex)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
>                               __strncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> +             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
>
>    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
>    IFUNC_IMPL (i, name, strpbrk,
>               IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2),
>                               __strpbrk_sse42)
> -             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
> +             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic))
>
>
>    /* Support sysdeps/x86_64/multiarch/strspn.c.  */
>    IFUNC_IMPL (i, name, strspn,
>               IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2),
>                               __strspn_sse42)
> -             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
> +             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic))
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    IFUNC_IMPL (i, name, wcscpy,
>               IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3),
>                               __wcscpy_ssse3)
> -             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
> +             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic))
>
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
> @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> -             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
> +             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic))
>
>    /* Support sysdeps/x86_64/multiarch/wmemchr.c.  */
>    IFUNC_IMPL (i, name, wmemchr,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> index b555ff2fac..ee36525bcf 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> @@ -19,7 +19,7 @@
>
>  #include <init-arch.h>
>
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>
>  static inline void *
> @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2))
>      return OPTIMIZE (sse42);
>
> -  return OPTIMIZE (sse2);
> +  return OPTIMIZE (generic);
>  }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index a15afa44e9..80529458d1 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -20,7 +20,11 @@
>
>  #include <init-arch.h>
>
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> +#ifndef GENERIC
> +# define GENERIC sse2
> +#endif
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  return OPTIMIZE (sse2);
> +  return OPTIMIZE (GENERIC);
>  }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> index 2b29e7608a..88c1c502af 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> @@ -19,7 +19,11 @@
>
>  #include <init-arch.h>
>
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> +#ifndef GENERIC
> +# define GENERIC sse2
> +#endif
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
>      return OPTIMIZE (sse4_1);
>
> -  return OPTIMIZE (sse2);
> +  return OPTIMIZE (GENERIC);
>  }
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
> index b016e487e1..eb62fcf388 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-c.c
> +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
> @@ -1,4 +1,4 @@
> -#define STPNCPY __stpncpy_sse2
> +#define STPNCPY __stpncpy_generic
>  #undef weak_alias
>  #define weak_alias(ignored1, ignored2)
>  #undef libc_hidden_def
> diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> index 82fa53957d..879bc83f0b 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy.c
> +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> @@ -25,6 +25,7 @@
>  # undef stpncpy
>  # undef __stpncpy
>
> +# define GENERIC generic
>  # define SYMBOL_NAME stpncpy
>  # include "ifunc-strcpy.h"
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> new file mode 100644
> index 0000000000..59f64f9fe8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> @@ -0,0 +1,163 @@
> +/* strcspn with SSE4.2 intrinsics
> +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nmmintrin.h>
> +#include <string.h>
> +#include "varshift.h"
> +
> +/* We use 0x2:
> +       _SIDD_SBYTE_OPS
> +       | _SIDD_CMP_EQUAL_ANY
> +       | _SIDD_POSITIVE_POLARITY
> +       | _SIDD_LEAST_SIGNIFICANT
> +   on pcmpistri to compare xmm/mem128
> +
> +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> +   X X X X X X X X X X X X X X X X
> +
> +   against xmm
> +
> +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> +   A A A A A A A A A A A A A A A A
> +
> +   to find out if the first 16byte data element has any byte A and
> +   the offset of the first byte.  There are 3 cases:
> +
> +   1. The first 16byte data element has the byte A at the offset X.
> +   2. The first 16byte data element has EOS and doesn't have the byte A.
> +   3. The first 16byte data element is valid and doesn't have the byte A.
> +
> +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> +
> +    1           X        1      0/1      0
> +    2          16        0       1       0
> +    3          16        0       0       0
> +
> +   We exit from the loop for cases 1 and 2 with jbe which branches
> +   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> +   X for case 1.  */
> +
> +#ifndef STRCSPN_GENERIC
> +# define STRCSPN_GENERIC __strcspn_generic
> +# define STRCSPN_SSE42 __strcspn_sse42
> +#endif
> +
> +#ifdef USE_AS_STRPBRK
> +# define RETURN(val1, val2) return val1
> +#else
> +# define RETURN(val1, val2) return val2
> +#endif
> +
> +extern
> +#ifdef USE_AS_STRPBRK
> +char *
> +#else
> +size_t
> +#endif
> +STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
> +
> +
> +#ifdef USE_AS_STRPBRK
> +char *
> +#else
> +size_t
> +#endif
> +__attribute__ ((section (".text.sse4.2")))
> +STRCSPN_SSE42 (const char *s, const char *a)
> +{
> +  if (*a == 0)
> +    RETURN (NULL, strlen (s));
> +
> +  const char *aligned;
> +  __m128i mask, maskz, zero;
> +  unsigned int maskz_bits;
> +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> +  zero = _mm_set1_epi8 (0);
> +  if (offset != 0)
> +    {
> +      /* Load masks.  */
> +      aligned = (const char *) ((size_t) a & -16L);
> +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> +
> +      /* Find where the NULL terminator is.  */
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +        {
> +          mask = __m128i_shift_right (mask0, offset);
> +          offset = (unsigned int) ((size_t) s & 15);
> +          if (offset)
> +            goto start_unaligned;
> +
> +          aligned = s;
> +          goto start_loop;
> +        }
> +    }
> +
> +  /* A is aligned.  */
> +  mask = _mm_loadu_si128 ((__m128i *) a);
> +  /* Find where the NULL terminator is.  */
> +  maskz = _mm_cmpeq_epi8 (mask, zero);
> +  maskz_bits = _mm_movemask_epi8 (maskz);
> +  if (maskz_bits == 0)
> +    {
> +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> +         of A > 16.  */
> +      if (a[16] != 0)
> +        return STRCSPN_GENERIC (s, a);
> +    }
> +
> +  aligned = s;
> +  offset = (unsigned int) ((size_t) s & 15);
> +  if (offset != 0)
> +    {
> +    start_unaligned:
> +      /* Check partial string.  */
> +      aligned = (const char *) ((size_t) s & -16L);
> +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> +
> +      value = __m128i_shift_right (value, offset);
> +
> +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> +      /* No need to check ZFlag since ZFlag is always 1.  */
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      if (cflag)
> +       RETURN ((char *) (s + length), length);
> +      /* Find where the NULL terminator is.  */
> +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> +      if (index < 16 - offset)
> +       RETURN (NULL, index);
> +      aligned += 16;
> +    }
> +
> +start_loop:
> +  while (1)
> +    {
> +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> +      if (cflag)
> +       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> +      if (zflag)
> +       RETURN (NULL,
> +               /* Find where the NULL terminator is.  */
> +               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> +      aligned += 16;
> +    }
> +}
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> index c312fab8b1..423de2e2b2 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> @@ -1,5 +1,5 @@
> -/* strcspn with SSE4.2 intrinsics
> -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> +/* strcspn.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,148 +16,13 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <nmmintrin.h>
> -#include <string.h>
> -#include "varshift.h"
> +#if IS_IN (libc)
>
> -/* We use 0x2:
> -       _SIDD_SBYTE_OPS
> -       | _SIDD_CMP_EQUAL_ANY
> -       | _SIDD_POSITIVE_POLARITY
> -       | _SIDD_LEAST_SIGNIFICANT
> -   on pcmpistri to compare xmm/mem128
> +# include <sysdep.h>
> +# define STRCSPN __strcspn_generic
>
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   X X X X X X X X X X X X X X X X
> -
> -   against xmm
> -
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   A A A A A A A A A A A A A A A A
> -
> -   to find out if the first 16byte data element has any byte A and
> -   the offset of the first byte.  There are 3 cases:
> -
> -   1. The first 16byte data element has the byte A at the offset X.
> -   2. The first 16byte data element has EOS and doesn't have the byte A.
> -   3. The first 16byte data element is valid and doesn't have the byte A.
> -
> -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> -
> -    1           X        1      0/1      0
> -    2          16        0       1       0
> -    3          16        0       0       0
> -
> -   We exit from the loop for cases 1 and 2 with jbe which branches
> -   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> -   X for case 1.  */
> -
> -#ifndef STRCSPN_SSE2
> -# define STRCSPN_SSE2 __strcspn_sse2
> -# define STRCSPN_SSE42 __strcspn_sse42
> -#endif
> -
> -#ifdef USE_AS_STRPBRK
> -# define RETURN(val1, val2) return val1
> -#else
> -# define RETURN(val1, val2) return val2
> -#endif
> -
> -extern
> -#ifdef USE_AS_STRPBRK
> -char *
> -#else
> -size_t
> -#endif
> -STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
> -
> -
> -#ifdef USE_AS_STRPBRK
> -char *
> -#else
> -size_t
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(STRCSPN)
>  #endif
> -__attribute__ ((section (".text.sse4.2")))
> -STRCSPN_SSE42 (const char *s, const char *a)
> -{
> -  if (*a == 0)
> -    RETURN (NULL, strlen (s));
> -
> -  const char *aligned;
> -  __m128i mask, maskz, zero;
> -  unsigned int maskz_bits;
> -  unsigned int offset = (unsigned int) ((size_t) a & 15);
> -  zero = _mm_set1_epi8 (0);
> -  if (offset != 0)
> -    {
> -      /* Load masks.  */
> -      aligned = (const char *) ((size_t) a & -16L);
> -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> -
> -      /* Find where the NULL terminator is.  */
> -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> -      if (maskz_bits != 0)
> -        {
> -          mask = __m128i_shift_right (mask0, offset);
> -          offset = (unsigned int) ((size_t) s & 15);
> -          if (offset)
> -            goto start_unaligned;
> -
> -          aligned = s;
> -          goto start_loop;
> -        }
> -    }
> -
> -  /* A is aligned.  */
> -  mask = _mm_loadu_si128 ((__m128i *) a);
> -  /* Find where the NULL terminator is.  */
> -  maskz = _mm_cmpeq_epi8 (mask, zero);
> -  maskz_bits = _mm_movemask_epi8 (maskz);
> -  if (maskz_bits == 0)
> -    {
> -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> -         of A > 16.  */
> -      if (a[16] != 0)
> -        return STRCSPN_SSE2 (s, a);
> -    }
> -
> -  aligned = s;
> -  offset = (unsigned int) ((size_t) s & 15);
> -  if (offset != 0)
> -    {
> -    start_unaligned:
> -      /* Check partial string.  */
> -      aligned = (const char *) ((size_t) s & -16L);
> -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -
> -      value = __m128i_shift_right (value, offset);
> -
> -      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> -      /* No need to check ZFlag since ZFlag is always 1.  */
> -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> -      if (cflag)
> -       RETURN ((char *) (s + length), length);
> -      /* Find where the NULL terminator is.  */
> -      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> -      if (index < 16 - offset)
> -       RETURN (NULL, index);
> -      aligned += 16;
> -    }
>
> -start_loop:
> -  while (1)
> -    {
> -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> -      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> -      if (cflag)
> -       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> -      if (zflag)
> -       RETURN (NULL,
> -               /* Find where the NULL terminator is.  */
> -               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> -      aligned += 16;
> -    }
> -}
> +#include <string/strcspn.c>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> deleted file mode 100644
> index 3a04bb39fc..0000000000
> --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strcspn.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define STRCSPN __strcspn_sse2
> -
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(STRCSPN)
> -#endif
> -
> -#include <string/strcspn.c>
> diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
> index 93a7fab7ea..b729c033d9 100644
> --- a/sysdeps/x86_64/multiarch/strncat-c.c
> +++ b/sysdeps/x86_64/multiarch/strncat-c.c
> @@ -1,2 +1,2 @@
> -#define STRNCAT __strncat_sse2
> +#define STRNCAT __strncat_generic
>  #include <string/strncat.c>
> diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> index b649343a97..50fba8a41f 100644
> --- a/sysdeps/x86_64/multiarch/strncat.c
> +++ b/sysdeps/x86_64/multiarch/strncat.c
> @@ -24,6 +24,7 @@
>  # undef strncat
>
>  # define SYMBOL_NAME strncat
> +# define GENERIC generic
>  # include "ifunc-strcpy.h"
>
>  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
> index 57c45ac7ab..183b0b8e0f 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-c.c
> +++ b/sysdeps/x86_64/multiarch/strncpy-c.c
> @@ -1,4 +1,4 @@
> -#define STRNCPY __strncpy_sse2
> +#define STRNCPY __strncpy_generic
>  #undef libc_hidden_builtin_def
>  #define libc_hidden_builtin_def(strncpy)
>
> diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> index 2a780a7e16..7fc7d72ec5 100644
> --- a/sysdeps/x86_64/multiarch/strncpy.c
> +++ b/sysdeps/x86_64/multiarch/strncpy.c
> @@ -24,6 +24,7 @@
>  # undef strncpy
>
>  # define SYMBOL_NAME strncpy
> +# define GENERIC generic
>  # include "ifunc-strcpy.h"
>
>  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> similarity index 74%
> rename from sysdeps/x86_64/multiarch/strspn-sse2.c
> rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> index 61cc6cb0a5..8700276773 100644
> --- a/sysdeps/x86_64/multiarch/strspn-sse2.c
> +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> @@ -1,5 +1,5 @@
> -/* strspn.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +/* strpbrk with SSE4.2 intrinsics
> +   Copyright (C) 2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,13 +16,7 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define STRSPN __strspn_sse2
> -
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(STRSPN)
> -#endif
> -
> -#include <string/strspn.c>
> +#define USE_AS_STRPBRK
> +#define STRCSPN_GENERIC __strpbrk_generic
> +#define STRCSPN_SSE42 __strpbrk_sse42
> +#include "strcspn-c-sse4.c"
> diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
> index abf4ff7f1a..d31acfe495 100644
> --- a/sysdeps/x86_64/multiarch/strpbrk-c.c
> +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
> @@ -1,5 +1,5 @@
> -/* strpbrk with SSE4.2 intrinsics
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> +/* strpbrk.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,7 +16,13 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#define USE_AS_STRPBRK
> -#define STRCSPN_SSE2 __strpbrk_sse2
> -#define STRCSPN_SSE42 __strpbrk_sse42
> -#include "strcspn-c.c"
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +# define STRPBRK __strpbrk_generic
> +
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(STRPBRK)
> +#endif
> +
> +#include <string/strpbrk.c>
> diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> deleted file mode 100644
> index d03214c4fb..0000000000
> --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strpbrk.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define STRPBRK __strpbrk_sse2
> -
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(STRPBRK)
> -#endif
> -
> -#include <string/strpbrk.c>
> diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> new file mode 100644
> index 0000000000..d044916688
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> @@ -0,0 +1,136 @@
> +/* strspn with SSE4.2 intrinsics
> +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nmmintrin.h>
> +#include <string.h>
> +#include "varshift.h"
> +
> +/* We use 0x12:
> +       _SIDD_SBYTE_OPS
> +       | _SIDD_CMP_EQUAL_ANY
> +       | _SIDD_NEGATIVE_POLARITY
> +       | _SIDD_LEAST_SIGNIFICANT
> +   on pcmpistri to compare xmm/mem128
> +
> +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> +   X X X X X X X X X X X X X X X X
> +
> +   against xmm
> +
> +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> +   A A A A A A A A A A A A A A A A
> +
> +   to find out if the first 16byte data element has any non-A byte and
> +   the offset of the first byte.  There are 2 cases:
> +
> +   1. The first 16byte data element has the non-A byte, including
> +      EOS, at the offset X.
> +   2. The first 16byte data element is valid and doesn't have the non-A
> +      byte.
> +
> +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> +
> +   case                ECX     CFlag   ZFlag   SFlag
> +    1           X        1      0/1      0
> +    2          16        0       0       0
> +
> +   We exit from the loop for case 1.  */
> +
> +extern size_t __strspn_generic (const char *, const char *) attribute_hidden;
> +
> +
> +size_t
> +__attribute__ ((section (".text.sse4.2")))
> +__strspn_sse42 (const char *s, const char *a)
> +{
> +  if (*a == 0)
> +    return 0;
> +
> +  const char *aligned;
> +  __m128i mask, maskz, zero;
> +  unsigned int maskz_bits;
> +  unsigned int offset = (int) ((size_t) a & 15);
> +  zero = _mm_set1_epi8 (0);
> +  if (offset != 0)
> +    {
> +      /* Load masks.  */
> +      aligned = (const char *) ((size_t) a & -16L);
> +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> +
> +      /* Find where the NULL terminator is.  */
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +        {
> +          mask = __m128i_shift_right (mask0, offset);
> +          offset = (unsigned int) ((size_t) s & 15);
> +          if (offset)
> +            goto start_unaligned;
> +
> +          aligned = s;
> +          goto start_loop;
> +        }
> +    }
> +
> +  /* A is aligned.  */
> +  mask = _mm_loadu_si128 ((__m128i *) a);
> +
> +  /* Find where the NULL terminator is.  */
> +  maskz = _mm_cmpeq_epi8 (mask, zero);
> +  maskz_bits = _mm_movemask_epi8 (maskz);
> +  if (maskz_bits == 0)
> +    {
> +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> +         of A > 16.  */
> +      if (a[16] != 0)
> +        return __strspn_generic (s, a);
> +    }
> +  aligned = s;
> +  offset = (unsigned int) ((size_t) s & 15);
> +
> +  if (offset != 0)
> +    {
> +    start_unaligned:
> +      /* Check partial string.  */
> +      aligned = (const char *) ((size_t) s & -16L);
> +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> +      __m128i adj_value = __m128i_shift_right (value, offset);
> +
> +      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> +      /* No need to check CFlag since it is always 1.  */
> +      if (length < 16 - offset)
> +       return length;
> +      /* Find where the NULL terminator is.  */
> +      maskz = _mm_cmpeq_epi8 (value, zero);
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +       return length;
> +      aligned += 16;
> +    }
> +
> +start_loop:
> +  while (1)
> +    {
> +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> +      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> +      if (cflag)
> +       return (size_t) (aligned + index - s);
> +      aligned += 16;
> +    }
> +}
> diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> index 6124033ceb..6b50c36432 100644
> --- a/sysdeps/x86_64/multiarch/strspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> @@ -1,5 +1,5 @@
> -/* strspn with SSE4.2 intrinsics
> -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> +/* strspn.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,121 +16,13 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <nmmintrin.h>
> -#include <string.h>
> -#include "varshift.h"
> +#if IS_IN (libc)
>
> -/* We use 0x12:
> -       _SIDD_SBYTE_OPS
> -       | _SIDD_CMP_EQUAL_ANY
> -       | _SIDD_NEGATIVE_POLARITY
> -       | _SIDD_LEAST_SIGNIFICANT
> -   on pcmpistri to compare xmm/mem128
> +# include <sysdep.h>
> +# define STRSPN __strspn_generic
>
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   X X X X X X X X X X X X X X X X
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(STRSPN)
> +#endif
>
> -   against xmm
> -
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   A A A A A A A A A A A A A A A A
> -
> -   to find out if the first 16byte data element has any non-A byte and
> -   the offset of the first byte.  There are 2 cases:
> -
> -   1. The first 16byte data element has the non-A byte, including
> -      EOS, at the offset X.
> -   2. The first 16byte data element is valid and doesn't have the non-A
> -      byte.
> -
> -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> -
> -   case                ECX     CFlag   ZFlag   SFlag
> -    1           X        1      0/1      0
> -    2          16        0       0       0
> -
> -   We exit from the loop for case 1.  */
> -
> -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
> -
> -
> -size_t
> -__attribute__ ((section (".text.sse4.2")))
> -__strspn_sse42 (const char *s, const char *a)
> -{
> -  if (*a == 0)
> -    return 0;
> -
> -  const char *aligned;
> -  __m128i mask, maskz, zero;
> -  unsigned int maskz_bits;
> -  unsigned int offset = (int) ((size_t) a & 15);
> -  zero = _mm_set1_epi8 (0);
> -  if (offset != 0)
> -    {
> -      /* Load masks.  */
> -      aligned = (const char *) ((size_t) a & -16L);
> -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> -
> -      /* Find where the NULL terminator is.  */
> -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> -      if (maskz_bits != 0)
> -        {
> -          mask = __m128i_shift_right (mask0, offset);
> -          offset = (unsigned int) ((size_t) s & 15);
> -          if (offset)
> -            goto start_unaligned;
> -
> -          aligned = s;
> -          goto start_loop;
> -        }
> -    }
> -
> -  /* A is aligned.  */
> -  mask = _mm_loadu_si128 ((__m128i *) a);
> -
> -  /* Find where the NULL terminator is.  */
> -  maskz = _mm_cmpeq_epi8 (mask, zero);
> -  maskz_bits = _mm_movemask_epi8 (maskz);
> -  if (maskz_bits == 0)
> -    {
> -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> -         of A > 16.  */
> -      if (a[16] != 0)
> -        return __strspn_sse2 (s, a);
> -    }
> -  aligned = s;
> -  offset = (unsigned int) ((size_t) s & 15);
> -
> -  if (offset != 0)
> -    {
> -    start_unaligned:
> -      /* Check partial string.  */
> -      aligned = (const char *) ((size_t) s & -16L);
> -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      __m128i adj_value = __m128i_shift_right (value, offset);
> -
> -      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> -      /* No need to check CFlag since it is always 1.  */
> -      if (length < 16 - offset)
> -       return length;
> -      /* Find where the NULL terminator is.  */
> -      maskz = _mm_cmpeq_epi8 (value, zero);
> -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> -      if (maskz_bits != 0)
> -       return length;
> -      aligned += 16;
> -    }
> -
> -start_loop:
> -  while (1)
> -    {
> -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> -      if (cflag)
> -       return (size_t) (aligned + index - s);
> -      aligned += 16;
> -    }
> -}
> +#include <string/strspn.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
> index 26d6984e9b..fa38dd898d 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-c.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
> @@ -1,5 +1,5 @@
>  #if IS_IN (libc)
> -# define WCSCPY  __wcscpy_sse2
> +# define WCSCPY  __wcscpy_generic
>  #endif
>
>  #include <wcsmbs/wcscpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> index 6a2d1421d9..53c3228dc2 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> @@ -26,7 +26,7 @@
>  # define SYMBOL_NAME wcscpy
>  # include <init-arch.h>
>
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>
>  static inline void *
> @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
>      return OPTIMIZE (ssse3);
>
> -  return OPTIMIZE (sse2);
> +  return OPTIMIZE (generic);
>  }
>
>  libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ());
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> index e1ec7cfbb5..1c9c04241a 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> @@ -1,9 +1,9 @@
>  #if IS_IN (libc)
>  # include <wchar.h>
>
> -# define WCSNLEN __wcsnlen_sse2
> +# define WCSNLEN __wcsnlen_generic
>
> -extern __typeof (wcsnlen) __wcsnlen_sse2;
> +extern __typeof (wcsnlen) __wcsnlen_generic;
>  #endif
>
>  #include "wcsmbs/wcsnlen.c"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
> index baa26666a8..05b7a211de 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen.c
> +++ b/sysdeps/x86_64/multiarch/wcsnlen.c
> @@ -24,6 +24,7 @@
>  # undef __wcsnlen
>
>  # define SYMBOL_NAME wcsnlen
> +# define GENERIC generic
>  # include "ifunc-wcslen.h"
>
>  libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
> --
> 2.34.1
>
Noah Goldstein June 10, 2022, 1:26 a.m. UTC | #2
On Thu, Jun 9, 2022 at 6:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Jun 9, 2022 at 5:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No functions are changed. It just renames generic implementations from
> > '{func}_sse2' to '{func}_generic'. This is just because the postfix
> > "_sse2" was overloaded and was used for files that had hand-optimized
> > sse2 assembly implementations and files that just redirected back
> > to the generic implementation.
>
> This change isn't small and its benefit is very small.  Can it be the part of
> a big change to support building glibc with
>
> -march=x86-64-vN

kk
>
> > Full xcheck passed on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile             |  15 +-
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 +-
> >  sysdeps/x86_64/multiarch/ifunc-sse4_2.h       |   4 +-
> >  sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   8 +-
> >  sysdeps/x86_64/multiarch/ifunc-wcslen.h       |   8 +-
> >  sysdeps/x86_64/multiarch/stpncpy-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/stpncpy.c            |   1 +
> >  sysdeps/x86_64/multiarch/strcspn-c-sse4.c     | 163 ++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strcspn-c.c          | 151 +---------------
> >  sysdeps/x86_64/multiarch/strcspn-sse2.c       |  28 ---
> >  sysdeps/x86_64/multiarch/strncat-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/strncat.c            |   1 +
> >  sysdeps/x86_64/multiarch/strncpy-c.c          |   2 +-
> >  sysdeps/x86_64/multiarch/strncpy.c            |   1 +
> >  .../{strspn-sse2.c => strpbrk-c-sse4.c}       |  18 +-
> >  sysdeps/x86_64/multiarch/strpbrk-c.c          |  18 +-
> >  sysdeps/x86_64/multiarch/strpbrk-sse2.c       |  28 ---
> >  sysdeps/x86_64/multiarch/strspn-c-sse4.c      | 136 +++++++++++++++
> >  sysdeps/x86_64/multiarch/strspn-c.c           | 126 +-------------
> >  sysdeps/x86_64/multiarch/wcscpy-c.c           |   2 +-
> >  sysdeps/x86_64/multiarch/wcscpy.c             |   4 +-
> >  sysdeps/x86_64/multiarch/wcsnlen-c.c          |   4 +-
> >  sysdeps/x86_64/multiarch/wcsnlen.c            |   1 +
> >  23 files changed, 376 insertions(+), 363 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c
> >  rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%)
> >  delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c
> >  create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 3d153cac35..86c6ecdfc1 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -77,7 +77,7 @@ sysdep_routines += \
> >    strcpy-sse2 \
> >    strcpy-sse2-unaligned \
> >    strcspn-c \
> > -  strcspn-sse2 \
> > +  strcspn-c-sse4 \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > @@ -109,21 +109,22 @@ sysdep_routines += \
> >    strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> > -  strpbrk-sse2 \
> > +  strpbrk-c-sse4 \
> >    strrchr-avx2 \
> >    strrchr-avx2-rtm \
> >    strrchr-evex \
> >    strrchr-sse2 \
> >    strspn-c \
> > -  strspn-sse2 \
> > +  strspn-c-sse4 \
> >    strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > -CFLAGS-varshift.c += -msse4
> > -CFLAGS-strcspn-c.c += -msse4
> > -CFLAGS-strpbrk-c.c += -msse4
> > -CFLAGS-strspn-c.c += -msse4
> > +
> > +CFLAGS-strcspn-c-sse4.c += -msse4
> > +CFLAGS-strpbrk-c-sse4.c += -msse4
> > +CFLAGS-strspn-c-sse4.c += -msse4
> > +
> >  CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> >  endif
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 58f3ec8306..4cbd200d39 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __stpncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> >                               __stpncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> >    IFUNC_IMPL (i, name, stpcpy,
> > @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    IFUNC_IMPL (i, name, strcspn,
> >               IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strcspn_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
> >    IFUNC_IMPL (i, name, strncasecmp,
> > @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __strncat_evex)
> >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> >                               __strncat_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >    IFUNC_IMPL (i, name, strncpy,
> > @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __strncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> >                               __strncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> >    IFUNC_IMPL (i, name, strpbrk,
> >               IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strpbrk_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic))
> >
> >
> >    /* Support sysdeps/x86_64/multiarch/strspn.c.  */
> >    IFUNC_IMPL (i, name, strspn,
> >               IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2),
> >                               __strspn_sse42)
> > -             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
> > +             IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    IFUNC_IMPL (i, name, wcscpy,
> >               IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3),
> >                               __wcscpy_ssse3)
> > -             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
> > +             IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> >    IFUNC_IMPL (i, name, wcslen,
> > @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > -             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic))
> >
> >    /* Support sysdeps/x86_64/multiarch/wmemchr.c.  */
> >    IFUNC_IMPL (i, name, wmemchr,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > index b555ff2fac..ee36525bcf 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
> > @@ -19,7 +19,7 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> >
> >  static inline void *
> > @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2))
> >      return OPTIMIZE (sse42);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (generic);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > index a15afa44e9..80529458d1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > @@ -20,7 +20,11 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +#ifndef GENERIC
> > +# define GENERIC sse2
> > +#endif
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> >    attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> >      return OPTIMIZE (sse2_unaligned);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (GENERIC);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > index 2b29e7608a..88c1c502af 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
> > @@ -19,7 +19,11 @@
> >
> >  #include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +#ifndef GENERIC
> > +# define GENERIC sse2
> > +#endif
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> >      return OPTIMIZE (sse4_1);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (GENERIC);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
> > index b016e487e1..eb62fcf388 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
> > @@ -1,4 +1,4 @@
> > -#define STPNCPY __stpncpy_sse2
> > +#define STPNCPY __stpncpy_generic
> >  #undef weak_alias
> >  #define weak_alias(ignored1, ignored2)
> >  #undef libc_hidden_def
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > index 82fa53957d..879bc83f0b 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > @@ -25,6 +25,7 @@
> >  # undef stpncpy
> >  # undef __stpncpy
> >
> > +# define GENERIC generic
> >  # define SYMBOL_NAME stpncpy
> >  # include "ifunc-strcpy.h"
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> > new file mode 100644
> > index 0000000000..59f64f9fe8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
> > @@ -0,0 +1,163 @@
> > +/* strcspn with SSE4.2 intrinsics
> > +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nmmintrin.h>
> > +#include <string.h>
> > +#include "varshift.h"
> > +
> > +/* We use 0x2:
> > +       _SIDD_SBYTE_OPS
> > +       | _SIDD_CMP_EQUAL_ANY
> > +       | _SIDD_POSITIVE_POLARITY
> > +       | _SIDD_LEAST_SIGNIFICANT
> > +   on pcmpistri to compare xmm/mem128
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   X X X X X X X X X X X X X X X X
> > +
> > +   against xmm
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   A A A A A A A A A A A A A A A A
> > +
> > +   to find out if the first 16byte data element has any byte A and
> > +   the offset of the first byte.  There are 3 cases:
> > +
> > +   1. The first 16byte data element has the byte A at the offset X.
> > +   2. The first 16byte data element has EOS and doesn't have the byte A.
> > +   3. The first 16byte data element is valid and doesn't have the byte A.
> > +
> > +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > +
> > +    1           X        1      0/1      0
> > +    2          16        0       1       0
> > +    3          16        0       0       0
> > +
> > +   We exit from the loop for cases 1 and 2 with jbe which branches
> > +   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> > +   X for case 1.  */
> > +
> > +#ifndef STRCSPN_GENERIC
> > +# define STRCSPN_GENERIC __strcspn_generic
> > +# define STRCSPN_SSE42 __strcspn_sse42
> > +#endif
> > +
> > +#ifdef USE_AS_STRPBRK
> > +# define RETURN(val1, val2) return val1
> > +#else
> > +# define RETURN(val1, val2) return val2
> > +#endif
> > +
> > +extern
> > +#ifdef USE_AS_STRPBRK
> > +char *
> > +#else
> > +size_t
> > +#endif
> > +STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
> > +
> > +
> > +#ifdef USE_AS_STRPBRK
> > +char *
> > +#else
> > +size_t
> > +#endif
> > +__attribute__ ((section (".text.sse4.2")))
> > +STRCSPN_SSE42 (const char *s, const char *a)
> > +{
> > +  if (*a == 0)
> > +    RETURN (NULL, strlen (s));
> > +
> > +  const char *aligned;
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> > +  if (offset != 0)
> > +    {
> > +      /* Load masks.  */
> > +      aligned = (const char *) ((size_t) a & -16L);
> > +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > +
> > +      /* Find where the NULL terminator is.  */
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> > +    }
> > +
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return STRCSPN_GENERIC (s, a);
> > +    }
> > +
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> > +  if (offset != 0)
> > +    {
> > +    start_unaligned:
> > +      /* Check partial string.  */
> > +      aligned = (const char *) ((size_t) s & -16L);
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +
> > +      value = __m128i_shift_right (value, offset);
> > +
> > +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > +      /* No need to check ZFlag since ZFlag is always 1.  */
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      if (cflag)
> > +       RETURN ((char *) (s + length), length);
> > +      /* Find where the NULL terminator is.  */
> > +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > +      if (index < 16 - offset)
> > +       RETURN (NULL, index);
> > +      aligned += 16;
> > +    }
> > +
> > +start_loop:
> > +  while (1)
> > +    {
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > +      if (cflag)
> > +       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > +      if (zflag)
> > +       RETURN (NULL,
> > +               /* Find where the NULL terminator is.  */
> > +               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> > +      aligned += 16;
> > +    }
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index c312fab8b1..423de2e2b2 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -1,5 +1,5 @@
> > -/* strcspn with SSE4.2 intrinsics
> > -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +/* strcspn.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,148 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <nmmintrin.h>
> > -#include <string.h>
> > -#include "varshift.h"
> > +#if IS_IN (libc)
> >
> > -/* We use 0x2:
> > -       _SIDD_SBYTE_OPS
> > -       | _SIDD_CMP_EQUAL_ANY
> > -       | _SIDD_POSITIVE_POLARITY
> > -       | _SIDD_LEAST_SIGNIFICANT
> > -   on pcmpistri to compare xmm/mem128
> > +# include <sysdep.h>
> > +# define STRCSPN __strcspn_generic
> >
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   X X X X X X X X X X X X X X X X
> > -
> > -   against xmm
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   A A A A A A A A A A A A A A A A
> > -
> > -   to find out if the first 16byte data element has any byte A and
> > -   the offset of the first byte.  There are 3 cases:
> > -
> > -   1. The first 16byte data element has the byte A at the offset X.
> > -   2. The first 16byte data element has EOS and doesn't have the byte A.
> > -   3. The first 16byte data element is valid and doesn't have the byte A.
> > -
> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > -
> > -    1           X        1      0/1      0
> > -    2          16        0       1       0
> > -    3          16        0       0       0
> > -
> > -   We exit from the loop for cases 1 and 2 with jbe which branches
> > -   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> > -   X for case 1.  */
> > -
> > -#ifndef STRCSPN_SSE2
> > -# define STRCSPN_SSE2 __strcspn_sse2
> > -# define STRCSPN_SSE42 __strcspn_sse42
> > -#endif
> > -
> > -#ifdef USE_AS_STRPBRK
> > -# define RETURN(val1, val2) return val1
> > -#else
> > -# define RETURN(val1, val2) return val2
> > -#endif
> > -
> > -extern
> > -#ifdef USE_AS_STRPBRK
> > -char *
> > -#else
> > -size_t
> > -#endif
> > -STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
> > -
> > -
> > -#ifdef USE_AS_STRPBRK
> > -char *
> > -#else
> > -size_t
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRCSPN)
> >  #endif
> > -__attribute__ ((section (".text.sse4.2")))
> > -STRCSPN_SSE42 (const char *s, const char *a)
> > -{
> > -  if (*a == 0)
> > -    RETURN (NULL, strlen (s));
> > -
> > -  const char *aligned;
> > -  __m128i mask, maskz, zero;
> > -  unsigned int maskz_bits;
> > -  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > -  zero = _mm_set1_epi8 (0);
> > -  if (offset != 0)
> > -    {
> > -      /* Load masks.  */
> > -      aligned = (const char *) ((size_t) a & -16L);
> > -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > -
> > -      /* Find where the NULL terminator is.  */
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -        {
> > -          mask = __m128i_shift_right (mask0, offset);
> > -          offset = (unsigned int) ((size_t) s & 15);
> > -          if (offset)
> > -            goto start_unaligned;
> > -
> > -          aligned = s;
> > -          goto start_loop;
> > -        }
> > -    }
> > -
> > -  /* A is aligned.  */
> > -  mask = _mm_loadu_si128 ((__m128i *) a);
> > -  /* Find where the NULL terminator is.  */
> > -  maskz = _mm_cmpeq_epi8 (mask, zero);
> > -  maskz_bits = _mm_movemask_epi8 (maskz);
> > -  if (maskz_bits == 0)
> > -    {
> > -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -         of A > 16.  */
> > -      if (a[16] != 0)
> > -        return STRCSPN_SSE2 (s, a);
> > -    }
> > -
> > -  aligned = s;
> > -  offset = (unsigned int) ((size_t) s & 15);
> > -  if (offset != 0)
> > -    {
> > -    start_unaligned:
> > -      /* Check partial string.  */
> > -      aligned = (const char *) ((size_t) s & -16L);
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -
> > -      value = __m128i_shift_right (value, offset);
> > -
> > -      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > -      /* No need to check ZFlag since ZFlag is always 1.  */
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      if (cflag)
> > -       RETURN ((char *) (s + length), length);
> > -      /* Find where the NULL terminator is.  */
> > -      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > -      if (index < 16 - offset)
> > -       RETURN (NULL, index);
> > -      aligned += 16;
> > -    }
> >
> > -start_loop:
> > -  while (1)
> > -    {
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > -      if (cflag)
> > -       RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > -      if (zflag)
> > -       RETURN (NULL,
> > -               /* Find where the NULL terminator is.  */
> > -               (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
> > -      aligned += 16;
> > -    }
> > -}
> > +#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > deleted file mode 100644
> > index 3a04bb39fc..0000000000
> > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c
> > +++ /dev/null
> > @@ -1,28 +0,0 @@
> > -/* strcspn.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRCSPN __strcspn_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRCSPN)
> > -#endif
> > -
> > -#include <string/strcspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
> > index 93a7fab7ea..b729c033d9 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-c.c
> > +++ b/sysdeps/x86_64/multiarch/strncat-c.c
> > @@ -1,2 +1,2 @@
> > -#define STRNCAT __strncat_sse2
> > +#define STRNCAT __strncat_generic
> >  #include <string/strncat.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > index b649343a97..50fba8a41f 100644
> > --- a/sysdeps/x86_64/multiarch/strncat.c
> > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > @@ -24,6 +24,7 @@
> >  # undef strncat
> >
> >  # define SYMBOL_NAME strncat
> > +# define GENERIC generic
> >  # include "ifunc-strcpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
> > index 57c45ac7ab..183b0b8e0f 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/strncpy-c.c
> > @@ -1,4 +1,4 @@
> > -#define STRNCPY __strncpy_sse2
> > +#define STRNCPY __strncpy_generic
> >  #undef libc_hidden_builtin_def
> >  #define libc_hidden_builtin_def(strncpy)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > index 2a780a7e16..7fc7d72ec5 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > @@ -24,6 +24,7 @@
> >  # undef strncpy
> >
> >  # define SYMBOL_NAME strncpy
> > +# define GENERIC generic
> >  # include "ifunc-strcpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > similarity index 74%
> > rename from sysdeps/x86_64/multiarch/strspn-sse2.c
> > rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > index 61cc6cb0a5..8700276773 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-sse2.c
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
> > @@ -1,5 +1,5 @@
> > -/* strspn.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > +/* strpbrk with SSE4.2 intrinsics
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,13 +16,7 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRSPN __strspn_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRSPN)
> > -#endif
> > -
> > -#include <string/strspn.c>
> > +#define USE_AS_STRPBRK
> > +#define STRCSPN_GENERIC __strpbrk_generic
> > +#define STRCSPN_SSE42 __strpbrk_sse42
> > +#include "strcspn-c-sse4.c"
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
> > index abf4ff7f1a..d31acfe495 100644
> > --- a/sysdeps/x86_64/multiarch/strpbrk-c.c
> > +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
> > @@ -1,5 +1,5 @@
> > -/* strpbrk with SSE4.2 intrinsics
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > +/* strpbrk.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,7 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#define USE_AS_STRPBRK
> > -#define STRCSPN_SSE2 __strpbrk_sse2
> > -#define STRCSPN_SSE42 __strpbrk_sse42
> > -#include "strcspn-c.c"
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +# define STRPBRK __strpbrk_generic
> > +
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRPBRK)
> > +#endif
> > +
> > +#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > deleted file mode 100644
> > index d03214c4fb..0000000000
> > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c
> > +++ /dev/null
> > @@ -1,28 +0,0 @@
> > -/* strpbrk.
> > -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -# define STRPBRK __strpbrk_sse2
> > -
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(STRPBRK)
> > -#endif
> > -
> > -#include <string/strpbrk.c>
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> > new file mode 100644
> > index 0000000000..d044916688
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
> > @@ -0,0 +1,136 @@
> > +/* strspn with SSE4.2 intrinsics
> > +   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <nmmintrin.h>
> > +#include <string.h>
> > +#include "varshift.h"
> > +
> > +/* We use 0x12:
> > +       _SIDD_SBYTE_OPS
> > +       | _SIDD_CMP_EQUAL_ANY
> > +       | _SIDD_NEGATIVE_POLARITY
> > +       | _SIDD_LEAST_SIGNIFICANT
> > +   on pcmpistri to compare xmm/mem128
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   X X X X X X X X X X X X X X X X
> > +
> > +   against xmm
> > +
> > +   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > +   A A A A A A A A A A A A A A A A
> > +
> > +   to find out if the first 16byte data element has any non-A byte and
> > +   the offset of the first byte.  There are 2 cases:
> > +
> > +   1. The first 16byte data element has the non-A byte, including
> > +      EOS, at the offset X.
> > +   2. The first 16byte data element is valid and doesn't have the non-A
> > +      byte.
> > +
> > +   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > +
> > +   case                ECX     CFlag   ZFlag   SFlag
> > +    1           X        1      0/1      0
> > +    2          16        0       0       0
> > +
> > +   We exit from the loop for case 1.  */
> > +
> > +extern size_t __strspn_generic (const char *, const char *) attribute_hidden;
> > +
> > +
> > +size_t
> > +__attribute__ ((section (".text.sse4.2")))
> > +__strspn_sse42 (const char *s, const char *a)
> > +{
> > +  if (*a == 0)
> > +    return 0;
> > +
> > +  const char *aligned;
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> > +  if (offset != 0)
> > +    {
> > +      /* Load masks.  */
> > +      aligned = (const char *) ((size_t) a & -16L);
> > +      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > +
> > +      /* Find where the NULL terminator is.  */
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> > +    }
> > +
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return __strspn_generic (s, a);
> > +    }
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> > +
> > +  if (offset != 0)
> > +    {
> > +    start_unaligned:
> > +      /* Check partial string.  */
> > +      aligned = (const char *) ((size_t) s & -16L);
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      __m128i adj_value = __m128i_shift_right (value, offset);
> > +
> > +      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > +      /* No need to check CFlag since it is always 1.  */
> > +      if (length < 16 - offset)
> > +       return length;
> > +      /* Find where the NULL terminator is.  */
> > +      maskz = _mm_cmpeq_epi8 (value, zero);
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +       return length;
> > +      aligned += 16;
> > +    }
> > +
> > +start_loop:
> > +  while (1)
> > +    {
> > +      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > +      if (cflag)
> > +       return (size_t) (aligned + index - s);
> > +      aligned += 16;
> > +    }
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> > index 6124033ceb..6b50c36432 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> > @@ -1,5 +1,5 @@
> > -/* strspn with SSE4.2 intrinsics
> > -   Copyright (C) 2009-2022 Free Software Foundation, Inc.
> > +/* strspn.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,121 +16,13 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <nmmintrin.h>
> > -#include <string.h>
> > -#include "varshift.h"
> > +#if IS_IN (libc)
> >
> > -/* We use 0x12:
> > -       _SIDD_SBYTE_OPS
> > -       | _SIDD_CMP_EQUAL_ANY
> > -       | _SIDD_NEGATIVE_POLARITY
> > -       | _SIDD_LEAST_SIGNIFICANT
> > -   on pcmpistri to compare xmm/mem128
> > +# include <sysdep.h>
> > +# define STRSPN __strspn_generic
> >
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   X X X X X X X X X X X X X X X X
> > +# undef libc_hidden_builtin_def
> > +# define libc_hidden_builtin_def(STRSPN)
> > +#endif
> >
> > -   against xmm
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   A A A A A A A A A A A A A A A A
> > -
> > -   to find out if the first 16byte data element has any non-A byte and
> > -   the offset of the first byte.  There are 2 cases:
> > -
> > -   1. The first 16byte data element has the non-A byte, including
> > -      EOS, at the offset X.
> > -   2. The first 16byte data element is valid and doesn't have the non-A
> > -      byte.
> > -
> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
> > -
> > -   case                ECX     CFlag   ZFlag   SFlag
> > -    1           X        1      0/1      0
> > -    2          16        0       0       0
> > -
> > -   We exit from the loop for case 1.  */
> > -
> > -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
> > -
> > -
> > -size_t
> > -__attribute__ ((section (".text.sse4.2")))
> > -__strspn_sse42 (const char *s, const char *a)
> > -{
> > -  if (*a == 0)
> > -    return 0;
> > -
> > -  const char *aligned;
> > -  __m128i mask, maskz, zero;
> > -  unsigned int maskz_bits;
> > -  unsigned int offset = (int) ((size_t) a & 15);
> > -  zero = _mm_set1_epi8 (0);
> > -  if (offset != 0)
> > -    {
> > -      /* Load masks.  */
> > -      aligned = (const char *) ((size_t) a & -16L);
> > -      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -      maskz = _mm_cmpeq_epi8 (mask0, zero);
> > -
> > -      /* Find where the NULL terminator is.  */
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -        {
> > -          mask = __m128i_shift_right (mask0, offset);
> > -          offset = (unsigned int) ((size_t) s & 15);
> > -          if (offset)
> > -            goto start_unaligned;
> > -
> > -          aligned = s;
> > -          goto start_loop;
> > -        }
> > -    }
> > -
> > -  /* A is aligned.  */
> > -  mask = _mm_loadu_si128 ((__m128i *) a);
> > -
> > -  /* Find where the NULL terminator is.  */
> > -  maskz = _mm_cmpeq_epi8 (mask, zero);
> > -  maskz_bits = _mm_movemask_epi8 (maskz);
> > -  if (maskz_bits == 0)
> > -    {
> > -      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -         of A > 16.  */
> > -      if (a[16] != 0)
> > -        return __strspn_sse2 (s, a);
> > -    }
> > -  aligned = s;
> > -  offset = (unsigned int) ((size_t) s & 15);
> > -
> > -  if (offset != 0)
> > -    {
> > -    start_unaligned:
> > -      /* Check partial string.  */
> > -      aligned = (const char *) ((size_t) s & -16L);
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      __m128i adj_value = __m128i_shift_right (value, offset);
> > -
> > -      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > -      /* No need to check CFlag since it is always 1.  */
> > -      if (length < 16 - offset)
> > -       return length;
> > -      /* Find where the NULL terminator is.  */
> > -      maskz = _mm_cmpeq_epi8 (value, zero);
> > -      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > -      if (maskz_bits != 0)
> > -       return length;
> > -      aligned += 16;
> > -    }
> > -
> > -start_loop:
> > -  while (1)
> > -    {
> > -      __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > -      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > -      if (cflag)
> > -       return (size_t) (aligned + index - s);
> > -      aligned += 16;
> > -    }
> > -}
> > +#include <string/strspn.c>
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
> > index 26d6984e9b..fa38dd898d 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy-c.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
> > @@ -1,5 +1,5 @@
> >  #if IS_IN (libc)
> > -# define WCSCPY  __wcscpy_sse2
> > +# define WCSCPY  __wcscpy_generic
> >  #endif
> >
> >  #include <wcsmbs/wcscpy.c>
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> > index 6a2d1421d9..53c3228dc2 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> > @@ -26,7 +26,7 @@
> >  # define SYMBOL_NAME wcscpy
> >  # include <init-arch.h>
> >
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> >
> >  static inline void *
> > @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> >      return OPTIMIZE (ssse3);
> >
> > -  return OPTIMIZE (sse2);
> > +  return OPTIMIZE (generic);
> >  }
> >
> >  libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ());
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > index e1ec7cfbb5..1c9c04241a 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c
> > @@ -1,9 +1,9 @@
> >  #if IS_IN (libc)
> >  # include <wchar.h>
> >
> > -# define WCSNLEN __wcsnlen_sse2
> > +# define WCSNLEN __wcsnlen_generic
> >
> > -extern __typeof (wcsnlen) __wcsnlen_sse2;
> > +extern __typeof (wcsnlen) __wcsnlen_generic;
> >  #endif
> >
> >  #include "wcsmbs/wcsnlen.c"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
> > index baa26666a8..05b7a211de 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen.c
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen.c
> > @@ -24,6 +24,7 @@
> >  # undef __wcsnlen
> >
> >  # define SYMBOL_NAME wcsnlen
> > +# define GENERIC generic
> >  # include "ifunc-wcslen.h"
> >
> >  libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3d153cac35..86c6ecdfc1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -77,7 +77,7 @@  sysdep_routines += \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
   strcspn-c \
-  strcspn-sse2 \
+  strcspn-c-sse4 \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
@@ -109,21 +109,22 @@  sysdep_routines += \
   strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
-  strpbrk-sse2 \
+  strpbrk-c-sse4 \
   strrchr-avx2 \
   strrchr-avx2-rtm \
   strrchr-evex \
   strrchr-sse2 \
   strspn-c \
-  strspn-sse2 \
+  strspn-c-sse4 \
   strstr-avx512 \
   strstr-sse2-unaligned \
   varshift \
 # sysdep_routines
-CFLAGS-varshift.c += -msse4
-CFLAGS-strcspn-c.c += -msse4
-CFLAGS-strpbrk-c.c += -msse4
-CFLAGS-strspn-c.c += -msse4
+
+CFLAGS-strcspn-c-sse4.c += -msse4
+CFLAGS-strpbrk-c-sse4.c += -msse4
+CFLAGS-strspn-c-sse4.c += -msse4
+
 CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 58f3ec8306..4cbd200d39 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -372,7 +372,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
@@ -531,7 +531,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcspn,
 	      IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcspn_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic))
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
@@ -585,7 +585,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncat_evex)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
 
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
@@ -601,20 +601,20 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
 
   /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
   IFUNC_IMPL (i, name, strpbrk,
 	      IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2),
 			      __strpbrk_sse42)
-	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic))
 
 
   /* Support sysdeps/x86_64/multiarch/strspn.c.  */
   IFUNC_IMPL (i, name, strspn,
 	      IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2),
 			      __strspn_sse42)
-	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic))
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
@@ -697,7 +697,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, wcscpy,
 	      IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3),
 			      __wcscpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic))
 
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
@@ -749,7 +749,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic))
 
   /* Support sysdeps/x86_64/multiarch/wmemchr.c.  */
   IFUNC_IMPL (i, name, wmemchr,
diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
index b555ff2fac..ee36525bcf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h
@@ -19,7 +19,7 @@ 
 
 #include <init-arch.h>
 
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 
 static inline void *
@@ -30,5 +30,5 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2))
     return OPTIMIZE (sse42);
 
-  return OPTIMIZE (sse2);
+  return OPTIMIZE (generic);
 }
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index a15afa44e9..80529458d1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -20,7 +20,11 @@ 
 
 #include <init-arch.h>
 
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+#ifndef GENERIC
+# define GENERIC sse2
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
@@ -49,5 +53,5 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  return OPTIMIZE (sse2);
+  return OPTIMIZE (GENERIC);
 }
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
index 2b29e7608a..88c1c502af 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -19,7 +19,11 @@ 
 
 #include <init-arch.h>
 
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+#ifndef GENERIC
+# define GENERIC sse2
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -48,5 +52,5 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  return OPTIMIZE (sse2);
+  return OPTIMIZE (GENERIC);
 }
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
index b016e487e1..eb62fcf388 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-c.c
+++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -1,4 +1,4 @@ 
-#define STPNCPY __stpncpy_sse2
+#define STPNCPY __stpncpy_generic
 #undef weak_alias
 #define weak_alias(ignored1, ignored2)
 #undef libc_hidden_def
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 82fa53957d..879bc83f0b 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -25,6 +25,7 @@ 
 # undef stpncpy
 # undef __stpncpy
 
+# define GENERIC generic
 # define SYMBOL_NAME stpncpy
 # include "ifunc-strcpy.h"
 
diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
new file mode 100644
index 0000000000..59f64f9fe8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c
@@ -0,0 +1,163 @@ 
+/* strcspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x2:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_POSITIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any byte A and
+   the offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte A at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte A.
+   3. The first 16byte data element is valid and doesn't have the byte A.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+#ifndef STRCSPN_GENERIC
+# define STRCSPN_GENERIC __strcspn_generic
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    RETURN (NULL, strlen (s));
+
+  const char *aligned;
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
+
+      /* Find where the NULL terminator is.  */
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
+    }
+
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return STRCSPN_GENERIC (s, a);
+    }
+
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+    start_unaligned:
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      value = __m128i_shift_right (value, offset);
+
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
+      /* No need to check ZFlag since ZFlag is always 1.  */
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (s + length), length);
+      /* Find where the NULL terminator is.  */
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	RETURN (NULL, index);
+      aligned += 16;
+    }
+
+start_loop:
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+      if (zflag)
+	RETURN (NULL,
+		/* Find where the NULL terminator is.  */
+		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+      aligned += 16;
+    }
+}
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index c312fab8b1..423de2e2b2 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -1,5 +1,5 @@ 
-/* strcspn with SSE4.2 intrinsics
-   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+/* strcspn.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,148 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
+#if IS_IN (libc)
 
-/* We use 0x2:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ANY
-	| _SIDD_POSITIVE_POLARITY
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
+# include <sysdep.h>
+# define STRCSPN __strcspn_generic
 
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   A A A A A A A A A A A A A A A A
-
-   to find out if the first 16byte data element has any byte A and
-   the offset of the first byte.  There are 3 cases:
-
-   1. The first 16byte data element has the byte A at the offset X.
-   2. The first 16byte data element has EOS and doesn't have the byte A.
-   3. The first 16byte data element is valid and doesn't have the byte A.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
-    1		 X	  1	 0/1	  0
-    2		16	  0	  1	  0
-    3		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 2 with jbe which branches
-   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
-   X for case 1.  */
-
-#ifndef STRCSPN_SSE2
-# define STRCSPN_SSE2 __strcspn_sse2
-# define STRCSPN_SSE42 __strcspn_sse42
-#endif
-
-#ifdef USE_AS_STRPBRK
-# define RETURN(val1, val2) return val1
-#else
-# define RETURN(val1, val2) return val2
-#endif
-
-extern
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
-#endif
-STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
-
-
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRCSPN)
 #endif
-__attribute__ ((section (".text.sse4.2")))
-STRCSPN_SSE42 (const char *s, const char *a)
-{
-  if (*a == 0)
-    RETURN (NULL, strlen (s));
-
-  const char *aligned;
-  __m128i mask, maskz, zero;
-  unsigned int maskz_bits;
-  unsigned int offset = (unsigned int) ((size_t) a & 15);
-  zero = _mm_set1_epi8 (0);
-  if (offset != 0)
-    {
-      /* Load masks.  */
-      aligned = (const char *) ((size_t) a & -16L);
-      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-      maskz = _mm_cmpeq_epi8 (mask0, zero);
-
-      /* Find where the NULL terminator is.  */
-      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
-      if (maskz_bits != 0)
-        {
-          mask = __m128i_shift_right (mask0, offset);
-          offset = (unsigned int) ((size_t) s & 15);
-          if (offset)
-            goto start_unaligned;
-
-          aligned = s;
-          goto start_loop;
-        }
-    }
-
-  /* A is aligned.  */
-  mask = _mm_loadu_si128 ((__m128i *) a);
-  /* Find where the NULL terminator is.  */
-  maskz = _mm_cmpeq_epi8 (mask, zero);
-  maskz_bits = _mm_movemask_epi8 (maskz);
-  if (maskz_bits == 0)
-    {
-      /* There is no NULL terminator.  Don't use SSE4.2 if the length
-         of A > 16.  */
-      if (a[16] != 0)
-        return STRCSPN_SSE2 (s, a);
-    }
-
-  aligned = s;
-  offset = (unsigned int) ((size_t) s & 15);
-  if (offset != 0)
-    {
-    start_unaligned:
-      /* Check partial string.  */
-      aligned = (const char *) ((size_t) s & -16L);
-      __m128i value = _mm_load_si128 ((__m128i *) aligned);
-
-      value = __m128i_shift_right (value, offset);
-
-      unsigned int length = _mm_cmpistri (mask, value, 0x2);
-      /* No need to check ZFlag since ZFlag is always 1.  */
-      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
-      if (cflag)
-	RETURN ((char *) (s + length), length);
-      /* Find where the NULL terminator is.  */
-      unsigned int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
-	RETURN (NULL, index);
-      aligned += 16;
-    }
 
-start_loop:
-  while (1)
-    {
-      __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      unsigned int index = _mm_cmpistri (mask, value, 0x2);
-      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
-      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
-      if (cflag)
-	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
-      if (zflag)
-	RETURN (NULL,
-		/* Find where the NULL terminator is.  */
-		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
-      aligned += 16;
-    }
-}
+#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c
deleted file mode 100644
index 3a04bb39fc..0000000000
--- a/sysdeps/x86_64/multiarch/strcspn-sse2.c
+++ /dev/null
@@ -1,28 +0,0 @@ 
-/* strcspn.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define STRCSPN __strcspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(STRCSPN)
-#endif
-
-#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
index 93a7fab7ea..b729c033d9 100644
--- a/sysdeps/x86_64/multiarch/strncat-c.c
+++ b/sysdeps/x86_64/multiarch/strncat-c.c
@@ -1,2 +1,2 @@ 
-#define STRNCAT __strncat_sse2
+#define STRNCAT __strncat_generic
 #include <string/strncat.c>
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index b649343a97..50fba8a41f 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,6 +24,7 @@ 
 # undef strncat
 
 # define SYMBOL_NAME strncat
+# define GENERIC generic
 # include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
index 57c45ac7ab..183b0b8e0f 100644
--- a/sysdeps/x86_64/multiarch/strncpy-c.c
+++ b/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -1,4 +1,4 @@ 
-#define STRNCPY __strncpy_sse2
+#define STRNCPY __strncpy_generic
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(strncpy)
 
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 2a780a7e16..7fc7d72ec5 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,6 +24,7 @@ 
 # undef strncpy
 
 # define SYMBOL_NAME strncpy
+# define GENERIC generic
 # include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
similarity index 74%
rename from sysdeps/x86_64/multiarch/strspn-sse2.c
rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
index 61cc6cb0a5..8700276773 100644
--- a/sysdeps/x86_64/multiarch/strspn-sse2.c
+++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c
@@ -1,5 +1,5 @@ 
-/* strspn.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+/* strpbrk with SSE4.2 intrinsics
+   Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,13 +16,7 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define STRSPN __strspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(STRSPN)
-#endif
-
-#include <string/strspn.c>
+#define USE_AS_STRPBRK
+#define STRCSPN_GENERIC __strpbrk_generic
+#define STRCSPN_SSE42 __strpbrk_sse42
+#include "strcspn-c-sse4.c"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
index abf4ff7f1a..d31acfe495 100644
--- a/sysdeps/x86_64/multiarch/strpbrk-c.c
+++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -1,5 +1,5 @@ 
-/* strpbrk with SSE4.2 intrinsics
-   Copyright (C) 2022 Free Software Foundation, Inc.
+/* strpbrk.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define USE_AS_STRPBRK
-#define STRCSPN_SSE2 __strpbrk_sse2
-#define STRCSPN_SSE42 __strpbrk_sse42
-#include "strcspn-c.c"
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRPBRK __strpbrk_generic
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRPBRK)
+#endif
+
+#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
deleted file mode 100644
index d03214c4fb..0000000000
--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+++ /dev/null
@@ -1,28 +0,0 @@ 
-/* strpbrk.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define STRPBRK __strpbrk_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(STRPBRK)
-#endif
-
-#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
new file mode 100644
index 0000000000..d044916688
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c
@@ -0,0 +1,136 @@ 
+/* strspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_generic (const char *, const char *) attribute_hidden;
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    return 0;
+
+  const char *aligned;
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
+
+      /* Find where the NULL terminator is.  */
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
+    }
+
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_generic (s, a);
+    }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
+
+  if (offset != 0)
+    {
+    start_unaligned:
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
+
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+	return length;
+      aligned += 16;
+    }
+
+start_loop:
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
+      if (cflag)
+	return (size_t) (aligned + index - s);
+      aligned += 16;
+    }
+}
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 6124033ceb..6b50c36432 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -1,5 +1,5 @@ 
-/* strspn with SSE4.2 intrinsics
-   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+/* strspn.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,121 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
+#if IS_IN (libc)
 
-/* We use 0x12:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ANY
-	| _SIDD_NEGATIVE_POLARITY
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
+# include <sysdep.h>
+# define STRSPN __strspn_generic
 
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRSPN)
+#endif
 
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   A A A A A A A A A A A A A A A A
-
-   to find out if the first 16byte data element has any non-A byte and
-   the offset of the first byte.  There are 2 cases:
-
-   1. The first 16byte data element has the non-A byte, including
-      EOS, at the offset X.
-   2. The first 16byte data element is valid and doesn't have the non-A
-      byte.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	 0/1	  0
-    2		16	  0	  0	  0
-
-   We exit from the loop for case 1.  */
-
-extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
-
-
-size_t
-__attribute__ ((section (".text.sse4.2")))
-__strspn_sse42 (const char *s, const char *a)
-{
-  if (*a == 0)
-    return 0;
-
-  const char *aligned;
-  __m128i mask, maskz, zero;
-  unsigned int maskz_bits;
-  unsigned int offset = (int) ((size_t) a & 15);
-  zero = _mm_set1_epi8 (0);
-  if (offset != 0)
-    {
-      /* Load masks.  */
-      aligned = (const char *) ((size_t) a & -16L);
-      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-      maskz = _mm_cmpeq_epi8 (mask0, zero);
-
-      /* Find where the NULL terminator is.  */
-      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
-      if (maskz_bits != 0)
-        {
-          mask = __m128i_shift_right (mask0, offset);
-          offset = (unsigned int) ((size_t) s & 15);
-          if (offset)
-            goto start_unaligned;
-
-          aligned = s;
-          goto start_loop;
-        }
-    }
-
-  /* A is aligned.  */
-  mask = _mm_loadu_si128 ((__m128i *) a);
-
-  /* Find where the NULL terminator is.  */
-  maskz = _mm_cmpeq_epi8 (mask, zero);
-  maskz_bits = _mm_movemask_epi8 (maskz);
-  if (maskz_bits == 0)
-    {
-      /* There is no NULL terminator.  Don't use SSE4.2 if the length
-         of A > 16.  */
-      if (a[16] != 0)
-        return __strspn_sse2 (s, a);
-    }
-  aligned = s;
-  offset = (unsigned int) ((size_t) s & 15);
-
-  if (offset != 0)
-    {
-    start_unaligned:
-      /* Check partial string.  */
-      aligned = (const char *) ((size_t) s & -16L);
-      __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      __m128i adj_value = __m128i_shift_right (value, offset);
-
-      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
-      /* No need to check CFlag since it is always 1.  */
-      if (length < 16 - offset)
-	return length;
-      /* Find where the NULL terminator is.  */
-      maskz = _mm_cmpeq_epi8 (value, zero);
-      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
-      if (maskz_bits != 0)
-	return length;
-      aligned += 16;
-    }
-
-start_loop:
-  while (1)
-    {
-      __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      unsigned int index = _mm_cmpistri (mask, value, 0x12);
-      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
-      if (cflag)
-	return (size_t) (aligned + index - s);
-      aligned += 16;
-    }
-}
+#include <string/strspn.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
index 26d6984e9b..fa38dd898d 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-c.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -1,5 +1,5 @@ 
 #if IS_IN (libc)
-# define WCSCPY  __wcscpy_sse2
+# define WCSCPY  __wcscpy_generic
 #endif
 
 #include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 6a2d1421d9..53c3228dc2 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,7 +26,7 @@ 
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 static inline void *
@@ -37,7 +37,7 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
-  return OPTIMIZE (sse2);
+  return OPTIMIZE (generic);
 }
 
 libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
index e1ec7cfbb5..1c9c04241a 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c
@@ -1,9 +1,9 @@ 
 #if IS_IN (libc)
 # include <wchar.h>
 
-# define WCSNLEN __wcsnlen_sse2
+# define WCSNLEN __wcsnlen_generic
 
-extern __typeof (wcsnlen) __wcsnlen_sse2;
+extern __typeof (wcsnlen) __wcsnlen_generic;
 #endif
 
 #include "wcsmbs/wcsnlen.c"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index baa26666a8..05b7a211de 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,6 +24,7 @@ 
 # undef __wcsnlen
 
 # define SYMBOL_NAME wcsnlen
+# define GENERIC generic
 # include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());