diff mbox series

[v2] x86_64: Add strstr function with 512-bit EVEX

Message ID 20220606191743.3344179-1-raghuveer.devulapalli@intel.com
State New
Headers show
Series [v2] x86_64: Add strstr function with 512-bit EVEX | expand

Commit Message

Raghuveer Devulapalli June 6, 2022, 7:17 p.m. UTC
Adding a 512-bit EVEX version of strstr. The algorithm works as follows:

(1) We spend a few cycles at the begining to peek into the needle. We
locate an edge in the needle (first occurance of 2 consequent distinct
characters) and also store the first 64-bytes into a zmm register.

(2) We search for the edge in the haystack by looking into one cache
line of the haystack at a time. This avoids having to read past a page
boundary which can cause a seg fault.

(3) If an edge is found in the haystack we first compare the first
64-bytes of the needle (already stored in a zmm register) before we
proceed with a full string compare performed byte by byte.

Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)

Geometric mean of all benchmarks: new / old =  0.66

Difficult skiptable(0) : new / old =  0.02
Difficult skiptable(1) : new / old =  0.01
Difficult 2-way : new / old =  0.25
Difficult testing first 2 : new / old =  1.26
Difficult skiptable(0) : new / old =  0.05
Difficult skiptable(1) : new / old =  0.06
Difficult 2-way : new / old =  0.26
Difficult testing first 2 : new / old =  1.05
Difficult skiptable(0) : new / old =  0.42
Difficult skiptable(1) : new / old =  0.24
Difficult 2-way : new / old =  0.21
Difficult testing first 2 : new / old =  1.04
---
 sysdeps/x86_64/multiarch/Makefile          |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
 sysdeps/x86_64/multiarch/strstr-avx512.c   | 214 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
 4 files changed, 242 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c

Comments

Noah Goldstein June 6, 2022, 8:25 p.m. UTC | #1
On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
>
> Geometric mean of all benchmarks: new / old =  0.66
>
> Difficult skiptable(0) : new / old =  0.02
> Difficult skiptable(1) : new / old =  0.01
> Difficult 2-way : new / old =  0.25
> Difficult testing first 2 : new / old =  1.26
> Difficult skiptable(0) : new / old =  0.05
> Difficult skiptable(1) : new / old =  0.06
> Difficult 2-way : new / old =  0.26
> Difficult testing first 2 : new / old =  1.05
> Difficult skiptable(0) : new / old =  0.42
> Difficult skiptable(1) : new / old =  0.24
> Difficult 2-way : new / old =  0.21
> Difficult testing first 2 : new / old =  1.04
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
>  sysdeps/x86_64/multiarch/strstr-avx512.c   | 214 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
>  4 files changed, 242 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d0869c3ac3..3d153cac35 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -116,6 +116,7 @@ sysdep_routines += \
>    strrchr-sse2 \
>    strspn-c \
>    strspn-sse2 \
> +  strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4
>  CFLAGS-strcspn-c.c += -msse4
>  CFLAGS-strpbrk-c.c += -msse4
>  CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3

Do we need -O3?

HJ, are there any issues with having this as -O3?
>  endif
>
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c5cd9466fe..58f3ec8306 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> +              IFUNC_IMPL_ADD (array, i, strstr,
> +                              (CPU_FEATURE_USABLE (AVX512VL)
> +                               && CPU_FEATURE_USABLE (AVX512BW)
> +                               && CPU_FEATURE_USABLE (AVX512DQ)
> +                               && CPU_FEATURE_USABLE (BMI2)),
> +                              __strstr_avx512)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..2ab9e96db8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,214 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +#define PAGESIZE 4096
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> +  size_t ind = 0;
> +  while (ned[ind + 1] != '\0')
> +    {
> +      if (ned[ind] != ned[ind + 1])
> +        return ind;
> +      else
> +        ind = ind + 1;
> +    }
> +  return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> +                     size_t ind)
> +{
> +  while (ned[ind] != '\0')
> +    {
> +      if (ned[ind] != hay[hay_index + ind])
> +        return false;
> +      ind = ind + 1;
> +    }
> +  return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> +                            const char *ned, const __mmask64 ned_mask,
> +                            const __m512i ned_zmm)
> +{
> +  /* check first 64 bytes using zmm and then scalar */
> +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> +  if (match != 0x0) // failed the first few chars
> +    return false;
> +  else if (ned_mask == FULL_MMASK64)
> +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> +  return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> +  char first = ned[0];
> +  if (first == '\0')
> +    return (char *)haystack;
> +  if (ned[1] == '\0')
> +    return (char *)strchr (haystack, ned[0]);
> +
> +  size_t edge = find_edge_in_needle (ned);
> +
> +  /* ensure haystack is as long as the pos of edge in needle */
> +  for (int ii = 0; ii < edge; ++ii)
> +    {
> +      if (haystack[ii] == '\0')
> +        return NULL;
> +    }
> +
> +  /*
> +   Load 64 bytes of the needle and save it to a zmm register
> +   Read one cache line at a time to avoid loading across a page boundary
> +   */
> +  __mmask64 ned_load_mask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
> +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> +  __mmask64 ned_nullmask
> +      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
> +
> +  if (__glibc_unlikely (ned_nullmask == 0x0))
> +    {
> +      ned_zmm = _mm512_loadu_si512 (ned);
> +      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      if (ned_nullmask != 0x0)
> +        ned_load_mask = ned_load_mask >> 1;
> +    }
> +  else
> +    {
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      ned_load_mask = ned_load_mask >> 1;
> +    }
> +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> +  /*
> +   Read the bytes of haystack in the current cache line
> +   */
> +  size_t hay_index = edge;
> +  __mmask64 loadmask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> +  /* First load is a partial cache line */
> +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> +  /* Search for NULL and compare only till null char */
> +  uint64_t nullmask
> +      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> +  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> +  /* Search for the 2 charaters of needle */
> +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> +  k1 = _kshiftri_mask64 (k1, 1);
> +  /* k2 masks tell us if both chars from needle match */
> +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +  /* For every match, search for the entire needle for a full match */
> +  while (k2)
> +    {
> +      uint64_t bitcount = _tzcnt_u64 (k2);
> +      k2 = _blsr_u64 (k2);
> +      size_t match_pos = hay_index + bitcount - edge;
> +      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> +          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> +        {
> +          /*
> +           * Use vector compare as long as you are not crossing a page
> +           */
> +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                          ned_load_mask, ned_zmm))
> +            return (char *)haystack + match_pos;
> +        }
> +      else
> +        {
> +          if (verify_string_match (haystack, match_pos, ned, 0))
> +            return (char *)haystack + match_pos;
> +        }
> +    }
> +  /* We haven't checked for potential match at the last char yet */
> +  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
> +  hay_index = 0;
> +
> +  /*
> +   Loop over one cache line at a time to prevent reading over page
> +   boundary
> +   */
> +  __m512i hay1;
> +  while (nullmask == 0)
> +    {
> +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> +      hay1 = _mm512_load_si512 (haystack + hay_index
> +                                + 1); // Always 64 byte aligned
> +      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> +      /* Compare only till null char */
> +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> +      /* k2 masks tell us if both chars from needle match */
> +      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +      /* For every match, compare full strings for potential match */
> +      while (k2)
> +        {
> +          uint64_t bitcount = _tzcnt_u64 (k2);
> +          k2 = _blsr_u64 (k2);
> +          size_t match_pos = hay_index + bitcount - edge;
> +          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> +              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> +            {
> +              /*
> +               * Use vector compare as long as you are not crossing a page
> +               */
> +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                              ned_load_mask, ned_zmm))
> +                return (char *)haystack + match_pos;
> +            }
> +          else
> +            {
> +              /* Compare byte by byte */
> +              if (verify_string_match (haystack, match_pos, ned, 0))
> +                return (char *)haystack + match_pos;
> +            }
> +        }
> +      hay_index += ZMM_SIZE_IN_BYTES;
> +    }
> +  return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
>  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
>  #include "init-arch.h"
>
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> -           ? __strstr_sse2_unaligned
> -           : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +    return __strstr_avx512;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> +    return __strstr_sse2_unaligned;
> +
> +  return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
>  #undef strstr
>  strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>
Noah Goldstein June 6, 2022, 8:35 p.m. UTC | #2
On Mon, Jun 6, 2022 at 1:25 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
> >
> > Geometric mean of all benchmarks: new / old =  0.66
> >
> > Difficult skiptable(0) : new / old =  0.02
> > Difficult skiptable(1) : new / old =  0.01
> > Difficult 2-way : new / old =  0.25
> > Difficult testing first 2 : new / old =  1.26
> > Difficult skiptable(0) : new / old =  0.05
> > Difficult skiptable(1) : new / old =  0.06
> > Difficult 2-way : new / old =  0.26
> > Difficult testing first 2 : new / old =  1.05
> > Difficult skiptable(0) : new / old =  0.42
> > Difficult skiptable(1) : new / old =  0.24
> > Difficult 2-way : new / old =  0.21
> > Difficult testing first 2 : new / old =  1.04
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 214 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 242 insertions(+), 4 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index d0869c3ac3..3d153cac35 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -116,6 +116,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4
> >  CFLAGS-strcspn-c.c += -msse4
> >  CFLAGS-strpbrk-c.c += -msse4
> >  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>
> Do we need -O3?
>
> HJ, are there any issues with having this as -O3?

No issue I've heard.
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index c5cd9466fe..58f3ec8306 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..2ab9e96db8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,214 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff
> > +#define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +#define PAGESIZE 4096
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if no edge
> > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned)
> > +{
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location
> > + */
> > +static inline bool
> > +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64 bytes are
> > + compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm)
> > +{
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned)
> > +{
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */
> > +  for (int ii = 0; ii < edge; ++ii)
> > +    {
> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
> > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
> > +
> > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> > +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > +  /* First load is a partial cache line */
> > +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */
> > +  uint64_t nullmask
> > +      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > +  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */
> > +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +  /* For every match, search for the entire needle for a full match */
> > +  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64 (k2);
> > +      k2 = _blsr_u64 (k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > +          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > +        {
> > +          /*
> > +           * Use vector compare as long as you are not crossing a page
> > +           */
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */
> > +  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
> > +  hay_index = 0;
> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
> > +      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64 (k2);
> > +          k2 = _blsr_u64 (k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > +              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > +            {
> > +              /*
> > +               * Use vector compare as long as you are not crossing a page
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
> >  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr;
> > -libc_ifunc (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >

LGTM.
H.J. Lu June 6, 2022, 9:32 p.m. UTC | #3
On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
>
> Geometric mean of all benchmarks: new / old =  0.66
>
> Difficult skiptable(0) : new / old =  0.02
> Difficult skiptable(1) : new / old =  0.01
> Difficult 2-way : new / old =  0.25
> Difficult testing first 2 : new / old =  1.26
> Difficult skiptable(0) : new / old =  0.05
> Difficult skiptable(1) : new / old =  0.06
> Difficult 2-way : new / old =  0.26
> Difficult testing first 2 : new / old =  1.05
> Difficult skiptable(0) : new / old =  0.42
> Difficult skiptable(1) : new / old =  0.24
> Difficult 2-way : new / old =  0.21
> Difficult testing first 2 : new / old =  1.04
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
>  sysdeps/x86_64/multiarch/strstr-avx512.c   | 214 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
>  4 files changed, 242 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d0869c3ac3..3d153cac35 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -116,6 +116,7 @@ sysdep_routines += \
>    strrchr-sse2 \
>    strspn-c \
>    strspn-sse2 \
> +  strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4
>  CFLAGS-strcspn-c.c += -msse4
>  CFLAGS-strpbrk-c.c += -msse4
>  CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>  endif
>
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c5cd9466fe..58f3ec8306 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> +              IFUNC_IMPL_ADD (array, i, strstr,
> +                              (CPU_FEATURE_USABLE (AVX512VL)
> +                               && CPU_FEATURE_USABLE (AVX512BW)
> +                               && CPU_FEATURE_USABLE (AVX512DQ)
> +                               && CPU_FEATURE_USABLE (BMI2)),
> +                              __strstr_avx512)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..2ab9e96db8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,214 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +#define PAGESIZE 4096
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> +  size_t ind = 0;
> +  while (ned[ind + 1] != '\0')
> +    {
> +      if (ned[ind] != ned[ind + 1])
> +        return ind;
> +      else
> +        ind = ind + 1;
> +    }
> +  return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> +                     size_t ind)
> +{
> +  while (ned[ind] != '\0')
> +    {
> +      if (ned[ind] != hay[hay_index + ind])
> +        return false;
> +      ind = ind + 1;
> +    }
> +  return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> +                            const char *ned, const __mmask64 ned_mask,
> +                            const __m512i ned_zmm)
> +{
> +  /* check first 64 bytes using zmm and then scalar */
> +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> +  if (match != 0x0) // failed the first few chars
> +    return false;
> +  else if (ned_mask == FULL_MMASK64)
> +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> +  return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> +  char first = ned[0];
> +  if (first == '\0')
> +    return (char *)haystack;
> +  if (ned[1] == '\0')
> +    return (char *)strchr (haystack, ned[0]);
> +
> +  size_t edge = find_edge_in_needle (ned);
> +
> +  /* ensure haystack is as long as the pos of edge in needle */
> +  for (int ii = 0; ii < edge; ++ii)
> +    {
> +      if (haystack[ii] == '\0')
> +        return NULL;
> +    }
> +
> +  /*
> +   Load 64 bytes of the needle and save it to a zmm register
> +   Read one cache line at a time to avoid loading across a page boundary
> +   */
> +  __mmask64 ned_load_mask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
> +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> +  __mmask64 ned_nullmask
> +      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
> +
> +  if (__glibc_unlikely (ned_nullmask == 0x0))
> +    {
> +      ned_zmm = _mm512_loadu_si512 (ned);
> +      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      if (ned_nullmask != 0x0)
> +        ned_load_mask = ned_load_mask >> 1;
> +    }
> +  else
> +    {
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      ned_load_mask = ned_load_mask >> 1;
> +    }
> +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> +  /*
> +   Read the bytes of haystack in the current cache line
> +   */
> +  size_t hay_index = edge;
> +  __mmask64 loadmask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> +  /* First load is a partial cache line */
> +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> +  /* Search for NULL and compare only till null char */
> +  uint64_t nullmask
> +      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> +  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> +  /* Search for the 2 charaters of needle */
> +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> +  k1 = _kshiftri_mask64 (k1, 1);
> +  /* k2 masks tell us if both chars from needle match */
> +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +  /* For every match, search for the entire needle for a full match */
> +  while (k2)
> +    {
> +      uint64_t bitcount = _tzcnt_u64 (k2);
> +      k2 = _blsr_u64 (k2);
> +      size_t match_pos = hay_index + bitcount - edge;
> +      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> +          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> +        {
> +          /*
> +           * Use vector compare as long as you are not crossing a page
> +           */
> +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                          ned_load_mask, ned_zmm))
> +            return (char *)haystack + match_pos;
> +        }
> +      else
> +        {
> +          if (verify_string_match (haystack, match_pos, ned, 0))
> +            return (char *)haystack + match_pos;
> +        }
> +    }
> +  /* We haven't checked for potential match at the last char yet */
> +  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
> +  hay_index = 0;
> +
> +  /*
> +   Loop over one cache line at a time to prevent reading over page
> +   boundary
> +   */
> +  __m512i hay1;
> +  while (nullmask == 0)
> +    {
> +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> +      hay1 = _mm512_load_si512 (haystack + hay_index
> +                                + 1); // Always 64 byte aligned
> +      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> +      /* Compare only till null char */
> +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> +      /* k2 masks tell us if both chars from needle match */
> +      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +      /* For every match, compare full strings for potential match */
> +      while (k2)
> +        {
> +          uint64_t bitcount = _tzcnt_u64 (k2);
> +          k2 = _blsr_u64 (k2);
> +          size_t match_pos = hay_index + bitcount - edge;
> +          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> +              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> +            {
> +              /*
> +               * Use vector compare as long as you are not crossing a page
> +               */
> +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                              ned_load_mask, ned_zmm))
> +                return (char *)haystack + match_pos;
> +            }
> +          else
> +            {
> +              /* Compare byte by byte */
> +              if (verify_string_match (haystack, match_pos, ned, 0))
> +                return (char *)haystack + match_pos;
> +            }
> +        }
> +      hay_index += ZMM_SIZE_IN_BYTES;
> +    }
> +  return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
>  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
>  #include "init-arch.h"
>
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> -           ? __strstr_sse2_unaligned
> -           : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +    return __strstr_avx512;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> +    return __strstr_sse2_unaligned;
> +
> +  return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
>  #undef strstr
>  strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Do you need me to commit it for you?

Thanks.
develop--- via Libc-alpha June 6, 2022, 9:39 p.m. UTC | #4
> -----Original Message-----
> From: H.J. Lu <hjl.tools@gmail.com>
> Sent: Monday, June 6, 2022 2:32 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH v2] x86_64: Add strstr function with 512-bit EVEX
> 
> On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc-
> alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking results: (old = strstr_sse2_unaligned, new =
> > strstr_avx512)
> >
> > Geometric mean of all benchmarks: new / old =  0.66
> >
> > Difficult skiptable(0) : new / old =  0.02 Difficult skiptable(1) :
> > new / old =  0.01 Difficult 2-way : new / old =  0.25 Difficult
> > testing first 2 : new / old =  1.26 Difficult skiptable(0) : new / old
> > =  0.05 Difficult skiptable(1) : new / old =  0.06 Difficult 2-way :
> > new / old =  0.26 Difficult testing first 2 : new / old =  1.05
> > Difficult skiptable(0) : new / old =  0.42 Difficult skiptable(1) :
> > new / old =  0.24 Difficult 2-way : new / old =  0.21 Difficult
> > testing first 2 : new / old =  1.04
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 214
> +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 242 insertions(+), 4 deletions(-)  create mode
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index d0869c3ac3..3d153cac35 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -116,6 +116,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> +=
> > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> mavx512bw
> > +-mbmi -mbmi2 -O3
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index c5cd9466fe..58f3ec8306 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..2ab9e96db8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,214 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64 #define PAGESIZE 4096
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if
> > +no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location  */
> > +static inline bool verify_string_match (const char *hay, const size_t
> > +hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64
> > +bytes are  compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm) {
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> to
> > +do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> hay_zmm,
> > +ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned,
> > +ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */  for
> > + (int ii = 0; ii < edge; ++ii)
> > +    {
> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));  __m512i ned_zmm
> > + = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > + ned_zmm);
> > +
> > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > +  /* First load is a partial cache line */  __m512i hay0 =
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */  uint64_t
> > + nullmask
> > +      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask,
> hay0,
> > + hay0));  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */  uint64_t k2
> > + = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +  /* For every match, search for the entire needle for a full match
> > + */  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64 (k2);
> > +      k2 = _blsr_u64 (k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > +          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > +        {
> > +          /*
> > +           * Use vector compare as long as you are not crossing a page
> > +           */
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */
> > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) |
> > + 63));  hay_index = 0;
> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
> > +      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1,
> hay1));
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64 (k2);
> > +          k2 = _blsr_u64 (k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > +              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > +            {
> > +              /*
> > +               * Use vector compare as long as you are not crossing a page
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > +IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >
> 
> LGTM.
> 
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> 
> Do you need me to commit it for you?

Yes, Please. Thanks! 

> 
> Thanks.
> 
> --
> H.J.
Sunil Pandey July 14, 2022, 2:04 a.m. UTC | #5
On Mon, Jun 6, 2022 at 2:41 PM Devulapalli, Raghuveer via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> > -----Original Message-----
> > From: H.J. Lu <hjl.tools@gmail.com>
> > Sent: Monday, June 6, 2022 2:32 PM
> > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > Cc: GNU C Library <libc-alpha@sourceware.org>
> > Subject: Re: [PATCH v2] x86_64: Add strstr function with 512-bit EVEX
> >
> > On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc-
> > alpha@sourceware.org> wrote:
> > >
> > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> > >
> > > (1) We spend a few cycles at the begining to peek into the needle. We
> > > locate an edge in the needle (first occurance of 2 consequent distinct
> > > characters) and also store the first 64-bytes into a zmm register.
> > >
> > > (2) We search for the edge in the haystack by looking into one cache
> > > line of the haystack at a time. This avoids having to read past a page
> > > boundary which can cause a seg fault.
> > >
> > > (3) If an edge is found in the haystack we first compare the first
> > > 64-bytes of the needle (already stored in a zmm register) before we
> > > proceed with a full string compare performed byte by byte.
> > >
> > > Benchmarking results: (old = strstr_sse2_unaligned, new =
> > > strstr_avx512)
> > >
> > > Geometric mean of all benchmarks: new / old =  0.66
> > >
> > > Difficult skiptable(0) : new / old =  0.02 Difficult skiptable(1) :
> > > new / old =  0.01 Difficult 2-way : new / old =  0.25 Difficult
> > > testing first 2 : new / old =  1.26 Difficult skiptable(0) : new / old
> > > =  0.05 Difficult skiptable(1) : new / old =  0.06 Difficult 2-way :
> > > new / old =  0.26 Difficult testing first 2 : new / old =  1.05
> > > Difficult skiptable(0) : new / old =  0.42 Difficult skiptable(1) :
> > > new / old =  0.24 Difficult 2-way : new / old =  0.21 Difficult
> > > testing first 2 : new / old =  1.04
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> > >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 214
> > +++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> > >  4 files changed, 242 insertions(+), 4 deletions(-)  create mode
> > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > b/sysdeps/x86_64/multiarch/Makefile
> > > index d0869c3ac3..3d153cac35 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -116,6 +116,7 @@ sysdep_routines += \
> > >    strrchr-sse2 \
> > >    strspn-c \
> > >    strspn-sse2 \
> > > +  strstr-avx512 \
> > >    strstr-sse2-unaligned \
> > >    varshift \
> > >  # sysdep_routines
> > > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> > +=
> > > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > mavx512bw
> > > +-mbmi -mbmi2 -O3
> > >  endif
> > >
> > >  ifeq ($(subdir),wcsmbs)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index c5cd9466fe..58f3ec8306 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > > libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> > >    IFUNC_IMPL (i, name, strstr,
> > > +              IFUNC_IMPL_ADD (array, i, strstr,
> > > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > > +                               && CPU_FEATURE_USABLE (BMI2)),
> > > +                              __strstr_avx512)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > new file mode 100644
> > > index 0000000000..2ab9e96db8
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -0,0 +1,214 @@
> > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <immintrin.h>
> > > +#include <inttypes.h>
> > > +#include <stdbool.h>
> > > +#include <string.h>
> > > +
> > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > +#define ZMM_SIZE_IN_BYTES 64 #define PAGESIZE 4096
> > > +
> > > +/*
> > > + Returns the index of the first edge within the needle, returns 0 if
> > > +no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > + */
> > > +static inline size_t
> > > +find_edge_in_needle (const char *ned) {
> > > +  size_t ind = 0;
> > > +  while (ned[ind + 1] != '\0')
> > > +    {
> > > +      if (ned[ind] != ned[ind + 1])
> > > +        return ind;
> > > +      else
> > > +        ind = ind + 1;
> > > +    }
> > > +  return 0;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack byte by byte at specified location  */
> > > +static inline bool verify_string_match (const char *hay, const size_t
> > > +hay_index, const char *ned,
> > > +                     size_t ind)
> > > +{
> > > +  while (ned[ind] != '\0')
> > > +    {
> > > +      if (ned[ind] != hay[hay_index + ind])
> > > +        return false;
> > > +      ind = ind + 1;
> > > +    }
> > > +  return true;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack at specified location. The first 64
> > > +bytes are  compared using a ZMM register.
> > > + */
> > > +static inline bool
> > > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > > +                            const char *ned, const __mmask64 ned_mask,
> > > +                            const __m512i ned_zmm) {
> > > +  /* check first 64 bytes using zmm and then scalar */
> > > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> > to
> > > +do so
> > > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> > hay_zmm,
> > > +ned_zmm);
> > > +  if (match != 0x0) // failed the first few chars
> > > +    return false;
> > > +  else if (ned_mask == FULL_MMASK64)
> > > +    return verify_string_match (hay, hay_index, ned,
> > > +ZMM_SIZE_IN_BYTES);
> > > +  return true;
> > > +}
> > > +
> > > +char *
> > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > +  char first = ned[0];
> > > +  if (first == '\0')
> > > +    return (char *)haystack;
> > > +  if (ned[1] == '\0')
> > > +    return (char *)strchr (haystack, ned[0]);
> > > +
> > > +  size_t edge = find_edge_in_needle (ned);
> > > +
> > > +  /* ensure haystack is as long as the pos of edge in needle */  for
> > > + (int ii = 0; ii < edge; ++ii)
> > > +    {
> > > +      if (haystack[ii] == '\0')
> > > +        return NULL;
> > > +    }
> > > +
> > > +  /*
> > > +   Load 64 bytes of the needle and save it to a zmm register
> > > +   Read one cache line at a time to avoid loading across a page boundary
> > > +   */
> > > +  __mmask64 ned_load_mask = _bzhi_u64 (
> > > +      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));  __m512i ned_zmm
> > > + = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > > +  __mmask64 ned_nullmask
> > > +      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > > + ned_zmm);
> > > +
> > > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > > +    {
> > > +      ned_zmm = _mm512_loadu_si512 (ned);
> > > +      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      if (ned_nullmask != 0x0)
> > > +        ned_load_mask = ned_load_mask >> 1;
> > > +    }
> > > +  else
> > > +    {
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      ned_load_mask = ned_load_mask >> 1;
> > > +    }
> > > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i
> > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > +
> > > +  /*
> > > +   Read the bytes of haystack in the current cache line
> > > +   */
> > > +  size_t hay_index = edge;
> > > +  __mmask64 loadmask = _bzhi_u64 (
> > > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > > +  /* First load is a partial cache line */  __m512i hay0 =
> > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > +  /* Search for NULL and compare only till null char */  uint64_t
> > > + nullmask
> > > +      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask,
> > hay0,
> > > + hay0));  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > +  /* Search for the 2 charaters of needle */
> > > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > +  k1 = _kshiftri_mask64 (k1, 1);
> > > +  /* k2 masks tell us if both chars from needle match */  uint64_t k2
> > > + = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > +  /* For every match, search for the entire needle for a full match
> > > + */  while (k2)
> > > +    {
> > > +      uint64_t bitcount = _tzcnt_u64 (k2);
> > > +      k2 = _blsr_u64 (k2);
> > > +      size_t match_pos = hay_index + bitcount - edge;
> > > +      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > > +          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > > +        {
> > > +          /*
> > > +           * Use vector compare as long as you are not crossing a page
> > > +           */
> > > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                          ned_load_mask, ned_zmm))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +      else
> > > +        {
> > > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +    }
> > > +  /* We haven't checked for potential match at the last char yet */
> > > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) |
> > > + 63));  hay_index = 0;
> > > +
> > > +  /*
> > > +   Loop over one cache line at a time to prevent reading over page
> > > +   boundary
> > > +   */
> > > +  __m512i hay1;
> > > +  while (nullmask == 0)
> > > +    {
> > > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > > +                                + 1); // Always 64 byte aligned
> > > +      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1,
> > hay1));
> > > +      /* Compare only till null char */
> > > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > +      /* k2 masks tell us if both chars from needle match */
> > > +      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > +      /* For every match, compare full strings for potential match */
> > > +      while (k2)
> > > +        {
> > > +          uint64_t bitcount = _tzcnt_u64 (k2);
> > > +          k2 = _blsr_u64 (k2);
> > > +          size_t match_pos = hay_index + bitcount - edge;
> > > +          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
> > > +              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
> > > +            {
> > > +              /*
> > > +               * Use vector compare as long as you are not crossing a page
> > > +               */
> > > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                              ned_load_mask, ned_zmm))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +          else
> > > +            {
> > > +              /* Compare byte by byte */
> > > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +        }
> > > +      hay_index += ZMM_SIZE_IN_BYTES;
> > > +    }
> > > +  return NULL;
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > b/sysdeps/x86_64/multiarch/strstr.c
> > > index 95600a9de5..2fb8b169b6 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > @@ -35,16 +35,32 @@
> > >
> > >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > > attribute_hidden;
> > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> > >
> > >  #include "init-arch.h"
> > >
> > >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > >     ifunc symbol properly.  */
> > >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > (__libc_strstr,
> > > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > -           ? __strstr_sse2_unaligned
> > > -           : __strstr_sse2)
> > >
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > > +
> > > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > +    return __strstr_avx512;
> > > +
> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > +    return __strstr_sse2_unaligned;
> > > +
> > > +  return __strstr_sse2;
> > > +}
> > > +
> > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > +IFUNC_SELECTOR ());
> > >  #undef strstr
> > >  strong_alias (__libc_strstr, strstr)
> > > --
> > > 2.36.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Do you need me to commit it for you?
>
> Yes, Please. Thanks!
>
> >
> > Thanks.
> >
> > --
> > H.J.

I would like to backport this patch to release branches.
Any comments or objections?

This patch will have 2 squashed commit to fix glibc build failure with gcc 6.4.1

commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Tue Jul 12 11:48:04 2022 -0700

    x86: Remove __mmask intrinsics in strstr-avx512.c

commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c
Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date:   Mon Jun 6 12:17:43 2022 -0700

    x86_64: Add strstr function with 512-bit EVEX

--Sunil
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d0869c3ac3..3d153cac35 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -116,6 +116,7 @@  sysdep_routines += \
   strrchr-sse2 \
   strspn-c \
   strspn-sse2 \
+  strstr-avx512 \
   strstr-sse2-unaligned \
   varshift \
 # sysdep_routines
@@ -123,6 +124,7 @@  CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c5cd9466fe..58f3ec8306 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -618,6 +618,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
+              IFUNC_IMPL_ADD (array, i, strstr,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __strstr_avx512)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
new file mode 100644
index 0000000000..2ab9e96db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
@@ -0,0 +1,214 @@ 
+/* strstr optimized with 512-bit AVX-512 instructions
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+#define PAGESIZE 4096
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+  size_t ind = 0;
+  while (ned[ind + 1] != '\0')
+    {
+      if (ned[ind] != ned[ind + 1])
+        return ind;
+      else
+        ind = ind + 1;
+    }
+  return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+                     size_t ind)
+{
+  while (ned[ind] != '\0')
+    {
+      if (ned[ind] != hay[hay_index + ind])
+        return false;
+      ind = ind + 1;
+    }
+  return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+                            const char *ned, const __mmask64 ned_mask,
+                            const __m512i ned_zmm)
+{
+  /* check first 64 bytes using zmm and then scalar */
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+  if (match != 0x0) // failed the first few chars
+    return false;
+  else if (ned_mask == FULL_MMASK64)
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+  return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+  char first = ned[0];
+  if (first == '\0')
+    return (char *)haystack;
+  if (ned[1] == '\0')
+    return (char *)strchr (haystack, ned[0]);
+
+  size_t edge = find_edge_in_needle (ned);
+
+  /* ensure haystack is as long as the pos of edge in needle */
+  for (int ii = 0; ii < edge; ++ii)
+    {
+      if (haystack[ii] == '\0')
+        return NULL;
+    }
+
+  /*
+   Load 64 bytes of the needle and save it to a zmm register
+   Read one cache line at a time to avoid loading across a page boundary
+   */
+  __mmask64 ned_load_mask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+  __mmask64 ned_nullmask
+      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
+
+  if (__glibc_unlikely (ned_nullmask == 0x0))
+    {
+      ned_zmm = _mm512_loadu_si512 (ned);
+      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      if (ned_nullmask != 0x0)
+        ned_load_mask = ned_load_mask >> 1;
+    }
+  else
+    {
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      ned_load_mask = ned_load_mask >> 1;
+    }
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+  /*
+   Read the bytes of haystack in the current cache line
+   */
+  size_t hay_index = edge;
+  __mmask64 loadmask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+  /* First load is a partial cache line */
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+  /* Search for NULL and compare only till null char */
+  uint64_t nullmask
+      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
+  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
+  /* Search for the 2 charaters of needle */
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+  k1 = _kshiftri_mask64 (k1, 1);
+  /* k2 masks tell us if both chars from needle match */
+  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
+  /* For every match, search for the entire needle for a full match */
+  while (k2)
+    {
+      uint64_t bitcount = _tzcnt_u64 (k2);
+      k2 = _blsr_u64 (k2);
+      size_t match_pos = hay_index + bitcount - edge;
+      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+        {
+          /*
+           * Use vector compare as long as you are not crossing a page
+           */
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                          ned_load_mask, ned_zmm))
+            return (char *)haystack + match_pos;
+        }
+      else
+        {
+          if (verify_string_match (haystack, match_pos, ned, 0))
+            return (char *)haystack + match_pos;
+        }
+    }
+  /* We haven't checked for potential match at the last char yet */
+  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
+  hay_index = 0;
+
+  /*
+   Loop over one cache line at a time to prevent reading over page
+   boundary
+   */
+  __m512i hay1;
+  while (nullmask == 0)
+    {
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
+      hay1 = _mm512_load_si512 (haystack + hay_index
+                                + 1); // Always 64 byte aligned
+      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
+      /* Compare only till null char */
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+      /* k2 masks tell us if both chars from needle match */
+      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
+      /* For every match, compare full strings for potential match */
+      while (k2)
+        {
+          uint64_t bitcount = _tzcnt_u64 (k2);
+          k2 = _blsr_u64 (k2);
+          size_t match_pos = hay_index + bitcount - edge;
+          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+            {
+              /*
+               * Use vector compare as long as you are not crossing a page
+               */
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                              ned_load_mask, ned_zmm))
+                return (char *)haystack + match_pos;
+            }
+          else
+            {
+              /* Compare byte by byte */
+              if (verify_string_match (haystack, match_pos, ned, 0))
+                return (char *)haystack + match_pos;
+            }
+        }
+      hay_index += ZMM_SIZE_IN_BYTES;
+    }
+  return NULL;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 95600a9de5..2fb8b169b6 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,16 +35,32 @@ 
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-	    ? __strstr_sse2_unaligned
-	    : __strstr_sse2)
 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
 #undef strstr
 strong_alias (__libc_strstr, strstr)