Message ID | 20220606191743.3344179-1-raghuveer.devulapalli@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86_64: Add strstr function with 512-bit EVEX | expand |
On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows: > > (1) We spend a few cycles at the begining to peek into the needle. We > locate an edge in the needle (first occurance of 2 consequent distinct > characters) and also store the first 64-bytes into a zmm register. > > (2) We search for the edge in the haystack by looking into one cache > line of the haystack at a time. This avoids having to read past a page > boundary which can cause a seg fault. > > (3) If an edge is found in the haystack we first compare the first > 64-bytes of the needle (already stored in a zmm register) before we > proceed with a full string compare performed byte by byte. > > Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) > > Geometric mean of all benchmarks: new / old = 0.66 > > Difficult skiptable(0) : new / old = 0.02 > Difficult skiptable(1) : new / old = 0.01 > Difficult 2-way : new / old = 0.25 > Difficult testing first 2 : new / old = 1.26 > Difficult skiptable(0) : new / old = 0.05 > Difficult skiptable(1) : new / old = 0.06 > Difficult 2-way : new / old = 0.26 > Difficult testing first 2 : new / old = 1.05 > Difficult skiptable(0) : new / old = 0.42 > Difficult skiptable(1) : new / old = 0.24 > Difficult 2-way : new / old = 0.21 > Difficult testing first 2 : new / old = 1.04 > --- > sysdeps/x86_64/multiarch/Makefile | 2 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + > sysdeps/x86_64/multiarch/strstr-avx512.c | 214 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strstr.c | 24 ++- > 4 files changed, 242 insertions(+), 4 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index d0869c3ac3..3d153cac35 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -116,6 +116,7 @@ sysdep_routines += \ > strrchr-sse2 \ > strspn-c \ > strspn-sse2 \ > + strstr-avx512 \ > strstr-sse2-unaligned \ > varshift \ > # sysdep_routines > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 > CFLAGS-strcspn-c.c += -msse4 > CFLAGS-strpbrk-c.c += -msse4 > CFLAGS-strspn-c.c += -msse4 > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 Do we need -O3? HJ, are there any issues with having this as -O3? > endif > > ifeq ($(subdir),wcsmbs) > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c5cd9466fe..58f3ec8306 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > IFUNC_IMPL (i, name, strstr, > + IFUNC_IMPL_ADD (array, i, strstr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (AVX512DQ) > + && CPU_FEATURE_USABLE (BMI2)), > + __strstr_avx512) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c > new file mode 100644 > index 0000000000..2ab9e96db8 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c > @@ -0,0 +1,214 @@ > +/* strstr optimized with 512-bit AVX-512 instructions > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <immintrin.h> > +#include <inttypes.h> > +#include <stdbool.h> > +#include <string.h> > + > +#define FULL_MMASK64 0xffffffffffffffff > +#define ONE_64BIT 0x1ull > +#define ZMM_SIZE_IN_BYTES 64 > +#define PAGESIZE 4096 > + > +/* > + Returns the index of the first edge within the needle, returns 0 if no edge > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' > + */ > +static inline size_t > +find_edge_in_needle (const char *ned) > +{ > + size_t ind = 0; > + while (ned[ind + 1] != '\0') > + { > + if (ned[ind] != ned[ind + 1]) > + return ind; > + else > + ind = ind + 1; > + } > + return 0; > +} > + > +/* > + Compare needle with haystack byte by byte at specified location > + */ > +static inline bool > +verify_string_match (const char *hay, const size_t hay_index, const char *ned, > + size_t ind) > +{ > + while (ned[ind] != '\0') > + { > + if (ned[ind] != hay[hay_index + ind]) > + return false; > + ind = ind + 1; > + } > + return true; > +} > + > +/* > + Compare needle with haystack at specified location. The first 64 bytes are > + compared using a ZMM register. > + */ > +static inline bool > +verify_string_match_avx512 (const char *hay, const size_t hay_index, > + const char *ned, const __mmask64 ned_mask, > + const __m512i ned_zmm) > +{ > + /* check first 64 bytes using zmm and then scalar */ > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); > + if (match != 0x0) // failed the first few chars > + return false; > + else if (ned_mask == FULL_MMASK64) > + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); > + return true; > +} > + > +char * > +__strstr_avx512 (const char *haystack, const char *ned) > +{ > + char first = ned[0]; > + if (first == '\0') > + return (char *)haystack; > + if (ned[1] == '\0') > + return (char *)strchr (haystack, ned[0]); > + > + size_t edge = find_edge_in_needle (ned); > + > + /* ensure haystack is as long as the pos of edge in needle */ > + for (int ii = 0; ii < edge; ++ii) > + { > + if (haystack[ii] == '\0') > + return NULL; > + } > + > + /* > + Load 64 bytes of the needle and save it to a zmm register > + Read one cache line at a time to avoid loading across a page boundary > + */ > + __mmask64 ned_load_mask = _bzhi_u64 ( > + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); > + __mmask64 ned_nullmask > + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); > + > + if (__glibc_unlikely (ned_nullmask == 0x0)) > + { > + ned_zmm = _mm512_loadu_si512 (ned); > + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > + if (ned_nullmask != 0x0) > + ned_load_mask = ned_load_mask >> 1; > + } > + else > + { > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > + ned_load_mask = ned_load_mask >> 1; > + } > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); > + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); > + > + /* > + Read the bytes of haystack in the current cache line > + */ > + size_t hay_index = edge; > + __mmask64 loadmask = _bzhi_u64 ( > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); > + /* First load is a partial cache line */ > + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); > + /* Search for NULL and compare only till null char */ > + uint64_t nullmask > + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); > + uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); > + /* Search for the 2 charaters of needle */ > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); > + k1 = _kshiftri_mask64 (k1, 1); > + /* k2 masks tell us if both chars from needle match */ > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > + /* For every match, search for the entire needle for a full match */ > + while (k2) > + { > + uint64_t bitcount = _tzcnt_u64 (k2); > + k2 = _blsr_u64 (k2); > + size_t match_pos = hay_index + bitcount - edge; > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > + { > + /* > + * Use vector compare as long as you are not crossing a page > + */ > + if (verify_string_match_avx512 (haystack, match_pos, ned, > + ned_load_mask, ned_zmm)) > + return (char *)haystack + match_pos; > + } > + else > + { > + if (verify_string_match (haystack, match_pos, ned, 0)) > + return (char *)haystack + match_pos; > + } > + } > + /* We haven't checked for potential match at the last char yet */ > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); > + hay_index = 0; > + > + /* > + Loop over one cache line at a time to prevent reading over page > + boundary > + */ > + __m512i hay1; > + while (nullmask == 0) > + { > + hay0 = _mm512_loadu_si512 (haystack + hay_index); > + hay1 = _mm512_load_si512 (haystack + hay_index > + + 1); // Always 64 byte aligned > + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); > + /* Compare only till null char */ > + cmpmask = nullmask ^ (nullmask - ONE_64BIT); > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); > + /* k2 masks tell us if both chars from needle match */ > + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > + /* For every match, compare full strings for potential match */ > + while (k2) > + { > + uint64_t bitcount = _tzcnt_u64 (k2); > + k2 = _blsr_u64 (k2); > + size_t match_pos = hay_index + bitcount - edge; > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > + { > + /* > + * Use vector compare as long as you are not crossing a page > + */ > + if (verify_string_match_avx512 (haystack, match_pos, ned, > + ned_load_mask, ned_zmm)) > + return (char *)haystack + match_pos; > + } > + else > + { > + /* Compare byte by byte */ > + if (verify_string_match (haystack, match_pos, ned, 0)) > + return (char *)haystack + match_pos; > + } > + } > + hay_index += ZMM_SIZE_IN_BYTES; > + } > + return NULL; > +} > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c > index 95600a9de5..2fb8b169b6 100644 > --- a/sysdeps/x86_64/multiarch/strstr.c > +++ b/sysdeps/x86_64/multiarch/strstr.c > @@ -35,16 +35,32 @@ > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; > extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; > > #include "init-arch.h" > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > ifunc symbol properly. */ > extern __typeof (__redirect_strstr) __libc_strstr; > -libc_ifunc (__libc_strstr, > - HAS_ARCH_FEATURE (Fast_Unaligned_Load) > - ? __strstr_sse2_unaligned > - : __strstr_sse2) > > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + return __strstr_avx512; > + > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > + return __strstr_sse2_unaligned; > + > + return __strstr_sse2; > +} > + > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); > #undef strstr > strong_alias (__libc_strstr, strstr) > -- > 2.36.1 >
On Mon, Jun 6, 2022 at 1:25 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows: > > > > (1) We spend a few cycles at the begining to peek into the needle. We > > locate an edge in the needle (first occurance of 2 consequent distinct > > characters) and also store the first 64-bytes into a zmm register. > > > > (2) We search for the edge in the haystack by looking into one cache > > line of the haystack at a time. This avoids having to read past a page > > boundary which can cause a seg fault. > > > > (3) If an edge is found in the haystack we first compare the first > > 64-bytes of the needle (already stored in a zmm register) before we > > proceed with a full string compare performed byte by byte. > > > > Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) > > > > Geometric mean of all benchmarks: new / old = 0.66 > > > > Difficult skiptable(0) : new / old = 0.02 > > Difficult skiptable(1) : new / old = 0.01 > > Difficult 2-way : new / old = 0.25 > > Difficult testing first 2 : new / old = 1.26 > > Difficult skiptable(0) : new / old = 0.05 > > Difficult skiptable(1) : new / old = 0.06 > > Difficult 2-way : new / old = 0.26 > > Difficult testing first 2 : new / old = 1.05 > > Difficult skiptable(0) : new / old = 0.42 > > Difficult skiptable(1) : new / old = 0.24 > > Difficult 2-way : new / old = 0.21 > > Difficult testing first 2 : new / old = 1.04 > > --- > > sysdeps/x86_64/multiarch/Makefile | 2 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + > > sysdeps/x86_64/multiarch/strstr-avx512.c | 214 +++++++++++++++++++++ > > sysdeps/x86_64/multiarch/strstr.c | 24 ++- > > 4 files changed, 242 insertions(+), 4 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index d0869c3ac3..3d153cac35 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -116,6 +116,7 @@ sysdep_routines += \ > > strrchr-sse2 \ > > strspn-c \ > > strspn-sse2 \ > > + strstr-avx512 \ > > strstr-sse2-unaligned \ > > varshift \ > > # sysdep_routines > > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 > > CFLAGS-strcspn-c.c += -msse4 > > CFLAGS-strpbrk-c.c += -msse4 > > CFLAGS-strspn-c.c += -msse4 > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 > > Do we need -O3? > > HJ, are there any issues with having this as -O3? No issue I've heard. > > endif > > > > ifeq ($(subdir),wcsmbs) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index c5cd9466fe..58f3ec8306 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > > IFUNC_IMPL (i, name, strstr, > > + IFUNC_IMPL_ADD (array, i, strstr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (AVX512DQ) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __strstr_avx512) > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c > > new file mode 100644 > > index 0000000000..2ab9e96db8 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c > > @@ -0,0 +1,214 @@ > > +/* strstr optimized with 512-bit AVX-512 instructions > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <immintrin.h> > > +#include <inttypes.h> > > +#include <stdbool.h> > > +#include <string.h> > > + > > +#define FULL_MMASK64 0xffffffffffffffff > > +#define ONE_64BIT 0x1ull > > +#define ZMM_SIZE_IN_BYTES 64 > > +#define PAGESIZE 4096 > > + > > +/* > > + Returns the index of the first edge within the needle, returns 0 if no edge > > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' > > + */ > > +static inline size_t > > +find_edge_in_needle (const char *ned) > > +{ > > + size_t ind = 0; > > + while (ned[ind + 1] != '\0') > > + { > > + if (ned[ind] != ned[ind + 1]) > > + return ind; > > + else > > + ind = ind + 1; > > + } > > + return 0; > > +} > > + > > +/* > > + Compare needle with haystack byte by byte at specified location > > + */ > > +static inline bool > > +verify_string_match (const char *hay, const size_t hay_index, const char *ned, > > + size_t ind) > > +{ > > + while (ned[ind] != '\0') > > + { > > + if (ned[ind] != hay[hay_index + ind]) > > + return false; > > + ind = ind + 1; > > + } > > + return true; > > +} > > + > > +/* > > + Compare needle with haystack at specified location. The first 64 bytes are > > + compared using a ZMM register. > > + */ > > +static inline bool > > +verify_string_match_avx512 (const char *hay, const size_t hay_index, > > + const char *ned, const __mmask64 ned_mask, > > + const __m512i ned_zmm) > > +{ > > + /* check first 64 bytes using zmm and then scalar */ > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); > > + if (match != 0x0) // failed the first few chars > > + return false; > > + else if (ned_mask == FULL_MMASK64) > > + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); > > + return true; > > +} > > + > > +char * > > +__strstr_avx512 (const char *haystack, const char *ned) > > +{ > > + char first = ned[0]; > > + if (first == '\0') > > + return (char *)haystack; > > + if (ned[1] == '\0') > > + return (char *)strchr (haystack, ned[0]); > > + > > + size_t edge = find_edge_in_needle (ned); > > + > > + /* ensure haystack is as long as the pos of edge in needle */ > > + for (int ii = 0; ii < edge; ++ii) > > + { > > + if (haystack[ii] == '\0') > > + return NULL; > > + } > > + > > + /* > > + Load 64 bytes of the needle and save it to a zmm register > > + Read one cache line at a time to avoid loading across a page boundary > > + */ > > + __mmask64 ned_load_mask = _bzhi_u64 ( > > + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); > > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); > > + __mmask64 ned_nullmask > > + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); > > + > > + if (__glibc_unlikely (ned_nullmask == 0x0)) > > + { > > + ned_zmm = _mm512_loadu_si512 (ned); > > + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > + if (ned_nullmask != 0x0) > > + ned_load_mask = ned_load_mask >> 1; > > + } > > + else > > + { > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > + ned_load_mask = ned_load_mask >> 1; > > + } > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); > > + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); > > + > > + /* > > + Read the bytes of haystack in the current cache line > > + */ > > + size_t hay_index = edge; > > + __mmask64 loadmask = _bzhi_u64 ( > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); > > + /* First load is a partial cache line */ > > + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); > > + /* Search for NULL and compare only till null char */ > > + uint64_t nullmask > > + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); > > + uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); > > + /* Search for the 2 charaters of needle */ > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); > > + k1 = _kshiftri_mask64 (k1, 1); > > + /* k2 masks tell us if both chars from needle match */ > > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > + /* For every match, search for the entire needle for a full match */ > > + while (k2) > > + { > > + uint64_t bitcount = _tzcnt_u64 (k2); > > + k2 = _blsr_u64 (k2); > > + size_t match_pos = hay_index + bitcount - edge; > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > + { > > + /* > > + * Use vector compare as long as you are not crossing a page > > + */ > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > + ned_load_mask, ned_zmm)) > > + return (char *)haystack + match_pos; > > + } > > + else > > + { > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > + return (char *)haystack + match_pos; > > + } > > + } > > + /* We haven't checked for potential match at the last char yet */ > > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); > > + hay_index = 0; > > + > > + /* > > + Loop over one cache line at a time to prevent reading over page > > + boundary > > + */ > > + __m512i hay1; > > + while (nullmask == 0) > > + { > > + hay0 = _mm512_loadu_si512 (haystack + hay_index); > > + hay1 = _mm512_load_si512 (haystack + hay_index > > + + 1); // Always 64 byte aligned > > + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); > > + /* Compare only till null char */ > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); > > + /* k2 masks tell us if both chars from needle match */ > > + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > + /* For every match, compare full strings for potential match */ > > + while (k2) > > + { > > + uint64_t bitcount = _tzcnt_u64 (k2); > > + k2 = _blsr_u64 (k2); > > + size_t match_pos = hay_index + bitcount - edge; > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > + { > > + /* > > + * Use vector compare as long as you are not crossing a page > > + */ > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > + ned_load_mask, ned_zmm)) > > + return (char *)haystack + match_pos; > > + } > > + else > > + { > > + /* Compare byte by byte */ > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > + return (char *)haystack + match_pos; > > + } > > + } > > + hay_index += ZMM_SIZE_IN_BYTES; > > + } > > + return NULL; > > +} > > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c > > index 95600a9de5..2fb8b169b6 100644 > > --- a/sysdeps/x86_64/multiarch/strstr.c > > +++ b/sysdeps/x86_64/multiarch/strstr.c > > @@ -35,16 +35,32 @@ > > > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; > > extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; > > > > #include "init-arch.h" > > > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > > ifunc symbol properly. */ > > extern __typeof (__redirect_strstr) __libc_strstr; > > -libc_ifunc (__libc_strstr, > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load) > > - ? __strstr_sse2_unaligned > > - : __strstr_sse2) > > > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + return __strstr_avx512; > > + > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > > + return __strstr_sse2_unaligned; > > + > > + return __strstr_sse2; > > +} > > + > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); > > #undef strstr > > strong_alias (__libc_strstr, strstr) > > -- > > 2.36.1 > > LGTM.
On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows: > > (1) We spend a few cycles at the begining to peek into the needle. We > locate an edge in the needle (first occurance of 2 consequent distinct > characters) and also store the first 64-bytes into a zmm register. > > (2) We search for the edge in the haystack by looking into one cache > line of the haystack at a time. This avoids having to read past a page > boundary which can cause a seg fault. > > (3) If an edge is found in the haystack we first compare the first > 64-bytes of the needle (already stored in a zmm register) before we > proceed with a full string compare performed byte by byte. > > Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) > > Geometric mean of all benchmarks: new / old = 0.66 > > Difficult skiptable(0) : new / old = 0.02 > Difficult skiptable(1) : new / old = 0.01 > Difficult 2-way : new / old = 0.25 > Difficult testing first 2 : new / old = 1.26 > Difficult skiptable(0) : new / old = 0.05 > Difficult skiptable(1) : new / old = 0.06 > Difficult 2-way : new / old = 0.26 > Difficult testing first 2 : new / old = 1.05 > Difficult skiptable(0) : new / old = 0.42 > Difficult skiptable(1) : new / old = 0.24 > Difficult 2-way : new / old = 0.21 > Difficult testing first 2 : new / old = 1.04 > --- > sysdeps/x86_64/multiarch/Makefile | 2 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + > sysdeps/x86_64/multiarch/strstr-avx512.c | 214 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strstr.c | 24 ++- > 4 files changed, 242 insertions(+), 4 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index d0869c3ac3..3d153cac35 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -116,6 +116,7 @@ sysdep_routines += \ > strrchr-sse2 \ > strspn-c \ > strspn-sse2 \ > + strstr-avx512 \ > strstr-sse2-unaligned \ > varshift \ > # sysdep_routines > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 > CFLAGS-strcspn-c.c += -msse4 > CFLAGS-strpbrk-c.c += -msse4 > CFLAGS-strspn-c.c += -msse4 > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 > endif > > ifeq ($(subdir),wcsmbs) > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c5cd9466fe..58f3ec8306 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > IFUNC_IMPL (i, name, strstr, > + IFUNC_IMPL_ADD (array, i, strstr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (AVX512DQ) > + && CPU_FEATURE_USABLE (BMI2)), > + __strstr_avx512) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c > new file mode 100644 > index 0000000000..2ab9e96db8 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c > @@ -0,0 +1,214 @@ > +/* strstr optimized with 512-bit AVX-512 instructions > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <immintrin.h> > +#include <inttypes.h> > +#include <stdbool.h> > +#include <string.h> > + > +#define FULL_MMASK64 0xffffffffffffffff > +#define ONE_64BIT 0x1ull > +#define ZMM_SIZE_IN_BYTES 64 > +#define PAGESIZE 4096 > + > +/* > + Returns the index of the first edge within the needle, returns 0 if no edge > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' > + */ > +static inline size_t > +find_edge_in_needle (const char *ned) > +{ > + size_t ind = 0; > + while (ned[ind + 1] != '\0') > + { > + if (ned[ind] != ned[ind + 1]) > + return ind; > + else > + ind = ind + 1; > + } > + return 0; > +} > + > +/* > + Compare needle with haystack byte by byte at specified location > + */ > +static inline bool > +verify_string_match (const char *hay, const size_t hay_index, const char *ned, > + size_t ind) > +{ > + while (ned[ind] != '\0') > + { > + if (ned[ind] != hay[hay_index + ind]) > + return false; > + ind = ind + 1; > + } > + return true; > +} > + > +/* > + Compare needle with haystack at specified location. The first 64 bytes are > + compared using a ZMM register. > + */ > +static inline bool > +verify_string_match_avx512 (const char *hay, const size_t hay_index, > + const char *ned, const __mmask64 ned_mask, > + const __m512i ned_zmm) > +{ > + /* check first 64 bytes using zmm and then scalar */ > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); > + if (match != 0x0) // failed the first few chars > + return false; > + else if (ned_mask == FULL_MMASK64) > + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); > + return true; > +} > + > +char * > +__strstr_avx512 (const char *haystack, const char *ned) > +{ > + char first = ned[0]; > + if (first == '\0') > + return (char *)haystack; > + if (ned[1] == '\0') > + return (char *)strchr (haystack, ned[0]); > + > + size_t edge = find_edge_in_needle (ned); > + > + /* ensure haystack is as long as the pos of edge in needle */ > + for (int ii = 0; ii < edge; ++ii) > + { > + if (haystack[ii] == '\0') > + return NULL; > + } > + > + /* > + Load 64 bytes of the needle and save it to a zmm register > + Read one cache line at a time to avoid loading across a page boundary > + */ > + __mmask64 ned_load_mask = _bzhi_u64 ( > + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); > + __mmask64 ned_nullmask > + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); > + > + if (__glibc_unlikely (ned_nullmask == 0x0)) > + { > + ned_zmm = _mm512_loadu_si512 (ned); > + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > + if (ned_nullmask != 0x0) > + ned_load_mask = ned_load_mask >> 1; > + } > + else > + { > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > + ned_load_mask = ned_load_mask >> 1; > + } > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); > + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); > + > + /* > + Read the bytes of haystack in the current cache line > + */ > + size_t hay_index = edge; > + __mmask64 loadmask = _bzhi_u64 ( > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); > + /* First load is a partial cache line */ > + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); > + /* Search for NULL and compare only till null char */ > + uint64_t nullmask > + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); > + uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); > + /* Search for the 2 charaters of needle */ > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); > + k1 = _kshiftri_mask64 (k1, 1); > + /* k2 masks tell us if both chars from needle match */ > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > + /* For every match, search for the entire needle for a full match */ > + while (k2) > + { > + uint64_t bitcount = _tzcnt_u64 (k2); > + k2 = _blsr_u64 (k2); > + size_t match_pos = hay_index + bitcount - edge; > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > + { > + /* > + * Use vector compare as long as you are not crossing a page > + */ > + if (verify_string_match_avx512 (haystack, match_pos, ned, > + ned_load_mask, ned_zmm)) > + return (char *)haystack + match_pos; > + } > + else > + { > + if (verify_string_match (haystack, match_pos, ned, 0)) > + return (char *)haystack + match_pos; > + } > + } > + /* We haven't checked for potential match at the last char yet */ > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); > + hay_index = 0; > + > + /* > + Loop over one cache line at a time to prevent reading over page > + boundary > + */ > + __m512i hay1; > + while (nullmask == 0) > + { > + hay0 = _mm512_loadu_si512 (haystack + hay_index); > + hay1 = _mm512_load_si512 (haystack + hay_index > + + 1); // Always 64 byte aligned > + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); > + /* Compare only till null char */ > + cmpmask = nullmask ^ (nullmask - ONE_64BIT); > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); > + /* k2 masks tell us if both chars from needle match */ > + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > + /* For every match, compare full strings for potential match */ > + while (k2) > + { > + uint64_t bitcount = _tzcnt_u64 (k2); > + k2 = _blsr_u64 (k2); > + size_t match_pos = hay_index + bitcount - edge; > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > + { > + /* > + * Use vector compare as long as you are not crossing a page > + */ > + if (verify_string_match_avx512 (haystack, match_pos, ned, > + ned_load_mask, ned_zmm)) > + return (char *)haystack + match_pos; > + } > + else > + { > + /* Compare byte by byte */ > + if (verify_string_match (haystack, match_pos, ned, 0)) > + return (char *)haystack + match_pos; > + } > + } > + hay_index += ZMM_SIZE_IN_BYTES; > + } > + return NULL; > +} > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c > index 95600a9de5..2fb8b169b6 100644 > --- a/sysdeps/x86_64/multiarch/strstr.c > +++ b/sysdeps/x86_64/multiarch/strstr.c > @@ -35,16 +35,32 @@ > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; > extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; > > #include "init-arch.h" > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > ifunc symbol properly. */ > extern __typeof (__redirect_strstr) __libc_strstr; > -libc_ifunc (__libc_strstr, > - HAS_ARCH_FEATURE (Fast_Unaligned_Load) > - ? __strstr_sse2_unaligned > - : __strstr_sse2) > > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + return __strstr_avx512; > + > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > + return __strstr_sse2_unaligned; > + > + return __strstr_sse2; > +} > + > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); > #undef strstr > strong_alias (__libc_strstr, strstr) > -- > 2.36.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Do you need me to commit it for you? Thanks.
> -----Original Message----- > From: H.J. Lu <hjl.tools@gmail.com> > Sent: Monday, June 6, 2022 2:32 PM > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com> > Cc: GNU C Library <libc-alpha@sourceware.org> > Subject: Re: [PATCH v2] x86_64: Add strstr function with 512-bit EVEX > > On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc- > alpha@sourceware.org> wrote: > > > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows: > > > > (1) We spend a few cycles at the begining to peek into the needle. We > > locate an edge in the needle (first occurance of 2 consequent distinct > > characters) and also store the first 64-bytes into a zmm register. > > > > (2) We search for the edge in the haystack by looking into one cache > > line of the haystack at a time. This avoids having to read past a page > > boundary which can cause a seg fault. > > > > (3) If an edge is found in the haystack we first compare the first > > 64-bytes of the needle (already stored in a zmm register) before we > > proceed with a full string compare performed byte by byte. > > > > Benchmarking results: (old = strstr_sse2_unaligned, new = > > strstr_avx512) > > > > Geometric mean of all benchmarks: new / old = 0.66 > > > > Difficult skiptable(0) : new / old = 0.02 Difficult skiptable(1) : > > new / old = 0.01 Difficult 2-way : new / old = 0.25 Difficult > > testing first 2 : new / old = 1.26 Difficult skiptable(0) : new / old > > = 0.05 Difficult skiptable(1) : new / old = 0.06 Difficult 2-way : > > new / old = 0.26 Difficult testing first 2 : new / old = 1.05 > > Difficult skiptable(0) : new / old = 0.42 Difficult skiptable(1) : > > new / old = 0.24 Difficult 2-way : new / old = 0.21 Difficult > > testing first 2 : new / old = 1.04 > > --- > > sysdeps/x86_64/multiarch/Makefile | 2 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + > > sysdeps/x86_64/multiarch/strstr-avx512.c | 214 > +++++++++++++++++++++ > > sysdeps/x86_64/multiarch/strstr.c | 24 ++- > > 4 files changed, 242 insertions(+), 4 deletions(-) create mode > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile > > b/sysdeps/x86_64/multiarch/Makefile > > index d0869c3ac3..3d153cac35 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -116,6 +116,7 @@ sysdep_routines += \ > > strrchr-sse2 \ > > strspn-c \ > > strspn-sse2 \ > > + strstr-avx512 \ > > strstr-sse2-unaligned \ > > varshift \ > > # sysdep_routines > > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c > += > > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq - > mavx512bw > > +-mbmi -mbmi2 -O3 > > endif > > > > ifeq ($(subdir),wcsmbs) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index c5cd9466fe..58f3ec8306 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct > > libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > > IFUNC_IMPL (i, name, strstr, > > + IFUNC_IMPL_ADD (array, i, strstr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (AVX512DQ) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __strstr_avx512) > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c > > b/sysdeps/x86_64/multiarch/strstr-avx512.c > > new file mode 100644 > > index 0000000000..2ab9e96db8 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c > > @@ -0,0 +1,214 @@ > > +/* strstr optimized with 512-bit AVX-512 instructions > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <immintrin.h> > > +#include <inttypes.h> > > +#include <stdbool.h> > > +#include <string.h> > > + > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull > > +#define ZMM_SIZE_IN_BYTES 64 #define PAGESIZE 4096 > > + > > +/* > > + Returns the index of the first edge within the needle, returns 0 if > > +no edge is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' > > + */ > > +static inline size_t > > +find_edge_in_needle (const char *ned) { > > + size_t ind = 0; > > + while (ned[ind + 1] != '\0') > > + { > > + if (ned[ind] != ned[ind + 1]) > > + return ind; > > + else > > + ind = ind + 1; > > + } > > + return 0; > > +} > > + > > +/* > > + Compare needle with haystack byte by byte at specified location */ > > +static inline bool verify_string_match (const char *hay, const size_t > > +hay_index, const char *ned, > > + size_t ind) > > +{ > > + while (ned[ind] != '\0') > > + { > > + if (ned[ind] != hay[hay_index + ind]) > > + return false; > > + ind = ind + 1; > > + } > > + return true; > > +} > > + > > +/* > > + Compare needle with haystack at specified location. The first 64 > > +bytes are compared using a ZMM register. > > + */ > > +static inline bool > > +verify_string_match_avx512 (const char *hay, const size_t hay_index, > > + const char *ned, const __mmask64 ned_mask, > > + const __m512i ned_zmm) { > > + /* check first 64 bytes using zmm and then scalar */ > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe > to > > +do so > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, > hay_zmm, > > +ned_zmm); > > + if (match != 0x0) // failed the first few chars > > + return false; > > + else if (ned_mask == FULL_MMASK64) > > + return verify_string_match (hay, hay_index, ned, > > +ZMM_SIZE_IN_BYTES); > > + return true; > > +} > > + > > +char * > > +__strstr_avx512 (const char *haystack, const char *ned) { > > + char first = ned[0]; > > + if (first == '\0') > > + return (char *)haystack; > > + if (ned[1] == '\0') > > + return (char *)strchr (haystack, ned[0]); > > + > > + size_t edge = find_edge_in_needle (ned); > > + > > + /* ensure haystack is as long as the pos of edge in needle */ for > > + (int ii = 0; ii < edge; ++ii) > > + { > > + if (haystack[ii] == '\0') > > + return NULL; > > + } > > + > > + /* > > + Load 64 bytes of the needle and save it to a zmm register > > + Read one cache line at a time to avoid loading across a page boundary > > + */ > > + __mmask64 ned_load_mask = _bzhi_u64 ( > > + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); __m512i ned_zmm > > + = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); > > + __mmask64 ned_nullmask > > + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, > > + ned_zmm); > > + > > + if (__glibc_unlikely (ned_nullmask == 0x0)) > > + { > > + ned_zmm = _mm512_loadu_si512 (ned); > > + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > + if (ned_nullmask != 0x0) > > + ned_load_mask = ned_load_mask >> 1; > > + } > > + else > > + { > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > + ned_load_mask = ned_load_mask >> 1; > > + } > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const __m512i > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]); > > + > > + /* > > + Read the bytes of haystack in the current cache line > > + */ > > + size_t hay_index = edge; > > + __mmask64 loadmask = _bzhi_u64 ( > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); > > + /* First load is a partial cache line */ __m512i hay0 = > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); > > + /* Search for NULL and compare only till null char */ uint64_t > > + nullmask > > + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, > hay0, > > + hay0)); uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); > > + /* Search for the 2 charaters of needle */ > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); > > + k1 = _kshiftri_mask64 (k1, 1); > > + /* k2 masks tell us if both chars from needle match */ uint64_t k2 > > + = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > + /* For every match, search for the entire needle for a full match > > + */ while (k2) > > + { > > + uint64_t bitcount = _tzcnt_u64 (k2); > > + k2 = _blsr_u64 (k2); > > + size_t match_pos = hay_index + bitcount - edge; > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > + { > > + /* > > + * Use vector compare as long as you are not crossing a page > > + */ > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > + ned_load_mask, ned_zmm)) > > + return (char *)haystack + match_pos; > > + } > > + else > > + { > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > + return (char *)haystack + match_pos; > > + } > > + } > > + /* We haven't checked for potential match at the last char yet */ > > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | > > + 63)); hay_index = 0; > > + > > + /* > > + Loop over one cache line at a time to prevent reading over page > > + boundary > > + */ > > + __m512i hay1; > > + while (nullmask == 0) > > + { > > + hay0 = _mm512_loadu_si512 (haystack + hay_index); > > + hay1 = _mm512_load_si512 (haystack + hay_index > > + + 1); // Always 64 byte aligned > > + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, > hay1)); > > + /* Compare only till null char */ > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); > > + /* k2 masks tell us if both chars from needle match */ > > + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > + /* For every match, compare full strings for potential match */ > > + while (k2) > > + { > > + uint64_t bitcount = _tzcnt_u64 (k2); > > + k2 = _blsr_u64 (k2); > > + size_t match_pos = hay_index + bitcount - edge; > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > + { > > + /* > > + * Use vector compare as long as you are not crossing a page > > + */ > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > + ned_load_mask, ned_zmm)) > > + return (char *)haystack + match_pos; > > + } > > + else > > + { > > + /* Compare byte by byte */ > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > + return (char *)haystack + match_pos; > > + } > > + } > > + hay_index += ZMM_SIZE_IN_BYTES; > > + } > > + return NULL; > > +} > > diff --git a/sysdeps/x86_64/multiarch/strstr.c > > b/sysdeps/x86_64/multiarch/strstr.c > > index 95600a9de5..2fb8b169b6 100644 > > --- a/sysdeps/x86_64/multiarch/strstr.c > > +++ b/sysdeps/x86_64/multiarch/strstr.c > > @@ -35,16 +35,32 @@ > > > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned > > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 > > attribute_hidden; > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; > > > > #include "init-arch.h" > > > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > > ifunc symbol properly. */ > > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc > > (__libc_strstr, > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load) > > - ? __strstr_sse2_unaligned > > - : __strstr_sse2) > > > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + return __strstr_avx512; > > + > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > > + return __strstr_sse2_unaligned; > > + > > + return __strstr_sse2; > > +} > > + > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, > > +IFUNC_SELECTOR ()); > > #undef strstr > > strong_alias (__libc_strstr, strstr) > > -- > > 2.36.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Do you need me to commit it for you? Yes, Please. Thanks! > > Thanks. > > -- > H.J.
On Mon, Jun 6, 2022 at 2:41 PM Devulapalli, Raghuveer via Libc-alpha <libc-alpha@sourceware.org> wrote: > > > > > -----Original Message----- > > From: H.J. Lu <hjl.tools@gmail.com> > > Sent: Monday, June 6, 2022 2:32 PM > > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com> > > Cc: GNU C Library <libc-alpha@sourceware.org> > > Subject: Re: [PATCH v2] x86_64: Add strstr function with 512-bit EVEX > > > > On Mon, Jun 6, 2022 at 12:09 PM Raghuveer Devulapalli via Libc-alpha <libc- > > alpha@sourceware.org> wrote: > > > > > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows: > > > > > > (1) We spend a few cycles at the begining to peek into the needle. We > > > locate an edge in the needle (first occurance of 2 consequent distinct > > > characters) and also store the first 64-bytes into a zmm register. > > > > > > (2) We search for the edge in the haystack by looking into one cache > > > line of the haystack at a time. This avoids having to read past a page > > > boundary which can cause a seg fault. > > > > > > (3) If an edge is found in the haystack we first compare the first > > > 64-bytes of the needle (already stored in a zmm register) before we > > > proceed with a full string compare performed byte by byte. > > > > > > Benchmarking results: (old = strstr_sse2_unaligned, new = > > > strstr_avx512) > > > > > > Geometric mean of all benchmarks: new / old = 0.66 > > > > > > Difficult skiptable(0) : new / old = 0.02 Difficult skiptable(1) : > > > new / old = 0.01 Difficult 2-way : new / old = 0.25 Difficult > > > testing first 2 : new / old = 1.26 Difficult skiptable(0) : new / old > > > = 0.05 Difficult skiptable(1) : new / old = 0.06 Difficult 2-way : > > > new / old = 0.26 Difficult testing first 2 : new / old = 1.05 > > > Difficult skiptable(0) : new / old = 0.42 Difficult skiptable(1) : > > > new / old = 0.24 Difficult 2-way : new / old = 0.21 Difficult > > > testing first 2 : new / old = 1.04 > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 2 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + > > > sysdeps/x86_64/multiarch/strstr-avx512.c | 214 > > +++++++++++++++++++++ > > > sysdeps/x86_64/multiarch/strstr.c | 24 ++- > > > 4 files changed, 242 insertions(+), 4 deletions(-) create mode > > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile > > > b/sysdeps/x86_64/multiarch/Makefile > > > index d0869c3ac3..3d153cac35 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -116,6 +116,7 @@ sysdep_routines += \ > > > strrchr-sse2 \ > > > strspn-c \ > > > strspn-sse2 \ > > > + strstr-avx512 \ > > > strstr-sse2-unaligned \ > > > varshift \ > > > # sysdep_routines > > > @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c > > += > > > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 > > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq - > > mavx512bw > > > +-mbmi -mbmi2 -O3 > > > endif > > > > > > ifeq ($(subdir),wcsmbs) > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index c5cd9466fe..58f3ec8306 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct > > > libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > > > IFUNC_IMPL (i, name, strstr, > > > + IFUNC_IMPL_ADD (array, i, strstr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (AVX512DQ) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __strstr_avx512) > > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) > > > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c > > > b/sysdeps/x86_64/multiarch/strstr-avx512.c > > > new file mode 100644 > > > index 0000000000..2ab9e96db8 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c > > > @@ -0,0 +1,214 @@ > > > +/* strstr optimized with 512-bit AVX-512 instructions > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include <immintrin.h> > > > +#include <inttypes.h> > > > +#include <stdbool.h> > > > +#include <string.h> > > > + > > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull > > > +#define ZMM_SIZE_IN_BYTES 64 #define PAGESIZE 4096 > > > + > > > +/* > > > + Returns the index of the first edge within the needle, returns 0 if > > > +no edge is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' > > > + */ > > > +static inline size_t > > > +find_edge_in_needle (const char *ned) { > > > + size_t ind = 0; > > > + while (ned[ind + 1] != '\0') > > > + { > > > + if (ned[ind] != ned[ind + 1]) > > > + return ind; > > > + else > > > + ind = ind + 1; > > > + } > > > + return 0; > > > +} > > > + > > > +/* > > > + Compare needle with haystack byte by byte at specified location */ > > > +static inline bool verify_string_match (const char *hay, const size_t > > > +hay_index, const char *ned, > > > + size_t ind) > > > +{ > > > + while (ned[ind] != '\0') > > > + { > > > + if (ned[ind] != hay[hay_index + ind]) > > > + return false; > > > + ind = ind + 1; > > > + } > > > + return true; > > > +} > > > + > > > +/* > > > + Compare needle with haystack at specified location. The first 64 > > > +bytes are compared using a ZMM register. > > > + */ > > > +static inline bool > > > +verify_string_match_avx512 (const char *hay, const size_t hay_index, > > > + const char *ned, const __mmask64 ned_mask, > > > + const __m512i ned_zmm) { > > > + /* check first 64 bytes using zmm and then scalar */ > > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe > > to > > > +do so > > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, > > hay_zmm, > > > +ned_zmm); > > > + if (match != 0x0) // failed the first few chars > > > + return false; > > > + else if (ned_mask == FULL_MMASK64) > > > + return verify_string_match (hay, hay_index, ned, > > > +ZMM_SIZE_IN_BYTES); > > > + return true; > > > +} > > > + > > > +char * > > > +__strstr_avx512 (const char *haystack, const char *ned) { > > > + char first = ned[0]; > > > + if (first == '\0') > > > + return (char *)haystack; > > > + if (ned[1] == '\0') > > > + return (char *)strchr (haystack, ned[0]); > > > + > > > + size_t edge = find_edge_in_needle (ned); > > > + > > > + /* ensure haystack is as long as the pos of edge in needle */ for > > > + (int ii = 0; ii < edge; ++ii) > > > + { > > > + if (haystack[ii] == '\0') > > > + return NULL; > > > + } > > > + > > > + /* > > > + Load 64 bytes of the needle and save it to a zmm register > > > + Read one cache line at a time to avoid loading across a page boundary > > > + */ > > > + __mmask64 ned_load_mask = _bzhi_u64 ( > > > + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); __m512i ned_zmm > > > + = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); > > > + __mmask64 ned_nullmask > > > + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, > > > + ned_zmm); > > > + > > > + if (__glibc_unlikely (ned_nullmask == 0x0)) > > > + { > > > + ned_zmm = _mm512_loadu_si512 (ned); > > > + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); > > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > > + if (ned_nullmask != 0x0) > > > + ned_load_mask = ned_load_mask >> 1; > > > + } > > > + else > > > + { > > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); > > > + ned_load_mask = ned_load_mask >> 1; > > > + } > > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const __m512i > > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]); > > > + > > > + /* > > > + Read the bytes of haystack in the current cache line > > > + */ > > > + size_t hay_index = edge; > > > + __mmask64 loadmask = _bzhi_u64 ( > > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); > > > + /* First load is a partial cache line */ __m512i hay0 = > > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); > > > + /* Search for NULL and compare only till null char */ uint64_t > > > + nullmask > > > + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, > > hay0, > > > + hay0)); uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > > + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); > > > + /* Search for the 2 charaters of needle */ > > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); > > > + k1 = _kshiftri_mask64 (k1, 1); > > > + /* k2 masks tell us if both chars from needle match */ uint64_t k2 > > > + = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > > + /* For every match, search for the entire needle for a full match > > > + */ while (k2) > > > + { > > > + uint64_t bitcount = _tzcnt_u64 (k2); > > > + k2 = _blsr_u64 (k2); > > > + size_t match_pos = hay_index + bitcount - edge; > > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > > + { > > > + /* > > > + * Use vector compare as long as you are not crossing a page > > > + */ > > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > > + ned_load_mask, ned_zmm)) > > > + return (char *)haystack + match_pos; > > > + } > > > + else > > > + { > > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > > + return (char *)haystack + match_pos; > > > + } > > > + } > > > + /* We haven't checked for potential match at the last char yet */ > > > + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | > > > + 63)); hay_index = 0; > > > + > > > + /* > > > + Loop over one cache line at a time to prevent reading over page > > > + boundary > > > + */ > > > + __m512i hay1; > > > + while (nullmask == 0) > > > + { > > > + hay0 = _mm512_loadu_si512 (haystack + hay_index); > > > + hay1 = _mm512_load_si512 (haystack + hay_index > > > + + 1); // Always 64 byte aligned > > > + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, > > hay1)); > > > + /* Compare only till null char */ > > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT); > > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); > > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); > > > + /* k2 masks tell us if both chars from needle match */ > > > + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; > > > + /* For every match, compare full strings for potential match */ > > > + while (k2) > > > + { > > > + uint64_t bitcount = _tzcnt_u64 (k2); > > > + k2 = _blsr_u64 (k2); > > > + size_t match_pos = hay_index + bitcount - edge; > > > + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) > > > + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) > > > + { > > > + /* > > > + * Use vector compare as long as you are not crossing a page > > > + */ > > > + if (verify_string_match_avx512 (haystack, match_pos, ned, > > > + ned_load_mask, ned_zmm)) > > > + return (char *)haystack + match_pos; > > > + } > > > + else > > > + { > > > + /* Compare byte by byte */ > > > + if (verify_string_match (haystack, match_pos, ned, 0)) > > > + return (char *)haystack + match_pos; > > > + } > > > + } > > > + hay_index += ZMM_SIZE_IN_BYTES; > > > + } > > > + return NULL; > > > +} > > > diff --git a/sysdeps/x86_64/multiarch/strstr.c > > > b/sysdeps/x86_64/multiarch/strstr.c > > > index 95600a9de5..2fb8b169b6 100644 > > > --- a/sysdeps/x86_64/multiarch/strstr.c > > > +++ b/sysdeps/x86_64/multiarch/strstr.c > > > @@ -35,16 +35,32 @@ > > > > > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned > > > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 > > > attribute_hidden; > > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; > > > > > > #include "init-arch.h" > > > > > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > > > ifunc symbol properly. */ > > > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc > > > (__libc_strstr, > > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load) > > > - ? __strstr_sse2_unaligned > > > - : __strstr_sse2) > > > > > > +static inline void * > > > +IFUNC_SELECTOR (void) > > > +{ > > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > > + > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) > > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > + return __strstr_avx512; > > > + > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > > > + return __strstr_sse2_unaligned; > > > + > > > + return __strstr_sse2; > > > +} > > > + > > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, > > > +IFUNC_SELECTOR ()); > > > #undef strstr > > > strong_alias (__libc_strstr, strstr) > > > -- > > > 2.36.1 > > > > > > > LGTM. > > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > > > Do you need me to commit it for you? > > Yes, Please. Thanks! > > > > > Thanks. > > > > -- > > H.J. I would like to backport this patch to release branches. Any comments or objections? This patch will have 2 squashed commit to fix glibc build failure with gcc 6.4.1 commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0 Author: Noah Goldstein <goldstein.w.n@gmail.com> Date: Tue Jul 12 11:48:04 2022 -0700 x86: Remove __mmask intrinsics in strstr-avx512.c commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> Date: Mon Jun 6 12:17:43 2022 -0700 x86_64: Add strstr function with 512-bit EVEX --Sunil
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d0869c3ac3..3d153cac35 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -116,6 +116,7 @@ sysdep_routines += \ strrchr-sse2 \ strspn-c \ strspn-sse2 \ + strstr-avx512 \ strstr-sse2-unaligned \ varshift \ # sysdep_routines @@ -123,6 +124,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c5cd9466fe..58f3ec8306 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -618,6 +618,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (AVX512DQ) + && CPU_FEATURE_USABLE (BMI2)), + __strstr_avx512) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c new file mode 100644 index 0000000000..2ab9e96db8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c @@ -0,0 +1,214 @@ +/* strstr optimized with 512-bit AVX-512 instructions + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <inttypes.h> +#include <stdbool.h> +#include <string.h> + +#define FULL_MMASK64 0xffffffffffffffff +#define ONE_64BIT 0x1ull +#define ZMM_SIZE_IN_BYTES 64 +#define PAGESIZE 4096 + +/* + Returns the index of the first edge within the needle, returns 0 if no edge + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' + */ +static inline size_t +find_edge_in_needle (const char *ned) +{ + size_t ind = 0; + while (ned[ind + 1] != '\0') + { + if (ned[ind] != ned[ind + 1]) + return ind; + else + ind = ind + 1; + } + return 0; +} + +/* + Compare needle with haystack byte by byte at specified location + */ +static inline bool +verify_string_match (const char *hay, const size_t hay_index, const char *ned, + size_t ind) +{ + while (ned[ind] != '\0') + { + if (ned[ind] != hay[hay_index + ind]) + return false; + ind = ind + 1; + } + return true; +} + +/* + Compare needle with haystack at specified location. The first 64 bytes are + compared using a ZMM register. + */ +static inline bool +verify_string_match_avx512 (const char *hay, const size_t hay_index, + const char *ned, const __mmask64 ned_mask, + const __m512i ned_zmm) +{ + /* check first 64 bytes using zmm and then scalar */ + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); + if (match != 0x0) // failed the first few chars + return false; + else if (ned_mask == FULL_MMASK64) + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); + return true; +} + +char * +__strstr_avx512 (const char *haystack, const char *ned) +{ + char first = ned[0]; + if (first == '\0') + return (char *)haystack; + if (ned[1] == '\0') + return (char *)strchr (haystack, ned[0]); + + size_t edge = find_edge_in_needle (ned); + + /* ensure haystack is as long as the pos of edge in needle */ + for (int ii = 0; ii < edge; ++ii) + { + if (haystack[ii] == '\0') + return NULL; + } + + /* + Load 64 bytes of the needle and save it to a zmm register + Read one cache line at a time to avoid loading across a page boundary + */ + __mmask64 ned_load_mask = _bzhi_u64 ( + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); + __mmask64 ned_nullmask + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); + + if (__glibc_unlikely (ned_nullmask == 0x0)) + { + ned_zmm = _mm512_loadu_si512 (ned); + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); + if (ned_nullmask != 0x0) + ned_load_mask = ned_load_mask >> 1; + } + else + { + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); + ned_load_mask = ned_load_mask >> 1; + } + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); + + /* + Read the bytes of haystack in the current cache line + */ + size_t hay_index = edge; + __mmask64 loadmask = _bzhi_u64 ( + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); + /* First load is a partial cache line */ + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); + /* Search for NULL and compare only till null char */ + uint64_t nullmask + = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); + uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); + cmpmask = cmpmask & _cvtmask64_u64 (loadmask); + /* Search for the 2 charaters of needle */ + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); + k1 = _kshiftri_mask64 (k1, 1); + /* k2 masks tell us if both chars from needle match */ + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; + /* For every match, search for the entire needle for a full match */ + while (k2) + { + uint64_t bitcount = _tzcnt_u64 (k2); + k2 = _blsr_u64 (k2); + size_t match_pos = hay_index + bitcount - edge; + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) + { + /* + * Use vector compare as long as you are not crossing a page + */ + if (verify_string_match_avx512 (haystack, match_pos, ned, + ned_load_mask, ned_zmm)) + return (char *)haystack + match_pos; + } + else + { + if (verify_string_match (haystack, match_pos, ned, 0)) + return (char *)haystack + match_pos; + } + } + /* We haven't checked for potential match at the last char yet */ + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); + hay_index = 0; + + /* + Loop over one cache line at a time to prevent reading over page + boundary + */ + __m512i hay1; + while (nullmask == 0) + { + hay0 = _mm512_loadu_si512 (haystack + hay_index); + hay1 = _mm512_load_si512 (haystack + hay_index + + 1); // Always 64 byte aligned + nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); + /* Compare only till null char */ + cmpmask = nullmask ^ (nullmask - ONE_64BIT); + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); + /* k2 masks tell us if both chars from needle match */ + k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask; + /* For every match, compare full strings for potential match */ + while (k2) + { + uint64_t bitcount = _tzcnt_u64 (k2); + k2 = _blsr_u64 (k2); + size_t match_pos = hay_index + bitcount - edge; + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) + { + /* + * Use vector compare as long as you are not crossing a page + */ + if (verify_string_match_avx512 (haystack, match_pos, ned, + ned_load_mask, ned_zmm)) + return (char *)haystack + match_pos; + } + else + { + /* Compare byte by byte */ + if (verify_string_match (haystack, match_pos, ned, 0)) + return (char *)haystack + match_pos; + } + } + hay_index += ZMM_SIZE_IN_BYTES; + } + return NULL; +} diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 95600a9de5..2fb8b169b6 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -35,16 +35,32 @@ extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; #include "init-arch.h" /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - ? __strstr_sse2_unaligned - : __strstr_sse2) +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return __strstr_avx512; + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return __strstr_sse2_unaligned; + + return __strstr_sse2; +} + +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); #undef strstr strong_alias (__libc_strstr, strstr)