Message ID | 20240218082621.131128-1-tirtajames45@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v6] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c | expand |
On Sun, Feb 18, 2024 at 8:26 AM James Tirta Halim <tirtajames45@gmail.com> wrote: > > Find the rarest byte in NE. Find the parts of HS that matches the rare byte > and the byte after it. If found, shift back to the start of NE in HS and > vector compare the first VEC_SIZE with NE. If matches, compare the rest > with MEMCMPEQ. > > Timings (Core i3-1115G4): > basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2 > __memmem_generic > Total: > 6.80124e+06 1.06087e+06 219483 345385 768041 > Average: > 25958.9 4049.11 837.721 1318.26 2931.45 > > Passes make check. > > Changes in v1: > 1. Add memmem-avx2.c > > Changes in v2: > 1. Add avx512 support with a generic header file > 2. Use __memcmpeq instead of memcmp > 3. Remove scalar loop > 4. Fix unsafe unaligned load > > Changes in v3: > 1. Avoid checking for alignment to the start of the page since that will be rare > 2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined > reference errors) > 3. Add memmem.c (needs review) > 4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs > review) > 5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review) > > Changes in v4: > 1. Correct the cpu feature checks in ifunc-impl-list.c and memmem.c to > use AVX512BW and BMI1 for AVX512 and AVX2 and BMI1 for AVX2 > 2. Correct the Makefile to use the appropriate flags > 3. Rename memmem-vectorized-avx.h to memmem-avx-base.h > 4. Remove unused vector macros (POPCNT and LZCNT) > > Changes in v5: > 1. Rename SHIFT to RARE, OFF to OFF_S, OFF2 to OFF_E > 2. Remove conditional for VEC_SIZE and ONES, and remove unused MASK_SIZE > 3. Add comments > 4. Limit needle length to VEC_SIZE when finding the rare byte > > Changes in v6: > 1. Fix patch apply error in memmem.c > 2. Correctly use MIN(ne_len, VEC_SIZE) when checking if RARE is found at the end > of needle > 3. Always do unaligned load at the tail code > 4. Rename rarebyte_table to ___rarebyte_table > 5. Add memmem-avx-base.c in which ___rarebyte_table is defined > 6. Add memmem-avx-base to the Makefile > 7. Add always_inline to find_rarest_byte > 8. Change ((m << off) >> off) to (m & (ONES >> off)) > 9. Change void * to unsigned char * in find_rarest_byte > > --- > string/memmem.c | 7 +- > sysdeps/x86_64/multiarch/Makefile | 6 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 ++ > sysdeps/x86_64/multiarch/memmem-avx-base.c | 20 +++ > sysdeps/x86_64/multiarch/memmem-avx-base.h | 183 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/memmem-avx2.c | 3 + > sysdeps/x86_64/multiarch/memmem-avx512.c | 12 ++ > sysdeps/x86_64/multiarch/memmem.c | 67 ++++++++ > 8 files changed, 309 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.c > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.h > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c > create mode 100644 sysdeps/x86_64/multiarch/memmem.c > > diff --git a/string/memmem.c b/string/memmem.c > index a4117f8e1e..a315c7d0b5 100644 > --- a/string/memmem.c > +++ b/string/memmem.c > @@ -25,6 +25,10 @@ > # define __memmem memmem > #endif > > +#ifndef MEMMEM > +# define MEMMEM __memmem > +#endif > + > #define RETURN_TYPE void * > #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) > #define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) > @@ -50,7 +54,7 @@ > The limit also implies worst-case performance is linear. > Needles larger than 256 characters use the linear-time Two-Way algorithm. */ > void * > -__memmem (const void *haystack, size_t hs_len, > +MEMMEM (const void *haystack, size_t hs_len, > const void *needle, size_t ne_len) > { > const unsigned char *hs = (const unsigned char *) haystack; > @@ -127,3 +131,4 @@ __memmem (const void *haystack, size_t hs_len, > libc_hidden_def (__memmem) > weak_alias (__memmem, memmem) > libc_hidden_weak (memmem) > +libc_hidden_builtin_def (MEMMEM) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index d3d2270394..0b46d5f341 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -15,6 +15,9 @@ sysdep_routines += \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > memcmpeq-sse2 \ > + memmem-avx-base \ > + memmem-avx2 \ > + memmem-avx512 \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > @@ -122,6 +125,9 @@ sysdep_routines += \ > varshift \ > # sysdep_routines > > +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3 > +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3 > + > CFLAGS-strcspn-sse4.c += -msse4 > CFLAGS-strpbrk-sse4.c += -msse4 > CFLAGS-strspn-sse4.c += -msse4 > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c4a21d4b7c..5fe1440235 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -798,6 +798,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strstr_avx512) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic)) > + > + /* Support sysdeps/x86_64/multiarch/memmem.c. */ > + IFUNC_IMPL (i, name, memmem, > + IFUNC_IMPL_ADD (array, i, memmem, > + (CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI1)), > + __memmem_avx512) > + IFUNC_IMPL_ADD (array, i, memmem, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI1)), > + __memmem_avx2) > + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic)) > > /* Support sysdeps/x86_64/multiarch/wcschr.c. */ > IFUNC_IMPL (i, name, wcschr, > diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.c b/sysdeps/x86_64/multiarch/memmem-avx-base.c > new file mode 100644 > index 0000000000..212d75c96f > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.c > @@ -0,0 +1,20 @@ > +const unsigned char ___rarebyte_table[256] attribute_hidden > + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, > + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, > + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, > + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, > + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, > + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, > + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, > + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, > + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, > + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, > + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, > + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, > + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, > + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, > + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, > + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, > + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, > + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, > + 85, 49, 50, 51 }; > diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h > new file mode 100644 > index 0000000000..1333eac5b5 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h > @@ -0,0 +1,183 @@ > +#include <immintrin.h> > +#include <inttypes.h> > +#include <string.h> > +#include <libc-pointer-arith.h> > + > +#ifndef FUNC_NAME > +# define __memmem_avx2 > +#endif > +#ifndef VEC > +# define VEC __m256i > +#endif > +#ifndef MASK > +# define MASK uint32_t > +#endif > +#ifndef LOAD > +# define LOAD(x) _mm256_load_si256 (x) > +#endif > +#ifndef LOADU > +# define LOADU(x) _mm256_loadu_si256 (x) > +#endif > +#ifndef CMPEQ8_MASK > +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y)) > +#endif > +#ifndef SETONE8 > +# define SETONE8(x) _mm256_set1_epi8 (x) > +#endif > +#ifndef TZCNT > +# define TZCNT(x) _tzcnt_u32 (x) > +#endif > +#ifndef BLSR > +# define BLSR(x) _blsr_u32 (x) > +#endif > +#define VEC_SIZE sizeof (VEC) > +#define ONES ((MASK) -1) > + > +#ifndef MEMCMPEQ > +# define MEMCMPEQ __memcmpeq > +#endif > +#ifndef MEMCPY > +# define MEMCPY memcpy > +#endif > +#ifndef MEMCHR > +# define MEMCHR memchr > +#endif > +#ifndef PAGE_SIZE > +# define PAGE_SIZE 4096 > +#endif > +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) > + > +/* Lower is rarer. The table is based on the > + *.c and *.h files in glibc. */ > +extern const unsigned char ___rarebyte_table[256] attribute_hidden; > + > +static inline void *__attribute__ ((always_inline)) > +find_rarest_byte (const unsigned char *rare, size_t n) > +{ > + const unsigned char *p = (const unsigned char *) rare; > + int c_rare = ___rarebyte_table[*rare]; > + int c; > + for (; n--; ++p) > + { > + c = ___rarebyte_table[*p]; > + if (c < c_rare) > + { > + rare = p; > + c_rare = c; > + } > + } > + return (void *) rare; > +} > + > +void * > +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len) > +{ > + if (ne_len == 1) > + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len); > + if (__glibc_unlikely (ne_len == 0)) > + return (void *) hs; > + if (__glibc_unlikely (hs_len < ne_len)) > + return NULL; > + VEC hv0, hv1, hv, nv; > + MASK i, hm0, hm1, m, cmpm; > + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0; > + const MASK matchm = ONES << matchsh; > + const unsigned char *h = (const unsigned char *) hs; > + const unsigned char *const end = h + hs_len - ne_len; > + const unsigned char *hp; > + size_t rare = PTR_DIFF (find_rarest_byte ((const unsigned char *)ne, MIN (ne_len, VEC_SIZE)), ne); > + /* RARE will always be the first byte to find. > + If RARE is at the end of the needle, use the byte before it. */ > + if (rare == MIN (ne_len, VEC_SIZE) - 1) > + --rare; > + const VEC nv0 = SETONE8 (*((char *) ne + rare)); > + const VEC nv1 = SETONE8 (*((char *) ne + rare + 1)); > + unsigned int off_e = (PTR_DIFF (end, h) < VEC_SIZE) > + ? VEC_SIZE - (unsigned int) (end - h) - 1 > + : 0; > + /* Start from the position of RARE. */ > + h += rare; > + /* Load the needle vector. */ > + if (((uintptr_t) ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE) > + || ne_len >= VEC_SIZE) > + nv = LOADU ((VEC *) ne); > + else > + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len)); > + const unsigned int off_s = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE)); > + /* Align down to VEC_SIZE. */ > + h -= off_s; > + hv0 = LOAD ((const VEC *) h); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1; > + /* Clear the irrelevant bits from aligning down (OFF_S) and ones that are out > + * of bounds (OFF_E). */ > + m = ((hm0 & hm1) >> off_s) & (ONES >> off_e); > + while (m) > + { > + i = TZCNT (m); > + m = BLSR (m); > + hp = h + off_s + i - rare; > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > + { > + /* Do a vector compare if we are not crossing a page. */ > + hv = LOADU ((VEC *) hp); > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > + /* Compare only the relevant bits of the needle vector. */ > + if (cmpm == matchm) > + /* Compare the rest of the needle. */ > + if (ne_len <= VEC_SIZE > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, > + ne_len - VEC_SIZE)) > + return (void *) hp; > + } > + else > + { > + if (!MEMCMPEQ (hp, ne, ne_len)) > + return (void *) hp; > + } > + } > + h += VEC_SIZE - 1; > + for (; h - rare + VEC_SIZE <= end; h += VEC_SIZE) > + { > + hv0 = LOADU ((const VEC *) h); > + hv1 = LOAD ((const VEC *) (h + 1)); > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + m = hm0 & hm1; > + while (m) > + { > + match: > + i = TZCNT (m); > + m = BLSR (m); > + hp = h + i - rare; > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > + { > + hv = LOADU ((VEC *) hp); > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > + if (cmpm == matchm) > + if (ne_len <= VEC_SIZE > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, > + ne_len - VEC_SIZE)) > + return (void *) hp; > + } > + else > + { > + if (!MEMCMPEQ (hp, ne, ne_len)) > + return (void *) hp; > + } > + } > + } > + if (h - rare <= end) > + { > + off_e = VEC_SIZE - (unsigned int) (end - (h - rare)) - 1; > + hv0 = LOADU ((const VEC *) h); > + hv1 = LOAD ((const VEC *) (h + 1)); > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + /* Clear the irrelevant bits that are out of bounds. */ > + m = hm0 & hm1 & (ONES >> off_e); > + if (m) > + goto match; > + } > + return NULL; > +} > diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c > new file mode 100644 > index 0000000000..91f5d5d331 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c > @@ -0,0 +1,3 @@ > +#define FUNC_NAME __memmem_avx2 > + > +#include "memmem-avx-base.h" > diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c > new file mode 100644 > index 0000000000..76016c1cfe > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c > @@ -0,0 +1,12 @@ > +#define VEC __m512i > +#define MASK uint64_t > +#define LOAD(x) _mm512_load_si512 (x) > +#define LOADU(x) _mm512_loadu_si512 (x) > +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y) > +#define SETONE8(x) _mm512_set1_epi8 (x) > +#define TZCNT(x) _tzcnt_u64 (x) > +#define BLSR(x) _blsr_u64 (x) > + > +#define FUNC_NAME __memmem_avx512 > + > +#include "memmem-avx-base.h" > diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c > new file mode 100644 > index 0000000000..8fe7b77d33 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem.c > @@ -0,0 +1,67 @@ > +/* Multiple versions of memmem. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2012-2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Redefine memmem so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +#undef memmem > +#define memmem __redirect_memmem > +#include <string.h> > +#undef memmem > + > +#define MEMMEM __memmem_generic > +#ifdef SHARED > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(name) \ > + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic); > +#endif > + > +#include "string/memmem.c" > + > +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden; > +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden; > +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden; > + > +#define SYMBOL_NAME memmem > + > +#include "init-arch.h" > + > +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > + ifunc symbol properly. */ > +extern __typeof (__redirect_memmem) __libc_memmem; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > + return __memmem_avx512; > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > + return __memmem_avx2; > + > + return __memmem_generic; > +} > + > +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ()); > +#undef memmem > +strong_alias (__libc_memmem, __memmem) > -- > 2.43.2 > It doesn't seem you have addressed many of the comments from your v5 patch. Can it helps if you 1: Reply to the comments indicating they are handled / why are choosing not to handle them. 2: Send further versions to the same email chain. (`--in-reply-to` with `git send-email`).
On Mon, 19 Feb 2024, Noah Goldstein wrote: > It doesn't seem you have addressed many of the comments from your v5 patch. > Can it helps if you > 1: Reply to the comments indicating they are handled / why are choosing not > to handle them. > 2: Send further versions to the same email chain. (`--in-reply-to` > with `git send-email`). Are you ok with the change in worst-case time complexity? The existing generic implementation is O(n+m), the proposed variants are O(n*m). Alexander
On 19/02/24 05:13, Alexander Monakov wrote: > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > >> It doesn't seem you have addressed many of the comments from your v5 patch. >> Can it helps if you >> 1: Reply to the comments indicating they are handled / why are choosing not >> to handle them. >> 2: Send further versions to the same email chain. (`--in-reply-to` >> with `git send-email`). > > Are you ok with the change in worst-case time complexity? The existing generic > implementation is O(n+m), the proposed variants are O(n*m). I think we should consider this a regression, we already have a bug opened for wcsstr [1] for a similar issue. We already had another similar issue for PowerPC [2], and we did not have consensus back then because the generic implementation was also O(m*n) (it was before Wilco new implementation). [1] https://sourceware.org/bugzilla/show_bug.cgi?id=23865 [2] https://sourceware.org/pipermail/libc-alpha/2015-July/062808.html
On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 19/02/24 05:13, Alexander Monakov wrote: > > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > > > >> It doesn't seem you have addressed many of the comments from your v5 patch. > >> Can it helps if you > >> 1: Reply to the comments indicating they are handled / why are choosing not > >> to handle them. > >> 2: Send further versions to the same email chain. (`--in-reply-to` > >> with `git send-email`). > > > > Are you ok with the change in worst-case time complexity? The existing generic > > implementation is O(n+m), the proposed variants are O(n*m). > > I think we should consider this a regression, we already have a bug opened for > wcsstr [1] for a similar issue. We already had another similar issue for > PowerPC [2], and we did not have consensus back then because the generic > implementation was also O(m*n) (it was before Wilco new implementation). Think practically this impl would be faster for short needles. Maybe limit to `m < ~16`, otherwise fallback to generic? > > [1] https://sourceware.org/bugzilla/show_bug.cgi?id=23865 > [2] https://sourceware.org/pipermail/libc-alpha/2015-July/062808.html
On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: > > > > > > > > On 19/02/24 05:13, Alexander Monakov wrote: > > > > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > > > > > >> It doesn't seem you have addressed many of the comments from your v5 > patch. > > >> Can it helps if you > > >> 1: Reply to the comments indicating they are handled / why are > choosing not > > >> to handle them. > > >> 2: Send further versions to the same email chain. (`--in-reply-to` > > >> with `git send-email`). > > > > > > Are you ok with the change in worst-case time complexity? The existing > generic > > > implementation is O(n+m), the proposed variants are O(n*m). > > > > I think we should consider this a regression, we already have a bug > opened for > > wcsstr [1] for a similar issue. We already had another similar issue for > > PowerPC [2], and we did not have consensus back then because the generic > > implementation was also O(m*n) (it was before Wilco new implementation). > > Think practically this impl would be faster for short needles. Maybe > limit to `m < ~16`, otherwise fallback to generic? > This implementation is virtually O(n) for m <= VEC_SIZE, so I think it should be at least m <= VEC_SIZE, and since generic implementation uses O(n+m) for m > 256, it should be m <= 256, unless we want to directly use str-two-way.h, which I think would be a waste of code size. > > > > [1] https://sourceware.org/bugzilla/show_bug.cgi?id=23865 > > [2] https://sourceware.org/pipermail/libc-alpha/2015-July/062808.html >
On 20/02/24 00:00, James wrote: > On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: > > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> wrote: > > > > > > > > On 19/02/24 05:13, Alexander Monakov wrote: > > > > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > > > > > >> It doesn't seem you have addressed many of the comments from your v5 patch. > > >> Can it helps if you > > >> 1: Reply to the comments indicating they are handled / why are choosing not > > >> to handle them. > > >> 2: Send further versions to the same email chain. (`--in-reply-to` > > >> with `git send-email`). > > > > > > Are you ok with the change in worst-case time complexity? The existing generic > > > implementation is O(n+m), the proposed variants are O(n*m). > > > > I think we should consider this a regression, we already have a bug opened for > > wcsstr [1] for a similar issue. We already had another similar issue for > > PowerPC [2], and we did not have consensus back then because the generic > > implementation was also O(m*n) (it was before Wilco new implementation). > > Think practically this impl would be faster for short needles. Maybe > limit to `m < ~16`, otherwise fallback to generic? > > This implementation is virtually O(n) for m <= VEC_SIZE, so I think it should be at least m <= VEC_SIZE, and since generic implementation uses O(n+m) for m > 256, it should be m <= 256, unless we want to directly use str-two-way.h, which I think would be a waste of code size. Afaik s390x do use a similar strategy, so it should be ok to optimize for m <= VEC_SIZE. Also, please check why your patch is making aarch64/arm buildbot fails to build [1]. You can bootstrap a toolchain using the build-many-glibcs.py script it required. [1] https://patchwork.sourceware.org/project/glibc/patch/20240218082621.131128-1-tirtajames45@gmail.com/
(Resend because I didn't reply all) On Tue, Feb 20, 2024 at 9:30 PM Adhemerval Zanella Netto < adhemerval.zanella@linaro.org> wrote: > > > On 20/02/24 00:00, James wrote: > > On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein <goldstein.w.n@gmail.com > <mailto:goldstein.w.n@gmail.com>> wrote: > > > > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto > > <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> > wrote: > > > > > > > > > > > > On 19/02/24 05:13, Alexander Monakov wrote: > > > > > > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > > > > > > > >> It doesn't seem you have addressed many of the comments from > your v5 patch. > > > >> Can it helps if you > > > >> 1: Reply to the comments indicating they are handled / why are > choosing not > > > >> to handle them. > > > >> 2: Send further versions to the same email chain. > (`--in-reply-to` > > > >> with `git send-email`). > > > > > > > > Are you ok with the change in worst-case time complexity? The > existing generic > > > > implementation is O(n+m), the proposed variants are O(n*m). > > > > > > I think we should consider this a regression, we already have a > bug opened for > > > wcsstr [1] for a similar issue. We already had another similar > issue for > > > PowerPC [2], and we did not have consensus back then because the > generic > > > implementation was also O(m*n) (it was before Wilco new > implementation). > > > > Think practically this impl would be faster for short needles. Maybe > > limit to `m < ~16`, otherwise fallback to generic? > > > > This implementation is virtually O(n) for m <= VEC_SIZE, so I think it > should be at least m <= VEC_SIZE, and since generic implementation uses > O(n+m) for m > 256, it should be m <= 256, unless we want to directly use > str-two-way.h, which I think would be a waste of code size. > > Afaik s390x do use a similar strategy, so it should be ok to optimize for > m <= VEC_SIZE. > > Also, please check why your patch is making aarch64/arm buildbot fails to > build [1]. You can bootstrap a toolchain using the build-many-glibcs.py > script it required. > It seems that it has to do with the libc_hidden_builtin_def in string/memmem.c which I don't really understand. > > [1] > https://patchwork.sourceware.org/project/glibc/patch/20240218082621.131128-1-tirtajames45@gmail.com/ > >
On Tue, Feb 20, 2024 at 3:16 PM James <tirtajames45@gmail.com> wrote: > > (Resend because I didn't reply all) > > On Tue, Feb 20, 2024 at 9:30 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 20/02/24 00:00, James wrote: >> > On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: >> > >> > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto >> > <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> wrote: >> > > >> > > >> > > >> > > On 19/02/24 05:13, Alexander Monakov wrote: >> > > > >> > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: >> > > > >> > > >> It doesn't seem you have addressed many of the comments from your v5 patch. >> > > >> Can it helps if you >> > > >> 1: Reply to the comments indicating they are handled / why are choosing not >> > > >> to handle them. >> > > >> 2: Send further versions to the same email chain. (`--in-reply-to` >> > > >> with `git send-email`). >> > > > >> > > > Are you ok with the change in worst-case time complexity? The existing generic >> > > > implementation is O(n+m), the proposed variants are O(n*m). >> > > >> > > I think we should consider this a regression, we already have a bug opened for >> > > wcsstr [1] for a similar issue. We already had another similar issue for >> > > PowerPC [2], and we did not have consensus back then because the generic >> > > implementation was also O(m*n) (it was before Wilco new implementation). >> > >> > Think practically this impl would be faster for short needles. Maybe >> > limit to `m < ~16`, otherwise fallback to generic? >> > >> > This implementation is virtually O(n) for m <= VEC_SIZE, so I think it should be at least m <= VEC_SIZE, and since generic implementation uses O(n+m) for m > 256, it should be m <= 256, unless we want to directly use str-two-way.h, which I think would be a waste of code size. >> >> Afaik s390x do use a similar strategy, so it should be ok to optimize for >> m <= VEC_SIZE. >> >> Also, please check why your patch is making aarch64/arm buildbot fails to >> build [1]. You can bootstrap a toolchain using the build-many-glibcs.py >> script it required. > > It seems that it has to do with the libc_hidden_builtin_def in string/memmem.c which I don't really understand. Instead of adding a new hidden def at the end of `string/memmem.c`, just replace the existing using of `__memmem` with `MEMMEM` >> >> >> [1] https://patchwork.sourceware.org/project/glibc/patch/20240218082621.131128-1-tirtajames45@gmail.com/ >>
On Tue, Feb 20, 2024 at 11:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Tue, Feb 20, 2024 at 3:16 PM James <tirtajames45@gmail.com> wrote: > > > > (Resend because I didn't reply all) > > > > On Tue, Feb 20, 2024 at 9:30 PM Adhemerval Zanella Netto < > adhemerval.zanella@linaro.org> wrote: > >> > >> > >> > >> On 20/02/24 00:00, James wrote: > >> > On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein < > goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: > >> > > >> > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto > >> > <adhemerval.zanella@linaro.org <mailto: > adhemerval.zanella@linaro.org>> wrote: > >> > > > >> > > > >> > > > >> > > On 19/02/24 05:13, Alexander Monakov wrote: > >> > > > > >> > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: > >> > > > > >> > > >> It doesn't seem you have addressed many of the comments from > your v5 patch. > >> > > >> Can it helps if you > >> > > >> 1: Reply to the comments indicating they are handled / why > are choosing not > >> > > >> to handle them. > >> > > >> 2: Send further versions to the same email chain. > (`--in-reply-to` > >> > > >> with `git send-email`). > >> > > > > >> > > > Are you ok with the change in worst-case time complexity? The > existing generic > >> > > > implementation is O(n+m), the proposed variants are O(n*m). > >> > > > >> > > I think we should consider this a regression, we already have a > bug opened for > >> > > wcsstr [1] for a similar issue. We already had another similar > issue for > >> > > PowerPC [2], and we did not have consensus back then because > the generic > >> > > implementation was also O(m*n) (it was before Wilco new > implementation). > >> > > >> > Think practically this impl would be faster for short needles. > Maybe > >> > limit to `m < ~16`, otherwise fallback to generic? > >> > > >> > This implementation is virtually O(n) for m <= VEC_SIZE, so I think > it should be at least m <= VEC_SIZE, and since generic implementation uses > O(n+m) for m > 256, it should be m <= 256, unless we want to directly use > str-two-way.h, which I think would be a waste of code size. > >> > >> Afaik s390x do use a similar strategy, so it should be ok to optimize > for > >> m <= VEC_SIZE. > >> > >> Also, please check why your patch is making aarch64/arm buildbot fails > to > >> build [1]. You can bootstrap a toolchain using the build-many-glibcs.py > >> script it required. > > > > It seems that it has to do with the libc_hidden_builtin_def in > string/memmem.c which I don't really understand. > > Instead of adding a new hidden def at the end of `string/memmem.c`, > just replace the existing > using of `__memmem` with `MEMMEM` > With #ifndef _LIBC # define __memmem memmem #endif #ifndef MEMMEM # define MEMMEM __memmem #endif void * MEMMEM (const void *haystack, size_t hs_len, const void *needle, size_t ne_len) libc_hidden_def (MEMMEM) weak_alias (MEMMEM, memmem) libc_hidden_weak (memmem) make test t=string/test-memmem on x86-64 shows ./../include/libc-symbols.h:472:33: error: ‘__EI___memmem_generic’ aliased to undefined symbol ‘__GI___memmem_generic’ 472 | extern thread __typeof (name) __EI_##name \ | ^~~~~ ./../include/libc-symbols.h:468:3: note: in expansion of macro ‘__hidden_ver2’ 468 | __hidden_ver2 (, local, internal, name) | ^~~~~~~~~~~~~ ./../include/libc-symbols.h:476:41: note: in expansion of macro ‘__hidden_ver1’ 476 | # define hidden_def(name) __hidden_ver1(__GI_##name, name, name); | ^~~~~~~~~~~~~ ./../include/libc-symbols.h:557:32: note: in expansion of macro ‘hidden_def’ 557 | # define libc_hidden_def(name) hidden_def (name) | ^~~~~~~~~~ ../string/memmem.c:131:1: note: in expansion of macro ‘libc_hidden_def’ 131 | libc_hidden_def (MEMMEM) | ^~~~~~~~~~~~~~~ ./../include/libc-symbols.h:472:33: error: ‘__EI_memmem’ aliased to undefined symbol ‘__GI_memmem’ 472 | extern thread __typeof (name) __EI_##name \ | ^~~~~ ./../include/libc-symbols.h:468:3: note: in expansion of macro ‘__hidden_ver2’ 468 | __hidden_ver2 (, local, internal, name) | ^~~~~~~~~~~~~ ./../include/libc-symbols.h:484:9: note: in expansion of macro ‘__hidden_ver1’ 484 | __hidden_ver1(__GI_##name, name, name) __attribute__((weak)); | ^~~~~~~~~~~~~ ./../include/libc-symbols.h:558:33: note: in expansion of macro ‘hidden_weak’ 558 | # define libc_hidden_weak(name) hidden_weak (name) | ^~~~~~~~~~~ ../string/memmem.c:133:1: note: in expansion of macro ‘libc_hidden_weak’ 133 | libc_hidden_weak (memmem) | ^~~~~~~~~~~~~~~~ make[2]: *** [/home/james/.local/src/glibc/build/sysd-rules:671: /home/james/.local/src/glibc/build/string/memmem.os] Error 1 make[2]: Leaving directory '/home/james/.local/src/glibc/string' make[1]: *** [Makefile:759: test] Error 2 make[1]: Leaving directory '/home/james/.local/src/glibc' make: *** [Makefile:9: test] Error 2 > > >> > >> > >> [1] > https://patchwork.sourceware.org/project/glibc/patch/20240218082621.131128-1-tirtajames45@gmail.com/ > >> >
On Tue, Feb 20, 2024 at 4:26 PM James <tirtajames45@gmail.com> wrote: > > On Tue, Feb 20, 2024 at 11:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Tue, Feb 20, 2024 at 3:16 PM James <tirtajames45@gmail.com> wrote: >> > >> > (Resend because I didn't reply all) >> > >> > On Tue, Feb 20, 2024 at 9:30 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: >> >> >> >> >> >> >> >> On 20/02/24 00:00, James wrote: >> >> > On Tue, Feb 20, 2024 at 12:20 AM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: >> >> > >> >> > On Mon, Feb 19, 2024 at 2:25 PM Adhemerval Zanella Netto >> >> > <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> wrote: >> >> > > >> >> > > >> >> > > >> >> > > On 19/02/24 05:13, Alexander Monakov wrote: >> >> > > > >> >> > > > On Mon, 19 Feb 2024, Noah Goldstein wrote: >> >> > > > >> >> > > >> It doesn't seem you have addressed many of the comments from your v5 patch. >> >> > > >> Can it helps if you >> >> > > >> 1: Reply to the comments indicating they are handled / why are choosing not >> >> > > >> to handle them. >> >> > > >> 2: Send further versions to the same email chain. (`--in-reply-to` >> >> > > >> with `git send-email`). >> >> > > > >> >> > > > Are you ok with the change in worst-case time complexity? The existing generic >> >> > > > implementation is O(n+m), the proposed variants are O(n*m). >> >> > > >> >> > > I think we should consider this a regression, we already have a bug opened for >> >> > > wcsstr [1] for a similar issue. We already had another similar issue for >> >> > > PowerPC [2], and we did not have consensus back then because the generic >> >> > > implementation was also O(m*n) (it was before Wilco new implementation). >> >> > >> >> > Think practically this impl would be faster for short needles. Maybe >> >> > limit to `m < ~16`, otherwise fallback to generic? >> >> > >> >> > This implementation is virtually O(n) for m <= VEC_SIZE, so I think it should be at least m <= VEC_SIZE, and since generic implementation uses O(n+m) for m > 256, it should be m <= 256, unless we want to directly use str-two-way.h, which I think would be a waste of code size. >> >> >> >> Afaik s390x do use a similar strategy, so it should be ok to optimize for >> >> m <= VEC_SIZE. >> >> >> >> Also, please check why your patch is making aarch64/arm buildbot fails to >> >> build [1]. You can bootstrap a toolchain using the build-many-glibcs.py >> >> script it required. >> > >> > It seems that it has to do with the libc_hidden_builtin_def in string/memmem.c which I don't really understand. >> >> Instead of adding a new hidden def at the end of `string/memmem.c`, >> just replace the existing >> using of `__memmem` with `MEMMEM` So if the target is just using this as the generic impl (and defines the defs in sysdeps/* See how we do `wcscpy` in x86_64, you should be able to follow the same pattern. > > With > > #ifndef _LIBC > # define __memmem memmem > #endif > > #ifndef MEMMEM > # define MEMMEM __memmem > #endif > > void * > MEMMEM (const void *haystack, size_t hs_len, > const void *needle, size_t ne_len) > > libc_hidden_def (MEMMEM) > weak_alias (MEMMEM, memmem) > libc_hidden_weak (memmem) > > make test t=string/test-memmem on x86-64 shows > > ./../include/libc-symbols.h:472:33: error: ‘__EI___memmem_generic’ aliased to undefined symbol ‘__GI___memmem_generic’ > 472 | extern thread __typeof (name) __EI_##name \ > | ^~~~~ > ./../include/libc-symbols.h:468:3: note: in expansion of macro ‘__hidden_ver2’ > 468 | __hidden_ver2 (, local, internal, name) > | ^~~~~~~~~~~~~ > ./../include/libc-symbols.h:476:41: note: in expansion of macro ‘__hidden_ver1’ > 476 | # define hidden_def(name) __hidden_ver1(__GI_##name, name, name); > | ^~~~~~~~~~~~~ > ./../include/libc-symbols.h:557:32: note: in expansion of macro ‘hidden_def’ > 557 | # define libc_hidden_def(name) hidden_def (name) > | ^~~~~~~~~~ > ../string/memmem.c:131:1: note: in expansion of macro ‘libc_hidden_def’ > 131 | libc_hidden_def (MEMMEM) > | ^~~~~~~~~~~~~~~ > ./../include/libc-symbols.h:472:33: error: ‘__EI_memmem’ aliased to undefined symbol ‘__GI_memmem’ > 472 | extern thread __typeof (name) __EI_##name \ > | ^~~~~ > ./../include/libc-symbols.h:468:3: note: in expansion of macro ‘__hidden_ver2’ > 468 | __hidden_ver2 (, local, internal, name) > | ^~~~~~~~~~~~~ > ./../include/libc-symbols.h:484:9: note: in expansion of macro ‘__hidden_ver1’ > 484 | __hidden_ver1(__GI_##name, name, name) __attribute__((weak)); > | ^~~~~~~~~~~~~ > ./../include/libc-symbols.h:558:33: note: in expansion of macro ‘hidden_weak’ > 558 | # define libc_hidden_weak(name) hidden_weak (name) > | ^~~~~~~~~~~ > ../string/memmem.c:133:1: note: in expansion of macro ‘libc_hidden_weak’ > 133 | libc_hidden_weak (memmem) > | ^~~~~~~~~~~~~~~~ > make[2]: *** [/home/james/.local/src/glibc/build/sysd-rules:671: /home/james/.local/src/glibc/build/string/memmem.os] Error 1 > make[2]: Leaving directory '/home/james/.local/src/glibc/string' > make[1]: *** [Makefile:759: test] Error 2 > make[1]: Leaving directory '/home/james/.local/src/glibc' > make: *** [Makefile:9: test] Error 2 >> >> >> >> >> >> >> [1] https://patchwork.sourceware.org/project/glibc/patch/20240218082621.131128-1-tirtajames45@gmail.com/ >> >>
diff --git a/string/memmem.c b/string/memmem.c index a4117f8e1e..a315c7d0b5 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -25,6 +25,10 @@ # define __memmem memmem #endif +#ifndef MEMMEM +# define MEMMEM __memmem +#endif + #define RETURN_TYPE void * #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) #define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) @@ -50,7 +54,7 @@ The limit also implies worst-case performance is linear. Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -__memmem (const void *haystack, size_t hs_len, +MEMMEM (const void *haystack, size_t hs_len, const void *needle, size_t ne_len) { const unsigned char *hs = (const unsigned char *) haystack; @@ -127,3 +131,4 @@ __memmem (const void *haystack, size_t hs_len, libc_hidden_def (__memmem) weak_alias (__memmem, memmem) libc_hidden_weak (memmem) +libc_hidden_builtin_def (MEMMEM) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d3d2270394..0b46d5f341 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -15,6 +15,9 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ + memmem-avx-base \ + memmem-avx2 \ + memmem-avx512 \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -122,6 +125,9 @@ sysdep_routines += \ varshift \ # sysdep_routines +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3 +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3 + CFLAGS-strcspn-sse4.c += -msse4 CFLAGS-strpbrk-sse4.c += -msse4 CFLAGS-strspn-sse4.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c4a21d4b7c..5fe1440235 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -798,6 +798,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strstr_avx512) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic)) + + /* Support sysdeps/x86_64/multiarch/memmem.c. */ + IFUNC_IMPL (i, name, memmem, + IFUNC_IMPL_ADD (array, i, memmem, + (CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI1)), + __memmem_avx512) + IFUNC_IMPL_ADD (array, i, memmem, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI1)), + __memmem_avx2) + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic)) /* Support sysdeps/x86_64/multiarch/wcschr.c. */ IFUNC_IMPL (i, name, wcschr, diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.c b/sysdeps/x86_64/multiarch/memmem-avx-base.c new file mode 100644 index 0000000000..212d75c96f --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.c @@ -0,0 +1,20 @@ +const unsigned char ___rarebyte_table[256] attribute_hidden + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, + 85, 49, 50, 51 }; diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h new file mode 100644 index 0000000000..1333eac5b5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h @@ -0,0 +1,183 @@ +#include <immintrin.h> +#include <inttypes.h> +#include <string.h> +#include <libc-pointer-arith.h> + +#ifndef FUNC_NAME +# define __memmem_avx2 +#endif +#ifndef VEC +# define VEC __m256i +#endif +#ifndef MASK +# define MASK uint32_t +#endif +#ifndef LOAD +# define LOAD(x) _mm256_load_si256 (x) +#endif +#ifndef LOADU +# define LOADU(x) _mm256_loadu_si256 (x) +#endif +#ifndef CMPEQ8_MASK +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y)) +#endif +#ifndef SETONE8 +# define SETONE8(x) _mm256_set1_epi8 (x) +#endif +#ifndef TZCNT +# define TZCNT(x) _tzcnt_u32 (x) +#endif +#ifndef BLSR +# define BLSR(x) _blsr_u32 (x) +#endif +#define VEC_SIZE sizeof (VEC) +#define ONES ((MASK) -1) + +#ifndef MEMCMPEQ +# define MEMCMPEQ __memcmpeq +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif +#ifndef MEMCHR +# define MEMCHR memchr +#endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +/* Lower is rarer. The table is based on the + *.c and *.h files in glibc. */ +extern const unsigned char ___rarebyte_table[256] attribute_hidden; + +static inline void *__attribute__ ((always_inline)) +find_rarest_byte (const unsigned char *rare, size_t n) +{ + const unsigned char *p = (const unsigned char *) rare; + int c_rare = ___rarebyte_table[*rare]; + int c; + for (; n--; ++p) + { + c = ___rarebyte_table[*p]; + if (c < c_rare) + { + rare = p; + c_rare = c; + } + } + return (void *) rare; +} + +void * +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len) +{ + if (ne_len == 1) + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len); + if (__glibc_unlikely (ne_len == 0)) + return (void *) hs; + if (__glibc_unlikely (hs_len < ne_len)) + return NULL; + VEC hv0, hv1, hv, nv; + MASK i, hm0, hm1, m, cmpm; + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0; + const MASK matchm = ONES << matchsh; + const unsigned char *h = (const unsigned char *) hs; + const unsigned char *const end = h + hs_len - ne_len; + const unsigned char *hp; + size_t rare = PTR_DIFF (find_rarest_byte ((const unsigned char *)ne, MIN (ne_len, VEC_SIZE)), ne); + /* RARE will always be the first byte to find. + If RARE is at the end of the needle, use the byte before it. */ + if (rare == MIN (ne_len, VEC_SIZE) - 1) + --rare; + const VEC nv0 = SETONE8 (*((char *) ne + rare)); + const VEC nv1 = SETONE8 (*((char *) ne + rare + 1)); + unsigned int off_e = (PTR_DIFF (end, h) < VEC_SIZE) + ? VEC_SIZE - (unsigned int) (end - h) - 1 + : 0; + /* Start from the position of RARE. */ + h += rare; + /* Load the needle vector. */ + if (((uintptr_t) ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE) + || ne_len >= VEC_SIZE) + nv = LOADU ((VEC *) ne); + else + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len)); + const unsigned int off_s = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE)); + /* Align down to VEC_SIZE. */ + h -= off_s; + hv0 = LOAD ((const VEC *) h); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1; + /* Clear the irrelevant bits from aligning down (OFF_S) and ones that are out + * of bounds (OFF_E). */ + m = ((hm0 & hm1) >> off_s) & (ONES >> off_e); + while (m) + { + i = TZCNT (m); + m = BLSR (m); + hp = h + off_s + i - rare; + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) + { + /* Do a vector compare if we are not crossing a page. */ + hv = LOADU ((VEC *) hp); + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; + /* Compare only the relevant bits of the needle vector. */ + if (cmpm == matchm) + /* Compare the rest of the needle. */ + if (ne_len <= VEC_SIZE + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, + ne_len - VEC_SIZE)) + return (void *) hp; + } + else + { + if (!MEMCMPEQ (hp, ne, ne_len)) + return (void *) hp; + } + } + h += VEC_SIZE - 1; + for (; h - rare + VEC_SIZE <= end; h += VEC_SIZE) + { + hv0 = LOADU ((const VEC *) h); + hv1 = LOAD ((const VEC *) (h + 1)); + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + m = hm0 & hm1; + while (m) + { + match: + i = TZCNT (m); + m = BLSR (m); + hp = h + i - rare; + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) + { + hv = LOADU ((VEC *) hp); + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; + if (cmpm == matchm) + if (ne_len <= VEC_SIZE + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, + ne_len - VEC_SIZE)) + return (void *) hp; + } + else + { + if (!MEMCMPEQ (hp, ne, ne_len)) + return (void *) hp; + } + } + } + if (h - rare <= end) + { + off_e = VEC_SIZE - (unsigned int) (end - (h - rare)) - 1; + hv0 = LOADU ((const VEC *) h); + hv1 = LOAD ((const VEC *) (h + 1)); + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + /* Clear the irrelevant bits that are out of bounds. */ + m = hm0 & hm1 & (ONES >> off_e); + if (m) + goto match; + } + return NULL; +} diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c new file mode 100644 index 0000000000..91f5d5d331 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c @@ -0,0 +1,3 @@ +#define FUNC_NAME __memmem_avx2 + +#include "memmem-avx-base.h" diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c new file mode 100644 index 0000000000..76016c1cfe --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c @@ -0,0 +1,12 @@ +#define VEC __m512i +#define MASK uint64_t +#define LOAD(x) _mm512_load_si512 (x) +#define LOADU(x) _mm512_loadu_si512 (x) +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y) +#define SETONE8(x) _mm512_set1_epi8 (x) +#define TZCNT(x) _tzcnt_u64 (x) +#define BLSR(x) _blsr_u64 (x) + +#define FUNC_NAME __memmem_avx512 + +#include "memmem-avx-base.h" diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c new file mode 100644 index 0000000000..8fe7b77d33 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem.c @@ -0,0 +1,67 @@ +/* Multiple versions of memmem. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Redefine memmem so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +#undef memmem +#define memmem __redirect_memmem +#include <string.h> +#undef memmem + +#define MEMMEM __memmem_generic +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic); +#endif + +#include "string/memmem.c" + +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden; +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden; +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden; + +#define SYMBOL_NAME memmem + +#include "init-arch.h" + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_memmem) __libc_memmem; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) + return __memmem_avx512; + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) + return __memmem_avx2; + + return __memmem_generic; +} + +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ()); +#undef memmem +strong_alias (__libc_memmem, __memmem)