diff mbox series

[v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr

Message ID 20221018000034.892021-1-skpgkp2@gmail.com
State New
Headers show
Series [v5] x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr | expand

Commit Message

Sunil Pandey Oct. 18, 2022, midnight UTC
Changes from v4:
- Replace jmp max in first vector with cmov.
- Replace jmp max in page cross with cmov.
Changes from v3:
- Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
- Change first vector max check logic for terminating condition.
- Change page cross logic for terminating condition.
- Remove unnessary check in align_more block.
- Remove unnessary VEC(0) initialization.
- Define USE_WIDE_CHAR in wmemchr.

Changes from v2:
- Use VEC API
- Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)

Changes from v1:
- Change vcmp to vcmpeq and vcmpneq.
- Restructure unconditional loop jump logic.
- Improve 4 vector loop logic.
- Fix bug near page boundary.

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- memchr function using 512 bit vectors.
- rawmemchr function using 512 bit vectors.
- wmemchr function using 512 bit vectors.

Code size data:

memchr-evex.o		762 byte
memchr-evex512.o	576 byte (-24%)

rawmemchr-evex.o	461 byte
rawmemchr-evex512.o	432 byte (-6%)

wmemchr-evex.o		794 byte
wmemchr-evex512.o	576 byte (-27%)

Placeholder function, not used by any processor at the moment.

Fix page cross logic

Fix 2
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
 sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
 sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
 sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
 sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
 6 files changed, 343 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S

Comments

Noah Goldstein Oct. 18, 2022, 3:01 a.m. UTC | #1
On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Changes from v4:
> - Replace jmp max in first vector with cmov.
> - Replace jmp max in page cross with cmov.
> Changes from v3:
> - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> - Change first vector max check logic for terminating condition.
> - Change page cross logic for terminating condition.
> - Remove unnessary check in align_more block.
> - Remove unnessary VEC(0) initialization.
> - Define USE_WIDE_CHAR in wmemchr.
>
> Changes from v2:
> - Use VEC API
> - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
>
> Changes from v1:
> - Change vcmp to vcmpeq and vcmpneq.
> - Restructure unconditional loop jump logic.
> - Improve 4 vector loop logic.
> - Fix bug near page boundary.
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.
>
> - memchr function using 512 bit vectors.
> - rawmemchr function using 512 bit vectors.
> - wmemchr function using 512 bit vectors.
>
> Code size data:
>
> memchr-evex.o           762 byte
> memchr-evex512.o        576 byte (-24%)
>
> rawmemchr-evex.o        461 byte
> rawmemchr-evex512.o     432 byte (-6%)
>
> wmemchr-evex.o          794 byte
> wmemchr-evex512.o       576 byte (-27%)
>
> Placeholder function, not used by any processor at the moment.
>
> Fix page cross logic
>
> Fix 2
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
>  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
>  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
>  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
>  6 files changed, 343 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index df4601c294..e974b1ad97 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -4,6 +4,7 @@ sysdep_routines += \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> +  memchr-evex512 \
>    memchr-evex-rtm \
>    memchr-sse2 \
>    memcmp-avx2-movbe \
> @@ -36,6 +37,7 @@ sysdep_routines += \
>    rawmemchr-avx2 \
>    rawmemchr-avx2-rtm \
>    rawmemchr-evex \
> +  rawmemchr-evex512 \
>    rawmemchr-evex-rtm \
>    rawmemchr-sse2 \
>    stpcpy-avx2 \
> @@ -156,6 +158,7 @@ sysdep_routines += \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
>    wmemchr-evex \
> +  wmemchr-evex512 \
>    wmemchr-evex-rtm \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 00a91123d3..529c0b0ef0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __memchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __memchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __rawmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __rawmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> new file mode 100644
> index 0000000000..ea92983db8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> @@ -0,0 +1,301 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WMEMCHR
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNE      vpcmpneqd
> +#  define VPMINU       vpminud
> +#  define VPTESTNM     vptestnmd
> +# else
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNE      vpcmpneqb
> +#  define VPMINU       vpminub
> +#  define VPTESTNM     vptestnmb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text), "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (MEMCHR, 6)
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Check for zero length.  */
> +       test    %RDX_LP, %RDX_LP
> +       jz      L(zero)
> +
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#  endif
> +# endif
> +
> +       /* Broadcast CHAR to VMM(1).  */
> +       VPBROADCAST %esi, %VMM(1)
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMPEQ (%rdi), %VMM(1), %k0
> +
> +       KMOV    %k0, %VRCX
> +# ifndef USE_AS_RAWMEMCHR
> +       mov     %rdx, %rsi
> +       bsf     %VRCX, %VRSI
> +       cmp     $CHAR_PER_VEC, %rsi
> +       ja      L(align_more)
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rsi, %rdi
> +#  endif
> +       xor     %eax, %eax
> +       cmp     %rsi, %rdx
> +       cmova   %rdi, %rax
> +# else
> +       bsf     %VRCX, %VRAX
> +       jz      L(align_more)
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +       .p2align 5,,5
> +L(page_cross):
> +       movl    %eax, %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WMEMCHR
> +       shrl    $2, %ecx
> +# endif
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> +       KMOV    %k0, %VRSI
> +       shr     %cl, %VRSI
> +# ifndef USE_AS_RAWMEMCHR
> +       jnz     L(page_cross_end)
> +       movl    $CHAR_PER_VEC, %eax
> +       sub     %ecx, %eax
> +       cmp     %rax, %rdx
> +       ja      L(align_more)
> +# else
> +       jz      L(align_more)
> +# endif
> +
> +L(page_cross_end):
> +# ifndef USE_AS_RAWMEMCHR
> +       bsf     %VRSI, %VRCX
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +       xor     %eax, %eax
> +       cmp     %rcx, %rdx
> +       cmova   %rdi, %rax

You have a bug here test case:

align % 4096 = 4036
len = 8
pos = N/A (no char in bounds).

I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
or you need to move `rdx` to `rcx` first.

> +# else
> +       bsf     %VRSI, %VRAX
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x1):
> +       bsf     %VRAX, %VRAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +       .p2align 5,,5
> +L(align_more):
> +# ifndef USE_AS_RAWMEMCHR
> +       mov     %rdi, %rax
> +# endif
> +       subq    $-VEC_SIZE, %rdi
> +       /* Align rdi to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       sar     $2, %rax
> +#  endif
> +       addq    %rax, %rdx
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMPEQ (%rdi), %VMM(1), %k0
> +
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> +
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> +
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> +
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x4)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +       /* Save pointer to find alignment adjustment.  */
> +       movq    %rdi, %rax
> +# endif
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
> +
> +       /* Add alignment difference to rdx.  */
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       shr     $2, %VRAX
> +#  endif
> +       addq    %rax, %rdx
> +# endif
> +
> +       /* 4 vector loop.  */
> +       .p2align 5,,11
> +L(loop):
> +
> +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTNM %VMM(3), %VMM(3), %k2
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       KORTEST %k2, %k3
> +# ifdef USE_AS_RAWMEMCHR
> +       jz      L(loop)
> +# else
> +       jnz     L(loopend)
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       ja      L(loop)
> +L(zero_2):
> +       xor     %eax, %eax
> +       ret
> +# endif
> +
> +L(loopend):
> +       VPCMPEQ (%rdi), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero_2)
> +# endif
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero_2)
> +# endif
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero_2)
> +# endif
> +
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k3, %VRAX
> +
> +L(ret_vec_x4):
> +       bsf     %VRAX, %VRAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 5,,5
> +L(ret_vec_x3):
> +       bsf     %VRAX, %VRAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +END (MEMCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> new file mode 100644
> index 0000000000..002f8c8489
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> @@ -0,0 +1,8 @@
> +# ifndef MEMCHR
> +#  define MEMCHR       __memchr_evex512
> +# endif
> +
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
> +
> +#include "memchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> new file mode 100644
> index 0000000000..302d3cb055
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef RAWMEMCHR
> +# define RAWMEMCHR     __rawmemchr_evex512
> +#endif
> +#define USE_AS_RAWMEMCHR       1
> +#define MEMCHR RAWMEMCHR
> +
> +#include "memchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> new file mode 100644
> index 0000000000..78ec4ee5ad
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> @@ -0,0 +1,9 @@
> +#ifndef WMEMCHR
> +# define WMEMCHR       __wmemchr_evex512
> +#endif
> +
> +#define MEMCHR WMEMCHR
> +#define USE_AS_WMEMCHR 1
> +
> +#define USE_WIDE_CHAR  1
> +#include "memchr-evex512.S"
> --
> 2.36.1
>
Sunil Pandey Oct. 18, 2022, 4:15 a.m. UTC | #2
On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Changes from v4:
> > - Replace jmp max in first vector with cmov.
> > - Replace jmp max in page cross with cmov.
> > Changes from v3:
> > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> > - Change first vector max check logic for terminating condition.
> > - Change page cross logic for terminating condition.
> > - Remove unnessary check in align_more block.
> > - Remove unnessary VEC(0) initialization.
> > - Define USE_WIDE_CHAR in wmemchr.
> >
> > Changes from v2:
> > - Use VEC API
> > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
> >
> > Changes from v1:
> > - Change vcmp to vcmpeq and vcmpneq.
> > - Restructure unconditional loop jump logic.
> > - Improve 4 vector loop logic.
> > - Fix bug near page boundary.
> >
> > This patch implements following evex512 version of string functions.
> > evex512 version takes up to 30% less cycle as compared to evex,
> > depending on length and alignment.
> >
> > - memchr function using 512 bit vectors.
> > - rawmemchr function using 512 bit vectors.
> > - wmemchr function using 512 bit vectors.
> >
> > Code size data:
> >
> > memchr-evex.o           762 byte
> > memchr-evex512.o        576 byte (-24%)
> >
> > rawmemchr-evex.o        461 byte
> > rawmemchr-evex512.o     432 byte (-6%)
> >
> > wmemchr-evex.o          794 byte
> > wmemchr-evex512.o       576 byte (-27%)
> >
> > Placeholder function, not used by any processor at the moment.
> >
> > Fix page cross logic
> >
> > Fix 2
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
> >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
> >  6 files changed, 343 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index df4601c294..e974b1ad97 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -4,6 +4,7 @@ sysdep_routines += \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > +  memchr-evex512 \
> >    memchr-evex-rtm \
> >    memchr-sse2 \
> >    memcmp-avx2-movbe \
> > @@ -36,6 +37,7 @@ sysdep_routines += \
> >    rawmemchr-avx2 \
> >    rawmemchr-avx2-rtm \
> >    rawmemchr-evex \
> > +  rawmemchr-evex512 \
> >    rawmemchr-evex-rtm \
> >    rawmemchr-sse2 \
> >    stpcpy-avx2 \
> > @@ -156,6 +158,7 @@ sysdep_routines += \
> >    wmemchr-avx2 \
> >    wmemchr-avx2-rtm \
> >    wmemchr-evex \
> > +  wmemchr-evex512 \
> >    wmemchr-evex-rtm \
> >    wmemchr-sse2 \
> >    wmemcmp-avx2-movbe \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 00a91123d3..529c0b0ef0 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __memchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __memchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __rawmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __rawmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __wmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > new file mode 100644
> > index 0000000000..ea92983db8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > @@ -0,0 +1,301 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* UNUSED. Exists purely as reference implementation.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WMEMCHR
> > +#  define CHAR_SIZE    4
> > +#  define VPBROADCAST   vpbroadcastd
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPCMPNE      vpcmpneqd
> > +#  define VPMINU       vpminud
> > +#  define VPTESTNM     vptestnmd
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define VPBROADCAST   vpbroadcastb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPCMPNE      vpcmpneqb
> > +#  define VPMINU       vpminub
> > +#  define VPTESTNM     vptestnmb
> > +# endif
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (MEMCHR, 6)
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Check for zero length.  */
> > +       test    %RDX_LP, %RDX_LP
> > +       jz      L(zero)
> > +
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %edx, %edx
> > +#  endif
> > +# endif
> > +
> > +       /* Broadcast CHAR to VMM(1).  */
> > +       VPBROADCAST %esi, %VMM(1)
> > +       movl    %edi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > +
> > +       KMOV    %k0, %VRCX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       mov     %rdx, %rsi
> > +       bsf     %VRCX, %VRSI
> > +       cmp     $CHAR_PER_VEC, %rsi
> > +       ja      L(align_more)
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rsi, %rdi
> > +#  endif
> > +       xor     %eax, %eax
> > +       cmp     %rsi, %rdx
> > +       cmova   %rdi, %rax
> > +# else
> > +       bsf     %VRCX, %VRAX
> > +       jz      L(align_more)
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 5,,5
> > +L(page_cross):
> > +       movl    %eax, %ecx
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +# ifdef USE_AS_WMEMCHR
> > +       shrl    $2, %ecx
> > +# endif
> > +       xorq    %rdi, %rax
> > +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> > +       KMOV    %k0, %VRSI
> > +       shr     %cl, %VRSI
> > +# ifndef USE_AS_RAWMEMCHR
> > +       jnz     L(page_cross_end)
> > +       movl    $CHAR_PER_VEC, %eax
> > +       sub     %ecx, %eax
> > +       cmp     %rax, %rdx
> > +       ja      L(align_more)
> > +# else
> > +       jz      L(align_more)
> > +# endif
> > +
> > +L(page_cross_end):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       bsf     %VRSI, %VRCX
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +       xor     %eax, %eax
> > +       cmp     %rcx, %rdx
> > +       cmova   %rdi, %rax
>
> You have a bug here test case:
>
> align % 4096 = 4036
> len = 8
> pos = N/A (no char in bounds).
>

Can you please help reproduce this issue.
I tried adding this test case but it didn't fail.

do_test (4036, 20, 8, 8, 0x9B);


> I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
> or you need to move `rdx` to `rcx` first.
>
> > +# else
> > +       bsf     %VRSI, %VRAX
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rdi
> > +L(ret_vec_x1):
> > +       bsf     %VRAX, %VRAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +# ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 5,,5
> > +L(align_more):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       mov     %rdi, %rax
> > +# endif
> > +       subq    $-VEC_SIZE, %rdi
> > +       /* Align rdi to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       sar     $2, %rax
> > +#  endif
> > +       addq    %rax, %rdx
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > +
> > +       KMOV    %k0, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> > +
> > +       KMOV    %k0, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> > +
> > +       KMOV    %k0, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> > +
> > +       KMOV    %k0, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x4)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +       /* Save pointer to find alignment adjustment.  */
> > +       movq    %rdi, %rax
> > +# endif
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rdi
> > +
> > +       /* Add alignment difference to rdx.  */
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       shr     $2, %VRAX
> > +#  endif
> > +       addq    %rax, %rdx
> > +# endif
> > +
> > +       /* 4 vector loop.  */
> > +       .p2align 5,,11
> > +L(loop):
> > +
> > +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> > +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> > +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> > +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> > +       VPTESTNM %VMM(3), %VMM(3), %k2
> > +
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       KORTEST %k2, %k3
> > +# ifdef USE_AS_RAWMEMCHR
> > +       jz      L(loop)
> > +# else
> > +       jnz     L(loopend)
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       ja      L(loop)
> > +L(zero_2):
> > +       xor     %eax, %eax
> > +       ret
> > +# endif
> > +
> > +L(loopend):
> > +       VPCMPEQ (%rdi), %VMM(1), %k1
> > +       KMOV    %k1, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero_2)
> > +# endif
> > +
> > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> > +       KMOV    %k1, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero_2)
> > +# endif
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> > +       KMOV    %k1, %VRAX
> > +       test    %VRAX, %VRAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero_2)
> > +# endif
> > +
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k3, %VRAX
> > +
> > +L(ret_vec_x4):
> > +       bsf     %VRAX, %VRAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       .p2align 5,,5
> > +L(ret_vec_x3):
> > +       bsf     %VRAX, %VRAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +END (MEMCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > new file mode 100644
> > index 0000000000..002f8c8489
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > @@ -0,0 +1,8 @@
> > +# ifndef MEMCHR
> > +#  define MEMCHR       __memchr_evex512
> > +# endif
> > +
> > +#include "x86-evex512-vecs.h"
> > +#include "reg-macros.h"
> > +
> > +#include "memchr-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..302d3cb055
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef RAWMEMCHR
> > +# define RAWMEMCHR     __rawmemchr_evex512
> > +#endif
> > +#define USE_AS_RAWMEMCHR       1
> > +#define MEMCHR RAWMEMCHR
> > +
> > +#include "memchr-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..78ec4ee5ad
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > @@ -0,0 +1,9 @@
> > +#ifndef WMEMCHR
> > +# define WMEMCHR       __wmemchr_evex512
> > +#endif
> > +
> > +#define MEMCHR WMEMCHR
> > +#define USE_AS_WMEMCHR 1
> > +
> > +#define USE_WIDE_CHAR  1
> > +#include "memchr-evex512.S"
> > --
> > 2.36.1
> >
Noah Goldstein Oct. 18, 2022, 4:18 a.m. UTC | #3
On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > Changes from v4:
> > > - Replace jmp max in first vector with cmov.
> > > - Replace jmp max in page cross with cmov.
> > > Changes from v3:
> > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> > > - Change first vector max check logic for terminating condition.
> > > - Change page cross logic for terminating condition.
> > > - Remove unnessary check in align_more block.
> > > - Remove unnessary VEC(0) initialization.
> > > - Define USE_WIDE_CHAR in wmemchr.
> > >
> > > Changes from v2:
> > > - Use VEC API
> > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
> > >
> > > Changes from v1:
> > > - Change vcmp to vcmpeq and vcmpneq.
> > > - Restructure unconditional loop jump logic.
> > > - Improve 4 vector loop logic.
> > > - Fix bug near page boundary.
> > >
> > > This patch implements following evex512 version of string functions.
> > > evex512 version takes up to 30% less cycle as compared to evex,
> > > depending on length and alignment.
> > >
> > > - memchr function using 512 bit vectors.
> > > - rawmemchr function using 512 bit vectors.
> > > - wmemchr function using 512 bit vectors.
> > >
> > > Code size data:
> > >
> > > memchr-evex.o           762 byte
> > > memchr-evex512.o        576 byte (-24%)
> > >
> > > rawmemchr-evex.o        461 byte
> > > rawmemchr-evex512.o     432 byte (-6%)
> > >
> > > wmemchr-evex.o          794 byte
> > > wmemchr-evex512.o       576 byte (-27%)
> > >
> > > Placeholder function, not used by any processor at the moment.
> > >
> > > Fix page cross logic
> > >
> > > Fix 2
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
> > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
> > >  6 files changed, 343 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index df4601c294..e974b1ad97 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > >    memchr-avx2 \
> > >    memchr-avx2-rtm \
> > >    memchr-evex \
> > > +  memchr-evex512 \
> > >    memchr-evex-rtm \
> > >    memchr-sse2 \
> > >    memcmp-avx2-movbe \
> > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > >    rawmemchr-avx2 \
> > >    rawmemchr-avx2-rtm \
> > >    rawmemchr-evex \
> > > +  rawmemchr-evex512 \
> > >    rawmemchr-evex-rtm \
> > >    rawmemchr-sse2 \
> > >    stpcpy-avx2 \
> > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > >    wmemchr-avx2 \
> > >    wmemchr-avx2-rtm \
> > >    wmemchr-evex \
> > > +  wmemchr-evex512 \
> > >    wmemchr-evex-rtm \
> > >    wmemchr-sse2 \
> > >    wmemcmp-avx2-movbe \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 00a91123d3..529c0b0ef0 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __memchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __memchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __rawmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __rawmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __wmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __wmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > new file mode 100644
> > > index 0000000000..ea92983db8
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > @@ -0,0 +1,301 @@
> > > +/* Placeholder function, not used by any processor at the moment.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +/* UNUSED. Exists purely as reference implementation.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifdef USE_AS_WMEMCHR
> > > +#  define CHAR_SIZE    4
> > > +#  define VPBROADCAST   vpbroadcastd
> > > +#  define VPCMPEQ      vpcmpeqd
> > > +#  define VPCMPNE      vpcmpneqd
> > > +#  define VPMINU       vpminud
> > > +#  define VPTESTNM     vptestnmd
> > > +# else
> > > +#  define CHAR_SIZE    1
> > > +#  define VPBROADCAST   vpbroadcastb
> > > +#  define VPCMPEQ      vpcmpeqb
> > > +#  define VPCMPNE      vpcmpneqb
> > > +#  define VPMINU       vpminub
> > > +#  define VPTESTNM     vptestnmb
> > > +# endif
> > > +
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > +
> > > +       .section SECTION(.text), "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Check for zero length.  */
> > > +       test    %RDX_LP, %RDX_LP
> > > +       jz      L(zero)
> > > +
> > > +#  ifdef __ILP32__
> > > +       /* Clear the upper 32 bits.  */
> > > +       movl    %edx, %edx
> > > +#  endif
> > > +# endif
> > > +
> > > +       /* Broadcast CHAR to VMM(1).  */
> > > +       VPBROADCAST %esi, %VMM(1)
> > > +       movl    %edi, %eax
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > +
> > > +       KMOV    %k0, %VRCX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       mov     %rdx, %rsi
> > > +       bsf     %VRCX, %VRSI
> > > +       cmp     $CHAR_PER_VEC, %rsi
> > > +       ja      L(align_more)
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > > +#  else
> > > +       addq    %rsi, %rdi
> > > +#  endif
> > > +       xor     %eax, %eax
> > > +       cmp     %rsi, %rdx
> > > +       cmova   %rdi, %rax
> > > +# else
> > > +       bsf     %VRCX, %VRAX
> > > +       jz      L(align_more)
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +       .p2align 5,,5
> > > +L(page_cross):
> > > +       movl    %eax, %ecx
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +# ifdef USE_AS_WMEMCHR
> > > +       shrl    $2, %ecx
> > > +# endif
> > > +       xorq    %rdi, %rax
> > > +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> > > +       KMOV    %k0, %VRSI
> > > +       shr     %cl, %VRSI
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       jnz     L(page_cross_end)
> > > +       movl    $CHAR_PER_VEC, %eax
> > > +       sub     %ecx, %eax
> > > +       cmp     %rax, %rdx
> > > +       ja      L(align_more)
> > > +# else
> > > +       jz      L(align_more)
> > > +# endif
> > > +
> > > +L(page_cross_end):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       bsf     %VRSI, %VRCX
> > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +       xor     %eax, %eax
> > > +       cmp     %rcx, %rdx
> > > +       cmova   %rdi, %rax
> >
> > You have a bug here test case:
> >
> > align % 4096 = 4036
> > len = 8
> > pos = N/A (no char in bounds).
> >
>
> Can you please help reproduce this issue.
> I tried adding this test case but it didn't fail.
>
> do_test (4036, 20, 8, 8, 0x9B);
position need to not be in the first VEC (even out of bounds).
>
>
> > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
> > or you need to move `rdx` to `rcx` first.
> >
> > > +# else
> > > +       bsf     %VRSI, %VRAX
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +L(ret_vec_x2):
> > > +       subq    $-VEC_SIZE, %rdi
> > > +L(ret_vec_x1):
> > > +       bsf     %VRAX, %VRAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +# ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +       .p2align 5,,5
> > > +L(align_more):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       mov     %rdi, %rax
> > > +# endif
> > > +       subq    $-VEC_SIZE, %rdi
> > > +       /* Align rdi to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rdi
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       sar     $2, %rax
> > > +#  endif
> > > +       addq    %rax, %rdx
> > > +# endif
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > +
> > > +       KMOV    %k0, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> > > +
> > > +       KMOV    %k0, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> > > +
> > > +       KMOV    %k0, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> > > +
> > > +       KMOV    %k0, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x4)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +       /* Save pointer to find alignment adjustment.  */
> > > +       movq    %rdi, %rax
> > > +# endif
> > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > +
> > > +       /* Add alignment difference to rdx.  */
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       shr     $2, %VRAX
> > > +#  endif
> > > +       addq    %rax, %rdx
> > > +# endif
> > > +
> > > +       /* 4 vector loop.  */
> > > +       .p2align 5,,11
> > > +L(loop):
> > > +
> > > +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> > > +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > > +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> > > +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> > > +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> > > +       VPTESTNM %VMM(3), %VMM(3), %k2
> > > +
> > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > +       KORTEST %k2, %k3
> > > +# ifdef USE_AS_RAWMEMCHR
> > > +       jz      L(loop)
> > > +# else
> > > +       jnz     L(loopend)
> > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > +       ja      L(loop)
> > > +L(zero_2):
> > > +       xor     %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +L(loopend):
> > > +       VPCMPEQ (%rdi), %VMM(1), %k1
> > > +       KMOV    %k1, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero_2)
> > > +# endif
> > > +
> > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> > > +       KMOV    %k1, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero_2)
> > > +# endif
> > > +
> > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> > > +       KMOV    %k1, %VRAX
> > > +       test    %VRAX, %VRAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero_2)
> > > +# endif
> > > +
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       KMOV    %k3, %VRAX
> > > +
> > > +L(ret_vec_x4):
> > > +       bsf     %VRAX, %VRAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       .p2align 5,,5
> > > +L(ret_vec_x3):
> > > +       bsf     %VRAX, %VRAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +END (MEMCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..002f8c8489
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +# ifndef MEMCHR
> > > +#  define MEMCHR       __memchr_evex512
> > > +# endif
> > > +
> > > +#include "x86-evex512-vecs.h"
> > > +#include "reg-macros.h"
> > > +
> > > +#include "memchr-evex-base.S"
> > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..302d3cb055
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +#ifndef RAWMEMCHR
> > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > +#endif
> > > +#define USE_AS_RAWMEMCHR       1
> > > +#define MEMCHR RAWMEMCHR
> > > +
> > > +#include "memchr-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..78ec4ee5ad
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > @@ -0,0 +1,9 @@
> > > +#ifndef WMEMCHR
> > > +# define WMEMCHR       __wmemchr_evex512
> > > +#endif
> > > +
> > > +#define MEMCHR WMEMCHR
> > > +#define USE_AS_WMEMCHR 1
> > > +
> > > +#define USE_WIDE_CHAR  1
> > > +#include "memchr-evex512.S"
> > > --
> > > 2.36.1
> > >
Noah Goldstein Oct. 18, 2022, 4:19 a.m. UTC | #4
On Mon, Oct 17, 2022 at 11:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Changes from v4:
> > > > - Replace jmp max in first vector with cmov.
> > > > - Replace jmp max in page cross with cmov.
> > > > Changes from v3:
> > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> > > > - Change first vector max check logic for terminating condition.
> > > > - Change page cross logic for terminating condition.
> > > > - Remove unnessary check in align_more block.
> > > > - Remove unnessary VEC(0) initialization.
> > > > - Define USE_WIDE_CHAR in wmemchr.
> > > >
> > > > Changes from v2:
> > > > - Use VEC API
> > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
> > > >
> > > > Changes from v1:
> > > > - Change vcmp to vcmpeq and vcmpneq.
> > > > - Restructure unconditional loop jump logic.
> > > > - Improve 4 vector loop logic.
> > > > - Fix bug near page boundary.
> > > >
> > > > This patch implements following evex512 version of string functions.
> > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > depending on length and alignment.
> > > >
> > > > - memchr function using 512 bit vectors.
> > > > - rawmemchr function using 512 bit vectors.
> > > > - wmemchr function using 512 bit vectors.
> > > >
> > > > Code size data:
> > > >
> > > > memchr-evex.o           762 byte
> > > > memchr-evex512.o        576 byte (-24%)
> > > >
> > > > rawmemchr-evex.o        461 byte
> > > > rawmemchr-evex512.o     432 byte (-6%)
> > > >
> > > > wmemchr-evex.o          794 byte
> > > > wmemchr-evex512.o       576 byte (-27%)
> > > >
> > > > Placeholder function, not used by any processor at the moment.
> > > >
> > > > Fix page cross logic
> > > >
> > > > Fix 2
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
> > > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
> > > >  6 files changed, 343 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index df4601c294..e974b1ad97 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > > >    memchr-avx2 \
> > > >    memchr-avx2-rtm \
> > > >    memchr-evex \
> > > > +  memchr-evex512 \
> > > >    memchr-evex-rtm \
> > > >    memchr-sse2 \
> > > >    memcmp-avx2-movbe \
> > > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > > >    rawmemchr-avx2 \
> > > >    rawmemchr-avx2-rtm \
> > > >    rawmemchr-evex \
> > > > +  rawmemchr-evex512 \
> > > >    rawmemchr-evex-rtm \
> > > >    rawmemchr-sse2 \
> > > >    stpcpy-avx2 \
> > > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > > >    wmemchr-avx2 \
> > > >    wmemchr-avx2-rtm \
> > > >    wmemchr-evex \
> > > > +  wmemchr-evex512 \
> > > >    wmemchr-evex-rtm \
> > > >    wmemchr-sse2 \
> > > >    wmemcmp-avx2-movbe \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index 00a91123d3..529c0b0ef0 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __memchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __memchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __rawmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __rawmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __wmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __wmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > new file mode 100644
> > > > index 0000000000..ea92983db8
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > @@ -0,0 +1,301 @@
> > > > +/* Placeholder function, not used by any processor at the moment.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +#  define CHAR_SIZE    4
> > > > +#  define VPBROADCAST   vpbroadcastd
> > > > +#  define VPCMPEQ      vpcmpeqd
> > > > +#  define VPCMPNE      vpcmpneqd
> > > > +#  define VPMINU       vpminud
> > > > +#  define VPTESTNM     vptestnmd
> > > > +# else
> > > > +#  define CHAR_SIZE    1
> > > > +#  define VPBROADCAST   vpbroadcastb
> > > > +#  define VPCMPEQ      vpcmpeqb
> > > > +#  define VPCMPNE      vpcmpneqb
> > > > +#  define VPMINU       vpminub
> > > > +#  define VPTESTNM     vptestnmb
> > > > +# endif
> > > > +
> > > > +# define PAGE_SIZE     4096
> > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > +
> > > > +       .section SECTION(.text), "ax", @progbits
> > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > +   one vector length string.  */
> > > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       /* Check for zero length.  */
> > > > +       test    %RDX_LP, %RDX_LP
> > > > +       jz      L(zero)
> > > > +
> > > > +#  ifdef __ILP32__
> > > > +       /* Clear the upper 32 bits.  */
> > > > +       movl    %edx, %edx
> > > > +#  endif
> > > > +# endif
> > > > +
> > > > +       /* Broadcast CHAR to VMM(1).  */
> > > > +       VPBROADCAST %esi, %VMM(1)
> > > > +       movl    %edi, %eax
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +       ja      L(page_cross)
> > > > +
> > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRCX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       mov     %rdx, %rsi
> > > > +       bsf     %VRCX, %VRSI

This needs to be `bsfq` as `bsfl` (for VEC_SIZE == 32) has undefined
result in upper bits of dst.
> > > > +       cmp     $CHAR_PER_VEC, %rsi
> > > > +       ja      L(align_more)
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +       addq    %rsi, %rdi
> > > > +#  endif
> > > > +       xor     %eax, %eax
> > > > +       cmp     %rsi, %rdx
> > > > +       cmova   %rdi, %rax
> > > > +# else
> > > > +       bsf     %VRCX, %VRAX
> > > > +       jz      L(align_more)
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(page_cross):
> > > > +       movl    %eax, %ecx
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +       shrl    $2, %ecx
> > > > +# endif
> > > > +       xorq    %rdi, %rax
> > > > +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> > > > +       KMOV    %k0, %VRSI
> > > > +       shr     %cl, %VRSI
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       jnz     L(page_cross_end)
> > > > +       movl    $CHAR_PER_VEC, %eax
> > > > +       sub     %ecx, %eax
> > > > +       cmp     %rax, %rdx
> > > > +       ja      L(align_more)
> > > > +# else
> > > > +       jz      L(align_more)
> > > > +# endif
> > > > +
> > > > +L(page_cross_end):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       bsf     %VRSI, %VRCX


This needs to be `bsfq` as `bsfl` (for VEC_SIZE == 32) has undefined
result in upper bits of dst.
> > > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +       xor     %eax, %eax
> > > > +       cmp     %rcx, %rdx
> > > > +       cmova   %rdi, %rax
> > >
> > > You have a bug here test case:
> > >
> > > align % 4096 = 4036
> > > len = 8
> > > pos = N/A (no char in bounds).
> > >
> >
> > Can you please help reproduce this issue.
> > I tried adding this test case but it didn't fail.
> >
> > do_test (4036, 20, 8, 8, 0x9B);
> position need to not be in the first VEC (even out of bounds).
> >
> >
> > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
> > > or you need to move `rdx` to `rcx` first.
> > >
> > > > +# else
> > > > +       bsf     %VRSI, %VRAX
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +L(zero):
> > > > +       xorl    %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +L(ret_vec_x2):
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +L(ret_vec_x1):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(align_more):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       mov     %rdi, %rax
> > > > +# endif
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +       /* Align rdi to VEC_SIZE.  */
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       sar     $2, %rax
> > > > +#  endif
> > > > +       addq    %rax, %rdx
> > > > +# endif
> > > > +
> > > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x4)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +       /* Save pointer to find alignment adjustment.  */
> > > > +       movq    %rdi, %rax
> > > > +# endif
> > > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > > +
> > > > +       /* Add alignment difference to rdx.  */
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       shr     $2, %VRAX
> > > > +#  endif
> > > > +       addq    %rax, %rdx
> > > > +# endif
> > > > +
> > > > +       /* 4 vector loop.  */
> > > > +       .p2align 5,,11
> > > > +L(loop):
> > > > +
> > > > +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> > > > +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > > > +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> > > > +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> > > > +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> > > > +       VPTESTNM %VMM(3), %VMM(3), %k2
> > > > +
> > > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > > +       KORTEST %k2, %k3
> > > > +# ifdef USE_AS_RAWMEMCHR
> > > > +       jz      L(loop)
> > > > +# else
> > > > +       jnz     L(loopend)
> > > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > +       ja      L(loop)
> > > > +L(zero_2):
> > > > +       xor     %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +L(loopend):
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       /* At this point null [w]char must be in the fourth vector so no
> > > > +          need to check.  */
> > > > +       KMOV    %k3, %VRAX
> > > > +
> > > > +L(ret_vec_x4):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(ret_vec_x3):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +END (MEMCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..002f8c8489
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +# ifndef MEMCHR
> > > > +#  define MEMCHR       __memchr_evex512
> > > > +# endif
> > > > +
> > > > +#include "x86-evex512-vecs.h"
> > > > +#include "reg-macros.h"
> > > > +
> > > > +#include "memchr-evex-base.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..302d3cb055
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +#ifndef RAWMEMCHR
> > > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > > +#endif
> > > > +#define USE_AS_RAWMEMCHR       1
> > > > +#define MEMCHR RAWMEMCHR
> > > > +
> > > > +#include "memchr-evex512.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..78ec4ee5ad
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > @@ -0,0 +1,9 @@
> > > > +#ifndef WMEMCHR
> > > > +# define WMEMCHR       __wmemchr_evex512
> > > > +#endif
> > > > +
> > > > +#define MEMCHR WMEMCHR
> > > > +#define USE_AS_WMEMCHR 1
> > > > +
> > > > +#define USE_WIDE_CHAR  1
> > > > +#include "memchr-evex512.S"
> > > > --
> > > > 2.36.1
> > > >
Sunil Pandey Oct. 18, 2022, 5:36 a.m. UTC | #5
On Mon, Oct 17, 2022 at 9:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Changes from v4:
> > > > - Replace jmp max in first vector with cmov.
> > > > - Replace jmp max in page cross with cmov.
> > > > Changes from v3:
> > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> > > > - Change first vector max check logic for terminating condition.
> > > > - Change page cross logic for terminating condition.
> > > > - Remove unnessary check in align_more block.
> > > > - Remove unnessary VEC(0) initialization.
> > > > - Define USE_WIDE_CHAR in wmemchr.
> > > >
> > > > Changes from v2:
> > > > - Use VEC API
> > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
> > > >
> > > > Changes from v1:
> > > > - Change vcmp to vcmpeq and vcmpneq.
> > > > - Restructure unconditional loop jump logic.
> > > > - Improve 4 vector loop logic.
> > > > - Fix bug near page boundary.
> > > >
> > > > This patch implements following evex512 version of string functions.
> > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > depending on length and alignment.
> > > >
> > > > - memchr function using 512 bit vectors.
> > > > - rawmemchr function using 512 bit vectors.
> > > > - wmemchr function using 512 bit vectors.
> > > >
> > > > Code size data:
> > > >
> > > > memchr-evex.o           762 byte
> > > > memchr-evex512.o        576 byte (-24%)
> > > >
> > > > rawmemchr-evex.o        461 byte
> > > > rawmemchr-evex512.o     432 byte (-6%)
> > > >
> > > > wmemchr-evex.o          794 byte
> > > > wmemchr-evex512.o       576 byte (-27%)
> > > >
> > > > Placeholder function, not used by any processor at the moment.
> > > >
> > > > Fix page cross logic
> > > >
> > > > Fix 2
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
> > > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
> > > >  6 files changed, 343 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index df4601c294..e974b1ad97 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > > >    memchr-avx2 \
> > > >    memchr-avx2-rtm \
> > > >    memchr-evex \
> > > > +  memchr-evex512 \
> > > >    memchr-evex-rtm \
> > > >    memchr-sse2 \
> > > >    memcmp-avx2-movbe \
> > > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > > >    rawmemchr-avx2 \
> > > >    rawmemchr-avx2-rtm \
> > > >    rawmemchr-evex \
> > > > +  rawmemchr-evex512 \
> > > >    rawmemchr-evex-rtm \
> > > >    rawmemchr-sse2 \
> > > >    stpcpy-avx2 \
> > > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > > >    wmemchr-avx2 \
> > > >    wmemchr-avx2-rtm \
> > > >    wmemchr-evex \
> > > > +  wmemchr-evex512 \
> > > >    wmemchr-evex-rtm \
> > > >    wmemchr-sse2 \
> > > >    wmemcmp-avx2-movbe \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index 00a91123d3..529c0b0ef0 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __memchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __memchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __rawmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __rawmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __wmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __wmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > new file mode 100644
> > > > index 0000000000..ea92983db8
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > @@ -0,0 +1,301 @@
> > > > +/* Placeholder function, not used by any processor at the moment.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +#  define CHAR_SIZE    4
> > > > +#  define VPBROADCAST   vpbroadcastd
> > > > +#  define VPCMPEQ      vpcmpeqd
> > > > +#  define VPCMPNE      vpcmpneqd
> > > > +#  define VPMINU       vpminud
> > > > +#  define VPTESTNM     vptestnmd
> > > > +# else
> > > > +#  define CHAR_SIZE    1
> > > > +#  define VPBROADCAST   vpbroadcastb
> > > > +#  define VPCMPEQ      vpcmpeqb
> > > > +#  define VPCMPNE      vpcmpneqb
> > > > +#  define VPMINU       vpminub
> > > > +#  define VPTESTNM     vptestnmb
> > > > +# endif
> > > > +
> > > > +# define PAGE_SIZE     4096
> > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > +
> > > > +       .section SECTION(.text), "ax", @progbits
> > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > +   one vector length string.  */
> > > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       /* Check for zero length.  */
> > > > +       test    %RDX_LP, %RDX_LP
> > > > +       jz      L(zero)
> > > > +
> > > > +#  ifdef __ILP32__
> > > > +       /* Clear the upper 32 bits.  */
> > > > +       movl    %edx, %edx
> > > > +#  endif
> > > > +# endif
> > > > +
> > > > +       /* Broadcast CHAR to VMM(1).  */
> > > > +       VPBROADCAST %esi, %VMM(1)
> > > > +       movl    %edi, %eax
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +       ja      L(page_cross)
> > > > +
> > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRCX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       mov     %rdx, %rsi
> > > > +       bsf     %VRCX, %VRSI
> > > > +       cmp     $CHAR_PER_VEC, %rsi
> > > > +       ja      L(align_more)
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +       addq    %rsi, %rdi
> > > > +#  endif
> > > > +       xor     %eax, %eax
> > > > +       cmp     %rsi, %rdx
> > > > +       cmova   %rdi, %rax
> > > > +# else
> > > > +       bsf     %VRCX, %VRAX
> > > > +       jz      L(align_more)
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(page_cross):
> > > > +       movl    %eax, %ecx
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +       shrl    $2, %ecx
> > > > +# endif
> > > > +       xorq    %rdi, %rax
> > > > +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> > > > +       KMOV    %k0, %VRSI
> > > > +       shr     %cl, %VRSI
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       jnz     L(page_cross_end)
> > > > +       movl    $CHAR_PER_VEC, %eax
> > > > +       sub     %ecx, %eax
> > > > +       cmp     %rax, %rdx
> > > > +       ja      L(align_more)
> > > > +# else
> > > > +       jz      L(align_more)
> > > > +# endif
> > > > +
> > > > +L(page_cross_end):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       bsf     %VRSI, %VRCX
> > > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +       xor     %eax, %eax
> > > > +       cmp     %rcx, %rdx
> > > > +       cmova   %rdi, %rax
> > >
> > > You have a bug here test case:
> > >
> > > align % 4096 = 4036
> > > len = 8
> > > pos = N/A (no char in bounds).
> > >
> >
> > Can you please help reproduce this issue.
> > I tried adding this test case but it didn't fail.
> >
> > do_test (4036, 20, 8, 8, 0x9B);
> position need to not be in the first VEC (even out of bounds).
> >

How about

do_test (4036, 1200, 8, 8, 0x9B);

It still doesn't reproduce.

> >
> > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
> > > or you need to move `rdx` to `rcx` first.
> > >
> > > > +# else
> > > > +       bsf     %VRSI, %VRAX
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +L(zero):
> > > > +       xorl    %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +L(ret_vec_x2):
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +L(ret_vec_x1):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(align_more):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       mov     %rdi, %rax
> > > > +# endif
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +       /* Align rdi to VEC_SIZE.  */
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       sar     $2, %rax
> > > > +#  endif
> > > > +       addq    %rax, %rdx
> > > > +# endif
> > > > +
> > > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> > > > +
> > > > +       KMOV    %k0, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x4)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +       /* Save pointer to find alignment adjustment.  */
> > > > +       movq    %rdi, %rax
> > > > +# endif
> > > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > > +
> > > > +       /* Add alignment difference to rdx.  */
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       shr     $2, %VRAX
> > > > +#  endif
> > > > +       addq    %rax, %rdx
> > > > +# endif
> > > > +
> > > > +       /* 4 vector loop.  */
> > > > +       .p2align 5,,11
> > > > +L(loop):
> > > > +
> > > > +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> > > > +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > > > +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> > > > +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> > > > +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> > > > +       VPTESTNM %VMM(3), %VMM(3), %k2
> > > > +
> > > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > > +       KORTEST %k2, %k3
> > > > +# ifdef USE_AS_RAWMEMCHR
> > > > +       jz      L(loop)
> > > > +# else
> > > > +       jnz     L(loopend)
> > > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > +       ja      L(loop)
> > > > +L(zero_2):
> > > > +       xor     %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +L(loopend):
> > > > +       VPCMPEQ (%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> > > > +       KMOV    %k1, %VRAX
> > > > +       test    %VRAX, %VRAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero_2)
> > > > +# endif
> > > > +
> > > > +       /* At this point null [w]char must be in the fourth vector so no
> > > > +          need to check.  */
> > > > +       KMOV    %k3, %VRAX
> > > > +
> > > > +L(ret_vec_x4):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(ret_vec_x3):
> > > > +       bsf     %VRAX, %VRAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +END (MEMCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..002f8c8489
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +# ifndef MEMCHR
> > > > +#  define MEMCHR       __memchr_evex512
> > > > +# endif
> > > > +
> > > > +#include "x86-evex512-vecs.h"
> > > > +#include "reg-macros.h"
> > > > +
> > > > +#include "memchr-evex-base.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..302d3cb055
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +#ifndef RAWMEMCHR
> > > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > > +#endif
> > > > +#define USE_AS_RAWMEMCHR       1
> > > > +#define MEMCHR RAWMEMCHR
> > > > +
> > > > +#include "memchr-evex512.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..78ec4ee5ad
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > @@ -0,0 +1,9 @@
> > > > +#ifndef WMEMCHR
> > > > +# define WMEMCHR       __wmemchr_evex512
> > > > +#endif
> > > > +
> > > > +#define MEMCHR WMEMCHR
> > > > +#define USE_AS_WMEMCHR 1
> > > > +
> > > > +#define USE_WIDE_CHAR  1
> > > > +#include "memchr-evex512.S"
> > > > --
> > > > 2.36.1
> > > >
Noah Goldstein Oct. 18, 2022, 6:30 a.m. UTC | #6
On Mon, Oct 17, 2022 at 10:37 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 9:18 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Oct 17, 2022 at 11:15 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > On Mon, Oct 17, 2022 at 8:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Oct 17, 2022 at 5:00 PM Sunil K Pandey via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > > Changes from v4:
> > > > > - Replace jmp max in first vector with cmov.
> > > > > - Replace jmp max in page cross with cmov.
> > > > > Changes from v3:
> > > > > - Replace VPCMPEQ in loop with VPTESTNM for 4th vector.
> > > > > - Change first vector max check logic for terminating condition.
> > > > > - Change page cross logic for terminating condition.
> > > > > - Remove unnessary check in align_more block.
> > > > > - Remove unnessary VEC(0) initialization.
> > > > > - Define USE_WIDE_CHAR in wmemchr.
> > > > >
> > > > > Changes from v2:
> > > > > - Use VEC API
> > > > > - Replaced long jump L(zero) in L(endloop) with short jump L(zero_2)
> > > > >
> > > > > Changes from v1:
> > > > > - Change vcmp to vcmpeq and vcmpneq.
> > > > > - Restructure unconditional loop jump logic.
> > > > > - Improve 4 vector loop logic.
> > > > > - Fix bug near page boundary.
> > > > >
> > > > > This patch implements following evex512 version of string functions.
> > > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > > depending on length and alignment.
> > > > >
> > > > > - memchr function using 512 bit vectors.
> > > > > - rawmemchr function using 512 bit vectors.
> > > > > - wmemchr function using 512 bit vectors.
> > > > >
> > > > > Code size data:
> > > > >
> > > > > memchr-evex.o           762 byte
> > > > > memchr-evex512.o        576 byte (-24%)
> > > > >
> > > > > rawmemchr-evex.o        461 byte
> > > > > rawmemchr-evex512.o     432 byte (-6%)
> > > > >
> > > > > wmemchr-evex.o          794 byte
> > > > > wmemchr-evex512.o       576 byte (-27%)
> > > > >
> > > > > Placeholder function, not used by any processor at the moment.
> > > > >
> > > > > Fix page cross logic
> > > > >
> > > > > Fix 2
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > > > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 301 +++++++++++++++++++
> > > > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   8 +
> > > > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > > > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   9 +
> > > > >  6 files changed, 343 insertions(+)
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > index df4601c294..e974b1ad97 100644
> > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > > > >    memchr-avx2 \
> > > > >    memchr-avx2-rtm \
> > > > >    memchr-evex \
> > > > > +  memchr-evex512 \
> > > > >    memchr-evex-rtm \
> > > > >    memchr-sse2 \
> > > > >    memcmp-avx2-movbe \
> > > > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > > > >    rawmemchr-avx2 \
> > > > >    rawmemchr-avx2-rtm \
> > > > >    rawmemchr-evex \
> > > > > +  rawmemchr-evex512 \
> > > > >    rawmemchr-evex-rtm \
> > > > >    rawmemchr-sse2 \
> > > > >    stpcpy-avx2 \
> > > > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > > > >    wmemchr-avx2 \
> > > > >    wmemchr-avx2-rtm \
> > > > >    wmemchr-evex \
> > > > > +  wmemchr-evex512 \
> > > > >    wmemchr-evex-rtm \
> > > > >    wmemchr-sse2 \
> > > > >    wmemcmp-avx2-movbe \
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > index 00a91123d3..529c0b0ef0 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > >                                      __memchr_evex)
> > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > +                                    __memchr_evex512)
> > > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > > @@ -337,6 +342,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > >                                      __rawmemchr_evex)
> > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > +                                    __rawmemchr_evex512)
> > > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > > @@ -942,6 +952,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > >                                      __wmemchr_evex)
> > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > +                                    __wmemchr_evex512)
> > > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > > new file mode 100644
> > > > > index 0000000000..ea92983db8
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > > @@ -0,0 +1,301 @@
> > > > > +/* Placeholder function, not used by any processor at the moment.
> > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +
> > > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > > +
> > > > > +#include <isa-level.h>
> > > > > +
> > > > > +#if ISA_SHOULD_BUILD (4)
> > > > > +
> > > > > +# include <sysdep.h>
> > > > > +
> > > > > +# ifdef USE_AS_WMEMCHR
> > > > > +#  define CHAR_SIZE    4
> > > > > +#  define VPBROADCAST   vpbroadcastd
> > > > > +#  define VPCMPEQ      vpcmpeqd
> > > > > +#  define VPCMPNE      vpcmpneqd
> > > > > +#  define VPMINU       vpminud
> > > > > +#  define VPTESTNM     vptestnmd
> > > > > +# else
> > > > > +#  define CHAR_SIZE    1
> > > > > +#  define VPBROADCAST   vpbroadcastb
> > > > > +#  define VPCMPEQ      vpcmpeqb
> > > > > +#  define VPCMPNE      vpcmpneqb
> > > > > +#  define VPMINU       vpminub
> > > > > +#  define VPTESTNM     vptestnmb
> > > > > +# endif
> > > > > +
> > > > > +# define PAGE_SIZE     4096
> > > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > > +
> > > > > +       .section SECTION(.text), "ax", @progbits
> > > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > > +   one vector length string.  */
> > > > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       /* Check for zero length.  */
> > > > > +       test    %RDX_LP, %RDX_LP
> > > > > +       jz      L(zero)
> > > > > +
> > > > > +#  ifdef __ILP32__
> > > > > +       /* Clear the upper 32 bits.  */
> > > > > +       movl    %edx, %edx
> > > > > +#  endif
> > > > > +# endif
> > > > > +
> > > > > +       /* Broadcast CHAR to VMM(1).  */
> > > > > +       VPBROADCAST %esi, %VMM(1)
> > > > > +       movl    %edi, %eax
> > > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > +       ja      L(page_cross)
> > > > > +
> > > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > > +
> > > > > +       KMOV    %k0, %VRCX
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       mov     %rdx, %rsi
> > > > > +       bsf     %VRCX, %VRSI
> > > > > +       cmp     $CHAR_PER_VEC, %rsi
> > > > > +       ja      L(align_more)
> > > > > +#  ifdef USE_AS_WMEMCHR
> > > > > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > > > > +#  else
> > > > > +       addq    %rsi, %rdi
> > > > > +#  endif
> > > > > +       xor     %eax, %eax
> > > > > +       cmp     %rsi, %rdx
> > > > > +       cmova   %rdi, %rax
> > > > > +# else
> > > > > +       bsf     %VRCX, %VRAX
> > > > > +       jz      L(align_more)
> > > > > +       add     %rdi, %rax
> > > > > +# endif
> > > > > +       ret
> > > > > +
> > > > > +       .p2align 5,,5
> > > > > +L(page_cross):
> > > > > +       movl    %eax, %ecx
> > > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > > +# ifdef USE_AS_WMEMCHR
> > > > > +       shrl    $2, %ecx
> > > > > +# endif
> > > > > +       xorq    %rdi, %rax
> > > > > +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
> > > > > +       KMOV    %k0, %VRSI
> > > > > +       shr     %cl, %VRSI
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       jnz     L(page_cross_end)
> > > > > +       movl    $CHAR_PER_VEC, %eax
> > > > > +       sub     %ecx, %eax
> > > > > +       cmp     %rax, %rdx
> > > > > +       ja      L(align_more)
> > > > > +# else
> > > > > +       jz      L(align_more)
> > > > > +# endif
> > > > > +
> > > > > +L(page_cross_end):
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       bsf     %VRSI, %VRCX
> > > > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > > +       xor     %eax, %eax
> > > > > +       cmp     %rcx, %rdx
> > > > > +       cmova   %rdi, %rax
> > > >
> > > > You have a bug here test case:
> > > >
> > > > align % 4096 = 4036
> > > > len = 8
> > > > pos = N/A (no char in bounds).
> > > >
> > >
> > > Can you please help reproduce this issue.
> > > I tried adding this test case but it didn't fail.
> > >
> > > do_test (4036, 20, 8, 8, 0x9B);
> > position need to not be in the first VEC (even out of bounds).
> > >
>
> How about
>
> do_test (4036, 1200, 8, 8, 0x9B);
>
> It still doesn't reproduce.

  do_test (4036, 1200, 1200, 8, 23);

sorry the params I had were confusing. For some reason
glibc testsuite has two different variables for char position
and decided to name one of them 'len'.

Meant 3rd argument so 'n' in the glibc test suite.


>
> > >
> > > > I think the `bsf %rsi, %rcx` shouldn't have rcx as a destination
> > > > or you need to move `rdx` to `rcx` first.
> > > >
> > > > > +# else
> > > > > +       bsf     %VRSI, %VRAX
> > > > > +       add     %rdi, %rax
> > > > > +# endif
> > > > > +       ret
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +L(zero):
> > > > > +       xorl    %eax, %eax
> > > > > +       ret
> > > > > +# endif
> > > > > +
> > > > > +L(ret_vec_x2):
> > > > > +       subq    $-VEC_SIZE, %rdi
> > > > > +L(ret_vec_x1):
> > > > > +       bsf     %VRAX, %VRAX
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       cmp     %rax, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +# ifdef USE_AS_WMEMCHR
> > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > +# else
> > > > > +       add     %rdi, %rax
> > > > > +# endif
> > > > > +       ret
> > > > > +
> > > > > +       .p2align 5,,5
> > > > > +L(align_more):
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       mov     %rdi, %rax
> > > > > +# endif
> > > > > +       subq    $-VEC_SIZE, %rdi
> > > > > +       /* Align rdi to VEC_SIZE.  */
> > > > > +       andq    $-VEC_SIZE, %rdi
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    %rdi, %rax
> > > > > +#  ifdef USE_AS_WMEMCHR
> > > > > +       sar     $2, %rax
> > > > > +#  endif
> > > > > +       addq    %rax, %rdx
> > > > > +# endif
> > > > > +
> > > > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > > > +       VPCMPEQ (%rdi), %VMM(1), %k0
> > > > > +
> > > > > +       KMOV    %k0, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x1)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +
> > > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k0
> > > > > +
> > > > > +       KMOV    %k0, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x2)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +
> > > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k0
> > > > > +
> > > > > +       KMOV    %k0, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x3)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +
> > > > > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMM(1), %k0
> > > > > +
> > > > > +       KMOV    %k0, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x4)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero)
> > > > > +       /* Save pointer to find alignment adjustment.  */
> > > > > +       movq    %rdi, %rax
> > > > > +# endif
> > > > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > > > +
> > > > > +       /* Add alignment difference to rdx.  */
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    %rdi, %rax
> > > > > +#  ifdef USE_AS_WMEMCHR
> > > > > +       shr     $2, %VRAX
> > > > > +#  endif
> > > > > +       addq    %rax, %rdx
> > > > > +# endif
> > > > > +
> > > > > +       /* 4 vector loop.  */
> > > > > +       .p2align 5,,11
> > > > > +L(loop):
> > > > > +
> > > > > +       VPCMPNE (VEC_SIZE * 4)(%rdi), %VMM(1), %k1
> > > > > +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > > > > +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
> > > > > +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
> > > > > +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> > > > > +       VPTESTNM %VMM(3), %VMM(3), %k2
> > > > > +
> > > > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > > > +       KORTEST %k2, %k3
> > > > > +# ifdef USE_AS_RAWMEMCHR
> > > > > +       jz      L(loop)
> > > > > +# else
> > > > > +       jnz     L(loopend)
> > > > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > > +       ja      L(loop)
> > > > > +L(zero_2):
> > > > > +       xor     %eax, %eax
> > > > > +       ret
> > > > > +# endif
> > > > > +
> > > > > +L(loopend):
> > > > > +       VPCMPEQ (%rdi), %VMM(1), %k1
> > > > > +       KMOV    %k1, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x1)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero_2)
> > > > > +# endif
> > > > > +
> > > > > +       VPCMPEQ VEC_SIZE(%rdi), %VMM(1), %k1
> > > > > +       KMOV    %k1, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x2)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero_2)
> > > > > +# endif
> > > > > +
> > > > > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMM(1), %k1
> > > > > +       KMOV    %k1, %VRAX
> > > > > +       test    %VRAX, %VRAX
> > > > > +       jnz     L(ret_vec_x3)
> > > > > +
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > > +       jbe     L(zero_2)
> > > > > +# endif
> > > > > +
> > > > > +       /* At this point null [w]char must be in the fourth vector so no
> > > > > +          need to check.  */
> > > > > +       KMOV    %k3, %VRAX
> > > > > +
> > > > > +L(ret_vec_x4):
> > > > > +       bsf     %VRAX, %VRAX
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       cmp     %rax, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +       .p2align 5,,5
> > > > > +L(ret_vec_x3):
> > > > > +       bsf     %VRAX, %VRAX
> > > > > +# ifndef USE_AS_RAWMEMCHR
> > > > > +       cmp     %rax, %rdx
> > > > > +       jbe     L(zero)
> > > > > +# endif
> > > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +END (MEMCHR)
> > > > > +#endif
> > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > > new file mode 100644
> > > > > index 0000000000..002f8c8489
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > > @@ -0,0 +1,8 @@
> > > > > +# ifndef MEMCHR
> > > > > +#  define MEMCHR       __memchr_evex512
> > > > > +# endif
> > > > > +
> > > > > +#include "x86-evex512-vecs.h"
> > > > > +#include "reg-macros.h"
> > > > > +
> > > > > +#include "memchr-evex-base.S"
> > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > > new file mode 100644
> > > > > index 0000000000..302d3cb055
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > > @@ -0,0 +1,7 @@
> > > > > +#ifndef RAWMEMCHR
> > > > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > > > +#endif
> > > > > +#define USE_AS_RAWMEMCHR       1
> > > > > +#define MEMCHR RAWMEMCHR
> > > > > +
> > > > > +#include "memchr-evex512.S"
> > > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > > new file mode 100644
> > > > > index 0000000000..78ec4ee5ad
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > > @@ -0,0 +1,9 @@
> > > > > +#ifndef WMEMCHR
> > > > > +# define WMEMCHR       __wmemchr_evex512
> > > > > +#endif
> > > > > +
> > > > > +#define MEMCHR WMEMCHR
> > > > > +#define USE_AS_WMEMCHR 1
> > > > > +
> > > > > +#define USE_WIDE_CHAR  1
> > > > > +#include "memchr-evex512.S"
> > > > > --
> > > > > 2.36.1
> > > > >
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index df4601c294..e974b1ad97 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,6 +4,7 @@  sysdep_routines += \
   memchr-avx2 \
   memchr-avx2-rtm \
   memchr-evex \
+  memchr-evex512 \
   memchr-evex-rtm \
   memchr-sse2 \
   memcmp-avx2-movbe \
@@ -36,6 +37,7 @@  sysdep_routines += \
   rawmemchr-avx2 \
   rawmemchr-avx2-rtm \
   rawmemchr-evex \
+  rawmemchr-evex512 \
   rawmemchr-evex-rtm \
   rawmemchr-sse2 \
   stpcpy-avx2 \
@@ -156,6 +158,7 @@  sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
+  wmemchr-evex512 \
   wmemchr-evex-rtm \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 00a91123d3..529c0b0ef0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,6 +63,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __memchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __memchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
@@ -337,6 +342,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __rawmemchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __rawmemchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
@@ -942,6 +952,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wmemchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wmemchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
new file mode 100644
index 0000000000..ea92983db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
@@ -0,0 +1,301 @@ 
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WMEMCHR
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNE	vpcmpneqd
+#  define VPMINU	vpminud
+#  define VPTESTNM	vptestnmd
+# else
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNE	vpcmpneqb
+#  define VPMINU	vpminub
+#  define VPTESTNM	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (MEMCHR, 6)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	L(zero)
+
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+# endif
+
+	/* Broadcast CHAR to VMM(1).  */
+	VPBROADCAST %esi, %VMM(1)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMPEQ	(%rdi), %VMM(1), %k0
+
+	KMOV	%k0, %VRCX
+# ifndef USE_AS_RAWMEMCHR
+	mov	%rdx, %rsi
+	bsf	%VRCX, %VRSI
+	cmp	$CHAR_PER_VEC, %rsi
+	ja	L(align_more)
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xor	%eax, %eax
+	cmp	%rsi, %rdx
+	cmova	%rdi, %rax
+# else
+	bsf     %VRCX, %VRAX
+	jz	L(align_more)
+	add	%rdi, %rax
+# endif
+	ret
+
+	.p2align 5,,5
+L(page_cross):
+	movl	%eax, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+	shrl	$2, %ecx
+# endif
+	xorq	%rdi, %rax
+	VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %k0
+	KMOV    %k0, %VRSI
+	shr	%cl, %VRSI
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(page_cross_end)
+	movl	$CHAR_PER_VEC, %eax
+	sub	%ecx, %eax
+	cmp	%rax, %rdx
+	ja	L(align_more)
+# else
+	jz	L(align_more)
+# endif
+
+L(page_cross_end):
+# ifndef USE_AS_RAWMEMCHR
+	bsf	%VRSI, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+	xor	%eax, %eax
+	cmp	%rcx, %rdx
+	cmova	%rdi, %rax
+# else
+	bsf	%VRSI, %VRAX
+	add	%rdi, %rax
+# endif
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x1):
+	bsf     %VRAX, %VRAX
+# ifndef USE_AS_RAWMEMCHR
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+	ret
+
+	.p2align 5,,5
+L(align_more):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%rdi, %rax
+# endif
+	subq	$-VEC_SIZE, %rdi
+	/* Align rdi to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	%rdi, %rax
+#  ifdef USE_AS_WMEMCHR
+	sar	$2, %rax
+#  endif
+	addq	%rax, %rdx
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMPEQ	(%rdi), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x1)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x2)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x3)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x4)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+	/* Save pointer to find alignment adjustment.  */
+	movq	%rdi, %rax
+# endif
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	/* Add alignment difference to rdx.  */
+# ifndef USE_AS_RAWMEMCHR
+	subq	%rdi, %rax
+#  ifdef USE_AS_WMEMCHR
+	shr	$2, %VRAX
+#  endif
+	addq	%rax, %rdx
+# endif
+
+	/* 4 vector loop.  */
+	.p2align 5,,11
+L(loop):
+
+	VPCMPNE	(VEC_SIZE * 4)(%rdi), %VMM(1), %k1
+	vpxorq  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	vpxorq  (VEC_SIZE * 6)(%rdi), %VMM(1), %VMM(3)
+	VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMM(1), %k3
+	VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTNM %VMM(3), %VMM(3), %k2
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k2, %k3
+# ifdef USE_AS_RAWMEMCHR
+	jz	L(loop)
+# else
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+L(zero_2):
+	xor	%eax, %eax
+	ret
+# endif
+
+L(loopend):
+	VPCMPEQ	(%rdi), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x1)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero_2)
+# endif
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x2)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero_2)
+# endif
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x3)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero_2)
+# endif
+
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k3, %VRAX
+
+L(ret_vec_x4):
+	bsf	%VRAX, %VRAX
+# ifndef USE_AS_RAWMEMCHR
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5,,5
+L(ret_vec_x3):
+	bsf	%VRAX, %VRAX
+# ifndef USE_AS_RAWMEMCHR
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
new file mode 100644
index 0000000000..002f8c8489
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
@@ -0,0 +1,8 @@ 
+# ifndef MEMCHR
+#  define MEMCHR       __memchr_evex512
+# endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include "memchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
new file mode 100644
index 0000000000..302d3cb055
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
@@ -0,0 +1,7 @@ 
+#ifndef RAWMEMCHR
+# define RAWMEMCHR	__rawmemchr_evex512
+#endif
+#define USE_AS_RAWMEMCHR	1
+#define MEMCHR	RAWMEMCHR
+
+#include "memchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
new file mode 100644
index 0000000000..78ec4ee5ad
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
@@ -0,0 +1,9 @@ 
+#ifndef WMEMCHR
+# define WMEMCHR	__wmemchr_evex512
+#endif
+
+#define MEMCHR	WMEMCHR
+#define USE_AS_WMEMCHR	1
+
+#define USE_WIDE_CHAR	1
+#include "memchr-evex512.S"