diff mbox series

x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen

Message ID 20220518185943.2845106-1-skpgkp2@gmail.com
State New
Headers show
Series x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen | expand

Commit Message

Sunil Pandey May 18, 2022, 6:59 p.m. UTC
This patch implements following evex512 version of string functions.
Perf gain up to 50% as compared to evex, depending on length and
alignment.

- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
 sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
 6 files changed, 328 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S

Comments

Noah Goldstein May 18, 2022, 8:29 p.m. UTC | #1
On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain up to 50% as compared to evex, depending on length and
> alignment.

Can you include a csv (or any consistent fmt really) somewhere of all
the benchmarks
and results of ~10-20 runs and the hardware your benchmarking on?
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
>  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
>  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
>  6 files changed, 328 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> +  strlen-evex512 \
>    strlen-sse2 \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> +  strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
>    strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> +  wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
>    wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
>    wcsnlen-avx2-rtm \
>    wcsnlen-c \
>    wcsnlen-evex \
> +  wcsnlen-evex512 \
>    wcsnlen-sse4_1 \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcslen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcslen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcsnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..13a6b34615
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,291 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifndef STRLEN
> +#  define STRLEN       __strlen_evex512
> +# endif
> +
> +# define VMOVA         vmovdqa64
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMP                vpcmpd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMP                vpcmpb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define XMM0          xmm16
> +# define ZMM0          zmm16
> +# define ZMM1          zmm17
> +# define ZMM2          zmm18
> +# define ZMM3          zmm19
> +# define ZMM4          zmm20
> +# define VEC_SIZE      64
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)

Is it possible to integrate this file cleanly with the evex256 version?
Something similar to what we do for memset/memmove.
> +
> +       .section .text.evex512, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> +        /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +# endif
> +
> +       movl    %edi, %ecx
> +       vpxorq  %XMM0, %XMM0, %XMM0
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       testq   %rax, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
cmpl

> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +# ifdef USE_AS_STRNLEN
> +       /* eax instead of rax used to save encoding space.  */
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       /* At this point vector max length reached.  */
> +# ifdef USE_AS_STRNLEN
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +# endif
> +
> +L(page_cross):

Imo unless you need the 2-byte encoding on the jump this should be at
the end of the
file as its expected to not be hot.
> +       andl    $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       movq    %rdi, %rax
> +       andq    $-VEC_SIZE, %rax
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       /* Ignore number of character for alignment adjustment.  */
> +       shrq    %cl, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> +       movq    %rax, %rdx
> +       subq    %rdi, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       shrq    $2, %rdx
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       cmpq    %rsi, %rdx
> +       jae     L(ret_max)
> +       subq    %rsi, %rdx
> +       negq    %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(first_vector)

Just to keep consistent with the other files can you
rename first_vector/second_vector... to ret_vec_x{N}
or something like that.
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(second_vector)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif

The evex256 / avx2 versions do a simple check if we will be able
to do all 4 aligning compares w.o a branch. This saves total
branches. Why not do something similar here?
> +
> +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(third_vector)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(fourth_vector)
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       /* Instead of decreasing, rdx increased to prepare for loop
> +          first iteration.  Incremented 3 times because one increment
> +          cancelled by previous decrement.  */
> +       addq    $(3 * CHAR_PER_VEC), %rdx
> +# endif
> +
> +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> +          loop.  */
> +       testq   $(3 * VEC_SIZE), %rax
> +       jz      L(loop)
> +
> +       movq    %rax, %rcx
> +
> +       /* Align address to 4 * VEC_SIZE for loop.  */
> +       andq    $-(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       sarq    $2, %rcx
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustmentment.  */
> +       addq    %rcx, %rdx
> +# endif
> +
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(ret_max)

we have potential to overread by 255 bytes. Not correctness issue because
we are page aligned by seems like a possible perf issue.
> +# endif
> +       /* VPMINU and VPCMP combination provide better perfomance as
> +          compared to alternative combinations.  */
> +       VMOVA   (%rax), %ZMM1
> +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4

I think doing 4x in the main loop is probably overkill no?
Aligning to 256 is pretty extreme.

Also I don't think the 4x zmm loads can even keep up with
2x / cycle so seems like it may not be worth wasting up to
255 bytes to get it.
> +
> +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +       kortestq %k0, %k1
> +       jz      L(loop)
> +
> +       /* Need 4 vector subtraction because address incremented in
> +          the loop before terminating condition check.  Also want to
> +          reuse code for exit condition before and after the loop.  */
> +       subq    $(4 * VEC_SIZE), %rax
> +
> +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> +       kmovq   %k2, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(first_vector)
> +
> +       kmovq   %k0, %rcx
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       testq   %rcx, %rcx
> +       jnz     L(second_vector)
> +
> +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> +       kmovq   %k3, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(third_vector)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       kmovq   %k1, %rcx
> +
> +       /* Termination fourth, third, second vector are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(fourth_vector):
> +       addq    $(3 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
Can this be hoisted out to the begining of L(aligned_more).
It seems every return path uses it.

> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax

if not wcslen probably faster to use lea instead of 2x add

> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(third_vector):
> +       addq    $(2 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(second_vector):
> +       addq    $VEC_SIZE, %rax
> +L(first_vector):
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>
Sunil Pandey May 19, 2022, 3:33 a.m. UTC | #2
On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain up to 50% as compared to evex, depending on length and
> > alignment.
>
> Can you include a csv (or any consistent fmt really) somewhere of all
> the benchmarks
> and results of ~10-20 runs and the hardware your benchmarking on?

Machine:
Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Fedora 35
Glibc master

20 iteration data for each function is attached, please use any text
editor(vi) to
access it.

> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
> >  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
> >  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
> >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
> >  6 files changed, 328 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > +  strlen-evex512 \
> >    strlen-sse2 \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> >    strnlen-avx2 \
> >    strnlen-avx2-rtm \
> >    strnlen-evex \
> > +  strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> >    strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > +  wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> >    wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-c \
> >    wcsnlen-evex \
> > +  wcsnlen-evex512 \
> >    wcsnlen-sse4_1 \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcslen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcslen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcslen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcsnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcsnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..13a6b34615
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,291 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef STRLEN
> > +#  define STRLEN       __strlen_evex512
> > +# endif
> > +
> > +# define VMOVA         vmovdqa64
> > +# ifdef USE_AS_WCSLEN
> > +#  define VPCMP                vpcmpd
> > +#  define VPMINU       vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMP                vpcmpb
> > +#  define VPMINU       vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# define XMM0          xmm16
> > +# define ZMM0          zmm16
> > +# define ZMM1          zmm17
> > +# define ZMM2          zmm18
> > +# define ZMM3          zmm19
> > +# define ZMM4          zmm20
> > +# define VEC_SIZE      64
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> Is it possible to integrate this file cleanly with the evex256 version?
> Something similar to what we do for memset/memmove.

Good suggestion, I will look into it. For the first iteration, let's
keep it standalone
for now.

> > +
> > +       .section .text.evex512, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > +        /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(zero)
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#  endif
> > +# endif
> > +
> > +       movl    %edi, %ecx
> > +       vpxorq  %XMM0, %XMM0, %XMM0
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       testq   %rax, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> cmpl
>
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* eax instead of rax used to save encoding space.  */
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +       /* At this point vector max length reached.  */
> > +# ifdef USE_AS_STRNLEN
> > +L(ret_max):
> > +       movq    %rsi, %rax
> > +       ret
> > +# endif
> > +
> > +L(page_cross):
>
> Imo unless you need the 2-byte encoding on the jump this should be at
> the end of the
> file as its expected to not be hot.

One of my goal, to reduce size as much as possible, as long as it
doesn't hurt performance.  Keeping the jump target nearby reduces
size by a few bytes, without hurting performance.

> > +       andl    $(VEC_SIZE - 1), %ecx
> > +# ifdef USE_AS_WCSLEN
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       movq    %rdi, %rax
> > +       andq    $-VEC_SIZE, %rax
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       shrq    %cl, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       movq    %rax, %rdx
> > +       subq    %rdi, %rdx
> > +#  ifdef USE_AS_WCSLEN
> > +       shrq    $2, %rdx
> > +#  endif
> > +       /* At this point rdx contains [w]chars already compared.  */
> > +       cmpq    %rsi, %rdx
> > +       jae     L(ret_max)
> > +       subq    %rsi, %rdx
> > +       negq    %rdx
> > +       /* At this point rdx contains number of w[char] needs to go.
> > +          Now onwards rdx will keep decrementing with each compare.  */
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
>
> Just to keep consistent with the other files can you
> rename first_vector/second_vector... to ret_vec_x{N}
> or something like that.

Agree, will be fixed in v1.

> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
>
> The evex256 / avx2 versions do a simple check if we will be able
> to do all 4 aligning compares w.o a branch. This saves total
> branches. Why not do something similar here?

Done this way to reduce size and complexity. Branch taken, will
jump to terminating condition. Branch not taken has no impact on perf.

> > +
> > +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(fourth_vector)
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* Instead of decreasing, rdx increased to prepare for loop
> > +          first iteration.  Incremented 3 times because one increment
> > +          cancelled by previous decrement.  */
> > +       addq    $(3 * CHAR_PER_VEC), %rdx
> > +# endif
> > +
> > +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > +          loop.  */
> > +       testq   $(3 * VEC_SIZE), %rax
> > +       jz      L(loop)
> > +
> > +       movq    %rax, %rcx
> > +
> > +       /* Align address to 4 * VEC_SIZE for loop.  */
> > +       andq    $-(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    %rax, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rcx
> > +#  endif
> > +       /* rcx contains number of [w]char will be recompared due to
> > +          alignment fixes.  rdx must be incremented by rcx to offset
> > +          alignment adjustmentment.  */
> > +       addq    %rcx, %rdx
> > +# endif
> > +
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(ret_max)
>
> we have potential to overread by 255 bytes. Not correctness issue because
> we are page aligned by seems like a possible perf issue.

Correct, but overread data will be read from cache not memory, not a
significant impact, but this is the cost we have to pay for 4 vector alignments.

> > +# endif
> > +       /* VPMINU and VPCMP combination provide better perfomance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (%rax), %ZMM1
> > +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> > +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
>
> I think doing 4x in the main loop is probably overkill no?
> Aligning to 256 is pretty extreme.
>
> Also I don't think the 4x zmm loads can even keep up with
> 2x / cycle so seems like it may not be worth wasting up to
> 255 bytes to get it.

Perf number looks good, so for now it should be ok.

> > +
> > +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> > +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +       kortestq %k0, %k1
> > +       jz      L(loop)
> > +
> > +       /* Need 4 vector subtraction because address incremented in
> > +          the loop before terminating condition check.  Also want to
> > +          reuse code for exit condition before and after the loop.  */
> > +       subq    $(4 * VEC_SIZE), %rax
> > +
> > +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> > +       kmovq   %k2, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
> > +
> > +       kmovq   %k0, %rcx
> > +       /* At this point, if k0 is non zero, null char must be in the
> > +          second vector.  */
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> > +       kmovq   %k3, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       kmovq   %k1, %rcx
> > +
> > +       /* Termination fourth, third, second vector are pretty much
> > +          same, implemented this way to avoid branching and reuse code
> > +          from pre loop exit condition.  */
> > +L(fourth_vector):
> > +       addq    $(3 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> Can this be hoisted out to the begining of L(aligned_more).
> It seems every return path uses it.
>

It really depends on where the control is coming from. So moving before
align_more will not be correct, or I may be missing something here.

> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
>
> if not wcslen probably faster to use lea instead of 2x add

I'm not sure whether there will be any significant gain. lea vs add. Used add
because it's readily available on all ports.


>
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(third_vector):
> > +       addq    $(2 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(second_vector):
> > +       addq    $VEC_SIZE, %rax
> > +L(first_vector):
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >
Noah Goldstein May 19, 2022, 4:41 a.m. UTC | #3
On Wed, May 18, 2022 at 10:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > Perf gain up to 50% as compared to evex, depending on length and
> > > alignment.
> >
> > Can you include a csv (or any consistent fmt really) somewhere of all
> > the benchmarks
> > and results of ~10-20 runs and the hardware your benchmarking on?
>
> Machine:
> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> Fedora 35
> Glibc master
>
> 20 iteration data for each function is attached, please use any text
> editor(vi) to
> access it.

Any chance you can aggregate it?

Also can you add collective geometric mean of evex vs evex512 and
cpu info to the commit message.
>
> > >
> > > - String length function using 512 bit vectors.
> > > - String N length using 512 bit vectors.
> > > - Wide string length using 512 bit vectors.
> > > - Wide string N length using 512 bit vectors.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
> > >  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
> > >  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
> > >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
> > >  6 files changed, 328 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index f3ab5e0928..d0869c3ac3 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -81,6 +81,7 @@ sysdep_routines += \
> > >    strlen-avx2 \
> > >    strlen-avx2-rtm \
> > >    strlen-evex \
> > > +  strlen-evex512 \
> > >    strlen-sse2 \
> > >    strncase_l-avx2 \
> > >    strncase_l-avx2-rtm \
> > > @@ -105,6 +106,7 @@ sysdep_routines += \
> > >    strnlen-avx2 \
> > >    strnlen-avx2-rtm \
> > >    strnlen-evex \
> > > +  strnlen-evex512 \
> > >    strnlen-sse2 \
> > >    strpbrk-c \
> > >    strpbrk-sse2 \
> > > @@ -138,6 +140,7 @@ sysdep_routines += \
> > >    wcslen-avx2 \
> > >    wcslen-avx2-rtm \
> > >    wcslen-evex \
> > > +  wcslen-evex512 \
> > >    wcslen-sse2 \
> > >    wcslen-sse4_1 \
> > >    wcsncmp-avx2 \
> > > @@ -148,6 +151,7 @@ sysdep_routines += \
> > >    wcsnlen-avx2-rtm \
> > >    wcsnlen-c \
> > >    wcsnlen-evex \
> > > +  wcsnlen-evex512 \
> > >    wcsnlen-sse4_1 \
> > >    wcsrchr-avx2 \
> > >    wcsrchr-avx2-rtm \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 7218095430..c5cd9466fe 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __strlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, strlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __strlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __strnlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __strnlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __wcslen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __wcslen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, wcslen,
> > >                               CPU_FEATURE_USABLE (SSE4_1),
> > >                               __wcslen_sse4_1)
> > > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __wcsnlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __wcsnlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> > >                               CPU_FEATURE_USABLE (SSE4_1),
> > >                               __wcsnlen_sse4_1)
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..13a6b34615
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > @@ -0,0 +1,291 @@
> > > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#if IS_IN (libc)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifndef STRLEN
> > > +#  define STRLEN       __strlen_evex512
> > > +# endif
> > > +
> > > +# define VMOVA         vmovdqa64
> > > +# ifdef USE_AS_WCSLEN
> > > +#  define VPCMP                vpcmpd
> > > +#  define VPMINU       vpminud
> > > +#  define CHAR_SIZE    4
> > > +# else
> > > +#  define VPCMP                vpcmpb
> > > +#  define VPMINU       vpminub
> > > +#  define CHAR_SIZE    1
> > > +# endif
> > > +
> > > +# define XMM0          xmm16
> > > +# define ZMM0          zmm16
> > > +# define ZMM1          zmm17
> > > +# define ZMM2          zmm18
> > > +# define ZMM3          zmm19
> > > +# define ZMM4          zmm20
> > > +# define VEC_SIZE      64
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> > Is it possible to integrate this file cleanly with the evex256 version?
> > Something similar to what we do for memset/memmove.
>
> Good suggestion, I will look into it. For the first iteration, let's
> keep it standalone
> for now.

Why? There are a fair amount of functions. Given that evex/evex512
are just about 1-1 except VEC_SIZE we should try to integrate.
>
> > > +
> > > +       .section .text.evex512, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (STRLEN, 6)
> > > +# ifdef USE_AS_STRNLEN
> > > +        /* Check zero length.  */
> > > +       test    %RSI_LP, %RSI_LP
> > > +       jz      L(zero)
> > > +#  ifdef __ILP32__
> > > +       /* Clear the upper 32 bits.  */
> > > +       movl    %esi, %esi
> > > +#  endif
> > > +# endif
> > > +
> > > +       movl    %edi, %ecx
> > > +       vpxorq  %XMM0, %XMM0, %XMM0
> > > +       andl    $(PAGE_SIZE - 1), %ecx
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VPCMP   $0, (%rdi), %ZMM0, %k0
> > > +       kmovq   %k0, %rax
> > > +       testq   %rax, %rax
> > > +       jz      L(align_more)
> > > +
> > > +       tzcntq  %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > cmpl
> >
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       /* eax instead of rax used to save encoding space.  */
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +       /* At this point vector max length reached.  */
> > > +# ifdef USE_AS_STRNLEN
> > > +L(ret_max):
> > > +       movq    %rsi, %rax
> > > +       ret
> > > +# endif
> > > +
> > > +L(page_cross):
> >
> > Imo unless you need the 2-byte encoding on the jump this should be at
> > the end of the
> > file as its expected to not be hot.
>
> One of my goal, to reduce size as much as possible, as long as it
> doesn't hurt performance.  Keeping the jump target nearby reduces
> size by a few bytes, without hurting performance.

Fair enough, although the page cross cases are pretty cold. Putting cold
code in a hot region is a waste in a sense too.
>
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarl    $2, %ecx
> > > +# endif
> > > +       /* ecx contains number of w[char] to be skipped as a result
> > > +          of address alignment.  */
> > > +       movq    %rdi, %rax
> > > +       andq    $-VEC_SIZE, %rax
> > > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rax
> > > +       /* Ignore number of character for alignment adjustment.  */
> > > +       shrq    %cl, %rax
> > > +       jz      L(align_more)
> > > +
> > > +       tzcntq  %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(align_more):
> > > +       leaq    VEC_SIZE(%rdi), %rax
> > > +       /* Align rax to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       movq    %rax, %rdx
> > > +       subq    %rdi, %rdx
> > > +#  ifdef USE_AS_WCSLEN
> > > +       shrq    $2, %rdx
> > > +#  endif
> > > +       /* At this point rdx contains [w]chars already compared.  */
> > > +       cmpq    %rsi, %rdx
> > > +       jae     L(ret_max)
> > > +       subq    %rsi, %rdx
> > > +       negq    %rdx
> > > +       /* At this point rdx contains number of w[char] needs to go.
> > > +          Now onwards rdx will keep decrementing with each compare.  */
> > > +# endif
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(first_vector)
> >
> > Just to keep consistent with the other files can you
> > rename first_vector/second_vector... to ret_vec_x{N}
> > or something like that.
>
> Agree, will be fixed in v1.
>
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> > > +
> > > +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(second_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> >
> > The evex256 / avx2 versions do a simple check if we will be able
> > to do all 4 aligning compares w.o a branch. This saves total
> > branches. Why not do something similar here?
>
> Done this way to reduce size and complexity. Branch taken, will
> jump to terminating condition. Branch not taken has no impact on perf.

Don't think that's quite true...
>
> > > +
> > > +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(third_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(fourth_vector)
> > > +
> > > +       addq    $(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       /* Instead of decreasing, rdx increased to prepare for loop
> > > +          first iteration.  Incremented 3 times because one increment
> > > +          cancelled by previous decrement.  */
> > > +       addq    $(3 * CHAR_PER_VEC), %rdx
> > > +# endif
> > > +
> > > +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > > +          loop.  */
> > > +       testq   $(3 * VEC_SIZE), %rax
> > > +       jz      L(loop)
> > > +
> > > +       movq    %rax, %rcx
> > > +
> > > +       /* Align address to 4 * VEC_SIZE for loop.  */
> > > +       andq    $-(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    %rax, %rcx
> > > +#  ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rcx
> > > +#  endif
> > > +       /* rcx contains number of [w]char will be recompared due to
> > > +          alignment fixes.  rdx must be incremented by rcx to offset
> > > +          alignment adjustmentment.  */
> > > +       addq    %rcx, %rdx
> > > +# endif
> > > +
> > > +L(loop):
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > +       jbe     L(ret_max)
> >
> > we have potential to overread by 255 bytes. Not correctness issue because
> > we are page aligned by seems like a possible perf issue.
>
> Correct, but overread data will be read from cache not memory, not a
> significant impact, but this is the cost we have to pay for 4 vector alignments.

You can implement it so it has a last 4x case instead.
>
> > > +# endif
> > > +       /* VPMINU and VPCMP combination provide better perfomance as
> > > +          compared to alternative combinations.  */
> > > +       VMOVA   (%rax), %ZMM1
> > > +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > > +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> > > +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
> >
> > I think doing 4x in the main loop is probably overkill no?
> > Aligning to 256 is pretty extreme.
> >
> > Also I don't think the 4x zmm loads can even keep up with
> > 2x / cycle so seems like it may not be worth wasting up to
> > 255 bytes to get it.
>
> Perf number looks good, so for now it should be ok.

Would prefer having a good final version.
>
> > > +
> > > +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> > > +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> > > +
> > > +       addq    $(4 * VEC_SIZE), %rax
> > > +       kortestq %k0, %k1
> > > +       jz      L(loop)
> > > +
> > > +       /* Need 4 vector subtraction because address incremented in
> > > +          the loop before terminating condition check.  Also want to
> > > +          reuse code for exit condition before and after the loop.  */
> > > +       subq    $(4 * VEC_SIZE), %rax
> > > +
> > > +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> > > +       kmovq   %k2, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(first_vector)
> > > +
> > > +       kmovq   %k0, %rcx
> > > +       /* At this point, if k0 is non zero, null char must be in the
> > > +          second vector.  */
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(second_vector)
> > > +
> > > +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> > > +       kmovq   %k3, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(third_vector)
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       kmovq   %k1, %rcx
> > > +
> > > +       /* Termination fourth, third, second vector are pretty much
> > > +          same, implemented this way to avoid branching and reuse code
> > > +          from pre loop exit condition.  */
> > > +L(fourth_vector):
> > > +       addq    $(3 * VEC_SIZE), %rax
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > Can this be hoisted out to the begining of L(aligned_more).
> > It seems every return path uses it.
> >
>
> It really depends on where the control is coming from. So moving before
> align_more will not be correct, or I may be missing something here.

Is there any path from the *begining* of L(aligned_more) that
doesn't go to either L(ret_max) or one of the 4 return statements?
>
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> >
> > if not wcslen probably faster to use lea instead of 2x add
>
> I'm not sure whether there will be any significant gain. lea vs add. Used add
> because it's readily available on all ports.

AFAIK all machines we would enable evex512 on have fast LEA.
>
>
> >
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(third_vector):
> > > +       addq    $(2 * VEC_SIZE), %rax
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(second_vector):
> > > +       addq    $VEC_SIZE, %rax
> > > +L(first_vector):
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +END (STRLEN)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..0b7f220214
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __strnlen_evex512
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > new file mode 100644
> > > index 0000000000..f59c372b78
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __wcslen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..73dcf2f210
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > @@ -0,0 +1,5 @@
> > > +#define STRLEN __wcsnlen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > --
> > > 2.35.3
> > >
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@  sysdep_routines += \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@  sysdep_routines += \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
+  strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
   strpbrk-sse2 \
@@ -138,6 +140,7 @@  sysdep_routines += \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
+  wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
   wcsncmp-avx2 \
@@ -148,6 +151,7 @@  sysdep_routines += \
   wcsnlen-avx2-rtm \
   wcsnlen-c \
   wcsnlen-evex \
+  wcsnlen-evex512 \
   wcsnlen-sse4_1 \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -346,6 +351,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -699,6 +709,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -720,6 +735,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..13a6b34615
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,291 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_evex512
+# endif
+
+# define VMOVA		vmovdqa64
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define ZMM0		zmm16
+# define ZMM1		zmm17
+# define ZMM2		zmm18
+# define ZMM3		zmm19
+# define ZMM4		zmm20
+# define VEC_SIZE	64
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section .text.evex512, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+        /* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %ecx
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %ZMM0, %k0
+	kmovq	%k0, %rax
+	testq	%rax, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+# ifdef USE_AS_STRNLEN
+	/* eax instead of rax used to save encoding space.  */
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(page_cross):
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	movq	%rdi, %rax
+	andq	$-VEC_SIZE, %rax
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rax
+	/* Ignore number of character for alignment adjustment.  */
+	shrq	%cl, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rdx
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	cmpq	%rsi, %rdx
+	jae	L(ret_max)
+	subq	%rsi, %rdx
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(first_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(second_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(third_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(fourth_vector)
+
+	addq    $(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	/* Instead of decreasing, rdx increased to prepare for loop
+	   first iteration.  Incremented 3 times because one increment
+	   cancelled by previous decrement.  */
+	addq    $(3 * CHAR_PER_VEC), %rdx
+# endif
+
+	/* Test if address is already 4 * VEC_SIZE byte aligned goto
+	   loop.  */
+	testq   $(3 * VEC_SIZE), %rax
+	jz      L(loop)
+
+	movq	%rax, %rcx
+
+	/* Align address to 4 * VEC_SIZE for loop.  */
+	andq	$-(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	sarq	$2, %rcx
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustmentment.  */
+	addq	%rcx, %rdx
+# endif
+
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+# endif
+	/* VPMINU and VPCMP combination provide better perfomance as
+	   compared to alternative combinations.  */
+	VMOVA	(%rax), %ZMM1
+	VPMINU	(VEC_SIZE)(%rax), %ZMM1, %ZMM2
+	VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
+	VPMINU	(3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
+
+	VPCMP	$0, %ZMM2, %ZMM0, %k0
+	VPCMP	$0, %ZMM4, %ZMM0, %k1
+
+	addq	$(4 * VEC_SIZE), %rax
+	kortestq %k0, %k1
+	jz	L(loop)
+
+	/* Need 4 vector subtraction because address incremented in
+	   the loop before terminating condition check.  Also want to
+	   reuse code for exit condition before and after the loop.  */
+	subq	$(4 * VEC_SIZE), %rax
+
+	VPCMP	$0, %ZMM1, %ZMM0, %k2
+	kmovq	%k2, %rcx
+	testq	%rcx, %rcx
+	jnz	L(first_vector)
+
+	kmovq	%k0, %rcx
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	testq	%rcx, %rcx
+	jnz	L(second_vector)
+
+	VPCMP	$0, %ZMM3, %ZMM0, %k3
+	kmovq	%k3, %rcx
+	testq	%rcx, %rcx
+	jnz	L(third_vector)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	kmovq	%k1, %rcx
+
+	/* Termination fourth, third, second vector are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(fourth_vector):
+	addq	$(3 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(third_vector):
+	addq	$(2 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(second_vector):
+	addq	$VEC_SIZE, %rax
+L(first_vector):
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@ 
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@ 
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@ 
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"