diff mbox series

[v5,2/2] x86: Optimize strlen-avx2.S

Message ID 20210419233607.916848-2-goldstein.w.n@gmail.com
State New
Headers show
Series [v5,1/2] x86: Optimize strlen-evex.S | expand

Commit Message

Noah Goldstein April 19, 2021, 11:36 p.m. UTC
No bug. This commit optimizes strlen-avx2.S. The optimizations are
mostly small things but they add up to roughly 10-30% performance
improvement for strlen. The results for strnlen are bit more
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
 sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
 2 files changed, 334 insertions(+), 214 deletions(-)

Comments

H.J. Lu April 20, 2021, 1:01 a.m. UTC | #1
On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes strlen-avx2.S. The optimizations are
> mostly small things but they add up to roughly 10-30% performance
> improvement for strlen. The results for strnlen are bit more
> ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
>  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
>  2 files changed, 334 insertions(+), 214 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c377cab629..651b32908e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
>    IFUNC_IMPL (i, name, strlen,
>               IFUNC_IMPL_ADD (array, i, strlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_avx2)
>               IFUNC_IMPL_ADD (array, i, strlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strlen,
> @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
>    IFUNC_IMPL (i, name, strnlen,
>               IFUNC_IMPL_ADD (array, i, strnlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_avx2)
>               IFUNC_IMPL_ADD (array, i, strnlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strnlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strnlen,
> @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>               IFUNC_IMPL_ADD (array, i, wcslen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_avx2)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __wcslen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, wcslen,
> @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
>    IFUNC_IMPL (i, name, wcsnlen,
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_avx2)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __wcsnlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index 1caae9e6bc..bd2e6ee44a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -27,9 +27,11 @@
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMPEQ      vpcmpeqd
>  #  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPCMPEQ      vpcmpeqb
>  #  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
>  # endif
>
>  # ifndef VZEROUPPER
> @@ -41,349 +43,459 @@
>  # endif
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
> -       /* Check for zero length.  */
> +       /* Check zero length.  */
>         test    %RSI_LP, %RSI_LP
>         jz      L(zero)
> +       /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> +       mov     %RSI_LP, %R8_LP
>  #  ifdef USE_AS_WCSLEN
>         shl     $2, %RSI_LP
>  #  elif defined __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %esi, %esi
>  #  endif
> -       mov     %RSI_LP, %R8_LP
>  # endif
> -       movl    %edi, %ecx
> +       movl    %edi, %eax
>         movq    %rdi, %rdx
>         vpxor   %xmm0, %xmm0, %xmm0
> -
> +       /* Clear high bits from edi. Only keeping bits relevant to page
> +          cross check.  */
> +       andl    $(PAGE_SIZE - 1), %eax
>         /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -
> +       VPCMPEQ (%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>  # ifdef USE_AS_STRNLEN
> -       jnz     L(first_vec_x0_check)
> -       /* Adjust length and check the end of data.  */
> -       subq    $VEC_SIZE, %rsi
> -       jbe     L(max)
> -# else
> -       jnz     L(first_vec_x0)
> +       /* If length < VEC_SIZE handle special.  */
> +       cmpq    $VEC_SIZE, %rsi
> +       jbe     L(first_vec_x0)
>  # endif
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> +       /* If empty continue to aligned_more. Otherwise return bit
> +          position of first match.  */
> +       testl   %eax, %eax
> +       jz      L(aligned_more)
> +       tzcntl  %eax, %eax
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> -       addq    %rcx, %rsi
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
>
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       .p2align 4
> +L(first_vec_x0):
> +       /* Set bit for max len so that tzcnt will return min of max len
> +          and position of first match.  */
> +       btsq    %rsi, %rax
> +       tzcntl  %eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
> +       VZEROUPPER_RETURN
>  # endif
> -       jmp     L(more_4x_vec)
>
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       /* Remove the leading bytes.  */
> -       sarl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> +L(first_vec_x1):
>         tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 4 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       incl    %edi
> +       addl    %edi, %eax
>  # endif
> -       addq    %rdi, %rax
> -       addq    %rcx, %rax
> -       subq    %rdx, %rax
>  # ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       shrl    $2, %eax
>  # endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> +       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(aligned_more):
> +L(first_vec_x2):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> -           with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> -           to void possible addition overflow.  */
> -       negq    %rcx
> -       addq    $VEC_SIZE, %rcx
> -
> -       /* Check the end of data.  */
> -       subq    %rcx, %rsi
> -       jbe     L(max)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 3 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE + 1), %edi
> +       addl    %edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> -       addq    $VEC_SIZE, %rdi
> +       .p2align 4
> +L(first_vec_x3):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
> +# ifdef USE_AS_STRNLEN
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 2 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE * 2 + 1), %edi
> +       addl    %edi, %eax
> +# endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> +       .p2align 4
> +L(first_vec_x4):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE * 3 + 1), %edi
> +       addl    %edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> -L(more_4x_vec):
> +       .p2align 5
> +L(aligned_more):
> +       /* Align data to VEC_SIZE - 1. This is the same number of
> +          instructions as using andq with -VEC_SIZE but saves 4 bytes of
> +          code on the x4 check.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_STRNLEN
> +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> +          it simplies the logic in last_4x_vec_or_less.  */
> +       leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> +       subq    %rdx, %rcx
> +# endif
> +       /* Load first VEC regardless.  */
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
> +# ifdef USE_AS_STRNLEN
> +       /* Adjust length. If near end handle specially.  */
> +       subq    %rcx, %rsi
> +       jb      L(last_4x_vec_or_less)
> +# endif
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -# ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x4)
>
> +       /* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> +       /* Before adjusting length check if at last VEC_SIZE * 4.  */
> +       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> +       jbe     L(last_4x_vec_or_less_load)
> +       incq    %rdi
> +       movl    %edi, %ecx
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
> +       andl    $(VEC_SIZE * 4 - 1), %ecx
> +       /* Readjust length.  */
>         addq    %rcx, %rsi
> +# else
> +       incq    %rdi
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
>  # endif
> -
> +       /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -       VPMINU  %ymm1, %ymm2, %ymm5
> -       VPMINU  %ymm3, %ymm4, %ymm6
> -       VPMINU  %ymm5, %ymm6, %ymm5
> -
> -       VPCMPEQ %ymm5, %ymm0, %ymm5
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -# ifndef USE_AS_STRNLEN
> -       jmp     L(loop_4x_vec)
> -# else
> +# ifdef USE_AS_STRNLEN
> +       /* Break if at end of length.  */
>         subq    $(VEC_SIZE * 4), %rsi
> -       ja      L(loop_4x_vec)
> -
> -L(last_4x_vec_or_less):
> -       /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -       addl    $(VEC_SIZE * 2), %esi
> -       jle     L(last_2x_vec)
> +       jb      L(last_4x_vec_or_less_cmpeq)
> +# endif
> +       /* Save some code size by microfusing VPMINU with the load. Since
> +          the matches in ymm2/ymm4 can only be returned if there where no
> +          matches in ymm1/ymm3 respectively there is no issue with overlap.
> +        */
> +       vmovdqa 1(%rdi), %ymm1
> +       VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> +       VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> +
> +       VPMINU  %ymm2, %ymm4, %ymm5
> +       VPCMPEQ %ymm5, %ymm0, %ymm5
> +       vpmovmskb       %ymm5, %ecx
>
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       testl   %ecx, %ecx
> +       jz      L(loop_4x_vec)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm1, %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       subq    %rdx, %rdi
>         testl   %eax, %eax
> +       jnz     L(last_vec_return_x0)
>
> -       jnz     L(first_vec_x2_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> -
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm2, %ymm0, %ymm2
> +       vpmovmskb       %ymm2, %eax
>         testl   %eax, %eax
> -
> -       jnz     L(first_vec_x3_check)
> -       movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> +       jnz     L(last_vec_return_x1)
> +
> +       /* Combine last 2 VEC.  */
> +       VPCMPEQ %ymm3, %ymm0, %ymm3
> +       vpmovmskb       %ymm3, %eax
> +       /* rcx has combined result from all 4 VEC. It will only be used if
> +          the first 3 other VEC all did not contain a match.  */
> +       salq    $32, %rcx
> +       orq     %rcx, %rax
> +       tzcntq  %rax, %rax
> +       subq    $(VEC_SIZE * 2 - 1), %rdi
> +       addq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
> +
> +# ifdef USE_AS_STRNLEN
>         .p2align 4
> -L(last_2x_vec):
> -       addl    $(VEC_SIZE * 2), %esi
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> +L(last_4x_vec_or_less_load):
> +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +       subq    $-(VEC_SIZE * 4), %rdi
> +L(last_4x_vec_or_less_cmpeq):
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
> +L(last_4x_vec_or_less):
>
> -       jnz     L(first_vec_x0_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> +       vpmovmskb       %ymm1, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> +          VEC_SIZE * 4.  */
> +       testl   $(VEC_SIZE * 2), %esi
> +       jnz     L(last_4x_vec)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       /* length may have been negative or positive by an offset of
> +          VEC_SIZE * 4 depending on where this was called from. This fixes
> +          that.  */
> +       andl    $(VEC_SIZE * 4 - 1), %esi
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> -       movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -#  endif
> -       VZEROUPPER_RETURN
> +       jnz     L(last_vec_x1_check)
>
> -       .p2align 4
> -L(first_vec_x0_check):
> +       subl    $VEC_SIZE, %esi
> +       jb      L(max)
> +
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> +# endif
>
>         .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_return_x0):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $VEC_SIZE, %rax
> +       subq    $(VEC_SIZE * 4 - 1), %rdi
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_return_x1):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 2), %rax
> +       subq    $(VEC_SIZE * 3 - 1), %rdi
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
> +# ifdef USE_AS_STRNLEN
>         .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x1_check):
> +
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 3), %rax
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       incl    %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
>  L(max):
>         movq    %r8, %rax
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
> +L(last_4x_vec):
> +       /* Test first 2x VEC normally.  */
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x2)
> +
> +       /* Normalize length.  */
> +       andl    $(VEC_SIZE * 4 - 1), %esi
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x3)
> +
> +       subl    $(VEC_SIZE * 3), %esi
> +       jb      L(max)
> +
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       tzcntl  %eax, %eax
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE * 3 + 1), %eax
> +       addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -# endif
>
>         .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x1):
> +       /* essentially duplicates of first_vec_x1 but use 64 bit
> +          instructions.  */
>         tzcntl  %eax, %eax
> +       subq    %rdx, %rdi
> +       incl    %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x1):
> +L(last_vec_x2):
> +       /* essentially duplicates of first_vec_x1 but use 64 bit
> +          instructions.  */
>         tzcntl  %eax, %eax
> -       addq    $VEC_SIZE, %rax
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x3):
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 2), %rax
> +       subl    $(VEC_SIZE * 2), %esi
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max_end)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE * 2 + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
> +       VZEROUPPER_RETURN
> +L(max_end):
> +       movq    %r8, %rax
>         VZEROUPPER_RETURN
> +# endif
>
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
> -L(4x_vec_end):
> -       VPCMPEQ %ymm1, %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -       VPCMPEQ %ymm2, %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> +L(cross_page_boundary):
> +       /* Align data to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +       VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> +          so no need to manually mod rdx.  */
> +       sarxl   %edx, %eax, %eax
> +# ifdef USE_AS_STRNLEN
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> -       VPCMPEQ %ymm3, %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> +       jnz     L(cross_page_less_vec)
> +       leaq    1(%rdi), %rcx
> +       subq    %rdx, %rcx
> +       /* Check length.  */
> +       cmpq    %rsi, %rcx
> +       jb      L(cross_page_continue)
> +       movq    %r8, %rax
> +# else
>         testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> -       VPCMPEQ %ymm4, %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -L(first_vec_x3):
> +       jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
>  # endif
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +# ifdef USE_AS_STRNLEN
> +       .p2align 4
> +L(cross_page_less_vec):
> +       tzcntl  %eax, %eax
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
>         VZEROUPPER_RETURN
> +# endif
>
>  END (STRLEN)
>  #endif
> --
> 2.29.2
>

LGTM.  I am checking it in for you.

Thanks.
Aurelien Jarno Sept. 25, 2022, 8:19 a.m. UTC | #2
On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> No bug. This commit optimizes strlen-avx2.S. The optimizations are
> mostly small things but they add up to roughly 10-30% performance
> improvement for strlen. The results for strnlen are bit more
> ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
>  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
>  2 files changed, 334 insertions(+), 214 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c377cab629..651b32908e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
>    IFUNC_IMPL (i, name, strlen,
>  	      IFUNC_IMPL_ADD (array, i, strlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __strlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, strlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __strlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, strlen,
> @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
>    IFUNC_IMPL (i, name, strnlen,
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __strnlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __strnlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
> @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __wcslen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __wcslen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
> @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
>    IFUNC_IMPL (i, name, wcsnlen,
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __wcsnlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __wcsnlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index 1caae9e6bc..bd2e6ee44a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -27,9 +27,11 @@
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMPEQ	vpcmpeqd
>  #  define VPMINU	vpminud
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPCMPEQ	vpcmpeqb
>  #  define VPMINU	vpminub
> +#  define CHAR_SIZE	1
>  # endif
>  
>  # ifndef VZEROUPPER
> @@ -41,349 +43,459 @@
>  # endif
>  
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>  
>  	.section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
> -	/* Check for zero length.  */
> +	/* Check zero length.  */
>  	test	%RSI_LP, %RSI_LP
>  	jz	L(zero)
> +	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
> +	mov	%RSI_LP, %R8_LP
>  #  ifdef USE_AS_WCSLEN
>  	shl	$2, %RSI_LP
>  #  elif defined __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%esi, %esi
>  #  endif
> -	mov	%RSI_LP, %R8_LP
>  # endif
> -	movl	%edi, %ecx
> +	movl	%edi, %eax
>  	movq	%rdi, %rdx
>  	vpxor	%xmm0, %xmm0, %xmm0
> -
> +	/* Clear high bits from edi. Only keeping bits relevant to page
> +	   cross check.  */
> +	andl	$(PAGE_SIZE - 1), %eax
>  	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
> +	VPCMPEQ	(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  # ifdef USE_AS_STRNLEN
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rsi
> -	jbe	L(max)
> -# else
> -	jnz	L(first_vec_x0)
> +	/* If length < VEC_SIZE handle special.  */
> +	cmpq	$VEC_SIZE, %rsi
> +	jbe	L(first_vec_x0)
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	/* If empty continue to aligned_more. Otherwise return bit
> +	   position of first match.  */
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
>  # ifdef USE_AS_STRNLEN
> -	/* Adjust length.  */
> -	addq	%rcx, %rsi
> +L(zero):
> +	xorl	%eax, %eax
> +	ret
>  
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> +	.p2align 4
> +L(first_vec_x0):
> +	/* Set bit for max len so that tzcnt will return min of max len
> +	   and position of first match.  */
> +	btsq	%rsi, %rax
> +	tzcntl	%eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +#  endif
> +	VZEROUPPER_RETURN
>  # endif
> -	jmp	L(more_4x_vec)
>  
>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	/* Remove the leading bytes.  */
> -	sarl	%cl, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> +L(first_vec_x1):
>  	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 4 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	incl	%edi
> +	addl	%edi, %eax
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
> -	subq	%rdx, %rax
>  # ifdef USE_AS_WCSLEN
> -	shrq	$2, %rax
> +	shrl	$2, %eax
>  # endif
> -L(return_vzeroupper):
> -	ZERO_UPPER_VEC_REGISTERS_RETURN
> +	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(aligned_more):
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> -	    to void possible addition overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> -
> -	/* Check the end of data.  */
> -	subq	%rcx, %rsi
> -	jbe	L(max)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 3 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE + 1), %edi
> +	addl	%edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
> +# ifdef USE_AS_STRNLEN
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 2 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE * 2 + 1), %edi
> +	addl	%edi, %eax
> +# endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE * 3 + 1), %edi
> +	addl	%edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> -L(more_4x_vec):
> +	.p2align 5
> +L(aligned_more):
> +	/* Align data to VEC_SIZE - 1. This is the same number of
> +	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
> +	   code on the x4 check.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
>  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>  	   since data is only aligned to VEC_SIZE.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_STRNLEN
> +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> +	   it simplies the logic in last_4x_vec_or_less.  */
> +	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> +	subq	%rdx, %rcx
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +# ifdef USE_AS_STRNLEN
> +	/* Adjust length. If near end handle specially.  */
> +	subq	%rcx, %rsi
> +	jb	L(last_4x_vec_or_less)
> +# endif
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> -# ifdef USE_AS_STRNLEN
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> -	andq	$-(4 * VEC_SIZE), %rdi
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
>  
> +	/* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
> -	/* Adjust length.  */
> +	/* Before adjusting length check if at last VEC_SIZE * 4.  */
> +	cmpq	$(VEC_SIZE * 4 - 1), %rsi
> +	jbe	L(last_4x_vec_or_less_load)
> +	incq	%rdi
> +	movl	%edi, %ecx
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
> +	/* Readjust length.  */
>  	addq	%rcx, %rsi
> +# else
> +	incq	%rdi
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
>  # endif
> -
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	vmovdqa (%rdi), %ymm1
> -	vmovdqa	VEC_SIZE(%rdi), %ymm2
> -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
> -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
> -	VPMINU	%ymm1, %ymm2, %ymm5
> -	VPMINU	%ymm3, %ymm4, %ymm6
> -	VPMINU	%ymm5, %ymm6, %ymm5
> -
> -	VPCMPEQ	%ymm5, %ymm0, %ymm5
> -	vpmovmskb %ymm5, %eax
> -	testl	%eax, %eax
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> -# ifndef USE_AS_STRNLEN
> -	jmp	L(loop_4x_vec)
> -# else
> +# ifdef USE_AS_STRNLEN
> +	/* Break if at end of length.  */
>  	subq	$(VEC_SIZE * 4), %rsi
> -	ja	L(loop_4x_vec)
> -
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %esi
> -	jle	L(last_2x_vec)
> +	jb	L(last_4x_vec_or_less_cmpeq)
> +# endif
> +	/* Save some code size by microfusing VPMINU with the load. Since
> +	   the matches in ymm2/ymm4 can only be returned if there where no
> +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
> +	 */
> +	vmovdqa	1(%rdi), %ymm1
> +	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
> +	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> +
> +	VPMINU	%ymm2, %ymm4, %ymm5
> +	VPCMPEQ	%ymm5, %ymm0, %ymm5
> +	vpmovmskb	%ymm5, %ecx
>  
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	testl	%ecx, %ecx
> +	jz	L(loop_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	%ymm1, %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	subq	%rdx, %rdi
>  	testl	%eax, %eax
> +	jnz	L(last_vec_return_x0)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %esi
> -	jle	L(max)
> -
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	%ymm2, %ymm0, %ymm2
> +	vpmovmskb	%ymm2, %eax
>  	testl	%eax, %eax
> -
> -	jnz	L(first_vec_x3_check)
> -	movq	%r8, %rax
> -#  ifdef USE_AS_WCSLEN
> +	jnz	L(last_vec_return_x1)
> +
> +	/* Combine last 2 VEC.  */
> +	VPCMPEQ	%ymm3, %ymm0, %ymm3
> +	vpmovmskb	%ymm3, %eax
> +	/* rcx has combined result from all 4 VEC. It will only be used if
> +	   the first 3 other VEC all did not contain a match.  */
> +	salq	$32, %rcx
> +	orq	%rcx, %rax
> +	tzcntq	%rax, %rax
> +	subq	$(VEC_SIZE * 2 - 1), %rdi
> +	addq	%rdi, %rax
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
> +
> +# ifdef USE_AS_STRNLEN
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %esi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> +L(last_4x_vec_or_less_load):
> +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +	subq	$-(VEC_SIZE * 4), %rdi
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +L(last_4x_vec_or_less):
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %esi
> -	jle	L(max)
> +	vpmovmskb	%ymm1, %eax
> +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
> +	   VEC_SIZE * 4.  */
> +	testl	$(VEC_SIZE * 2), %esi
> +	jnz	L(last_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	/* length may have been negative or positive by an offset of
> +	   VEC_SIZE * 4 depending on where this was called from. This fixes
> +	   that.  */
> +	andl	$(VEC_SIZE * 4 - 1), %esi
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	movq	%r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -	shrq	$2, %rax
> -#  endif
> -	VZEROUPPER_RETURN
> +	jnz	L(last_vec_x1_check)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> +	subl	$VEC_SIZE, %esi
> +	jb	L(max)
> +
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	tzcntl	%eax, %eax
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
> +# endif
>  
>  	.p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_return_x0):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$VEC_SIZE, %rax
> +	subq	$(VEC_SIZE * 4 - 1), %rdi
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_return_x1):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$(VEC_SIZE * 2), %rax
> +	subq	$(VEC_SIZE * 3 - 1), %rdi
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
> +# ifdef USE_AS_STRNLEN
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x1_check):
> +
>  	tzcntl	%eax, %eax
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$(VEC_SIZE * 3), %rax
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	incl	%eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
>  L(max):
>  	movq	%r8, %rax
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
> +L(last_4x_vec):
> +	/* Test first 2x VEC normally.  */
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x1)
> +
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2)
> +
> +	/* Normalize length.  */
> +	andl	$(VEC_SIZE * 4 - 1), %esi
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x3)
> +
> +	subl	$(VEC_SIZE * 3), %esi
> +	jb	L(max)
> +
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	tzcntl	%eax, %eax
> +	/* Check the end of data.  */
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE * 3 + 1), %eax
> +	addq	%rdi, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	ret
> -# endif
>  
>  	.p2align 4
> -L(first_vec_x0):
> +L(last_vec_x1):
> +	/* essentially duplicates of first_vec_x1 but use 64 bit
> +	   instructions.  */
>  	tzcntl	%eax, %eax
> +	subq	%rdx, %rdi
> +	incl	%eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x1):
> +L(last_vec_x2):
> +	/* essentially duplicates of first_vec_x1 but use 64 bit
> +	   instructions.  */
>  	tzcntl	%eax, %eax
> -	addq	$VEC_SIZE, %rax
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x2):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 2), %rax
> +	subl	$(VEC_SIZE * 2), %esi
> +	/* Check the end of data.  */
> +	cmpl	%eax, %esi
> +	jb	L(max_end)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE * 2 + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
> +	VZEROUPPER_RETURN
> +L(max_end):
> +	movq	%r8, %rax
>  	VZEROUPPER_RETURN
> +# endif
>  
> +	/* Cold case for crossing page with first load.	 */
>  	.p2align 4
> -L(4x_vec_end):
> -	VPCMPEQ	%ymm1, %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	VPCMPEQ %ymm2, %ymm0, %ymm2
> -	vpmovmskb %ymm2, %eax
> +L(cross_page_boundary):
> +	/* Align data to VEC_SIZE - 1.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> +	   so no need to manually mod rdx.  */
> +	sarxl	%edx, %eax, %eax

This is a BMI2 instruction, which is not necessary available when AVX2
is available. This causes SIGILL on some CPU. I have reported that in 
https://sourceware.org/bugzilla/show_bug.cgi?id=29611

Regards
Aurelien
Noah Goldstein Sept. 25, 2022, 2 p.m. UTC | #3
On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > mostly small things but they add up to roughly 10-30% performance
> > improvement for strlen. The results for strnlen are bit more
> > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> >  2 files changed, 334 insertions(+), 214 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index c377cab629..651b32908e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> >    IFUNC_IMPL (i, name, strlen,
> >             IFUNC_IMPL_ADD (array, i, strlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __strlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, strlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __strlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, strlen,
> > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> >    IFUNC_IMPL (i, name, strnlen,
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __strnlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __strnlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> >    IFUNC_IMPL (i, name, wcslen,
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __wcslen_avx2)
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __wcslen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> >    IFUNC_IMPL (i, name, wcsnlen,
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __wcsnlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __wcsnlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > index 1caae9e6bc..bd2e6ee44a 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > @@ -27,9 +27,11 @@
> >  # ifdef USE_AS_WCSLEN
> >  #  define VPCMPEQ    vpcmpeqd
> >  #  define VPMINU     vpminud
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPCMPEQ    vpcmpeqb
> >  #  define VPMINU     vpminub
> > +#  define CHAR_SIZE  1
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > @@ -41,349 +43,459 @@
> >  # endif
> >
> >  # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> >       .section SECTION(.text),"ax",@progbits
> >  ENTRY (STRLEN)
> >  # ifdef USE_AS_STRNLEN
> > -     /* Check for zero length.  */
> > +     /* Check zero length.  */
> >       test    %RSI_LP, %RSI_LP
> >       jz      L(zero)
> > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > +     mov     %RSI_LP, %R8_LP
> >  #  ifdef USE_AS_WCSLEN
> >       shl     $2, %RSI_LP
> >  #  elif defined __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %esi, %esi
> >  #  endif
> > -     mov     %RSI_LP, %R8_LP
> >  # endif
> > -     movl    %edi, %ecx
> > +     movl    %edi, %eax
> >       movq    %rdi, %rdx
> >       vpxor   %xmm0, %xmm0, %xmm0
> > -
> > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > +        cross check.  */
> > +     andl    $(PAGE_SIZE - 1), %eax
> >       /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >  # ifdef USE_AS_STRNLEN
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rsi
> > -     jbe     L(max)
> > -# else
> > -     jnz     L(first_vec_x0)
> > +     /* If length < VEC_SIZE handle special.  */
> > +     cmpq    $VEC_SIZE, %rsi
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     /* If empty continue to aligned_more. Otherwise return bit
> > +        position of first match.  */
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> >  # ifdef USE_AS_STRNLEN
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rsi
> > +L(zero):
> > +     xorl    %eax, %eax
> > +     ret
> >
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > +     .p2align 4
> > +L(first_vec_x0):
> > +     /* Set bit for max len so that tzcnt will return min of max len
> > +        and position of first match.  */
> > +     btsq    %rsi, %rax
> > +     tzcntl  %eax, %eax
> > +#  ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +#  endif
> > +     VZEROUPPER_RETURN
> >  # endif
> > -     jmp     L(more_4x_vec)
> >
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     /* Remove the leading bytes.  */
> > -     sarl    %cl, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > +L(first_vec_x1):
> >       tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     incl    %edi
> > +     addl    %edi, %eax
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> > -     subq    %rdx, %rax
> >  # ifdef USE_AS_WCSLEN
> > -     shrq    $2, %rax
> > +     shrl    $2, %eax
> >  # endif
> > -L(return_vzeroupper):
> > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > +     VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(aligned_more):
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > -         to void possible addition overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > -
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rsi
> > -     jbe     L(max)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE + 1), %edi
> > +     addl    %edi, %eax
> >  # endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> > +# ifdef USE_AS_STRNLEN
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > +     addl    %edi, %eax
> > +# endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > +     addl    %edi, %eax
> >  # endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > -L(more_4x_vec):
> > +     .p2align 5
> > +L(aligned_more):
> > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > +        code on the x4 check.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >          since data is only aligned to VEC_SIZE.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +# ifdef USE_AS_STRNLEN
> > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > +        it simplies the logic in last_4x_vec_or_less.  */
> > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > +     subq    %rdx, %rcx
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +# ifdef USE_AS_STRNLEN
> > +     /* Adjust length. If near end handle specially.  */
> > +     subq    %rcx, %rsi
> > +     jb      L(last_4x_vec_or_less)
> > +# endif
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > -# ifdef USE_AS_STRNLEN
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > -     andq    $-(4 * VEC_SIZE), %rdi
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> >
> > +     /* Align data to VEC_SIZE * 4 - 1.  */
> >  # ifdef USE_AS_STRNLEN
> > -     /* Adjust length.  */
> > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > +     jbe     L(last_4x_vec_or_less_load)
> > +     incq    %rdi
> > +     movl    %edi, %ecx
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > +     /* Readjust length.  */
> >       addq    %rcx, %rsi
> > +# else
> > +     incq    %rdi
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> >  # endif
> > -
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     vmovdqa (%rdi), %ymm1
> > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > -     VPMINU  %ymm1, %ymm2, %ymm5
> > -     VPMINU  %ymm3, %ymm4, %ymm6
> > -     VPMINU  %ymm5, %ymm6, %ymm5
> > -
> > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > -     vpmovmskb %ymm5, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > -# ifndef USE_AS_STRNLEN
> > -     jmp     L(loop_4x_vec)
> > -# else
> > +# ifdef USE_AS_STRNLEN
> > +     /* Break if at end of length.  */
> >       subq    $(VEC_SIZE * 4), %rsi
> > -     ja      L(loop_4x_vec)
> > -
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %esi
> > -     jle     L(last_2x_vec)
> > +     jb      L(last_4x_vec_or_less_cmpeq)
> > +# endif
> > +     /* Save some code size by microfusing VPMINU with the load. Since
> > +        the matches in ymm2/ymm4 can only be returned if there where no
> > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > +      */
> > +     vmovdqa 1(%rdi), %ymm1
> > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > +
> > +     VPMINU  %ymm2, %ymm4, %ymm5
> > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > +     vpmovmskb       %ymm5, %ecx
> >
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     testl   %ecx, %ecx
> > +     jz      L(loop_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     subq    %rdx, %rdi
> >       testl   %eax, %eax
> > +     jnz     L(last_vec_return_x0)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %esi
> > -     jle     L(max)
> > -
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > +     vpmovmskb       %ymm2, %eax
> >       testl   %eax, %eax
> > -
> > -     jnz     L(first_vec_x3_check)
> > -     movq    %r8, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +     jnz     L(last_vec_return_x1)
> > +
> > +     /* Combine last 2 VEC.  */
> > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > +     vpmovmskb       %ymm3, %eax
> > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > +        the first 3 other VEC all did not contain a match.  */
> > +     salq    $32, %rcx
> > +     orq     %rcx, %rax
> > +     tzcntq  %rax, %rax
> > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > +     addq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> > +
> > +# ifdef USE_AS_STRNLEN
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %esi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > +L(last_4x_vec_or_less_load):
> > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +L(last_4x_vec_or_less):
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %esi
> > -     jle     L(max)
> > +     vpmovmskb       %ymm1, %eax
> > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > +        VEC_SIZE * 4.  */
> > +     testl   $(VEC_SIZE * 2), %esi
> > +     jnz     L(last_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     /* length may have been negative or positive by an offset of
> > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > +        that.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %esi
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     movq    %r8, %rax
> > -#  ifdef USE_AS_WCSLEN
> > -     shrq    $2, %rax
> > -#  endif
> > -     VZEROUPPER_RETURN
> > +     jnz     L(last_vec_x1_check)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > +     subl    $VEC_SIZE, %esi
> > +     jb      L(max)
> > +
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       tzcntl  %eax, %eax
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> > +# endif
> >
> >       .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_return_x0):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $VEC_SIZE, %rax
> > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_return_x1):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> > +# ifdef USE_AS_STRNLEN
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x1_check):
> > +
> >       tzcntl  %eax, %eax
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $(VEC_SIZE * 3), %rax
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     incl    %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> >
> > -     .p2align 4
> >  L(max):
> >       movq    %r8, %rax
> > +     VZEROUPPER_RETURN
> > +
> > +     .p2align 4
> > +L(last_4x_vec):
> > +     /* Test first 2x VEC normally.  */
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x1)
> > +
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2)
> > +
> > +     /* Normalize length.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     subl    $(VEC_SIZE * 3), %esi
> > +     jb      L(max)
> > +
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     tzcntl  %eax, %eax
> > +     /* Check the end of data.  */
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > +     addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> >
> > -     .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> >
> >       .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x1):
> > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > +        instructions.  */
> >       tzcntl  %eax, %eax
> > +     subq    %rdx, %rdi
> > +     incl    %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x1):
> > +L(last_vec_x2):
> > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > +        instructions.  */
> >       tzcntl  %eax, %eax
> > -     addq    $VEC_SIZE, %rax
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     subl    $(VEC_SIZE * 2), %esi
> > +     /* Check the end of data.  */
> > +     cmpl    %eax, %esi
> > +     jb      L(max_end)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE * 2 + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> > +     VZEROUPPER_RETURN
> > +L(max_end):
> > +     movq    %r8, %rax
> >       VZEROUPPER_RETURN
> > +# endif
> >
> > +     /* Cold case for crossing page with first load.  */
> >       .p2align 4
> > -L(4x_vec_end):
> > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > -     vpmovmskb %ymm2, %eax
> > +L(cross_page_boundary):
> > +     /* Align data to VEC_SIZE - 1.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > +        so no need to manually mod rdx.  */
> > +     sarxl   %edx, %eax, %eax
>
> This is a BMI2 instruction, which is not necessary available when AVX2
> is available. This causes SIGILL on some CPU. I have reported that in
> https://sourceware.org/bugzilla/show_bug.cgi?id=29611

This is not a bug on master as:

commit 83c5b368226c34a2f0a5287df40fc290b2b34359
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 19 10:45:07 2021 -0700

    x86-64: Require BMI2 for strchr-avx2.S

is already in tree. The issue is the avx2 changes where backported
w.o H.J's changes.
>
> Regards
> Aurelien
>
> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net
Sunil Pandey Sept. 28, 2022, 1:54 p.m. UTC | #4
Attached patch fixes BZ# 29611.

I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
if there is any objection.


On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > mostly small things but they add up to roughly 10-30% performance
> > > improvement for strlen. The results for strnlen are bit more
> > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > are all passing.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index c377cab629..651b32908e 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > >    IFUNC_IMPL (i, name, strlen,
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > >    IFUNC_IMPL (i, name, strnlen,
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > >    IFUNC_IMPL (i, name, wcslen,
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcslen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcslen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > >    IFUNC_IMPL (i, name, wcsnlen,
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcsnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcsnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > index 1caae9e6bc..bd2e6ee44a 100644
> > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > @@ -27,9 +27,11 @@
> > >  # ifdef USE_AS_WCSLEN
> > >  #  define VPCMPEQ    vpcmpeqd
> > >  #  define VPMINU     vpminud
> > > +#  define CHAR_SIZE  4
> > >  # else
> > >  #  define VPCMPEQ    vpcmpeqb
> > >  #  define VPMINU     vpminub
> > > +#  define CHAR_SIZE  1
> > >  # endif
> > >
> > >  # ifndef VZEROUPPER
> > > @@ -41,349 +43,459 @@
> > >  # endif
> > >
> > >  # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > >       .section SECTION(.text),"ax",@progbits
> > >  ENTRY (STRLEN)
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check for zero length.  */
> > > +     /* Check zero length.  */
> > >       test    %RSI_LP, %RSI_LP
> > >       jz      L(zero)
> > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > +     mov     %RSI_LP, %R8_LP
> > >  #  ifdef USE_AS_WCSLEN
> > >       shl     $2, %RSI_LP
> > >  #  elif defined __ILP32__
> > >       /* Clear the upper 32 bits.  */
> > >       movl    %esi, %esi
> > >  #  endif
> > > -     mov     %RSI_LP, %R8_LP
> > >  # endif
> > > -     movl    %edi, %ecx
> > > +     movl    %edi, %eax
> > >       movq    %rdi, %rdx
> > >       vpxor   %xmm0, %xmm0, %xmm0
> > > -
> > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > +        cross check.  */
> > > +     andl    $(PAGE_SIZE - 1), %eax
> > >       /* Check if we may cross page boundary with one vector load.  */
> > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > -     cmpl    $VEC_SIZE, %ecx
> > > -     ja      L(cros_page_boundary)
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +     ja      L(cross_page_boundary)
> > >
> > >       /* Check the first VEC_SIZE bytes.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -
> > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >  # ifdef USE_AS_STRNLEN
> > > -     jnz     L(first_vec_x0_check)
> > > -     /* Adjust length and check the end of data.  */
> > > -     subq    $VEC_SIZE, %rsi
> > > -     jbe     L(max)
> > > -# else
> > > -     jnz     L(first_vec_x0)
> > > +     /* If length < VEC_SIZE handle special.  */
> > > +     cmpq    $VEC_SIZE, %rsi
> > > +     jbe     L(first_vec_x0)
> > >  # endif
> > > -
> > > -     /* Align data for aligned loads in the loop.  */
> > > -     addq    $VEC_SIZE, %rdi
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > +        position of first match.  */
> > > +     testl   %eax, %eax
> > > +     jz      L(aligned_more)
> > > +     tzcntl  %eax, %eax
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > -     addq    %rcx, %rsi
> > > +L(zero):
> > > +     xorl    %eax, %eax
> > > +     ret
> > >
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     .p2align 4
> > > +L(first_vec_x0):
> > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > +        and position of first match.  */
> > > +     btsq    %rsi, %rax
> > > +     tzcntl  %eax, %eax
> > > +#  ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > >  # endif
> > > -     jmp     L(more_4x_vec)
> > >
> > >       .p2align 4
> > > -L(cros_page_boundary):
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     /* Remove the leading bytes.  */
> > > -     sarl    %cl, %eax
> > > -     testl   %eax, %eax
> > > -     jz      L(aligned_more)
> > > +L(first_vec_x1):
> > >       tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     incl    %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > -     addq    %rdi, %rax
> > > -     addq    %rcx, %rax
> > > -     subq    %rdx, %rax
> > >  # ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > +     shrl    $2, %eax
> > >  # endif
> > > -L(return_vzeroupper):
> > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +     VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(aligned_more):
> > > +L(first_vec_x2):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > -         to void possible addition overflow.  */
> > > -     negq    %rcx
> > > -     addq    $VEC_SIZE, %rcx
> > > -
> > > -     /* Check the end of data.  */
> > > -     subq    %rcx, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -     addq    $VEC_SIZE, %rdi
> > > +     .p2align 4
> > > +L(first_vec_x3):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > +     addl    %edi, %eax
> > > +# endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > +     .p2align 4
> > > +L(first_vec_x4):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -L(more_4x_vec):
> > > +     .p2align 5
> > > +L(aligned_more):
> > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > +        code on the x4 check.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +L(cross_page_continue):
> > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > >          since data is only aligned to VEC_SIZE.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > +     subq    %rdx, %rcx
> > > +# endif
> > > +     /* Load first VEC regardless.  */
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Adjust length. If near end handle specially.  */
> > > +     subq    %rcx, %rsi
> > > +     jb      L(last_4x_vec_or_less)
> > > +# endif
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x2)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x3)
> > >
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > -# endif
> > > -
> > > -     /* Align data to 4 * VEC_SIZE.  */
> > > -     movq    %rdi, %rcx
> > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x4)
> > >
> > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > +     jbe     L(last_4x_vec_or_less_load)
> > > +     incq    %rdi
> > > +     movl    %edi, %ecx
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > +     /* Readjust length.  */
> > >       addq    %rcx, %rsi
> > > +# else
> > > +     incq    %rdi
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > >  # endif
> > > -
> > > +     /* Compare 4 * VEC at a time forward.  */
> > >       .p2align 4
> > >  L(loop_4x_vec):
> > > -     /* Compare 4 * VEC at a time forward.  */
> > > -     vmovdqa (%rdi), %ymm1
> > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > -
> > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > -     vpmovmskb %ymm5, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(4x_vec_end)
> > > -
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifndef USE_AS_STRNLEN
> > > -     jmp     L(loop_4x_vec)
> > > -# else
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Break if at end of length.  */
> > >       subq    $(VEC_SIZE * 4), %rsi
> > > -     ja      L(loop_4x_vec)
> > > -
> > > -L(last_4x_vec_or_less):
> > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     jle     L(last_2x_vec)
> > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > +# endif
> > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > +      */
> > > +     vmovdqa 1(%rdi), %ymm1
> > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > +
> > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > +     vpmovmskb       %ymm5, %ecx
> > >
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     testl   %ecx, %ecx
> > > +     jz      L(loop_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     subq    %rdx, %rdi
> > >       testl   %eax, %eax
> > > +     jnz     L(last_vec_return_x0)
> > >
> > > -     jnz     L(first_vec_x2_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > -
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > +     vpmovmskb       %ymm2, %eax
> > >       testl   %eax, %eax
> > > -
> > > -     jnz     L(first_vec_x3_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +     jnz     L(last_vec_return_x1)
> > > +
> > > +     /* Combine last 2 VEC.  */
> > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > +     vpmovmskb       %ymm3, %eax
> > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > +        the first 3 other VEC all did not contain a match.  */
> > > +     salq    $32, %rcx
> > > +     orq     %rcx, %rax
> > > +     tzcntq  %rax, %rax
> > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > +     addq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(last_2x_vec):
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > +L(last_4x_vec_or_less_load):
> > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +L(last_4x_vec_or_less_cmpeq):
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +L(last_4x_vec_or_less):
> > >
> > > -     jnz     L(first_vec_x0_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > +        VEC_SIZE * 4.  */
> > > +     testl   $(VEC_SIZE * 2), %esi
> > > +     jnz     L(last_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     /* length may have been negative or positive by an offset of
> > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > +        that.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > >       testl   %eax, %eax
> > > -     jnz     L(first_vec_x1_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > -#  endif
> > > -     VZEROUPPER_RETURN
> > > +     jnz     L(last_vec_x1_check)
> > >
> > > -     .p2align 4
> > > -L(first_vec_x0_check):
> > > +     subl    $VEC_SIZE, %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x1_check):
> > > +L(last_vec_return_x0):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2_check):
> > > +L(last_vec_return_x1):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(first_vec_x3_check):
> > > +L(last_vec_x1_check):
> > > +
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 3), %rax
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > >  L(max):
> > >       movq    %r8, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > > +     .p2align 4
> > > +L(last_4x_vec):
> > > +     /* Test first 2x VEC normally.  */
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x1)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x2)
> > > +
> > > +     /* Normalize length.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x3)
> > > +
> > > +     subl    $(VEC_SIZE * 3), %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     tzcntl  %eax, %eax
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > +     addq    %rdi, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > > -L(zero):
> > > -     xorl    %eax, %eax
> > > -     ret
> > > -# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x0):
> > > +L(last_vec_x1):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x1):
> > > +L(last_vec_x2):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2):
> > > +L(last_vec_x3):
> > >       tzcntl  %eax, %eax
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subl    $(VEC_SIZE * 2), %esi
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max_end)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > > +L(max_end):
> > > +     movq    %r8, %rax
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > > +     /* Cold case for crossing page with first load.  */
> > >       .p2align 4
> > > -L(4x_vec_end):
> > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > -     vpmovmskb %ymm2, %eax
> > > +L(cross_page_boundary):
> > > +     /* Align data to VEC_SIZE - 1.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > +        so no need to manually mod rdx.  */
> > > +     sarxl   %edx, %eax, %eax
> >
> > This is a BMI2 instruction, which is not necessary available when AVX2
> > is available. This causes SIGILL on some CPU. I have reported that in
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> This is not a bug on master as:
>
> commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Mon Apr 19 10:45:07 2021 -0700
>
>     x86-64: Require BMI2 for strchr-avx2.S
>
> is already in tree. The issue is the avx2 changes where backported
> w.o H.J's changes.
> >
> > Regards
> > Aurelien
> >
> > --
> > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net                 http://www.aurel32.net
Darren Tristano Sept. 28, 2022, 2:02 p.m. UTC | #5
Please Remove me from this string. I should not be on it.
Noah Goldstein Sept. 28, 2022, 2:42 p.m. UTC | #6
On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Attached patch fixes BZ# 29611.
>
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.
The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S

Can you post these as separate emails with the patches embedded instead of
attached?

>
>
> On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > mostly small things but they add up to roughly 10-30% performance
> > > > improvement for strlen. The results for strnlen are bit more
> > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > are all passing.
> > > >
> > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index c377cab629..651b32908e 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > >    IFUNC_IMPL (i, name, strlen,
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __strlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __strlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > >    IFUNC_IMPL (i, name, strnlen,
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __strnlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __strnlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > >    IFUNC_IMPL (i, name, wcslen,
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __wcslen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __wcslen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __wcsnlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __wcsnlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > @@ -27,9 +27,11 @@
> > > >  # ifdef USE_AS_WCSLEN
> > > >  #  define VPCMPEQ    vpcmpeqd
> > > >  #  define VPMINU     vpminud
> > > > +#  define CHAR_SIZE  4
> > > >  # else
> > > >  #  define VPCMPEQ    vpcmpeqb
> > > >  #  define VPMINU     vpminub
> > > > +#  define CHAR_SIZE  1
> > > >  # endif
> > > >
> > > >  # ifndef VZEROUPPER
> > > > @@ -41,349 +43,459 @@
> > > >  # endif
> > > >
> > > >  # define VEC_SIZE 32
> > > > +# define PAGE_SIZE 4096
> > > >
> > > >       .section SECTION(.text),"ax",@progbits
> > > >  ENTRY (STRLEN)
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Check for zero length.  */
> > > > +     /* Check zero length.  */
> > > >       test    %RSI_LP, %RSI_LP
> > > >       jz      L(zero)
> > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > +     mov     %RSI_LP, %R8_LP
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shl     $2, %RSI_LP
> > > >  #  elif defined __ILP32__
> > > >       /* Clear the upper 32 bits.  */
> > > >       movl    %esi, %esi
> > > >  #  endif
> > > > -     mov     %RSI_LP, %R8_LP
> > > >  # endif
> > > > -     movl    %edi, %ecx
> > > > +     movl    %edi, %eax
> > > >       movq    %rdi, %rdx
> > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > -
> > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > +        cross check.  */
> > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > -     cmpl    $VEC_SIZE, %ecx
> > > > -     ja      L(cros_page_boundary)
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +     ja      L(cross_page_boundary)
> > > >
> > > >       /* Check the first VEC_SIZE bytes.  */
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -
> > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     jnz     L(first_vec_x0_check)
> > > > -     /* Adjust length and check the end of data.  */
> > > > -     subq    $VEC_SIZE, %rsi
> > > > -     jbe     L(max)
> > > > -# else
> > > > -     jnz     L(first_vec_x0)
> > > > +     /* If length < VEC_SIZE handle special.  */
> > > > +     cmpq    $VEC_SIZE, %rsi
> > > > +     jbe     L(first_vec_x0)
> > > >  # endif
> > > > -
> > > > -     /* Align data for aligned loads in the loop.  */
> > > > -     addq    $VEC_SIZE, %rdi
> > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > +        position of first match.  */
> > > > +     testl   %eax, %eax
> > > > +     jz      L(aligned_more)
> > > > +     tzcntl  %eax, %eax
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Adjust length.  */
> > > > -     addq    %rcx, %rsi
> > > > +L(zero):
> > > > +     xorl    %eax, %eax
> > > > +     ret
> > > >
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > +     .p2align 4
> > > > +L(first_vec_x0):
> > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > +        and position of first match.  */
> > > > +     btsq    %rsi, %rax
> > > > +     tzcntl  %eax, %eax
> > > > +#  ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +#  endif
> > > > +     VZEROUPPER_RETURN
> > > >  # endif
> > > > -     jmp     L(more_4x_vec)
> > > >
> > > >       .p2align 4
> > > > -L(cros_page_boundary):
> > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     /* Remove the leading bytes.  */
> > > > -     sarl    %cl, %eax
> > > > -     testl   %eax, %eax
> > > > -     jz      L(aligned_more)
> > > > +L(first_vec_x1):
> > > >       tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     incl    %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > -     addq    %rdi, %rax
> > > > -     addq    %rcx, %rax
> > > > -     subq    %rdx, %rax
> > > >  # ifdef USE_AS_WCSLEN
> > > > -     shrq    $2, %rax
> > > > +     shrl    $2, %eax
> > > >  # endif
> > > > -L(return_vzeroupper):
> > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > +     VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(aligned_more):
> > > > +L(first_vec_x2):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > -         to void possible addition overflow.  */
> > > > -     negq    %rcx
> > > > -     addq    $VEC_SIZE, %rcx
> > > > -
> > > > -     /* Check the end of data.  */
> > > > -     subq    %rcx, %rsi
> > > > -     jbe     L(max)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > -     addq    $VEC_SIZE, %rdi
> > > > +     .p2align 4
> > > > +L(first_vec_x3):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > +     addl    %edi, %eax
> > > > +# endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > +     .p2align 4
> > > > +L(first_vec_x4):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > -L(more_4x_vec):
> > > > +     .p2align 5
> > > > +L(aligned_more):
> > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > +        code on the x4 check.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +L(cross_page_continue):
> > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > >          since data is only aligned to VEC_SIZE.  */
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > -
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > +     subq    %rdx, %rcx
> > > > +# endif
> > > > +     /* Load first VEC regardless.  */
> > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Adjust length. If near end handle specially.  */
> > > > +     subq    %rcx, %rsi
> > > > +     jb      L(last_4x_vec_or_less)
> > > > +# endif
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x1)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x2)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x3)
> > > >
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > -
> > > > -# ifdef USE_AS_STRNLEN
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > -# endif
> > > > -
> > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > -     movq    %rdi, %rcx
> > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x4)
> > > >
> > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Adjust length.  */
> > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > +     incq    %rdi
> > > > +     movl    %edi, %ecx
> > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > +     /* Readjust length.  */
> > > >       addq    %rcx, %rsi
> > > > +# else
> > > > +     incq    %rdi
> > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > >  # endif
> > > > -
> > > > +     /* Compare 4 * VEC at a time forward.  */
> > > >       .p2align 4
> > > >  L(loop_4x_vec):
> > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > -     vmovdqa (%rdi), %ymm1
> > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > -
> > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > -     vpmovmskb %ymm5, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(4x_vec_end)
> > > > -
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > -
> > > > -# ifndef USE_AS_STRNLEN
> > > > -     jmp     L(loop_4x_vec)
> > > > -# else
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Break if at end of length.  */
> > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > -     ja      L(loop_4x_vec)
> > > > -
> > > > -L(last_4x_vec_or_less):
> > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > -     jle     L(last_2x_vec)
> > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > +# endif
> > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > +      */
> > > > +     vmovdqa 1(%rdi), %ymm1
> > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > +
> > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > +     vpmovmskb       %ymm5, %ecx
> > > >
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     testl   %ecx, %ecx
> > > > +     jz      L(loop_4x_vec)
> > > >
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     subq    %rdx, %rdi
> > > >       testl   %eax, %eax
> > > > +     jnz     L(last_vec_return_x0)
> > > >
> > > > -     jnz     L(first_vec_x2_check)
> > > > -     subl    $VEC_SIZE, %esi
> > > > -     jle     L(max)
> > > > -
> > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > +     vpmovmskb       %ymm2, %eax
> > > >       testl   %eax, %eax
> > > > -
> > > > -     jnz     L(first_vec_x3_check)
> > > > -     movq    %r8, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +     jnz     L(last_vec_return_x1)
> > > > +
> > > > +     /* Combine last 2 VEC.  */
> > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > +     vpmovmskb       %ymm3, %eax
> > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > +        the first 3 other VEC all did not contain a match.  */
> > > > +     salq    $32, %rcx
> > > > +     orq     %rcx, %rax
> > > > +     tzcntq  %rax, %rax
> > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > +     addq    %rdi, %rax
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > +
> > > > +# ifdef USE_AS_STRNLEN
> > > >       .p2align 4
> > > > -L(last_2x_vec):
> > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > +L(last_4x_vec_or_less_load):
> > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +L(last_4x_vec_or_less_cmpeq):
> > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > +L(last_4x_vec_or_less):
> > > >
> > > > -     jnz     L(first_vec_x0_check)
> > > > -     subl    $VEC_SIZE, %esi
> > > > -     jle     L(max)
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > +        VEC_SIZE * 4.  */
> > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > +     jnz     L(last_4x_vec)
> > > >
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     /* length may have been negative or positive by an offset of
> > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > +        that.  */
> > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > >       testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1_check)
> > > > -     movq    %r8, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > -     shrq    $2, %rax
> > > > -#  endif
> > > > -     VZEROUPPER_RETURN
> > > > +     jnz     L(last_vec_x1_check)
> > > >
> > > > -     .p2align 4
> > > > -L(first_vec_x0_check):
> > > > +     subl    $VEC_SIZE, %esi
> > > > +     jb      L(max)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       tzcntl  %eax, %eax
> > > >       /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > > +# endif
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x1_check):
> > > > +L(last_vec_return_x0):
> > > >       tzcntl  %eax, %eax
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $VEC_SIZE, %rax
> > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x2_check):
> > > > +L(last_vec_return_x1):
> > > >       tzcntl  %eax, %eax
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > +# ifdef USE_AS_STRNLEN
> > > >       .p2align 4
> > > > -L(first_vec_x3_check):
> > > > +L(last_vec_x1_check):
> > > > +
> > > >       tzcntl  %eax, %eax
> > > >       /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     incl    %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -     .p2align 4
> > > >  L(max):
> > > >       movq    %r8, %rax
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > > +     .p2align 4
> > > > +L(last_4x_vec):
> > > > +     /* Test first 2x VEC normally.  */
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x1)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x2)
> > > > +
> > > > +     /* Normalize length.  */
> > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x3)
> > > > +
> > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > +     jb      L(max)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Check the end of data.  */
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > +     addq    %rdi, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -     .p2align 4
> > > > -L(zero):
> > > > -     xorl    %eax, %eax
> > > > -     ret
> > > > -# endif
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x0):
> > > > +L(last_vec_x1):
> > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > +        instructions.  */
> > > >       tzcntl  %eax, %eax
> > > > +     subq    %rdx, %rdi
> > > > +     incl    %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x1):
> > > > +L(last_vec_x2):
> > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > +        instructions.  */
> > > >       tzcntl  %eax, %eax
> > > > -     addq    $VEC_SIZE, %rax
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x2):
> > > > +L(last_vec_x3):
> > > >       tzcntl  %eax, %eax
> > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > +     /* Check the end of data.  */
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max_end)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > > +     VZEROUPPER_RETURN
> > > > +L(max_end):
> > > > +     movq    %r8, %rax
> > > >       VZEROUPPER_RETURN
> > > > +# endif
> > > >
> > > > +     /* Cold case for crossing page with first load.  */
> > > >       .p2align 4
> > > > -L(4x_vec_end):
> > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > -     vpmovmskb %ymm2, %eax
> > > > +L(cross_page_boundary):
> > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > +        so no need to manually mod rdx.  */
> > > > +     sarxl   %edx, %eax, %eax
> > >
> > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > is available. This causes SIGILL on some CPU. I have reported that in
> > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> >
> > This is not a bug on master as:
> >
> > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date:   Mon Apr 19 10:45:07 2021 -0700
> >
> >     x86-64: Require BMI2 for strchr-avx2.S
> >
> > is already in tree. The issue is the avx2 changes where backported
> > w.o H.J's changes.
> > >
> > > Regards
> > > Aurelien
> > >
> > > --
> > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > aurelien@aurel32.net                 http://www.aurel32.net
Sunil Pandey Sept. 28, 2022, 2:54 p.m. UTC | #7
On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
> The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
>
> Can you post these as separate emails with the patches embedded instead of
> attached?
>
> >

Patches are also posted on bug report 29611.

https://sourceware.org/bugzilla/show_bug.cgi?id=29611

> >
> > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > >
> > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > > mostly small things but they add up to roughly 10-30% performance
> > > > > improvement for strlen. The results for strnlen are bit more
> > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > > are all passing.
> > > > >
> > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > index c377cab629..651b32908e 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > > >    IFUNC_IMPL (i, name, strlen,
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __strlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __strlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > > >    IFUNC_IMPL (i, name, strnlen,
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __strnlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __strnlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > > >    IFUNC_IMPL (i, name, wcslen,
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __wcslen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __wcslen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __wcsnlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __wcsnlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > @@ -27,9 +27,11 @@
> > > > >  # ifdef USE_AS_WCSLEN
> > > > >  #  define VPCMPEQ    vpcmpeqd
> > > > >  #  define VPMINU     vpminud
> > > > > +#  define CHAR_SIZE  4
> > > > >  # else
> > > > >  #  define VPCMPEQ    vpcmpeqb
> > > > >  #  define VPMINU     vpminub
> > > > > +#  define CHAR_SIZE  1
> > > > >  # endif
> > > > >
> > > > >  # ifndef VZEROUPPER
> > > > > @@ -41,349 +43,459 @@
> > > > >  # endif
> > > > >
> > > > >  # define VEC_SIZE 32
> > > > > +# define PAGE_SIZE 4096
> > > > >
> > > > >       .section SECTION(.text),"ax",@progbits
> > > > >  ENTRY (STRLEN)
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Check for zero length.  */
> > > > > +     /* Check zero length.  */
> > > > >       test    %RSI_LP, %RSI_LP
> > > > >       jz      L(zero)
> > > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > > +     mov     %RSI_LP, %R8_LP
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shl     $2, %RSI_LP
> > > > >  #  elif defined __ILP32__
> > > > >       /* Clear the upper 32 bits.  */
> > > > >       movl    %esi, %esi
> > > > >  #  endif
> > > > > -     mov     %RSI_LP, %R8_LP
> > > > >  # endif
> > > > > -     movl    %edi, %ecx
> > > > > +     movl    %edi, %eax
> > > > >       movq    %rdi, %rdx
> > > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > > -
> > > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > > +        cross check.  */
> > > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > > -     cmpl    $VEC_SIZE, %ecx
> > > > > -     ja      L(cros_page_boundary)
> > > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > +     ja      L(cross_page_boundary)
> > > > >
> > > > >       /* Check the first VEC_SIZE bytes.  */
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -
> > > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     jnz     L(first_vec_x0_check)
> > > > > -     /* Adjust length and check the end of data.  */
> > > > > -     subq    $VEC_SIZE, %rsi
> > > > > -     jbe     L(max)
> > > > > -# else
> > > > > -     jnz     L(first_vec_x0)
> > > > > +     /* If length < VEC_SIZE handle special.  */
> > > > > +     cmpq    $VEC_SIZE, %rsi
> > > > > +     jbe     L(first_vec_x0)
> > > > >  # endif
> > > > > -
> > > > > -     /* Align data for aligned loads in the loop.  */
> > > > > -     addq    $VEC_SIZE, %rdi
> > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > > +        position of first match.  */
> > > > > +     testl   %eax, %eax
> > > > > +     jz      L(aligned_more)
> > > > > +     tzcntl  %eax, %eax
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Adjust length.  */
> > > > > -     addq    %rcx, %rsi
> > > > > +L(zero):
> > > > > +     xorl    %eax, %eax
> > > > > +     ret
> > > > >
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > +     .p2align 4
> > > > > +L(first_vec_x0):
> > > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > > +        and position of first match.  */
> > > > > +     btsq    %rsi, %rax
> > > > > +     tzcntl  %eax, %eax
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +#  endif
> > > > > +     VZEROUPPER_RETURN
> > > > >  # endif
> > > > > -     jmp     L(more_4x_vec)
> > > > >
> > > > >       .p2align 4
> > > > > -L(cros_page_boundary):
> > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     /* Remove the leading bytes.  */
> > > > > -     sarl    %cl, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jz      L(aligned_more)
> > > > > +L(first_vec_x1):
> > > > >       tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     incl    %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > -     addq    %rdi, %rax
> > > > > -     addq    %rcx, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  # ifdef USE_AS_WCSLEN
> > > > > -     shrq    $2, %rax
> > > > > +     shrl    $2, %eax
> > > > >  # endif
> > > > > -L(return_vzeroupper):
> > > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(aligned_more):
> > > > > +L(first_vec_x2):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > > -         to void possible addition overflow.  */
> > > > > -     negq    %rcx
> > > > > -     addq    $VEC_SIZE, %rcx
> > > > > -
> > > > > -     /* Check the end of data.  */
> > > > > -     subq    %rcx, %rsi
> > > > > -     jbe     L(max)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > -     addq    $VEC_SIZE, %rdi
> > > > > +     .p2align 4
> > > > > +L(first_vec_x3):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > > +# endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > +     .p2align 4
> > > > > +L(first_vec_x4):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > -L(more_4x_vec):
> > > > > +     .p2align 5
> > > > > +L(aligned_more):
> > > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > > +        code on the x4 check.  */
> > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > +L(cross_page_continue):
> > > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > >          since data is only aligned to VEC_SIZE.  */
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > -
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > > +     subq    %rdx, %rcx
> > > > > +# endif
> > > > > +     /* Load first VEC regardless.  */
> > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Adjust length. If near end handle specially.  */
> > > > > +     subq    %rcx, %rsi
> > > > > +     jb      L(last_4x_vec_or_less)
> > > > > +# endif
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x1)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x2)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x3)
> > > > >
> > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > -# ifdef USE_AS_STRNLEN
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > -# endif
> > > > > -
> > > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > > -     movq    %rdi, %rcx
> > > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(first_vec_x4)
> > > > >
> > > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Adjust length.  */
> > > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > > +     incq    %rdi
> > > > > +     movl    %edi, %ecx
> > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > > +     /* Readjust length.  */
> > > > >       addq    %rcx, %rsi
> > > > > +# else
> > > > > +     incq    %rdi
> > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > >  # endif
> > > > > -
> > > > > +     /* Compare 4 * VEC at a time forward.  */
> > > > >       .p2align 4
> > > > >  L(loop_4x_vec):
> > > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > > -     vmovdqa (%rdi), %ymm1
> > > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > > -
> > > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > -     vpmovmskb %ymm5, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(4x_vec_end)
> > > > > -
> > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > -# ifndef USE_AS_STRNLEN
> > > > > -     jmp     L(loop_4x_vec)
> > > > > -# else
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Break if at end of length.  */
> > > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > > -     ja      L(loop_4x_vec)
> > > > > -
> > > > > -L(last_4x_vec_or_less):
> > > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > -     jle     L(last_2x_vec)
> > > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > > +# endif
> > > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > > +      */
> > > > > +     vmovdqa 1(%rdi), %ymm1
> > > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > > +
> > > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > +     vpmovmskb       %ymm5, %ecx
> > > > >
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +     testl   %ecx, %ecx
> > > > > +     jz      L(loop_4x_vec)
> > > > >
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x1)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     subq    %rdx, %rdi
> > > > >       testl   %eax, %eax
> > > > > +     jnz     L(last_vec_return_x0)
> > > > >
> > > > > -     jnz     L(first_vec_x2_check)
> > > > > -     subl    $VEC_SIZE, %esi
> > > > > -     jle     L(max)
> > > > > -
> > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > +     vpmovmskb       %ymm2, %eax
> > > > >       testl   %eax, %eax
> > > > > -
> > > > > -     jnz     L(first_vec_x3_check)
> > > > > -     movq    %r8, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +     jnz     L(last_vec_return_x1)
> > > > > +
> > > > > +     /* Combine last 2 VEC.  */
> > > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > > +     vpmovmskb       %ymm3, %eax
> > > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > > +        the first 3 other VEC all did not contain a match.  */
> > > > > +     salq    $32, %rcx
> > > > > +     orq     %rcx, %rax
> > > > > +     tzcntq  %rax, %rax
> > > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > > +     addq    %rdi, %rax
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > +
> > > > > +# ifdef USE_AS_STRNLEN
> > > > >       .p2align 4
> > > > > -L(last_2x_vec):
> > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > +L(last_4x_vec_or_less_load):
> > > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +L(last_4x_vec_or_less_cmpeq):
> > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > +L(last_4x_vec_or_less):
> > > > >
> > > > > -     jnz     L(first_vec_x0_check)
> > > > > -     subl    $VEC_SIZE, %esi
> > > > > -     jle     L(max)
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > > +        VEC_SIZE * 4.  */
> > > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > > +     jnz     L(last_4x_vec)
> > > > >
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     /* length may have been negative or positive by an offset of
> > > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > > +        that.  */
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > >       testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x1_check)
> > > > > -     movq    %r8, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > -     shrq    $2, %rax
> > > > > -#  endif
> > > > > -     VZEROUPPER_RETURN
> > > > > +     jnz     L(last_vec_x1_check)
> > > > >
> > > > > -     .p2align 4
> > > > > -L(first_vec_x0_check):
> > > > > +     subl    $VEC_SIZE, %esi
> > > > > +     jb      L(max)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       tzcntl  %eax, %eax
> > > > >       /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > > +# endif
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x1_check):
> > > > > +L(last_vec_return_x0):
> > > > >       tzcntl  %eax, %eax
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $VEC_SIZE, %rax
> > > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x2_check):
> > > > > +L(last_vec_return_x1):
> > > > >       tzcntl  %eax, %eax
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > +# ifdef USE_AS_STRNLEN
> > > > >       .p2align 4
> > > > > -L(first_vec_x3_check):
> > > > > +L(last_vec_x1_check):
> > > > > +
> > > > >       tzcntl  %eax, %eax
> > > > >       /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     incl    %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > -     .p2align 4
> > > > >  L(max):
> > > > >       movq    %r8, %rax
> > > > > +     VZEROUPPER_RETURN
> > > > > +
> > > > > +     .p2align 4
> > > > > +L(last_4x_vec):
> > > > > +     /* Test first 2x VEC normally.  */
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x1)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x2)
> > > > > +
> > > > > +     /* Normalize length.  */
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x3)
> > > > > +
> > > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > > +     jb      L(max)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Check the end of data.  */
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > > +     addq    %rdi, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > -     .p2align 4
> > > > > -L(zero):
> > > > > -     xorl    %eax, %eax
> > > > > -     ret
> > > > > -# endif
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x0):
> > > > > +L(last_vec_x1):
> > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > +        instructions.  */
> > > > >       tzcntl  %eax, %eax
> > > > > +     subq    %rdx, %rdi
> > > > > +     incl    %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x1):
> > > > > +L(last_vec_x2):
> > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > +        instructions.  */
> > > > >       tzcntl  %eax, %eax
> > > > > -     addq    $VEC_SIZE, %rax
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x2):
> > > > > +L(last_vec_x3):
> > > > >       tzcntl  %eax, %eax
> > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > > +     /* Check the end of data.  */
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max_end)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > > +     VZEROUPPER_RETURN
> > > > > +L(max_end):
> > > > > +     movq    %r8, %rax
> > > > >       VZEROUPPER_RETURN
> > > > > +# endif
> > > > >
> > > > > +     /* Cold case for crossing page with first load.  */
> > > > >       .p2align 4
> > > > > -L(4x_vec_end):
> > > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > -     vpmovmskb %ymm2, %eax
> > > > > +L(cross_page_boundary):
> > > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > > +        so no need to manually mod rdx.  */
> > > > > +     sarxl   %edx, %eax, %eax
> > > >
> > > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > > is available. This causes SIGILL on some CPU. I have reported that in
> > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> > >
> > > This is not a bug on master as:
> > >
> > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > Date:   Mon Apr 19 10:45:07 2021 -0700
> > >
> > >     x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > is already in tree. The issue is the avx2 changes where backported
> > > w.o H.J's changes.
> > > >
> > > > Regards
> > > > Aurelien
> > > >
> > > > --
> > > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > > aurelien@aurel32.net                 http://www.aurel32.net
Noah Goldstein Sept. 28, 2022, 3 p.m. UTC | #8
On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
> >
> > Can you post these as separate emails with the patches embedded instead of
> > attached?
> >
> > >
>
> Patches are also posted on bug report 29611.
>
> https://sourceware.org/bugzilla/show_bug.cgi?id=29611

is there a mailing list for backport patches like this?
>
> > >
> > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > > >
> > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > > > mostly small things but they add up to roughly 10-30% performance
> > > > > > improvement for strlen. The results for strnlen are bit more
> > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > > > are all passing.
> > > > > >
> > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > index c377cab629..651b32908e 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, strlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __strlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __strlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, strnlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __strnlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __strnlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > > > >    IFUNC_IMPL (i, name, wcslen,
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __wcslen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __wcslen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __wcsnlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __wcsnlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > @@ -27,9 +27,11 @@
> > > > > >  # ifdef USE_AS_WCSLEN
> > > > > >  #  define VPCMPEQ    vpcmpeqd
> > > > > >  #  define VPMINU     vpminud
> > > > > > +#  define CHAR_SIZE  4
> > > > > >  # else
> > > > > >  #  define VPCMPEQ    vpcmpeqb
> > > > > >  #  define VPMINU     vpminub
> > > > > > +#  define CHAR_SIZE  1
> > > > > >  # endif
> > > > > >
> > > > > >  # ifndef VZEROUPPER
> > > > > > @@ -41,349 +43,459 @@
> > > > > >  # endif
> > > > > >
> > > > > >  # define VEC_SIZE 32
> > > > > > +# define PAGE_SIZE 4096
> > > > > >
> > > > > >       .section SECTION(.text),"ax",@progbits
> > > > > >  ENTRY (STRLEN)
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Check for zero length.  */
> > > > > > +     /* Check zero length.  */
> > > > > >       test    %RSI_LP, %RSI_LP
> > > > > >       jz      L(zero)
> > > > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > > > +     mov     %RSI_LP, %R8_LP
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shl     $2, %RSI_LP
> > > > > >  #  elif defined __ILP32__
> > > > > >       /* Clear the upper 32 bits.  */
> > > > > >       movl    %esi, %esi
> > > > > >  #  endif
> > > > > > -     mov     %RSI_LP, %R8_LP
> > > > > >  # endif
> > > > > > -     movl    %edi, %ecx
> > > > > > +     movl    %edi, %eax
> > > > > >       movq    %rdi, %rdx
> > > > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > > > -
> > > > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > > > +        cross check.  */
> > > > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > > > -     cmpl    $VEC_SIZE, %ecx
> > > > > > -     ja      L(cros_page_boundary)
> > > > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > > +     ja      L(cross_page_boundary)
> > > > > >
> > > > > >       /* Check the first VEC_SIZE bytes.  */
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -
> > > > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     jnz     L(first_vec_x0_check)
> > > > > > -     /* Adjust length and check the end of data.  */
> > > > > > -     subq    $VEC_SIZE, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -# else
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > +     /* If length < VEC_SIZE handle special.  */
> > > > > > +     cmpq    $VEC_SIZE, %rsi
> > > > > > +     jbe     L(first_vec_x0)
> > > > > >  # endif
> > > > > > -
> > > > > > -     /* Align data for aligned loads in the loop.  */
> > > > > > -     addq    $VEC_SIZE, %rdi
> > > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > > > +        position of first match.  */
> > > > > > +     testl   %eax, %eax
> > > > > > +     jz      L(aligned_more)
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Adjust length.  */
> > > > > > -     addq    %rcx, %rsi
> > > > > > +L(zero):
> > > > > > +     xorl    %eax, %eax
> > > > > > +     ret
> > > > > >
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x0):
> > > > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > > > +        and position of first match.  */
> > > > > > +     btsq    %rsi, %rax
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +#  endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >  # endif
> > > > > > -     jmp     L(more_4x_vec)
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(cros_page_boundary):
> > > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     /* Remove the leading bytes.  */
> > > > > > -     sarl    %cl, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jz      L(aligned_more)
> > > > > > +L(first_vec_x1):
> > > > > >       tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     incl    %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > -     addq    %rdi, %rax
> > > > > > -     addq    %rcx, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  # ifdef USE_AS_WCSLEN
> > > > > > -     shrq    $2, %rax
> > > > > > +     shrl    $2, %eax
> > > > > >  # endif
> > > > > > -L(return_vzeroupper):
> > > > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(aligned_more):
> > > > > > +L(first_vec_x2):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > > > -         to void possible addition overflow.  */
> > > > > > -     negq    %rcx
> > > > > > -     addq    $VEC_SIZE, %rcx
> > > > > > -
> > > > > > -     /* Check the end of data.  */
> > > > > > -     subq    %rcx, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > -     addq    $VEC_SIZE, %rdi
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x3):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > > +# endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x4):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > -L(more_4x_vec):
> > > > > > +     .p2align 5
> > > > > > +L(aligned_more):
> > > > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > > > +        code on the x4 check.  */
> > > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > > +L(cross_page_continue):
> > > > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > > >          since data is only aligned to VEC_SIZE.  */
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > -
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > > > +     subq    %rdx, %rcx
> > > > > > +# endif
> > > > > > +     /* Load first VEC regardless.  */
> > > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Adjust length. If near end handle specially.  */
> > > > > > +     subq    %rcx, %rsi
> > > > > > +     jb      L(last_4x_vec_or_less)
> > > > > > +# endif
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x1)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x2)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x3)
> > > > > >
> > > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > > -
> > > > > > -# ifdef USE_AS_STRNLEN
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > -# endif
> > > > > > -
> > > > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > > > -     movq    %rdi, %rcx
> > > > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(first_vec_x4)
> > > > > >
> > > > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Adjust length.  */
> > > > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > > > +     incq    %rdi
> > > > > > +     movl    %edi, %ecx
> > > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > > > +     /* Readjust length.  */
> > > > > >       addq    %rcx, %rsi
> > > > > > +# else
> > > > > > +     incq    %rdi
> > > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > >  # endif
> > > > > > -
> > > > > > +     /* Compare 4 * VEC at a time forward.  */
> > > > > >       .p2align 4
> > > > > >  L(loop_4x_vec):
> > > > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > > > -     vmovdqa (%rdi), %ymm1
> > > > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > > > -
> > > > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > > -     vpmovmskb %ymm5, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(4x_vec_end)
> > > > > > -
> > > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > > -
> > > > > > -# ifndef USE_AS_STRNLEN
> > > > > > -     jmp     L(loop_4x_vec)
> > > > > > -# else
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Break if at end of length.  */
> > > > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     ja      L(loop_4x_vec)
> > > > > > -
> > > > > > -L(last_4x_vec_or_less):
> > > > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > > -     jle     L(last_2x_vec)
> > > > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > > > +# endif
> > > > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > > > +      */
> > > > > > +     vmovdqa 1(%rdi), %ymm1
> > > > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > > > +
> > > > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > > +     vpmovmskb       %ymm5, %ecx
> > > > > >
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > > +     testl   %ecx, %ecx
> > > > > > +     jz      L(loop_4x_vec)
> > > > > >
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x1)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     subq    %rdx, %rdi
> > > > > >       testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_return_x0)
> > > > > >
> > > > > > -     jnz     L(first_vec_x2_check)
> > > > > > -     subl    $VEC_SIZE, %esi
> > > > > > -     jle     L(max)
> > > > > > -
> > > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > > +     vpmovmskb       %ymm2, %eax
> > > > > >       testl   %eax, %eax
> > > > > > -
> > > > > > -     jnz     L(first_vec_x3_check)
> > > > > > -     movq    %r8, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +     jnz     L(last_vec_return_x1)
> > > > > > +
> > > > > > +     /* Combine last 2 VEC.  */
> > > > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > > > +     vpmovmskb       %ymm3, %eax
> > > > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > > > +        the first 3 other VEC all did not contain a match.  */
> > > > > > +     salq    $32, %rcx
> > > > > > +     orq     %rcx, %rax
> > > > > > +     tzcntq  %rax, %rax
> > > > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > > > +     addq    %rdi, %rax
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > +
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > >       .p2align 4
> > > > > > -L(last_2x_vec):
> > > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > +L(last_4x_vec_or_less_load):
> > > > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > > +L(last_4x_vec_or_less_cmpeq):
> > > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > > +L(last_4x_vec_or_less):
> > > > > >
> > > > > > -     jnz     L(first_vec_x0_check)
> > > > > > -     subl    $VEC_SIZE, %esi
> > > > > > -     jle     L(max)
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > > > +        VEC_SIZE * 4.  */
> > > > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > > > +     jnz     L(last_4x_vec)
> > > > > >
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     /* length may have been negative or positive by an offset of
> > > > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > > > +        that.  */
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > >       testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x1_check)
> > > > > > -     movq    %r8, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > -     shrq    $2, %rax
> > > > > > -#  endif
> > > > > > -     VZEROUPPER_RETURN
> > > > > > +     jnz     L(last_vec_x1_check)
> > > > > >
> > > > > > -     .p2align 4
> > > > > > -L(first_vec_x0_check):
> > > > > > +     subl    $VEC_SIZE, %esi
> > > > > > +     jb      L(max)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       tzcntl  %eax, %eax
> > > > > >       /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > > +# endif
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x1_check):
> > > > > > +L(last_vec_return_x0):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $VEC_SIZE, %rax
> > > > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x2_check):
> > > > > > +L(last_vec_return_x1):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x3_check):
> > > > > > +L(last_vec_x1_check):
> > > > > > +
> > > > > >       tzcntl  %eax, %eax
> > > > > >       /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     incl    %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > -     .p2align 4
> > > > > >  L(max):
> > > > > >       movq    %r8, %rax
> > > > > > +     VZEROUPPER_RETURN
> > > > > > +
> > > > > > +     .p2align 4
> > > > > > +L(last_4x_vec):
> > > > > > +     /* Test first 2x VEC normally.  */
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x1)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x2)
> > > > > > +
> > > > > > +     /* Normalize length.  */
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x3)
> > > > > > +
> > > > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > > > +     jb      L(max)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Check the end of data.  */
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > > > +     addq    %rdi, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > -     .p2align 4
> > > > > > -L(zero):
> > > > > > -     xorl    %eax, %eax
> > > > > > -     ret
> > > > > > -# endif
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x0):
> > > > > > +L(last_vec_x1):
> > > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > > +        instructions.  */
> > > > > >       tzcntl  %eax, %eax
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     incl    %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x1):
> > > > > > +L(last_vec_x2):
> > > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > > +        instructions.  */
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     addq    $VEC_SIZE, %rax
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x2):
> > > > > > +L(last_vec_x3):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > > > +     /* Check the end of data.  */
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max_end)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > > +L(max_end):
> > > > > > +     movq    %r8, %rax
> > > > > >       VZEROUPPER_RETURN
> > > > > > +# endif
> > > > > >
> > > > > > +     /* Cold case for crossing page with first load.  */
> > > > > >       .p2align 4
> > > > > > -L(4x_vec_end):
> > > > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > > -     vpmovmskb %ymm2, %eax
> > > > > > +L(cross_page_boundary):
> > > > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > > > +        so no need to manually mod rdx.  */
> > > > > > +     sarxl   %edx, %eax, %eax
> > > > >
> > > > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > > > is available. This causes SIGILL on some CPU. I have reported that in
> > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> > > >
> > > > This is not a bug on master as:
> > > >
> > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > > Date:   Mon Apr 19 10:45:07 2021 -0700
> > > >
> > > >     x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > is already in tree. The issue is the avx2 changes where backported
> > > > w.o H.J's changes.
> > > > >
> > > > > Regards
> > > > > Aurelien
> > > > >
> > > > > --
> > > > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > > > aurelien@aurel32.net                 http://www.aurel32.net
H.J. Lu Sept. 28, 2022, 6:23 p.m. UTC | #9
On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Attached patch fixes BZ# 29611.
>
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.

It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".


H.J.
H.J. Lu Sept. 28, 2022, 6:24 p.m. UTC | #10
On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
> > >
> > > Can you post these as separate emails with the patches embedded instead of
> > > attached?
> > >
> > > >
> >
> > Patches are also posted on bug report 29611.
> >
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> is there a mailing list for backport patches like this?

It is libc-stable.
Sunil Pandey Sept. 28, 2022, 7:09 p.m. UTC | #11
On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
> BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".
>
>
> H.J.

Pulling up corresponding patches are extremely difficult as they are not
modular. I can modify existing patches (as posted on bug report) to
incorporate ifunc-impl-list.c functionality. If it is OK?

For backporting small incremental changes are preferred. Single monolithic
patch makes backporting extremely difficult, if not impossible.
H.J. Lu Sept. 28, 2022, 7:23 p.m. UTC | #12
On Wed, Sep 28, 2022 at 12:09 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
> > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".
> >
> >
> > H.J.
>
> Pulling up corresponding patches are extremely difficult as they are not
> modular. I can modify existing patches (as posted on bug report) to
> incorporate ifunc-impl-list.c functionality. If it is OK?

Please mention BZ #29611 in the commit log of the backport and submit
a separate patch to fully fix BZ #29611.  We should use a patch set for
BZ #29611.

> For backporting small incremental changes are preferred. Single monolithic
> patch makes backporting extremely difficult, if not impossible.
Darren Tristano Sept. 30, 2022, 1:19 p.m. UTC | #13
FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>
Darren Tristano Sept. 30, 2022, 1:19 p.m. UTC | #14
FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>



Darren Tristano, CEO

FoodserviceResults

T: (708) 228-1427

darrentristano.com
Aurelien Jarno Oct. 4, 2022, 9:19 p.m. UTC | #15
On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> Attached patch fixes BZ# 29611.
> 
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.

Sorry to be late on this. I have a few comments about that patch:

> From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Mon, 19 Apr 2021 10:45:07 -0700
> Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> 
> Since strchr-avx2.S updated by
> 
> commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> Author: noah <goldstein.w.n@gmail.com>
> Date:   Wed Feb 3 00:38:59 2021 -0500
> 
>     x86-64: Refactor and improve performance of strchr-avx2.S
> 
> uses sarx:
> 
> c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
> 
> for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> ifunc-avx2.h.
> 
> (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> ---
>  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
>  2 files changed, 11 insertions(+), 5 deletions(-)

First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
backported to 2.32 and older branches, and strchr-avx2.S in those
branches do not use BMI2 instructions. So it doesn't make sense to
backport it.

That said the change in ifunc-avx2.h fixes:

- memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
  Optimize memchr-avx2.S")
- strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
  Optimize strlen-avx2.S")

So the issues are fixed, but mostly by chance.

NB: at this stage, I haven't verified the consistency of the ifunc
selectors with ifunc-impl-list.c.
H.J. Lu Oct. 4, 2022, 9:29 p.m. UTC | #16
On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> Sorry to be late on this. I have a few comments about that patch:
>
> > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> >
> > Since strchr-avx2.S updated by
> >
> > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > Author: noah <goldstein.w.n@gmail.com>
> > Date:   Wed Feb 3 00:38:59 2021 -0500
> >
> >     x86-64: Refactor and improve performance of strchr-avx2.S
> >
> > uses sarx:
> >
> > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> >
> > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > ifunc-avx2.h.
> >
> > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> >  2 files changed, 11 insertions(+), 5 deletions(-)
>
> First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> backported to 2.32 and older branches, and strchr-avx2.S in those
> branches do not use BMI2 instructions. So it doesn't make sense to
> backport it.
>
> That said the change in ifunc-avx2.h fixes:
>
> - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
>   Optimize memchr-avx2.S")
> - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
>   Optimize strlen-avx2.S")
>
> So the issues are fixed, but mostly by chance.
>
> NB: at this stage, I haven't verified the consistency of the ifunc
> selectors with ifunc-impl-list.c.
>

Changes to ifunc-impl-list.c aren't strictly needed since strchr functions
don't use BMI2.  AVX2 strchr functions are still tested on machines with
AVX2 and BMI2.
Sunil Pandey Oct. 5, 2022, 1:10 a.m. UTC | #17
On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> Sorry to be late on this. I have a few comments about that patch:
>
> > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> >
> > Since strchr-avx2.S updated by
> >
> > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > Author: noah <goldstein.w.n@gmail.com>
> > Date:   Wed Feb 3 00:38:59 2021 -0500
> >
> >     x86-64: Refactor and improve performance of strchr-avx2.S
> >
> > uses sarx:
> >
> > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> >
> > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > ifunc-avx2.h.
> >
> > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> >  2 files changed, 11 insertions(+), 5 deletions(-)
>
> First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> backported to 2.32 and older branches, and strchr-avx2.S in those
> branches do not use BMI2 instructions. So it doesn't make sense to
> backport it.
>
> That said the change in ifunc-avx2.h fixes:
>
> - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
>   Optimize memchr-avx2.S")
> - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
>   Optimize strlen-avx2.S")
>
> So the issues are fixed, but mostly by chance.

How do you know it is a "by chance" fix, do you have any evidence to back
your claim?

>
> NB: at this stage, I haven't verified the consistency of the ifunc
> selectors with ifunc-impl-list.c.
>
> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net
Noah Goldstein Oct. 5, 2022, 2:23 p.m. UTC | #18
On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > Sorry to be late on this. I have a few comments about that patch:
> >
> > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > Since strchr-avx2.S updated by
> > >
> > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > Author: noah <goldstein.w.n@gmail.com>
> > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > >
> > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > >
> > > uses sarx:
> > >
> > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > >
> > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > ifunc-avx2.h.
> > >
> > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > >  2 files changed, 11 insertions(+), 5 deletions(-)
> >
> > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > backported to 2.32 and older branches, and strchr-avx2.S in those
> > branches do not use BMI2 instructions. So it doesn't make sense to
> > backport it.
> >
> > That said the change in ifunc-avx2.h fixes:
> >
> > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> >   Optimize memchr-avx2.S")
> > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> >   Optimize strlen-avx2.S")
> >
> > So the issues are fixed, but mostly by chance.
>
> How do you know it is a "by chance" fix, do you have any evidence to back
> your claim?

There might not be evidence about the intention of the authors but clearly
the strchr commit message does not clarify that it also fixes memchr/strlen.
>
> >
> > NB: at this stage, I haven't verified the consistency of the ifunc
> > selectors with ifunc-impl-list.c.
> >
> > --
> > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net                 http://www.aurel32.net
Sunil Pandey Oct. 5, 2022, 4:35 p.m. UTC | #19
On Wed, Oct 5, 2022 at 7:23 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > >
> > > Sorry to be late on this. I have a few comments about that patch:
> > >
> > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > Since strchr-avx2.S updated by
> > > >
> > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > > Author: noah <goldstein.w.n@gmail.com>
> > > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > > >
> > > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > > >
> > > > uses sarx:
> > > >
> > > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > > >
> > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > > ifunc-avx2.h.
> > > >
> > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > > >  2 files changed, 11 insertions(+), 5 deletions(-)
> > >
> > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > > backported to 2.32 and older branches, and strchr-avx2.S in those
> > > branches do not use BMI2 instructions. So it doesn't make sense to
> > > backport it.
> > >
> > > That said the change in ifunc-avx2.h fixes:
> > >
> > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> > >   Optimize memchr-avx2.S")
> > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> > >   Optimize strlen-avx2.S")
> > >
> > > So the issues are fixed, but mostly by chance.
> >
> > How do you know it is a "by chance" fix, do you have any evidence to back
> > your claim?
>
> There might not be evidence about the intention of the authors but clearly
> the strchr commit message does not clarify that it also fixes memchr/strlen.

ifunc-avx2.h header file is used in many functions, so fix in ifunc-avx2.h fixes
all those functions too. It's not "by chance", I scan all the functions where
ifunc-avx2.h are used before backporting it.

Since this is a backport commit and no extra changes are made, there is no
need to modify the original author commit message.

> >
> > >
> > > NB: at this stage, I haven't verified the consistency of the ifunc
> > > selectors with ifunc-impl-list.c.
> > >
> > > --
> > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > aurelien@aurel32.net                 http://www.aurel32.net
Aurelien Jarno Oct. 5, 2022, 5:11 p.m. UTC | #20
On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote:
> On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > Sorry to be late on this. I have a few comments about that patch:
> >
> > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > Since strchr-avx2.S updated by
> > >
> > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > Author: noah <goldstein.w.n@gmail.com>
> > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > >
> > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > >
> > > uses sarx:
> > >
> > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > >
> > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > ifunc-avx2.h.
> > >
> > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > >  2 files changed, 11 insertions(+), 5 deletions(-)
> >
> > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > backported to 2.32 and older branches, and strchr-avx2.S in those
> > branches do not use BMI2 instructions. So it doesn't make sense to
> > backport it.
> >
> > That said the change in ifunc-avx2.h fixes:
> >
> > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> >   Optimize memchr-avx2.S")
> > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> >   Optimize strlen-avx2.S")
> >
> > So the issues are fixed, but mostly by chance.
> 
> How do you know it is a "by chance" fix, do you have any evidence to back
> your claim?

My point is that the commit that has been backported is fixing a bug
that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx
instruction as the commit claims, and does not use other BMI2
instructions either.

However following the backport of commit acfd088a1963 and aaa23c350715
in these branches, memchr-avx2.S and strlen-avx2.S use BMI2
instructions, and as they use ifunc-avx2.h, this actually fixes the bug.
Sunil Pandey Oct. 5, 2022, 6:34 p.m. UTC | #21
On Wed, Oct 5, 2022 at 10:11 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote:
> > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > >
> > > Sorry to be late on this. I have a few comments about that patch:
> > >
> > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > Since strchr-avx2.S updated by
> > > >
> > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > > Author: noah <goldstein.w.n@gmail.com>
> > > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > > >
> > > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > > >
> > > > uses sarx:
> > > >
> > > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > > >
> > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > > ifunc-avx2.h.
> > > >
> > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > > >  2 files changed, 11 insertions(+), 5 deletions(-)
> > >
> > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > > backported to 2.32 and older branches, and strchr-avx2.S in those
> > > branches do not use BMI2 instructions. So it doesn't make sense to
> > > backport it.
> > >
> > > That said the change in ifunc-avx2.h fixes:
> > >
> > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> > >   Optimize memchr-avx2.S")
> > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> > >   Optimize strlen-avx2.S")
> > >
> > > So the issues are fixed, but mostly by chance.
> >
> > How do you know it is a "by chance" fix, do you have any evidence to back
> > your claim?
>
> My point is that the commit that has been backported is fixing a bug
> that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx
> instruction as the commit claims, and does not use other BMI2
> instructions either.
>
> However following the backport of commit acfd088a1963 and aaa23c350715
> in these branches, memchr-avx2.S and strlen-avx2.S use BMI2
> instructions, and as they use ifunc-avx2.h, this actually fixes the bug.
>

This patch got selected because it fixes the ifunc-avx2.h file. My preference
 is to take an existing patch if possible, rather than creating a new one for
 branches.

You are right, the original patch should have been composed differently to
make it crystal clear.

For backporting it's preferable to have small independent patches with
logical grouping.


> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c377cab629..651b32908e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,10 +293,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strlen,
@@ -309,10 +311,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
@@ -654,10 +658,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      IFUNC_IMPL_ADD (array, i, wcslen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcslen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
@@ -670,10 +676,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
   IFUNC_IMPL (i, name, wcsnlen,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcsnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 1caae9e6bc..bd2e6ee44a 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@ 
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,349 +43,459 @@ 
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
-	/* Check for zero length.  */
+	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+	mov	%RSI_LP, %R8_LP
 #  ifdef USE_AS_WCSLEN
 	shl	$2, %RSI_LP
 #  elif defined __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
-	mov	%RSI_LP, %R8_LP
 # endif
-	movl	%edi, %ecx
+	movl	%edi, %eax
 	movq	%rdi, %rdx
 	vpxor	%xmm0, %xmm0, %xmm0
-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 # ifdef USE_AS_STRNLEN
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rsi
-	jbe	L(max)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$VEC_SIZE, %rsi
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
 
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+L(first_vec_x1):
 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
-	subq	%rdx, %rax
 # ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	shrl	$2, %eax
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-	    to void possible addition overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
-
-	/* Check the end of data.  */
-	subq	%rcx, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+	   it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
+	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* Readjust length.  */
 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
-
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa (%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
-	VPMINU	%ymm5, %ymm6, %ymm5
-
-	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
-	jmp	L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
 	subq	$(VEC_SIZE * 4), %rsi
-	ja	L(loop_4x_vec)
-
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %esi
-	jle	L(last_2x_vec)
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb	%ymm5, %ecx
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	subq	%rdx, %rdi
 	testl	%eax, %eax
+	jnz	L(last_vec_return_x0)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
-
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
+	jnz	L(last_vec_return_x1)
+
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
+
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %esi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	vpmovmskb	%ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x1_check)
 
-	.p2align 4
-L(first_vec_x0_check):
+	subl	$VEC_SIZE, %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
 L(max):
 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
 	VZEROUPPER_RETURN
+# endif
 
+	/* Cold case for crossing page with first load.	 */
 	.p2align 4
-L(4x_vec_end):
-	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	VPCMPEQ %ymm2, %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	VPCMPEQ %ymm3, %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	VPCMPEQ %ymm4, %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
 # endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
 	VZEROUPPER_RETURN
+# endif
 
 END (STRLEN)
 #endif