diff mbox series

x86_64: Implement AVX2 version of strlcpy/wcslcpy function

Message ID 20230630204812.2059831-1-skpgkp2@gmail.com
State New
Headers show
Series x86_64: Implement AVX2 version of strlcpy/wcslcpy function | expand

Commit Message

Sunil Pandey June 30, 2023, 8:48 p.m. UTC
This patch optimizes strlcpy/wsclcpy string functions for AVX2.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
 sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
 sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
 sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
 sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
 9 files changed, 627 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c

Comments

Noah Goldstein June 30, 2023, 9:04 p.m. UTC | #1
On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>  9 files changed, 627 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e1e894c963..7e3fc081df 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -82,6 +82,8 @@ sysdep_routines += \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
>    strcspn-sse4 \
> +  strlcpy-avx2 \
> +  strlcpy-generic \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> @@ -153,6 +155,8 @@ sysdep_routines += \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> +  wcslcpy-avx2 \
> +  wcslcpy-generic \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5427ff1907..9928dee187 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __strncat_sse2_unaligned))
>
> +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> +  IFUNC_IMPL (i, name, strlcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __strlcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> +                                    1,
> +                                    __strlcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __wcscpy_generic))
>
> +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> +  IFUNC_IMPL (i, name, wcslcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __wcslcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> +                                    1,
> +                                    __wcslcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>    IFUNC_IMPL (i, name, wcsncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> new file mode 100644
> index 0000000000..982a30d15b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> @@ -0,0 +1,34 @@
> +/* Common definition for ifunc selections.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    return OPTIMIZE (avx2);
> +
> +  return OPTIMIZE (generic);
> +}
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> new file mode 100644
> index 0000000000..cf54b1e990
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> @@ -0,0 +1,446 @@
> +/* Strlcpy/wcslcpy optimized with AVX2.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRLCPY
> +#  define STRLCPY      __strlcpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSLCPY
> +#  define CHAR_SIZE    4
> +#  define MOVU         movl
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
> +# else
> +#  define CHAR_SIZE    1
> +#  define MOVU         movb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
> +# endif
> +
> +# define PMOVMSK       vpmovmskb
> +# define PAGE_SIZE     4096
> +# define VEC_SIZE      32
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +
> +ENTRY_P2ALIGN (STRLCPY, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +
> +       /* Zero out vector register for end of string comparison. */
> +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> +       /* Save source pointer for return calculation.  */
> +       mov     %rsi, %r8
> +       mov     %esi, %eax
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       /* Load first vector.  */
> +       VMOVU   (%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       test    %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +       test    %rdx, %rdx
> +       jz      L(continue_second_vector)
> +
> +       /* Check whether we can copy full vector.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(page_cross_small_vec_copy)
> +       /* Copy first vector.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       sub     $CHAR_PER_VEC, %rdx
> +
> +L(continue_second_vector):
> +       /* Align RSI pointer and adjust RDI based on offset.  */
> +       mov     %rsi, %rax
> +       and     $-VEC_SIZE, %rsi
> +       sub     %rsi, %rax
> +       sub     %rax, %rdi
> +
> +       /* Check if string already copied N char, and RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(skip_copy_alignment_fix)
> +
> +       /* Adjust RDX for copy alignment fix.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       add     %rax, %rdx
> +
> +L(skip_copy_alignment_fix):
> +       /* Load second vector.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x2)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_third_vector)
> +
> +       /* Jump below/equal(instead of below) used here, because last
> +          copy chracter must be NULL.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_second_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy second vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +
> +L(continue_third_vector):
> +       /* Load third vector.  */
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x3)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_fourth_vector)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_third_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy third vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> +
> +L(continue_fourth_vector):
> +       /* Load fourth vector.  */
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x4)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_align)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_fourth_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy fourth vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> +
> +
> +L(loop_4x_align):
> +       /* Jump to loop if RSI is already 4 vector align.  */
> +       test    $(VEC_SIZE * 4 - 1), %esi
> +       jz      L(loop_4x_read)
> +
> +       mov     %rsi, %rcx
> +
> +       /* Align RSI to 4x vector.  */
> +       and     $(VEC_SIZE * -4), %rsi
> +       sub     %rsi, %rcx
> +
> +       /* Adjust RDI for RSI alignment fix.  */
> +       sub     %rcx, %rdi
> +
> +       /* Jump to loop if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_read)
> +
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +
> +       /* Adjust RDX for RSI alignment fix.  */
> +       add     %rcx, %rdx
> +       jmp     L(loop_4x_read)
> +
> +       .p2align 4,,6
> +L(loop_4x_vec):
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_partial_copy_return)
> +       cmp     $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(loop_partial_copy)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> +       sub     $(CHAR_PER_VEC * 4), %rdx
> +
> +L(loop_partial_copy_return):
> +       sub     $(VEC_SIZE * -4), %rsi
> +       sub     $(VEC_SIZE * -4), %rdi
> +
> +L(loop_4x_read):
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> +       vptest  %VMM(7), %VMM(7)
> +
> +       jz      L(loop_4x_vec)
> +
> +       /* Check if string ends in first vector or second vector.  */
> +       lea     (VEC_SIZE * 4)(%rsi), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       xor     %r10, %r10
> +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> +       vptest  %VMM(6), %VMM(6)
> +       jnz     L(endloop)
> +       sub     $(CHAR_PER_VEC * -2), %rax
> +       mov     $(CHAR_PER_VEC * 2), %r10
> +       VMOVA   %VMM(3), %VMM(1)
> +       VMOVA   %VMM(4), %VMM(2)
> +
> +L(endloop):
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> +       PMOVMSK %VMM(1), %rcx
> +       PMOVMSK %VMM(2), %r9
> +       shlq    $32, %r9
> +       orq     %r9, %rcx
> +       bsf     %rcx, %rcx
> +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +       /* At this point RAX has length to return.  */
> +       add     %rcx, %rax
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +       /* Add 1 to account for NULL character in RDX comparison.  */
> +       lea     1(%r10, %rcx), %rcx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(loop_partial_copy):
> +       cmp     $(CHAR_PER_VEC * 2), %rdx
> +       jbe     L(loop_partial_first_half)
> +       /* Reload first 2 vector.  */
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +
> +L(loop_partial_first_half):
> +       /* Go back 2 vector from last and use overlapping copy.
> +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> +        */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(7), %VMM(7)
> +       jz      L(loop_partial_copy_return)
> +       ret
> +
> +       .p2align 4
> +L(page_cross):
> +       mov     %rsi, %rcx
> +       mov     %rsi, %r11
> +       and     $-VEC_SIZE, %r11
> +       and     $(VEC_SIZE - 1), %rcx
> +       VMOVA   (%r11), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       shr     %cl, %eax
> +       jz      L(page_cross_continue)
> +
> +L(ret_vec_x1):
> +       bsf     %eax, %eax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %eax
> +# endif
> +       /* Increment by 1 to account for NULL char.  */
> +       lea     1(%eax), %ecx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +L(page_cross_small_vec_copy):
> +       cmp     $(16 / CHAR_SIZE), %rdx
> +       jbe     L(copy_8_byte_scalar)
> +       VMOVU   (%rsi), %VMM_128(1)
> +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> +       VMOVU   %VMM_128(1), (%rdi)
> +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_8_byte_scalar):
> +       cmp     $(8 / CHAR_SIZE), %rdx
> +       jbe     L(copy_4_byte_scalar)
> +       movq    (%rsi), %r10
> +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> +       movq    %r10, (%rdi)
> +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_4_byte_scalar):
> +# ifndef USE_AS_WCSLCPY
> +       cmp     $4, %rdx
> +       jbe     L(copy_2_byte_scalar)
> +# endif
> +       movl    (%rsi), %r10d
> +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> +       movl    %r10d, (%rdi)
> +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +# ifndef USE_AS_WCSLCPY
> +L(copy_2_byte_scalar):
> +       cmp     $2, %rdx
> +       jbe     L(copy_1_byte_scalar)
> +       movw    (%rsi), %r10w
> +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> +       movw    %r10w, (%rdi)
> +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_1_byte_scalar):
> +       MOVU    (%rsi), %r10b
> +       MOVU    %r10b, (%rdi)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     VEC_SIZE(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_second_vector):
> +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_third_vector)
> +
> +L(ret):
> +       ret
> +
> +L(ret_vec_x3):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_third_vector):
> +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +L(ret_vec_x4):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_fourth_vector):
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +END (STRLCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> new file mode 100644
> index 0000000000..eee3b7b086
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* strlcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __strlcpy  __strlcpy_generic
> +# include <string/strlcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
> new file mode 100644
> index 0000000000..ded41fbcfb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> @@ -0,0 +1,36 @@
> +/* Multiple versions of strlcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __strlcpy __redirect_strlcpy
> +# include <string.h>
> +# undef __strlcpy
> +
> +# define SYMBOL_NAME strlcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
> +weak_alias (__strlcpy, strlcpy)
> +
> +# ifdef SHARED
> +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> new file mode 100644
> index 0000000000..dafc20ded0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> @@ -0,0 +1,4 @@
> +#define STRLCPY        __wcslcpy_avx2
> +#define USE_AS_WCSLCPY 1
> +
> +#include "strlcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> new file mode 100644
> index 0000000000..ffd3c0e846
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* wcslcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __wcslcpy  __wcslcpy_generic
> +# include <wcsmbs/wcslcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
> new file mode 100644
> index 0000000000..371ef9626c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> @@ -0,0 +1,35 @@
> +/* Multiple versions of wcslcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcslcpy __redirect_wcslcpy
> +# include <wchar.h>
> +# undef __wcslcpy
> +
> +# define SYMBOL_NAME wcslcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcslcpy, wcslcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> +# endif
> +#endif
> --
> 2.38.1
>

Think we should at the very least wait for the generic strlcpy codes
to land first.
Paul Eggert June 30, 2023, 9:27 p.m. UTC | #2
On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> Think we should at the very least wait for the generic strlcpy codes
> to land first.

Let's not optimize these functions at all, unless there's good and 
measured reason to do so. In practice I expected they're called with 
small sizes for which optimization is a net minus as it consumes 
valuable maintenance time with no real benefit.
Sunil Pandey June 30, 2023, 10:21 p.m. UTC | #3
On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:

> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> > Think we should at the very least wait for the generic strlcpy codes
> > to land first.
>
> Let's not optimize these functions at all, unless there's good and
> measured reason to do so. In practice I expected they're called with
> small sizes for which optimization is a net minus as it consumes
> valuable maintenance time with no real benefit.
>

Hi Paul,

Attached is strcpy/wcslcpy microbenchmark data based on Noah
strlcpy/wcslcpy microbenchmark patch.

https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html

Thanks,
Sunil
Function: wcslcpy
Variant: 
                                    __wcslcpy_avx2	__wcslcpy_generic
========================================================================================================================
    len=16, align1=1, align2=1, n=16:        14.99 ( 24.63%)	       19.89	
    len=16, align1=1, align2=1, n=16:        14.58 ( 19.61%)	       18.13	
    len=16, align1=1, align2=2, n=16:        16.99 (  4.02%)	       17.70	
    len=16, align1=2, align2=1, n=16:        15.14 ( 17.08%)	       18.25	
      len=2, align1=7, align2=2, n=4:         8.40 ( 44.41%)	       15.11	
      len=4, align1=2, align2=7, n=2:        10.91 ( 42.41%)	       18.95	
      len=2, align1=7, align2=2, n=4:         8.92 ( 34.99%)	       13.72	
      len=4, align1=2, align2=7, n=2:        10.92 ( 42.05%)	       18.84	
    len=16, align1=2, align2=2, n=16:        15.70 ( 11.97%)	       17.84	
    len=16, align1=2, align2=2, n=16:        14.83 ( 16.82%)	       17.83	
    len=16, align1=2, align2=4, n=16:        17.30 ( -0.46%)	       17.22	
    len=16, align1=4, align2=2, n=16:        15.44 ( 15.20%)	       18.21	
      len=4, align1=6, align2=4, n=8:        12.87 ( 14.74%)	       15.09	
      len=8, align1=4, align2=6, n=4:        13.72 ( 25.95%)	       18.53	
      len=4, align1=6, align2=4, n=8:        12.85 (  9.03%)	       14.13	
      len=8, align1=4, align2=6, n=4:        12.67 ( 31.60%)	       18.52	
    len=16, align1=3, align2=3, n=16:        14.57 ( 15.76%)	       17.30	
    len=16, align1=3, align2=3, n=16:        14.82 ( 14.03%)	       17.23	
    len=16, align1=3, align2=6, n=16:        17.02 (  3.24%)	       17.59	
    len=16, align1=6, align2=3, n=16:        15.04 ( 19.50%)	       18.68	
     len=8, align1=5, align2=6, n=16:        14.96 (  8.00%)	       16.26	
     len=16, align1=6, align2=5, n=8:        13.70 ( 25.56%)	       18.41	
     len=8, align1=5, align2=6, n=16:        14.54 (  7.87%)	       15.78	
     len=16, align1=6, align2=5, n=8:        12.35 ( 24.15%)	       16.28	
    len=16, align1=4, align2=4, n=16:        13.93 ( 14.41%)	       16.28	
    len=16, align1=4, align2=4, n=16:        13.63 ( 16.32%)	       16.29	
    len=16, align1=4, align2=0, n=16:        12.97 ( 21.40%)	       16.51	
    len=16, align1=0, align2=4, n=16:        14.09 ( 15.59%)	       16.70	
    len=16, align1=4, align2=0, n=32:        13.75 ( 31.95%)	       20.20	
    len=32, align1=0, align2=4, n=16:        19.20 ( -0.01%)	       19.20	
    len=16, align1=4, align2=0, n=32:        14.45 ( 31.46%)	       21.08	
    len=32, align1=0, align2=4, n=16:        19.07 ( -1.55%)	       18.78	
    len=16, align1=5, align2=5, n=16:        14.89 ( 15.97%)	       17.72	
    len=16, align1=5, align2=5, n=16:        13.78 ( 15.12%)	       16.23	
    len=16, align1=5, align2=2, n=16:        14.89 ( 13.72%)	       17.26	
    len=16, align1=2, align2=5, n=16:        17.32 ( -0.72%)	       17.20	
    len=32, align1=3, align2=2, n=64:        23.78 ( 20.84%)	       30.05	
    len=64, align1=2, align2=3, n=32:        24.54 (  0.82%)	       24.74	
    len=32, align1=3, align2=2, n=64:        22.48 ( 17.99%)	       27.41	
    len=64, align1=2, align2=3, n=32:        22.63 (  8.72%)	       24.79	
    len=16, align1=6, align2=6, n=16:        14.76 ( 14.10%)	       17.19	
    len=16, align1=6, align2=6, n=16:        14.57 ( 16.81%)	       17.52	
    len=16, align1=6, align2=4, n=16:        14.88 ( 13.70%)	       17.25	
    len=16, align1=4, align2=6, n=16:        16.29 ( -0.14%)	       16.27	
   len=64, align1=2, align2=4, n=128:        28.40 (  9.37%)	       31.34	
   len=128, align1=4, align2=2, n=64:        28.48 ( 10.08%)	       31.67	
   len=64, align1=2, align2=4, n=128:        29.65 ( 11.33%)	       33.44	
   len=128, align1=4, align2=2, n=64:        30.18 (  6.40%)	       32.25	
    len=16, align1=7, align2=7, n=16:        14.86 (  8.40%)	       16.22	
    len=16, align1=7, align2=7, n=16:        13.78 ( 16.30%)	       16.47	
    len=16, align1=7, align2=6, n=16:        14.23 ( 12.27%)	       16.22	
    len=16, align1=6, align2=7, n=16:        16.30 ( -0.53%)	       16.22	
  len=128, align1=1, align2=6, n=256:        35.07 ( 25.88%)	       47.32	
  len=256, align1=6, align2=1, n=128:        45.32 ( 11.90%)	       51.44	
  len=128, align1=1, align2=6, n=256:        35.14 ( 24.65%)	       46.64	
  len=256, align1=6, align2=1, n=128:        43.26 ( 15.54%)	       51.22	
     len=8, align1=0, align2=0, n=16:        13.17 ( 29.35%)	       18.65	
    len=32, align1=0, align2=0, n=16:        18.81 ( -3.57%)	       18.17	
     len=8, align1=7, align2=2, n=16:        13.92 ( -7.07%)	       13.00	
    len=32, align1=7, align2=2, n=16:        17.52 ( 14.77%)	       20.55	
    len=16, align1=0, align2=0, n=32:        13.85 ( 33.77%)	       20.91	
    len=64, align1=0, align2=0, n=32:        23.32 (  7.24%)	       25.14	
    len=16, align1=6, align2=4, n=32:        14.87 ( 17.40%)	       18.00	
    len=64, align1=6, align2=4, n=32:        23.32 ( 14.99%)	       27.43	
    len=32, align1=0, align2=0, n=64:        21.05 ( 16.72%)	       25.28	
   len=128, align1=0, align2=0, n=64:        28.81 ( 11.25%)	       32.46	
    len=32, align1=5, align2=6, n=64:        24.68 ( 10.16%)	       27.47	
   len=128, align1=5, align2=6, n=64:        28.66 (  7.24%)	       30.89	
   len=64, align1=0, align2=0, n=128:        24.98 ( 21.37%)	       31.77	
  len=256, align1=0, align2=0, n=128:        43.90 ( 18.92%)	       54.14	
   len=64, align1=4, align2=0, n=128:        26.13 ( 24.65%)	       34.68	
  len=256, align1=4, align2=0, n=128:        44.27 ( 15.06%)	       52.12	
  len=128, align1=0, align2=0, n=256:        34.29 ( 33.53%)	       51.58	
  len=512, align1=0, align2=0, n=256:        68.94 (  8.14%)	       75.05	
  len=128, align1=3, align2=2, n=256:        36.06 ( 15.45%)	       42.65	
  len=512, align1=3, align2=2, n=256:        65.15 ( 12.33%)	       74.32	
  len=256, align1=0, align2=0, n=512:        46.37 ( 30.42%)	       66.64	
 len=1024, align1=0, align2=0, n=512:       114.89 (  8.32%)	      125.31	
  len=256, align1=2, align2=4, n=512:        56.05 ( 16.50%)	       67.12	
 len=1024, align1=2, align2=4, n=512:       179.87 (-52.13%)	      118.24	
 len=512, align1=0, align2=0, n=1024:        68.16 ( 29.70%)	       96.96	
 len=512, align1=1, align2=6, n=1024:       119.39 (-26.04%)	       94.72	
   len=128, align1=1, align2=0, n=64:        27.46 ( 17.94%)	       33.46	
   len=128, align1=0, align2=0, n=64:        29.69 ( -2.62%)	       28.93	
   len=128, align1=0, align2=0, n=64:        27.25 (  6.15%)	       29.03	
   len=128, align1=0, align2=0, n=64:        27.24 (  6.61%)	       29.17	
   len=64, align1=1, align2=0, n=128:        25.50 ( 21.40%)	       32.44	
   len=64, align1=0, align2=0, n=128:        23.50 ( 27.08%)	       32.22	
   len=64, align1=0, align2=0, n=128:        24.88 ( 16.98%)	       29.97	
   len=64, align1=0, align2=0, n=128:        24.59 ( 22.98%)	       31.92	
   len=128, align1=1, align2=0, n=96:        27.46 ( 29.72%)	       39.07	
   len=128, align1=0, align2=0, n=96:        28.55 ( 20.33%)	       35.83	
   len=128, align1=0, align2=0, n=96:        27.25 ( 24.21%)	       35.95	
   len=128, align1=0, align2=0, n=96:        28.53 ( 19.86%)	       35.59	
   len=96, align1=1, align2=0, n=128:        30.65 ( 18.65%)	       37.68	
   len=96, align1=0, align2=0, n=128:        28.06 ( 19.41%)	       34.82	
   len=96, align1=0, align2=0, n=128:        27.92 ( 20.27%)	       35.02	
   len=96, align1=0, align2=0, n=128:        28.06 ( 19.43%)	       34.83	
  len=128, align1=1, align2=0, n=128:        31.31 ( 28.02%)	       43.51	
  len=128, align1=0, align2=0, n=128:        28.52 ( 29.34%)	       40.37	
  len=128, align1=0, align2=0, n=128:        27.25 ( 32.18%)	       40.17	
  len=128, align1=0, align2=0, n=128:        27.46 ( 31.33%)	       39.99	
  len=128, align1=1, align2=0, n=128:        31.32 ( 28.00%)	       43.50	
  len=128, align1=0, align2=0, n=128:        27.46 ( 31.03%)	       39.82	
  len=128, align1=0, align2=0, n=128:        27.25 ( 32.30%)	       40.25	
  len=128, align1=0, align2=0, n=128:        27.25 ( 31.97%)	       40.05	
  len=128, align1=1, align2=0, n=160:        34.00 ( 20.12%)	       42.56	
  len=128, align1=0, align2=0, n=160:        32.19 ( 30.63%)	       46.40	
  len=128, align1=0, align2=0, n=160:        32.17 ( 28.12%)	       44.76	
  len=128, align1=0, align2=0, n=160:        32.39 ( 27.63%)	       44.76	
  len=160, align1=1, align2=0, n=128:        29.84 ( 35.97%)	       46.61	
  len=160, align1=0, align2=0, n=128:        31.79 ( 25.56%)	       42.71	
  len=160, align1=0, align2=0, n=128:        32.00 ( 24.86%)	       42.59	
  len=160, align1=0, align2=0, n=128:        31.79 ( 25.85%)	       42.86	
  len=128, align1=1, align2=0, n=192:        33.81 ( 21.08%)	       42.84	
  len=128, align1=0, align2=0, n=192:        32.38 ( 29.98%)	       46.24	
  len=128, align1=0, align2=0, n=192:        32.38 ( 27.38%)	       44.58	
  len=128, align1=0, align2=0, n=192:        32.18 ( 28.29%)	       44.87	
  len=192, align1=1, align2=0, n=128:        34.71 ( 27.54%)	       47.90	
  len=192, align1=0, align2=0, n=128:        35.25 ( 22.44%)	       45.44	
  len=192, align1=0, align2=0, n=128:        35.30 ( 21.97%)	       45.24	
  len=192, align1=0, align2=0, n=128:        35.03 ( 22.17%)	       45.01	
  len=256, align1=1, align2=0, n=192:        39.58 ( 30.82%)	       57.21	
  len=256, align1=0, align2=0, n=192:        42.27 ( 24.21%)	       55.77	
  len=256, align1=0, align2=0, n=192:        41.10 ( 26.00%)	       55.54	
  len=256, align1=0, align2=0, n=192:        43.11 ( 21.51%)	       54.92	
  len=192, align1=1, align2=0, n=256:        38.15 ( 29.78%)	       54.33	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.27%)	       55.26	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.46%)	       55.42	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.46%)	       55.42	
  len=256, align1=1, align2=0, n=224:        40.87 ( 31.48%)	       59.65	
  len=256, align1=0, align2=0, n=224:        41.66 ( 26.95%)	       57.02	
  len=256, align1=0, align2=0, n=224:        41.08 ( 28.22%)	       57.24	
  len=256, align1=0, align2=0, n=224:        41.17 ( 27.86%)	       57.07	
  len=224, align1=1, align2=0, n=256:        38.96 ( 32.41%)	       57.65	
  len=224, align1=0, align2=0, n=256:        42.27 ( 28.61%)	       59.21	
  len=224, align1=0, align2=0, n=256:        40.15 ( 32.33%)	       59.34	
  len=224, align1=0, align2=0, n=256:        40.10 ( 32.78%)	       59.65	
  len=256, align1=1, align2=0, n=256:        41.22 ( 33.31%)	       61.80	
  len=256, align1=0, align2=0, n=256:        41.52 ( 29.99%)	       59.30	
  len=256, align1=0, align2=0, n=256:        41.17 ( 29.82%)	       58.66	
  len=256, align1=0, align2=0, n=256:        41.18 ( 30.68%)	       59.40	
  len=256, align1=1, align2=0, n=256:        47.52 ( 29.49%)	       67.39	
  len=256, align1=0, align2=0, n=256:        44.83 ( 30.61%)	       64.60	
  len=256, align1=0, align2=0, n=256:        45.50 ( 29.57%)	       64.60	
  len=256, align1=0, align2=0, n=256:        44.83 ( 29.93%)	       63.97	
  len=256, align1=1, align2=0, n=288:        44.21 ( 33.34%)	       66.32	
  len=256, align1=0, align2=0, n=288:        41.58 ( 33.60%)	       62.62	
  len=256, align1=0, align2=0, n=288:        44.57 ( 30.02%)	       63.69	
  len=256, align1=0, align2=0, n=288:        42.80 ( 35.55%)	       66.41	
  len=288, align1=1, align2=0, n=256:        46.39 ( 29.55%)	       65.85	
  len=288, align1=0, align2=0, n=256:        45.95 ( 28.95%)	       64.68	
  len=288, align1=0, align2=0, n=256:        46.26 ( 29.92%)	       66.02	
  len=288, align1=0, align2=0, n=256:        48.47 ( 20.26%)	       60.79	
  len=256, align1=1, align2=0, n=320:        41.81 ( 31.09%)	       60.67	
  len=256, align1=0, align2=0, n=320:        41.87 ( 34.40%)	       63.82	
  len=256, align1=0, align2=0, n=320:        41.52 ( 34.47%)	       63.35	
  len=256, align1=0, align2=0, n=320:        44.29 ( 33.29%)	       66.39	
  len=320, align1=1, align2=0, n=256:        48.70 ( 29.59%)	       69.16	
  len=320, align1=0, align2=0, n=256:        46.47 ( 24.55%)	       61.60	
  len=320, align1=0, align2=0, n=256:        45.68 ( 27.30%)	       62.83	
  len=320, align1=0, align2=0, n=256:        47.34 ( 23.15%)	       61.60	
  len=512, align1=1, align2=0, n=448:        72.59 ( 23.10%)	       94.39	
  len=512, align1=0, align2=0, n=448:        68.84 ( 38.34%)	      111.65	
  len=512, align1=0, align2=0, n=448:        69.80 ( 36.56%)	      110.03	
  len=512, align1=0, align2=0, n=448:        67.31 ( 40.49%)	      113.10	
  len=448, align1=1, align2=0, n=512:        65.75 ( 28.23%)	       91.61	
  len=448, align1=0, align2=0, n=512:        61.41 ( 30.51%)	       88.36	
  len=448, align1=0, align2=0, n=512:        65.19 ( 29.15%)	       92.02	
  len=448, align1=0, align2=0, n=512:        61.07 ( 31.08%)	       88.61	
  len=512, align1=1, align2=0, n=480:        75.89 ( 16.65%)	       91.05	
  len=512, align1=0, align2=0, n=480:        66.17 ( 26.56%)	       90.10	
  len=512, align1=0, align2=0, n=480:        65.74 ( 26.92%)	       89.96	
  len=512, align1=0, align2=0, n=480:        66.30 ( 26.50%)	       90.21	
  len=480, align1=1, align2=0, n=512:        65.24 ( 28.33%)	       91.03	
  len=480, align1=0, align2=0, n=512:        64.50 ( 30.43%)	       92.70	
  len=480, align1=0, align2=0, n=512:        64.49 ( 29.90%)	       91.99	
  len=480, align1=0, align2=0, n=512:        64.50 ( 30.11%)	       92.29	
  len=512, align1=1, align2=0, n=512:        68.43 ( 28.04%)	       95.09	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.18%)	       92.05	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.01%)	       91.82	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.05%)	       91.87	
  len=512, align1=1, align2=0, n=512:        67.68 ( 28.93%)	       95.23	
  len=512, align1=0, align2=0, n=512:        67.03 ( 27.48%)	       92.42	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.15%)	       92.00	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.33%)	       92.23	
  len=512, align1=1, align2=0, n=544:        70.63 ( 26.35%)	       95.89	
  len=512, align1=0, align2=0, n=544:        67.72 ( 29.97%)	       96.70	
  len=512, align1=0, align2=0, n=544:        67.71 ( 30.17%)	       96.95	
  len=512, align1=0, align2=0, n=544:        67.71 ( 29.99%)	       96.72	
  len=544, align1=1, align2=0, n=512:        83.22 ( 13.39%)	       96.08	
  len=544, align1=0, align2=0, n=512:        68.97 ( 27.78%)	       95.50	
  len=544, align1=0, align2=0, n=512:        71.83 ( 24.53%)	       95.18	
  len=544, align1=0, align2=0, n=512:        68.99 ( 27.28%)	       94.87	
  len=512, align1=1, align2=0, n=576:        72.60 ( 28.17%)	      101.08	
  len=512, align1=0, align2=0, n=576:        72.27 ( 25.52%)	       97.03	
  len=512, align1=0, align2=0, n=576:        67.75 ( 30.53%)	       97.52	
  len=512, align1=0, align2=0, n=576:        72.53 ( 29.10%)	      102.30	
  len=576, align1=1, align2=0, n=512:        82.05 ( 16.23%)	       97.94	
  len=576, align1=0, align2=0, n=512:        71.35 ( 26.64%)	       97.26	
  len=576, align1=0, align2=0, n=512:        74.36 ( 23.52%)	       97.23	
  len=576, align1=0, align2=0, n=512:        71.58 ( 26.50%)	       97.38	
 len=1024, align1=1, align2=0, n=960:       147.26 ( 11.02%)	      165.50	
 len=1024, align1=0, align2=0, n=960:       134.00 ( 13.30%)	      154.55	
 len=1024, align1=0, align2=0, n=960:       134.31 ( 13.26%)	      154.84	
 len=1024, align1=0, align2=0, n=960:       134.53 ( 12.97%)	      154.58	
 len=960, align1=1, align2=0, n=1024:       129.09 ( 20.84%)	      163.08	
 len=960, align1=0, align2=0, n=1024:       113.32 ( 26.35%)	      153.86	
 len=960, align1=0, align2=0, n=1024:       113.08 ( 26.77%)	      154.42	
 len=960, align1=0, align2=0, n=1024:       113.10 ( 26.50%)	      153.88	
 len=1024, align1=1, align2=0, n=992:       138.81 ( 18.75%)	      170.85	
 len=1024, align1=0, align2=0, n=992:       134.08 ( 14.74%)	      157.25	
 len=1024, align1=0, align2=0, n=992:       133.96 ( 14.83%)	      157.28	
 len=1024, align1=0, align2=0, n=992:       133.76 ( 15.03%)	      157.42	
 len=992, align1=1, align2=0, n=1024:       136.17 ( 18.21%)	      166.50	
 len=992, align1=0, align2=0, n=1024:       116.81 ( 29.71%)	      166.18	
 len=992, align1=0, align2=0, n=1024:       116.46 ( 26.72%)	      158.92	
 len=992, align1=0, align2=0, n=1024:       116.63 ( 26.64%)	      158.99	
len=1024, align1=1, align2=0, n=1024:       150.63 ( 14.32%)	      175.81	
len=1024, align1=0, align2=0, n=1024:       119.07 ( 26.07%)	      161.07	
len=1024, align1=0, align2=0, n=1024:       119.10 ( 26.06%)	      161.08	
len=1024, align1=0, align2=0, n=1024:       118.91 ( 26.16%)	      161.04	
len=1024, align1=1, align2=0, n=1024:       158.94 ( 13.17%)	      183.06	
len=1024, align1=0, align2=0, n=1024:       120.68 ( 27.45%)	      166.35	
len=1024, align1=0, align2=0, n=1024:       119.16 ( 26.03%)	      161.09	
len=1024, align1=0, align2=0, n=1024:       119.16 ( 26.02%)	      161.07	
len=1024, align1=1, align2=0, n=1056:       162.90 ( 15.29%)	      192.30	
len=1024, align1=0, align2=0, n=1056:       140.90 ( 26.76%)	      192.38	
len=1024, align1=0, align2=0, n=1056:       140.05 ( 30.28%)	      200.89	
len=1024, align1=0, align2=0, n=1056:       146.22 ( 25.04%)	      195.08	
len=1056, align1=1, align2=0, n=1024:       166.62 (  8.97%)	      183.03	
len=1056, align1=0, align2=0, n=1024:       121.48 ( 25.46%)	      162.98	
len=1056, align1=0, align2=0, n=1024:       123.93 ( 24.01%)	      163.09	
len=1056, align1=0, align2=0, n=1024:       127.86 ( 25.98%)	      172.73	
len=1024, align1=1, align2=0, n=1088:       167.49 ( 12.93%)	      192.36	
len=1024, align1=0, align2=0, n=1088:       147.48 ( 23.34%)	      192.38	
len=1024, align1=0, align2=0, n=1088:       140.01 ( 27.22%)	      192.39	
len=1024, align1=0, align2=0, n=1088:       140.09 ( 27.23%)	      192.51	
len=1088, align1=1, align2=0, n=1024:       159.00 ( 13.46%)	      183.73	
len=1088, align1=0, align2=0, n=1024:       143.31 ( 14.25%)	      167.13	
len=1088, align1=0, align2=0, n=1024:       140.46 ( 14.32%)	      163.93	
len=1088, align1=0, align2=0, n=1024:       139.85 ( 14.69%)	      163.92
Function: strlcpy
Variant: 
                                    __strlcpy_avx2	__strlcpy_generic
========================================================================================================================
    len=16, align1=1, align2=1, n=16:        11.11 ( 32.32%)	       16.41	
    len=16, align1=1, align2=1, n=16:        10.73 ( 32.83%)	       15.98	
    len=16, align1=1, align2=2, n=16:        10.53 ( 33.23%)	       15.77	
    len=16, align1=2, align2=1, n=16:        10.89 ( 32.50%)	       16.13	
      len=2, align1=7, align2=2, n=4:         8.06 ( 35.05%)	       12.41	
      len=4, align1=2, align2=7, n=2:         8.66 ( 37.31%)	       13.82	
      len=2, align1=7, align2=2, n=4:         7.78 ( 33.85%)	       11.77	
      len=4, align1=2, align2=7, n=2:         8.70 ( 37.88%)	       14.01	
    len=16, align1=2, align2=2, n=16:        10.43 ( 31.86%)	       15.31	
    len=16, align1=2, align2=2, n=16:        10.87 ( 30.40%)	       15.62	
    len=16, align1=2, align2=4, n=16:        10.47 ( 30.24%)	       15.01	
    len=16, align1=4, align2=2, n=16:        10.56 ( 31.99%)	       15.53	
      len=4, align1=6, align2=4, n=8:        11.33 ( 18.99%)	       13.99	
      len=8, align1=4, align2=6, n=4:        10.44 ( 27.20%)	       14.34	
      len=4, align1=6, align2=4, n=8:        11.43 ( 13.14%)	       13.15	
      len=8, align1=4, align2=6, n=4:        10.83 ( 28.59%)	       15.16	
    len=16, align1=3, align2=3, n=16:        10.39 ( 33.18%)	       15.54	
    len=16, align1=3, align2=3, n=16:        10.13 ( 38.74%)	       16.53	
    len=16, align1=3, align2=6, n=16:        10.29 ( 37.51%)	       16.46	
    len=16, align1=6, align2=3, n=16:        10.56 ( 31.97%)	       15.53	
     len=8, align1=5, align2=6, n=16:        10.48 ( 22.21%)	       13.47	
     len=16, align1=6, align2=5, n=8:        10.95 ( 27.84%)	       15.17	
     len=8, align1=5, align2=6, n=16:        10.55 ( 23.09%)	       13.71	
     len=16, align1=6, align2=5, n=8:        10.98 ( 27.79%)	       15.20	
    len=16, align1=4, align2=4, n=16:        10.39 ( 32.51%)	       15.40	
    len=16, align1=4, align2=4, n=16:        10.38 ( 33.76%)	       15.68	
    len=16, align1=4, align2=0, n=16:        10.57 ( 28.87%)	       14.86	
    len=16, align1=0, align2=4, n=16:        10.28 ( 34.27%)	       15.64	
    len=16, align1=4, align2=0, n=32:        10.59 ( 23.24%)	       13.79	
    len=32, align1=0, align2=4, n=16:        11.66 ( 30.50%)	       16.77	
    len=16, align1=4, align2=0, n=32:        10.67 ( 23.98%)	       14.04	
    len=32, align1=0, align2=4, n=16:        11.06 ( 33.61%)	       16.66	
    len=16, align1=5, align2=5, n=16:        10.43 ( 33.52%)	       15.68	
    len=16, align1=5, align2=5, n=16:        10.49 ( 33.47%)	       15.77	
    len=16, align1=5, align2=2, n=16:        10.54 ( 29.46%)	       14.94	
    len=16, align1=2, align2=5, n=16:        10.20 ( 31.63%)	       14.92	
    len=32, align1=3, align2=2, n=64:        13.88 (  0.59%)	       13.97	
    len=64, align1=2, align2=3, n=32:        11.72 ( 22.36%)	       15.09	
    len=32, align1=3, align2=2, n=64:        13.49 (  2.26%)	       13.81	
    len=64, align1=2, align2=3, n=32:        11.54 ( 26.22%)	       15.64	
    len=16, align1=6, align2=6, n=16:        10.39 ( 27.70%)	       14.37	
    len=16, align1=6, align2=6, n=16:         9.94 ( 32.04%)	       14.63	
    len=16, align1=6, align2=4, n=16:         9.91 ( 33.92%)	       14.99	
    len=16, align1=4, align2=6, n=16:        10.19 ( 32.66%)	       15.14	
   len=64, align1=2, align2=4, n=128:        14.66 (  4.10%)	       15.29	
   len=128, align1=4, align2=2, n=64:        18.22 (-17.01%)	       15.57	
   len=64, align1=2, align2=4, n=128:        14.64 (  3.89%)	       15.24	
   len=128, align1=4, align2=2, n=64:        18.22 (-14.83%)	       15.86	
    len=16, align1=7, align2=7, n=16:         9.86 ( 30.07%)	       14.11	
    len=16, align1=7, align2=7, n=16:         9.86 ( 30.09%)	       14.11	
    len=16, align1=7, align2=6, n=16:         9.93 ( 32.92%)	       14.81	
    len=16, align1=6, align2=7, n=16:         9.83 ( 30.41%)	       14.13	
  len=128, align1=1, align2=6, n=256:        22.24 (  9.63%)	       24.61	
  len=256, align1=6, align2=1, n=128:        20.91 ( 12.22%)	       23.82	
  len=128, align1=1, align2=6, n=256:        22.21 (  9.86%)	       24.64	
  len=256, align1=6, align2=1, n=128:        20.81 ( 12.85%)	       23.88	
     len=8, align1=0, align2=0, n=16:        10.33 ( 20.37%)	       12.97	
    len=32, align1=0, align2=0, n=16:        10.75 ( 32.13%)	       15.84	
     len=8, align1=7, align2=2, n=16:        10.38 ( 20.33%)	       13.02	
    len=32, align1=7, align2=2, n=16:        11.03 ( 30.36%)	       15.84	
    len=16, align1=0, align2=0, n=32:         9.98 ( 26.96%)	       13.67	
    len=64, align1=0, align2=0, n=32:        10.94 ( 26.69%)	       14.92	
    len=16, align1=6, align2=4, n=32:        10.07 ( 22.77%)	       13.04	
    len=64, align1=6, align2=4, n=32:        11.68 ( 22.22%)	       15.01	
    len=32, align1=0, align2=0, n=64:        11.15 ( 11.26%)	       12.57	
   len=128, align1=0, align2=0, n=64:        17.59 ( -6.54%)	       16.51	
    len=32, align1=5, align2=6, n=64:        12.56 ( 12.27%)	       14.32	
   len=128, align1=5, align2=6, n=64:        19.12 (-20.33%)	       15.89	
   len=64, align1=0, align2=0, n=128:        12.70 ( 17.81%)	       15.45	
  len=256, align1=0, align2=0, n=128:        22.12 (  7.72%)	       23.97	
   len=64, align1=4, align2=0, n=128:        12.84 ( 18.75%)	       15.81	
  len=256, align1=4, align2=0, n=128:        21.48 ( 12.33%)	       24.50	
  len=128, align1=0, align2=0, n=256:        19.17 (  3.24%)	       19.81	
  len=512, align1=0, align2=0, n=256:        26.55 (  3.43%)	       27.49	
  len=128, align1=3, align2=2, n=256:        20.07 ( 17.46%)	       24.32	
  len=512, align1=3, align2=2, n=256:        26.65 ( 17.61%)	       32.35	
  len=256, align1=0, align2=0, n=512:        22.48 ( 14.46%)	       26.28	
 len=1024, align1=0, align2=0, n=512:        39.85 ( 12.47%)	       45.53	
  len=256, align1=2, align2=4, n=512:        27.00 (  8.13%)	       29.39	
 len=1024, align1=2, align2=4, n=512:        43.97 ( 15.73%)	       52.18	
 len=512, align1=0, align2=0, n=1024:        32.09 ( 29.08%)	       45.25	
len=2048, align1=0, align2=0, n=1024:        65.11 (  7.02%)	       70.02	
 len=512, align1=1, align2=6, n=1024:        35.13 ( 26.54%)	       47.83	
len=2048, align1=1, align2=6, n=1024:        80.38 (-15.59%)	       69.53	
   len=128, align1=1, align2=0, n=64:        18.89 (-12.93%)	       16.72	
   len=128, align1=0, align2=0, n=64:        16.93 ( -9.06%)	       15.52	
   len=128, align1=0, align2=0, n=64:        16.92 ( -8.70%)	       15.57	
   len=128, align1=0, align2=0, n=64:        17.58 (-12.44%)	       15.63	
   len=64, align1=1, align2=0, n=128:        12.84 ( 18.40%)	       15.74	
   len=64, align1=0, align2=0, n=128:        12.64 ( 19.60%)	       15.72	
   len=64, align1=0, align2=0, n=128:        12.78 ( 17.35%)	       15.47	
   len=64, align1=0, align2=0, n=128:        12.65 ( 18.44%)	       15.51	
   len=128, align1=1, align2=0, n=96:        20.15 ( -9.88%)	       18.34	
   len=128, align1=0, align2=0, n=96:        18.21 ( -3.68%)	       17.57	
   len=128, align1=0, align2=0, n=96:        18.46 ( -5.09%)	       17.57	
   len=128, align1=0, align2=0, n=96:        18.86 (  1.57%)	       19.16	
   len=96, align1=1, align2=0, n=128:        13.99 ( 15.86%)	       16.62	
   len=96, align1=0, align2=0, n=128:        14.60 ( 11.99%)	       16.59	
   len=96, align1=0, align2=0, n=128:        14.38 ( 20.13%)	       18.00	
   len=96, align1=0, align2=0, n=128:        14.34 ( 11.75%)	       16.25	
  len=128, align1=1, align2=0, n=128:        19.53 ( -0.01%)	       19.53	
  len=128, align1=0, align2=0, n=128:        20.17 ( -3.30%)	       19.53	
  len=128, align1=0, align2=0, n=128:        20.18 (-14.72%)	       17.59	
  len=128, align1=0, align2=0, n=128:        20.82 ( -0.68%)	       20.68	
  len=128, align1=1, align2=0, n=128:        20.01 ( -5.92%)	       18.89	
  len=128, align1=0, align2=0, n=128:        21.37 ( -8.22%)	       19.74	
  len=128, align1=0, align2=0, n=128:        20.17 (-14.75%)	       17.57	
  len=128, align1=0, align2=0, n=128:        20.80 (-18.42%)	       17.57	
  len=128, align1=1, align2=0, n=160:        19.65 ( 15.99%)	       23.39	
  len=128, align1=0, align2=0, n=160:        19.14 (  3.36%)	       19.80	
  len=128, align1=0, align2=0, n=160:        19.18 (  3.40%)	       19.85	
  len=128, align1=0, align2=0, n=160:        19.15 (  3.36%)	       19.81	
  len=160, align1=1, align2=0, n=128:        18.88 ( 12.02%)	       21.46	
  len=160, align1=0, align2=0, n=128:        20.16 (  9.62%)	       22.31	
  len=160, align1=0, align2=0, n=128:        20.80 (  0.05%)	       20.81	
  len=160, align1=0, align2=0, n=128:        20.16 (  8.81%)	       22.11	
  len=128, align1=1, align2=0, n=192:        19.65 ( 16.12%)	       23.42	
  len=128, align1=0, align2=0, n=192:        19.14 (  3.37%)	       19.80	
  len=128, align1=0, align2=0, n=192:        19.18 (  3.16%)	       19.80	
  len=128, align1=0, align2=0, n=192:        19.19 (  3.06%)	       19.80	
  len=192, align1=1, align2=0, n=128:        18.86 ( 19.40%)	       23.40	
  len=192, align1=0, align2=0, n=128:        20.81 (  6.46%)	       22.24	
  len=192, align1=0, align2=0, n=128:        20.81 (  8.70%)	       22.79	
  len=192, align1=0, align2=0, n=128:        21.46 (  4.55%)	       22.48	
  len=256, align1=1, align2=0, n=192:        20.83 ( 13.49%)	       24.08	
  len=256, align1=0, align2=0, n=192:        21.35 ( 15.83%)	       25.37	
  len=256, align1=0, align2=0, n=192:        20.83 ( 15.85%)	       24.75	
  len=256, align1=0, align2=0, n=192:        21.87 ( 13.82%)	       25.37	
  len=192, align1=1, align2=0, n=256:        22.27 (  5.03%)	       23.45	
  len=192, align1=0, align2=0, n=256:        19.58 ( 14.91%)	       23.02	
  len=192, align1=0, align2=0, n=256:        19.58 ( 14.91%)	       23.01	
  len=192, align1=0, align2=0, n=256:        19.57 ( 16.70%)	       23.50	
  len=256, align1=1, align2=0, n=224:        20.84 ( 19.02%)	       25.74	
  len=256, align1=0, align2=0, n=224:        20.91 ( 15.73%)	       24.81	
  len=256, align1=0, align2=0, n=224:        21.47 ( 10.79%)	       24.07	
  len=256, align1=0, align2=0, n=224:        21.47 ( 10.79%)	       24.06	
  len=224, align1=1, align2=0, n=256:        20.43 ( 16.38%)	       24.43	
  len=224, align1=0, align2=0, n=256:        19.23 ( 16.62%)	       23.06	
  len=224, align1=0, align2=0, n=256:        19.21 ( 16.84%)	       23.10	
  len=224, align1=0, align2=0, n=256:        19.24 ( 16.77%)	       23.12	
  len=256, align1=1, align2=0, n=256:        24.05 (  5.44%)	       25.44	
  len=256, align1=0, align2=0, n=256:        21.63 ( 14.98%)	       25.45	
  len=256, align1=0, align2=0, n=256:        20.81 ( 13.64%)	       24.10	
  len=256, align1=0, align2=0, n=256:        20.81 ( 13.67%)	       24.10	
  len=256, align1=1, align2=0, n=256:        24.10 ( -0.20%)	       24.05	
  len=256, align1=0, align2=0, n=256:        21.46 ( 16.56%)	       25.71	
  len=256, align1=0, align2=0, n=256:        21.46 ( 10.79%)	       24.05	
  len=256, align1=0, align2=0, n=256:        20.81 ( 14.64%)	       24.38	
  len=256, align1=1, align2=0, n=288:        24.21 ( 15.45%)	       28.63	
  len=256, align1=0, align2=0, n=288:        23.11 ( 12.68%)	       26.46	
  len=256, align1=0, align2=0, n=288:        22.55 ( 14.25%)	       26.29	
  len=256, align1=0, align2=0, n=288:        22.49 ( 14.49%)	       26.30	
  len=288, align1=1, align2=0, n=256:        24.06 (  5.36%)	       25.42	
  len=288, align1=0, align2=0, n=256:        22.82 (  7.35%)	       24.63	
  len=288, align1=0, align2=0, n=256:        22.80 ( 10.98%)	       25.62	
  len=288, align1=0, align2=0, n=256:        21.46 ( 17.56%)	       26.03	
  len=256, align1=1, align2=0, n=320:        24.17 ( 15.82%)	       28.71	
  len=256, align1=0, align2=0, n=320:        22.44 ( 14.79%)	       26.34	
  len=256, align1=0, align2=0, n=320:        22.56 ( 14.14%)	       26.27	
  len=256, align1=0, align2=0, n=320:        22.50 ( 14.35%)	       26.27	
  len=320, align1=1, align2=0, n=256:        24.10 (  8.33%)	       26.29	
  len=320, align1=0, align2=0, n=256:        22.11 ( 16.28%)	       26.41	
  len=320, align1=0, align2=0, n=256:        21.57 ( 16.27%)	       25.76	
  len=320, align1=0, align2=0, n=256:        21.46 ( 15.42%)	       25.37	
  len=512, align1=1, align2=0, n=448:        27.62 ( 31.43%)	       40.28	
  len=512, align1=0, align2=0, n=448:        27.63 ( 32.11%)	       40.70	
  len=512, align1=0, align2=0, n=448:        26.53 ( 35.05%)	       40.85	
  len=512, align1=0, align2=0, n=448:        26.51 ( 34.99%)	       40.78	
  len=448, align1=1, align2=0, n=512:        31.01 ( 28.08%)	       43.11	
  len=448, align1=0, align2=0, n=512:        29.35 ( 36.94%)	       46.54	
  len=448, align1=0, align2=0, n=512:        29.38 ( 37.01%)	       46.63	
  len=448, align1=0, align2=0, n=512:        29.38 ( 37.01%)	       46.64	
  len=512, align1=1, align2=0, n=480:        28.24 ( 35.42%)	       43.73	
  len=512, align1=0, align2=0, n=480:        28.76 ( 28.65%)	       40.31	
  len=512, align1=0, align2=0, n=480:        28.47 ( 30.82%)	       41.16	
  len=512, align1=0, align2=0, n=480:        26.70 ( 31.68%)	       39.08	
  len=480, align1=1, align2=0, n=512:        30.73 ( 26.75%)	       41.95	
  len=480, align1=0, align2=0, n=512:        28.79 ( 34.92%)	       44.23	
  len=480, align1=0, align2=0, n=512:        28.76 ( 35.89%)	       44.87	
  len=480, align1=0, align2=0, n=512:        29.39 ( 35.67%)	       45.68	
  len=512, align1=1, align2=0, n=512:        30.58 ( 25.28%)	       40.92	
  len=512, align1=0, align2=0, n=512:        26.67 ( 31.41%)	       38.87	
  len=512, align1=0, align2=0, n=512:        26.67 ( 34.15%)	       40.50	
  len=512, align1=0, align2=0, n=512:        27.17 ( 30.43%)	       39.06	
  len=512, align1=1, align2=0, n=512:        30.63 ( 25.12%)	       40.91	
  len=512, align1=0, align2=0, n=512:        26.74 ( 31.56%)	       39.06	
  len=512, align1=0, align2=0, n=512:        26.72 ( 31.55%)	       39.04	
  len=512, align1=0, align2=0, n=512:        26.74 ( 31.11%)	       38.81	
  len=512, align1=1, align2=0, n=544:        33.43 ( 21.70%)	       42.69	
  len=512, align1=0, align2=0, n=544:        31.96 ( 27.77%)	       44.25	
  len=512, align1=0, align2=0, n=544:        31.36 ( 27.40%)	       43.20	
  len=512, align1=0, align2=0, n=544:        31.41 ( 27.14%)	       43.11	
  len=544, align1=1, align2=0, n=512:        30.55 ( 25.76%)	       41.15	
  len=544, align1=0, align2=0, n=512:        27.26 ( 31.01%)	       39.51	
  len=544, align1=0, align2=0, n=512:        27.30 ( 30.74%)	       39.41	
  len=544, align1=0, align2=0, n=512:        26.65 ( 32.38%)	       39.40	
  len=512, align1=1, align2=0, n=576:        33.39 ( 21.56%)	       42.58	
  len=512, align1=0, align2=0, n=576:        31.41 ( 28.37%)	       43.85	
  len=512, align1=0, align2=0, n=576:        31.41 ( 27.57%)	       43.37	
  len=512, align1=0, align2=0, n=576:        31.42 ( 27.41%)	       43.28	
  len=576, align1=1, align2=0, n=512:        30.61 ( 27.75%)	       42.36	
  len=576, align1=0, align2=0, n=512:        27.66 ( 31.54%)	       40.40	
  len=576, align1=0, align2=0, n=512:        28.04 ( 30.84%)	       40.55	
  len=576, align1=0, align2=0, n=512:        27.94 ( 31.15%)	       40.58	
 len=1024, align1=1, align2=0, n=960:        39.78 ( 28.72%)	       55.80	
 len=1024, align1=0, align2=0, n=960:        40.87 ( 26.15%)	       55.34	
 len=1024, align1=0, align2=0, n=960:        40.06 ( 26.81%)	       54.73	
 len=1024, align1=0, align2=0, n=960:        40.25 ( 26.40%)	       54.69	
 len=960, align1=1, align2=0, n=1024:        38.74 ( 31.46%)	       56.52	
 len=960, align1=0, align2=0, n=1024:        38.37 ( 36.30%)	       60.24	
 len=960, align1=0, align2=0, n=1024:        38.37 ( 36.36%)	       60.30	
 len=960, align1=0, align2=0, n=1024:        39.88 ( 35.25%)	       61.60	
 len=1024, align1=1, align2=0, n=992:        39.71 ( 28.13%)	       55.26	
 len=1024, align1=0, align2=0, n=992:        39.85 ( 29.39%)	       56.44	
 len=1024, align1=0, align2=0, n=992:        40.34 ( 25.81%)	       54.37	
 len=1024, align1=0, align2=0, n=992:        40.31 ( 25.91%)	       54.40	
 len=992, align1=1, align2=0, n=1024:        37.72 ( 32.49%)	       55.88	
 len=992, align1=0, align2=0, n=1024:        38.37 ( 36.02%)	       59.97	
 len=992, align1=0, align2=0, n=1024:        38.42 ( 35.53%)	       59.60	
 len=992, align1=0, align2=0, n=1024:        38.40 ( 35.67%)	       59.69	
len=1024, align1=1, align2=0, n=1024:        40.88 ( 26.02%)	       55.26	
len=1024, align1=0, align2=0, n=1024:        40.36 ( 25.56%)	       54.22	
len=1024, align1=0, align2=0, n=1024:        40.31 ( 25.60%)	       54.19	
len=1024, align1=0, align2=0, n=1024:        40.35 ( 29.70%)	       57.40	
len=1024, align1=1, align2=0, n=1024:        41.03 ( 25.71%)	       55.22	
len=1024, align1=0, align2=0, n=1024:        40.37 ( 25.42%)	       54.13	
len=1024, align1=0, align2=0, n=1024:        40.31 ( 25.64%)	       54.21	
len=1024, align1=0, align2=0, n=1024:        40.32 ( 25.60%)	       54.19	
len=1024, align1=1, align2=0, n=1056:        41.06 ( 25.94%)	       55.45	
len=1024, align1=0, align2=0, n=1056:        41.06 ( 29.54%)	       58.27	
len=1024, align1=0, align2=0, n=1056:        41.05 ( 28.94%)	       57.77	
len=1024, align1=0, align2=0, n=1056:        41.02 ( 28.82%)	       57.62	
len=1056, align1=1, align2=0, n=1024:        41.00 ( 26.23%)	       55.59	
len=1056, align1=0, align2=0, n=1024:        39.67 ( 27.07%)	       54.39	
len=1056, align1=0, align2=0, n=1024:        40.34 ( 29.19%)	       56.97	
len=1056, align1=0, align2=0, n=1024:        40.37 ( 27.52%)	       55.71	
len=1024, align1=1, align2=0, n=1088:        41.02 ( 26.33%)	       55.68	
len=1024, align1=0, align2=0, n=1088:        41.06 ( 30.82%)	       59.35	
len=1024, align1=0, align2=0, n=1088:        41.05 ( 29.58%)	       58.29	
len=1024, align1=0, align2=0, n=1088:        41.14 ( 28.69%)	       57.69	
len=1088, align1=1, align2=0, n=1024:        41.31 ( 27.50%)	       56.98	
len=1088, align1=0, align2=0, n=1024:        40.32 ( 29.25%)	       56.99	
len=1088, align1=0, align2=0, n=1024:        40.74 ( 27.82%)	       56.44	
len=1088, align1=0, align2=0, n=1024:        40.70 ( 26.62%)	       55.47
Noah Goldstein June 30, 2023, 11:22 p.m. UTC | #4
On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
>> > Think we should at the very least wait for the generic strlcpy codes
>> > to land first.
>>
>> Let's not optimize these functions at all, unless there's good and
>> measured reason to do so. In practice I expected they're called with
>> small sizes for which optimization is a net minus as it consumes
>> valuable maintenance time with no real benefit.
>
>
> Hi Paul,
>
> Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch.
>
I don't think the concern is that we can beat the generic impl (which hasn't
even landed yet AFAICT), it whether doing so makes sense given the
usage/goal of the functions.

> https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html
>
> Thanks,
> Sunil
Noah Goldstein June 30, 2023, 11:27 p.m. UTC | #5
On Fri, Jun 30, 2023 at 6:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
> >>
> >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> >> > Think we should at the very least wait for the generic strlcpy codes
> >> > to land first.
> >>
> >> Let's not optimize these functions at all, unless there's good and
> >> measured reason to do so. In practice I expected they're called with
> >> small sizes for which optimization is a net minus as it consumes
> >> valuable maintenance time with no real benefit.
> >
> >
> > Hi Paul,
> >
> > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch.
> >
> I don't think the concern is that we can beat the generic impl (which hasn't
> even landed yet AFAICT), it whether doing so makes sense given the
> usage/goal of the functions.
>

That being said, I'm generally in favor of adding optimized versions since
we happen to be a position where at least several developers find it worth
their time to maintain, but not before the generic versions have landed.
> > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html
> >
> > Thanks,
> > Sunil
Florian Weimer July 1, 2023, 9:41 a.m. UTC | #6
* Noah Goldstein via Libc-alpha:

> Think we should at the very least wait for the generic strlcpy codes
> to land first.

Do you mean a version of string/strlcpy.c that is based on a modified
string/stplcpy.c, rather than the one we have now that calls just strlen
and memcpy?

Thanks,
Florian
Noah Goldstein July 2, 2023, 1:22 a.m. UTC | #7
On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > Think we should at the very least wait for the generic strlcpy codes
> > to land first.
>
> Do you mean a version of string/strlcpy.c that is based on a modified
> string/stplcpy.c, rather than the one we have now that calls just strlen
> and memcpy?

Hmm? I mean your strlcpy/strlcat patch to land.
>
> Thanks,
> Florian
>
Florian Weimer July 2, 2023, 6:51 a.m. UTC | #8
* Noah Goldstein:

> On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Noah Goldstein via Libc-alpha:
>>
>> > Think we should at the very least wait for the generic strlcpy codes
>> > to land first.
>>
>> Do you mean a version of string/strlcpy.c that is based on a modified
>> string/stplcpy.c, rather than the one we have now that calls just strlen
>> and memcpy?
>
> Hmm? I mean your strlcpy/strlcat patch to land.

That has already happened?

Thanks,
Florian
Noah Goldstein July 2, 2023, 4:55 p.m. UTC | #9
On Sun, Jul 2, 2023 at 1:51 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein:
>
> > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Noah Goldstein via Libc-alpha:
> >>
> >> > Think we should at the very least wait for the generic strlcpy codes
> >> > to land first.
> >>
> >> Do you mean a version of string/strlcpy.c that is based on a modified
> >> string/stplcpy.c, rather than the one we have now that calls just strlen
> >> and memcpy?
> >
> > Hmm? I mean your strlcpy/strlcat patch to land.
>
> That has already happened?
:/ yup had been a minute since I pulled.

Are we getting stplcpy?
>
> Thanks,
> Florian
>
Florian Weimer July 2, 2023, 5:02 p.m. UTC | #10
* Noah Goldstein:

>> >> Do you mean a version of string/strlcpy.c that is based on a modified
>> >> string/stplcpy.c, rather than the one we have now that calls just strlen
>> >> and memcpy?
>> >
>> > Hmm? I mean your strlcpy/strlcat patch to land.
>>
>> That has already happened?
> :/ yup had been a minute since I pulled.
>
> Are we getting stplcpy?

No.  I mentioned string/stplcpy.c because it's what the generic strcpy
is based upon.  Sorry for the confusion.

Thanks,
Florian
Noah Goldstein July 2, 2023, 5:03 p.m. UTC | #11
On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>  9 files changed, 627 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e1e894c963..7e3fc081df 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -82,6 +82,8 @@ sysdep_routines += \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
>    strcspn-sse4 \
> +  strlcpy-avx2 \
> +  strlcpy-generic \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> @@ -153,6 +155,8 @@ sysdep_routines += \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> +  wcslcpy-avx2 \
> +  wcslcpy-generic \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5427ff1907..9928dee187 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __strncat_sse2_unaligned))
>
> +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> +  IFUNC_IMPL (i, name, strlcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __strlcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> +                                    1,
> +                                    __strlcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __wcscpy_generic))
>
> +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> +  IFUNC_IMPL (i, name, wcslcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __wcslcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> +                                    1,
> +                                    __wcslcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>    IFUNC_IMPL (i, name, wcsncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> new file mode 100644
> index 0000000000..982a30d15b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> @@ -0,0 +1,34 @@
> +/* Common definition for ifunc selections.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    return OPTIMIZE (avx2);
> +
> +  return OPTIMIZE (generic);
> +}
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> new file mode 100644
> index 0000000000..cf54b1e990
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> @@ -0,0 +1,446 @@
> +/* Strlcpy/wcslcpy optimized with AVX2.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRLCPY
> +#  define STRLCPY      __strlcpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSLCPY
> +#  define CHAR_SIZE    4
> +#  define MOVU         movl
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
> +# else
> +#  define CHAR_SIZE    1
> +#  define MOVU         movb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
> +# endif
> +
> +# define PMOVMSK       vpmovmskb
> +# define PAGE_SIZE     4096
> +# define VEC_SIZE      32
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +
> +ENTRY_P2ALIGN (STRLCPY, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +
> +       /* Zero out vector register for end of string comparison. */
> +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> +       /* Save source pointer for return calculation.  */
> +       mov     %rsi, %r8
> +       mov     %esi, %eax
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       /* Load first vector.  */
> +       VMOVU   (%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       test    %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +       test    %rdx, %rdx
> +       jz      L(continue_second_vector)
> +
> +       /* Check whether we can copy full vector.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(page_cross_small_vec_copy)
> +       /* Copy first vector.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       sub     $CHAR_PER_VEC, %rdx
> +
> +L(continue_second_vector):
> +       /* Align RSI pointer and adjust RDI based on offset.  */
> +       mov     %rsi, %rax
> +       and     $-VEC_SIZE, %rsi
> +       sub     %rsi, %rax
> +       sub     %rax, %rdi
> +
> +       /* Check if string already copied N char, and RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(skip_copy_alignment_fix)
> +
> +       /* Adjust RDX for copy alignment fix.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       add     %rax, %rdx
> +
> +L(skip_copy_alignment_fix):
> +       /* Load second vector.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x2)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_third_vector)
> +
> +       /* Jump below/equal(instead of below) used here, because last
> +          copy chracter must be NULL.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_second_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy second vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +
> +L(continue_third_vector):
> +       /* Load third vector.  */
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x3)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_fourth_vector)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_third_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy third vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> +
> +L(continue_fourth_vector):
> +       /* Load fourth vector.  */
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x4)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_align)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_fourth_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy fourth vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> +
> +
> +L(loop_4x_align):
> +       /* Jump to loop if RSI is already 4 vector align.  */
> +       test    $(VEC_SIZE * 4 - 1), %esi
> +       jz      L(loop_4x_read)
> +
> +       mov     %rsi, %rcx
> +
> +       /* Align RSI to 4x vector.  */
> +       and     $(VEC_SIZE * -4), %rsi
> +       sub     %rsi, %rcx
> +
> +       /* Adjust RDI for RSI alignment fix.  */
> +       sub     %rcx, %rdi
> +
> +       /* Jump to loop if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_read)
> +
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +
> +       /* Adjust RDX for RSI alignment fix.  */
> +       add     %rcx, %rdx
> +       jmp     L(loop_4x_read)
> +
> +       .p2align 4,,6
> +L(loop_4x_vec):
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_partial_copy_return)
> +       cmp     $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(loop_partial_copy)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> +       sub     $(CHAR_PER_VEC * 4), %rdx
> +
> +L(loop_partial_copy_return):
> +       sub     $(VEC_SIZE * -4), %rsi
> +       sub     $(VEC_SIZE * -4), %rdi
> +
> +L(loop_4x_read):
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> +       vptest  %VMM(7), %VMM(7)
> +
> +       jz      L(loop_4x_vec)
> +
> +       /* Check if string ends in first vector or second vector.  */
> +       lea     (VEC_SIZE * 4)(%rsi), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       xor     %r10, %r10
> +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> +       vptest  %VMM(6), %VMM(6)
> +       jnz     L(endloop)
> +       sub     $(CHAR_PER_VEC * -2), %rax
> +       mov     $(CHAR_PER_VEC * 2), %r10
> +       VMOVA   %VMM(3), %VMM(1)
> +       VMOVA   %VMM(4), %VMM(2)
> +
> +L(endloop):
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> +       PMOVMSK %VMM(1), %rcx
> +       PMOVMSK %VMM(2), %r9
> +       shlq    $32, %r9
> +       orq     %r9, %rcx
> +       bsf     %rcx, %rcx
> +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +       /* At this point RAX has length to return.  */
> +       add     %rcx, %rax
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +       /* Add 1 to account for NULL character in RDX comparison.  */
> +       lea     1(%r10, %rcx), %rcx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(loop_partial_copy):
> +       cmp     $(CHAR_PER_VEC * 2), %rdx
> +       jbe     L(loop_partial_first_half)
> +       /* Reload first 2 vector.  */
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +
> +L(loop_partial_first_half):
> +       /* Go back 2 vector from last and use overlapping copy.
> +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> +        */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(7), %VMM(7)
> +       jz      L(loop_partial_copy_return)
> +       ret
> +
> +       .p2align 4
> +L(page_cross):
> +       mov     %rsi, %rcx
> +       mov     %rsi, %r11
> +       and     $-VEC_SIZE, %r11
> +       and     $(VEC_SIZE - 1), %rcx
> +       VMOVA   (%r11), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       shr     %cl, %eax
> +       jz      L(page_cross_continue)
> +
> +L(ret_vec_x1):
> +       bsf     %eax, %eax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %eax
> +# endif
> +       /* Increment by 1 to account for NULL char.  */
> +       lea     1(%eax), %ecx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +L(page_cross_small_vec_copy):
> +       cmp     $(16 / CHAR_SIZE), %rdx
> +       jbe     L(copy_8_byte_scalar)
> +       VMOVU   (%rsi), %VMM_128(1)
> +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> +       VMOVU   %VMM_128(1), (%rdi)
> +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_8_byte_scalar):
> +       cmp     $(8 / CHAR_SIZE), %rdx
> +       jbe     L(copy_4_byte_scalar)
> +       movq    (%rsi), %r10
> +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> +       movq    %r10, (%rdi)
> +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_4_byte_scalar):
> +# ifndef USE_AS_WCSLCPY
> +       cmp     $4, %rdx
> +       jbe     L(copy_2_byte_scalar)
> +# endif
> +       movl    (%rsi), %r10d
> +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> +       movl    %r10d, (%rdi)
> +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +# ifndef USE_AS_WCSLCPY
> +L(copy_2_byte_scalar):
> +       cmp     $2, %rdx
> +       jbe     L(copy_1_byte_scalar)
> +       movw    (%rsi), %r10w
> +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> +       movw    %r10w, (%rdi)
> +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_1_byte_scalar):
> +       MOVU    (%rsi), %r10b
> +       MOVU    %r10b, (%rdi)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     VEC_SIZE(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_second_vector):
> +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_third_vector)
> +
> +L(ret):
> +       ret
> +
> +L(ret_vec_x3):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_third_vector):
> +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +L(ret_vec_x4):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_fourth_vector):
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +END (STRLCPY)

Is strlcpy/strlcat integratable with existing strncat impl? Had
figured they would
fit in the same file.

> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> new file mode 100644
> index 0000000000..eee3b7b086
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* strlcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __strlcpy  __strlcpy_generic
> +# include <string/strlcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
> new file mode 100644
> index 0000000000..ded41fbcfb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> @@ -0,0 +1,36 @@
> +/* Multiple versions of strlcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __strlcpy __redirect_strlcpy
> +# include <string.h>
> +# undef __strlcpy
> +
> +# define SYMBOL_NAME strlcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
> +weak_alias (__strlcpy, strlcpy)
> +
> +# ifdef SHARED
> +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> new file mode 100644
> index 0000000000..dafc20ded0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> @@ -0,0 +1,4 @@
> +#define STRLCPY        __wcslcpy_avx2
> +#define USE_AS_WCSLCPY 1
> +
> +#include "strlcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> new file mode 100644
> index 0000000000..ffd3c0e846
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* wcslcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __wcslcpy  __wcslcpy_generic
> +# include <wcsmbs/wcslcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
> new file mode 100644
> index 0000000000..371ef9626c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> @@ -0,0 +1,35 @@
> +/* Multiple versions of wcslcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcslcpy __redirect_wcslcpy
> +# include <wchar.h>
> +# undef __wcslcpy
> +
> +# define SYMBOL_NAME wcslcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcslcpy, wcslcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> +# endif
> +#endif
> --
> 2.38.1
>
Sunil Pandey July 2, 2023, 6:37 p.m. UTC | #12
On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
> >  9 files changed, 627 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> > index e1e894c963..7e3fc081df 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -82,6 +82,8 @@ sysdep_routines += \
> >    strcpy-sse2 \
> >    strcpy-sse2-unaligned \
> >    strcspn-sse4 \
> > +  strlcpy-avx2 \
> > +  strlcpy-generic \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > @@ -153,6 +155,8 @@ sysdep_routines += \
> >    wcscpy-evex \
> >    wcscpy-generic \
> >    wcscpy-ssse3 \
> > +  wcslcpy-avx2 \
> > +  wcslcpy-generic \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 5427ff1907..9928dee187 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >                                      1,
> >                                      __strncat_sse2_unaligned))
> >
> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> > +  IFUNC_IMPL (i, name, strlcpy,
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> > +                                    CPU_FEATURE_USABLE (AVX2),
> > +                                    __strlcpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> > +                                    1,
> > +                                    __strlcpy_generic))
> > +
> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >    IFUNC_IMPL (i, name, strncpy,
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >                                      1,
> >                                      __wcscpy_generic))
> >
> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> > +  IFUNC_IMPL (i, name, wcslcpy,
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> > +                                    CPU_FEATURE_USABLE (AVX2),
> > +                                    __wcslcpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> > +                                    1,
> > +                                    __wcslcpy_generic))
> > +
> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> >    IFUNC_IMPL (i, name, wcsncpy,
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> > new file mode 100644
> > index 0000000000..982a30d15b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> > @@ -0,0 +1,34 @@
> > +/* Common definition for ifunc selections.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <init-arch.h>
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > +    return OPTIMIZE (avx2);
> > +
> > +  return OPTIMIZE (generic);
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> > new file mode 100644
> > index 0000000000..cf54b1e990
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> > @@ -0,0 +1,446 @@
> > +/* Strlcpy/wcslcpy optimized with AVX2.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-avx-vecs.h"
> > +# endif
> > +
> > +# ifndef STRLCPY
> > +#  define STRLCPY      __strlcpy_avx2
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSLCPY
> > +#  define CHAR_SIZE    4
> > +#  define MOVU         movl
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMINU       vpminud
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define MOVU         movb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMINU       vpminub
> > +# endif
> > +
> > +# define PMOVMSK       vpmovmskb
> > +# define PAGE_SIZE     4096
> > +# define VEC_SIZE      32
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +
> > +ENTRY_P2ALIGN (STRLCPY, 6)
> > +# ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %edx, %edx
> > +# endif
> > +
> > +       /* Zero out vector register for end of string comparison. */
> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> > +       /* Save source pointer for return calculation.  */
> > +       mov     %rsi, %r8
> > +       mov     %esi, %eax
> > +       sall    $20, %eax
> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> > +       ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +       /* Load first vector.  */
> > +       VMOVU   (%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       PMOVMSK %VMM(2), %eax
> > +       test    %eax, %eax
> > +       jnz     L(ret_vec_x1)
> > +
> > +       test    %rdx, %rdx
> > +       jz      L(continue_second_vector)
> > +
> > +       /* Check whether we can copy full vector.  */
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(page_cross_small_vec_copy)
> > +       /* Copy first vector.  */
> > +       VMOVU   %VMM(1), (%rdi)
> > +       sub     $CHAR_PER_VEC, %rdx
> > +
> > +L(continue_second_vector):
> > +       /* Align RSI pointer and adjust RDI based on offset.  */
> > +       mov     %rsi, %rax
> > +       and     $-VEC_SIZE, %rsi
> > +       sub     %rsi, %rax
> > +       sub     %rax, %rdi
> > +
> > +       /* Check if string already copied N char, and RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(skip_copy_alignment_fix)
> > +
> > +       /* Adjust RDX for copy alignment fix.  */
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +# endif
> > +       add     %rax, %rdx
> > +
> > +L(skip_copy_alignment_fix):
> > +       /* Load second vector.  */
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x2)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(continue_third_vector)
> > +
> > +       /* Jump below/equal(instead of below) used here, because last
> > +          copy chracter must be NULL.  */
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_second_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy second vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +
> > +L(continue_third_vector):
> > +       /* Load third vector.  */
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x3)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(continue_fourth_vector)
> > +
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_third_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy third vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> > +
> > +L(continue_fourth_vector):
> > +       /* Load fourth vector.  */
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x4)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_4x_align)
> > +
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_fourth_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy fourth vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > +L(loop_4x_align):
> > +       /* Jump to loop if RSI is already 4 vector align.  */
> > +       test    $(VEC_SIZE * 4 - 1), %esi
> > +       jz      L(loop_4x_read)
> > +
> > +       mov     %rsi, %rcx
> > +
> > +       /* Align RSI to 4x vector.  */
> > +       and     $(VEC_SIZE * -4), %rsi
> > +       sub     %rsi, %rcx
> > +
> > +       /* Adjust RDI for RSI alignment fix.  */
> > +       sub     %rcx, %rdi
> > +
> > +       /* Jump to loop if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_4x_read)
> > +
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rcx
> > +# endif
> > +
> > +       /* Adjust RDX for RSI alignment fix.  */
> > +       add     %rcx, %rdx
> > +       jmp     L(loop_4x_read)
> > +
> > +       .p2align 4,,6
> > +L(loop_4x_vec):
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_partial_copy_return)
> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(loop_partial_copy)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> > +       sub     $(CHAR_PER_VEC * 4), %rdx
> > +
> > +L(loop_partial_copy_return):
> > +       sub     $(VEC_SIZE * -4), %rsi
> > +       sub     $(VEC_SIZE * -4), %rdi
> > +
> > +L(loop_4x_read):
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> > +       vptest  %VMM(7), %VMM(7)
> > +
> > +       jz      L(loop_4x_vec)
> > +
> > +       /* Check if string ends in first vector or second vector.  */
> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +# endif
> > +       xor     %r10, %r10
> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> > +       vptest  %VMM(6), %VMM(6)
> > +       jnz     L(endloop)
> > +       sub     $(CHAR_PER_VEC * -2), %rax
> > +       mov     $(CHAR_PER_VEC * 2), %r10
> > +       VMOVA   %VMM(3), %VMM(1)
> > +       VMOVA   %VMM(4), %VMM(2)
> > +
> > +L(endloop):
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> > +       PMOVMSK %VMM(1), %rcx
> > +       PMOVMSK %VMM(2), %r9
> > +       shlq    $32, %r9
> > +       orq     %r9, %rcx
> > +       bsf     %rcx, %rcx
> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rcx
> > +# endif
> > +       /* At this point RAX has length to return.  */
> > +       add     %rcx, %rax
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +
> > +       /* Add 1 to account for NULL character in RDX comparison.  */
> > +       lea     1(%r10, %rcx), %rcx
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(loop_partial_copy):
> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
> > +       jbe     L(loop_partial_first_half)
> > +       /* Reload first 2 vector.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> > +
> > +L(loop_partial_first_half):
> > +       /* Go back 2 vector from last and use overlapping copy.
> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> > +        */
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %rdx, %rdx
> > +       vptest  %VMM(7), %VMM(7)
> > +       jz      L(loop_partial_copy_return)
> > +       ret
> > +
> > +       .p2align 4
> > +L(page_cross):
> > +       mov     %rsi, %rcx
> > +       mov     %rsi, %r11
> > +       and     $-VEC_SIZE, %r11
> > +       and     $(VEC_SIZE - 1), %rcx
> > +       VMOVA   (%r11), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       PMOVMSK %VMM(2), %eax
> > +       shr     %cl, %eax
> > +       jz      L(page_cross_continue)
> > +
> > +L(ret_vec_x1):
> > +       bsf     %eax, %eax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %eax
> > +# endif
> > +       /* Increment by 1 to account for NULL char.  */
> > +       lea     1(%eax), %ecx
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +
> > +L(page_cross_small_vec_copy):
> > +       cmp     $(16 / CHAR_SIZE), %rdx
> > +       jbe     L(copy_8_byte_scalar)
> > +       VMOVU   (%rsi), %VMM_128(1)
> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> > +       VMOVU   %VMM_128(1), (%rdi)
> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %rdx, %rdx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_8_byte_scalar):
> > +       cmp     $(8 / CHAR_SIZE), %rdx
> > +       jbe     L(copy_4_byte_scalar)
> > +       movq    (%rsi), %r10
> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> > +       movq    %r10, (%rdi)
> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_4_byte_scalar):
> > +# ifndef USE_AS_WCSLCPY
> > +       cmp     $4, %rdx
> > +       jbe     L(copy_2_byte_scalar)
> > +# endif
> > +       movl    (%rsi), %r10d
> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> > +       movl    %r10d, (%rdi)
> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +# ifndef USE_AS_WCSLCPY
> > +L(copy_2_byte_scalar):
> > +       cmp     $2, %rdx
> > +       jbe     L(copy_1_byte_scalar)
> > +       movw    (%rsi), %r10w
> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> > +       movw    %r10w, (%rdi)
> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_1_byte_scalar):
> > +       MOVU    (%rsi), %r10b
> > +       MOVU    %r10b, (%rdi)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +# endif
> > +
> > +L(ret_vec_x2):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_second_vector):
> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_third_vector)
> > +
> > +L(ret):
> > +       ret
> > +
> > +L(ret_vec_x3):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_third_vector):
> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_fourth_vector)
> > +       ret
> > +
> > +L(ret_vec_x4):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_fourth_vector):
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_fourth_vector)
> > +       ret
> > +
> > +END (STRLCPY)
>
> Is strlcpy/strlcat integratable with existing strncat impl? Had
> figured they would
> fit in the same file.
>

Hi Noah,

It may not be a good idea to put strlcpy/strlcat in the existing
strncpy/strnat impl file,
as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.

--Sunil


> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c
> b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> > new file mode 100644
> > index 0000000000..eee3b7b086
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> > @@ -0,0 +1,25 @@
> > +/* strlcpy generic.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (1)
> > +# define __strlcpy  __strlcpy_generic
> > +# include <string/strlcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c
> b/sysdeps/x86_64/multiarch/strlcpy.c
> > new file mode 100644
> > index 0000000000..ded41fbcfb
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> > @@ -0,0 +1,36 @@
> > +/* Multiple versions of strlcpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __strlcpy __redirect_strlcpy
> > +# include <string.h>
> > +# undef __strlcpy
> > +
> > +# define SYMBOL_NAME strlcpy
> > +# include "ifunc-strlcpy.h"
> > +
> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR
> ());
> > +weak_alias (__strlcpy, strlcpy)
> > +
> > +# ifdef SHARED
> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> > new file mode 100644
> > index 0000000000..dafc20ded0
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> > @@ -0,0 +1,4 @@
> > +#define STRLCPY        __wcslcpy_avx2
> > +#define USE_AS_WCSLCPY 1
> > +
> > +#include "strlcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> > new file mode 100644
> > index 0000000000..ffd3c0e846
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> > @@ -0,0 +1,25 @@
> > +/* wcslcpy generic.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (1)
> > +# define __wcslcpy  __wcslcpy_generic
> > +# include <wcsmbs/wcslcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c
> b/sysdeps/x86_64/multiarch/wcslcpy.c
> > new file mode 100644
> > index 0000000000..371ef9626c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> > @@ -0,0 +1,35 @@
> > +/* Multiple versions of wcslcpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcslcpy __redirect_wcslcpy
> > +# include <wchar.h>
> > +# undef __wcslcpy
> > +
> > +# define SYMBOL_NAME wcslcpy
> > +# include "ifunc-strlcpy.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR
> ());
> > +weak_alias (__wcslcpy, wcslcpy)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> > +# endif
> > +#endif
> > --
> > 2.38.1
> >
>
Noah Goldstein July 2, 2023, 6:54 p.m. UTC | #13
On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>> >
>> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
>> > ---
>> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
>> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>> >  9 files changed, 627 insertions(+)
>> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> > index e1e894c963..7e3fc081df 100644
>> > --- a/sysdeps/x86_64/multiarch/Makefile
>> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> > @@ -82,6 +82,8 @@ sysdep_routines += \
>> >    strcpy-sse2 \
>> >    strcpy-sse2-unaligned \
>> >    strcspn-sse4 \
>> > +  strlcpy-avx2 \
>> > +  strlcpy-generic \
>> >    strlen-avx2 \
>> >    strlen-avx2-rtm \
>> >    strlen-evex \
>> > @@ -153,6 +155,8 @@ sysdep_routines += \
>> >    wcscpy-evex \
>> >    wcscpy-generic \
>> >    wcscpy-ssse3 \
>> > +  wcslcpy-avx2 \
>> > +  wcslcpy-generic \
>> >    wcslen-avx2 \
>> >    wcslen-avx2-rtm \
>> >    wcslen-evex \
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > index 5427ff1907..9928dee187 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >                                      1,
>> >                                      __strncat_sse2_unaligned))
>> >
>> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
>> > +  IFUNC_IMPL (i, name, strlcpy,
>> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
>> > +                                    CPU_FEATURE_USABLE (AVX2),
>> > +                                    __strlcpy_avx2)
>> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
>> > +                                    1,
>> > +                                    __strlcpy_generic))
>> > +
>> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>> >    IFUNC_IMPL (i, name, strncpy,
>> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
>> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >                                      1,
>> >                                      __wcscpy_generic))
>> >
>> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
>> > +  IFUNC_IMPL (i, name, wcslcpy,
>> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
>> > +                                    CPU_FEATURE_USABLE (AVX2),
>> > +                                    __wcslcpy_avx2)
>> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
>> > +                                    1,
>> > +                                    __wcslcpy_generic))
>> > +
>> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>> >    IFUNC_IMPL (i, name, wcsncpy,
>> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> > new file mode 100644
>> > index 0000000000..982a30d15b
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> > @@ -0,0 +1,34 @@
>> > +/* Common definition for ifunc selections.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <init-arch.h>
>> > +
>> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>> > +
>> > +static inline void *
>> > +IFUNC_SELECTOR (void)
>> > +{
>> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
>> > +
>> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>> > +    return OPTIMIZE (avx2);
>> > +
>> > +  return OPTIMIZE (generic);
>> > +}
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> > new file mode 100644
>> > index 0000000000..cf54b1e990
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> > @@ -0,0 +1,446 @@
>> > +/* Strlcpy/wcslcpy optimized with AVX2.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <isa-level.h>
>> > +
>> > +#if ISA_SHOULD_BUILD (3)
>> > +
>> > +# include <sysdep.h>
>> > +
>> > +# ifndef VEC_SIZE
>> > +#  include "x86-avx-vecs.h"
>> > +# endif
>> > +
>> > +# ifndef STRLCPY
>> > +#  define STRLCPY      __strlcpy_avx2
>> > +# endif
>> > +
>> > +
>> > +# ifdef USE_AS_WCSLCPY
>> > +#  define CHAR_SIZE    4
>> > +#  define MOVU         movl
>> > +#  define VPCMPEQ      vpcmpeqd
>> > +#  define VPMINU       vpminud
>> > +# else
>> > +#  define CHAR_SIZE    1
>> > +#  define MOVU         movb
>> > +#  define VPCMPEQ      vpcmpeqb
>> > +#  define VPMINU       vpminub
>> > +# endif
>> > +
>> > +# define PMOVMSK       vpmovmskb
>> > +# define PAGE_SIZE     4096
>> > +# define VEC_SIZE      32
>> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> > +
>> > +       .section SECTION(.text),"ax",@progbits
>> > +/* Aligning entry point to 64 byte, provides better performance for
>> > +   one vector length string.  */
>> > +
>> > +ENTRY_P2ALIGN (STRLCPY, 6)
>> > +# ifdef __ILP32__
>> > +       /* Clear the upper 32 bits.  */
>> > +       movl    %edx, %edx
>> > +# endif
>> > +
>> > +       /* Zero out vector register for end of string comparison. */
>> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
>> > +       /* Save source pointer for return calculation.  */
>> > +       mov     %rsi, %r8
>> > +       mov     %esi, %eax
>> > +       sall    $20, %eax
>> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
>> > +       ja      L(page_cross)
>> > +
>> > +L(page_cross_continue):
>> > +       /* Load first vector.  */
>> > +       VMOVU   (%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       PMOVMSK %VMM(2), %eax
>> > +       test    %eax, %eax
>> > +       jnz     L(ret_vec_x1)
>> > +
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_second_vector)
>> > +
>> > +       /* Check whether we can copy full vector.  */
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(page_cross_small_vec_copy)
>> > +       /* Copy first vector.  */
>> > +       VMOVU   %VMM(1), (%rdi)
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +
>> > +L(continue_second_vector):
>> > +       /* Align RSI pointer and adjust RDI based on offset.  */
>> > +       mov     %rsi, %rax
>> > +       and     $-VEC_SIZE, %rsi
>> > +       sub     %rsi, %rax
>> > +       sub     %rax, %rdi
>> > +
>> > +       /* Check if string already copied N char, and RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(skip_copy_alignment_fix)
>> > +
>> > +       /* Adjust RDX for copy alignment fix.  */
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +# endif
>> > +       add     %rax, %rdx
>> > +
>> > +L(skip_copy_alignment_fix):
>> > +       /* Load second vector.  */
>> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x2)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_third_vector)
>> > +
>> > +       /* Jump below/equal(instead of below) used here, because last
>> > +          copy chracter must be NULL.  */
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_second_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy second vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
>> > +
>> > +L(continue_third_vector):
>> > +       /* Load third vector.  */
>> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x3)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_fourth_vector)
>> > +
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_third_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy third vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
>> > +
>> > +L(continue_fourth_vector):
>> > +       /* Load fourth vector.  */
>> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x4)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_4x_align)
>> > +
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_fourth_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy fourth vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
>> > +
>> > +
>> > +L(loop_4x_align):
>> > +       /* Jump to loop if RSI is already 4 vector align.  */
>> > +       test    $(VEC_SIZE * 4 - 1), %esi
>> > +       jz      L(loop_4x_read)
>> > +
>> > +       mov     %rsi, %rcx
>> > +
>> > +       /* Align RSI to 4x vector.  */
>> > +       and     $(VEC_SIZE * -4), %rsi
>> > +       sub     %rsi, %rcx
>> > +
>> > +       /* Adjust RDI for RSI alignment fix.  */
>> > +       sub     %rcx, %rdi
>> > +
>> > +       /* Jump to loop if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_4x_read)
>> > +
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rcx
>> > +# endif
>> > +
>> > +       /* Adjust RDX for RSI alignment fix.  */
>> > +       add     %rcx, %rdx
>> > +       jmp     L(loop_4x_read)
>> > +
>> > +       .p2align 4,,6
>> > +L(loop_4x_vec):
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_partial_copy_return)
>> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
>> > +       jbe     L(loop_partial_copy)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
>> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
>> > +       sub     $(CHAR_PER_VEC * 4), %rdx
>> > +
>> > +L(loop_partial_copy_return):
>> > +       sub     $(VEC_SIZE * -4), %rsi
>> > +       sub     $(VEC_SIZE * -4), %rdi
>> > +
>> > +L(loop_4x_read):
>> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
>> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
>> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
>> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
>> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
>> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
>> > +       vptest  %VMM(7), %VMM(7)
>> > +
>> > +       jz      L(loop_4x_vec)
>> > +
>> > +       /* Check if string ends in first vector or second vector.  */
>> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +# endif
>> > +       xor     %r10, %r10
>> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
>> > +       vptest  %VMM(6), %VMM(6)
>> > +       jnz     L(endloop)
>> > +       sub     $(CHAR_PER_VEC * -2), %rax
>> > +       mov     $(CHAR_PER_VEC * 2), %r10
>> > +       VMOVA   %VMM(3), %VMM(1)
>> > +       VMOVA   %VMM(4), %VMM(2)
>> > +
>> > +L(endloop):
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
>> > +       PMOVMSK %VMM(1), %rcx
>> > +       PMOVMSK %VMM(2), %r9
>> > +       shlq    $32, %r9
>> > +       orq     %r9, %rcx
>> > +       bsf     %rcx, %rcx
>> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       /* At this point RAX has length to return.  */
>> > +       add     %rcx, %rax
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +
>> > +       /* Add 1 to account for NULL character in RDX comparison.  */
>> > +       lea     1(%r10, %rcx), %rcx
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(loop_partial_copy):
>> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
>> > +       jbe     L(loop_partial_first_half)
>> > +       /* Reload first 2 vector.  */
>> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> > +
>> > +L(loop_partial_first_half):
>> > +       /* Go back 2 vector from last and use overlapping copy.
>> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
>> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
>> > +        */
>> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
>> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
>> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %rdx, %rdx
>> > +       vptest  %VMM(7), %VMM(7)
>> > +       jz      L(loop_partial_copy_return)
>> > +       ret
>> > +
>> > +       .p2align 4
>> > +L(page_cross):
>> > +       mov     %rsi, %rcx
>> > +       mov     %rsi, %r11
>> > +       and     $-VEC_SIZE, %r11
>> > +       and     $(VEC_SIZE - 1), %rcx
>> > +       VMOVA   (%r11), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       PMOVMSK %VMM(2), %eax
>> > +       shr     %cl, %eax
>> > +       jz      L(page_cross_continue)
>> > +
>> > +L(ret_vec_x1):
>> > +       bsf     %eax, %eax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %eax
>> > +# endif
>> > +       /* Increment by 1 to account for NULL char.  */
>> > +       lea     1(%eax), %ecx
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +
>> > +L(page_cross_small_vec_copy):
>> > +       cmp     $(16 / CHAR_SIZE), %rdx
>> > +       jbe     L(copy_8_byte_scalar)
>> > +       VMOVU   (%rsi), %VMM_128(1)
>> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
>> > +       VMOVU   %VMM_128(1), (%rdi)
>> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %rdx, %rdx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_8_byte_scalar):
>> > +       cmp     $(8 / CHAR_SIZE), %rdx
>> > +       jbe     L(copy_4_byte_scalar)
>> > +       movq    (%rsi), %r10
>> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
>> > +       movq    %r10, (%rdi)
>> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_4_byte_scalar):
>> > +# ifndef USE_AS_WCSLCPY
>> > +       cmp     $4, %rdx
>> > +       jbe     L(copy_2_byte_scalar)
>> > +# endif
>> > +       movl    (%rsi), %r10d
>> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
>> > +       movl    %r10d, (%rdi)
>> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +# ifndef USE_AS_WCSLCPY
>> > +L(copy_2_byte_scalar):
>> > +       cmp     $2, %rdx
>> > +       jbe     L(copy_1_byte_scalar)
>> > +       movw    (%rsi), %r10w
>> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
>> > +       movw    %r10w, (%rdi)
>> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_1_byte_scalar):
>> > +       MOVU    (%rsi), %r10b
>> > +       MOVU    %r10b, (%rdi)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +# endif
>> > +
>> > +L(ret_vec_x2):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_second_vector):
>> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_third_vector)
>> > +
>> > +L(ret):
>> > +       ret
>> > +
>> > +L(ret_vec_x3):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_third_vector):
>> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_fourth_vector)
>> > +       ret
>> > +
>> > +L(ret_vec_x4):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_fourth_vector):
>> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_fourth_vector)
>> > +       ret
>> > +
>> > +END (STRLCPY)
>>
>> Is strlcpy/strlcat integratable with existing strncat impl? Had
>> figured they would
>> fit in the same file.
>
>
> Hi Noah,
>
> It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file,
> as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
>
Well, we can put the impl there and include it from another to manage
any special
link cases.

> --Sunil
>
>>
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> > new file mode 100644
>> > index 0000000000..eee3b7b086
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> > @@ -0,0 +1,25 @@
>> > +/* strlcpy generic.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +
>> > +#include <isa-level.h>
>> > +#if ISA_SHOULD_BUILD (1)
>> > +# define __strlcpy  __strlcpy_generic
>> > +# include <string/strlcpy.c>
>> > +
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
>> > new file mode 100644
>> > index 0000000000..ded41fbcfb
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
>> > @@ -0,0 +1,36 @@
>> > +/* Multiple versions of strlcpy.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +/* Define multiple versions only for the definition in libc.  */
>> > +#if IS_IN (libc)
>> > +# define __strlcpy __redirect_strlcpy
>> > +# include <string.h>
>> > +# undef __strlcpy
>> > +
>> > +# define SYMBOL_NAME strlcpy
>> > +# include "ifunc-strlcpy.h"
>> > +
>> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
>> > +weak_alias (__strlcpy, strlcpy)
>> > +
>> > +# ifdef SHARED
>> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
>> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
>> > +# endif
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> > new file mode 100644
>> > index 0000000000..dafc20ded0
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> > @@ -0,0 +1,4 @@
>> > +#define STRLCPY        __wcslcpy_avx2
>> > +#define USE_AS_WCSLCPY 1
>> > +
>> > +#include "strlcpy-avx2.S"
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> > new file mode 100644
>> > index 0000000000..ffd3c0e846
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> > @@ -0,0 +1,25 @@
>> > +/* wcslcpy generic.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +
>> > +#include <isa-level.h>
>> > +#if ISA_SHOULD_BUILD (1)
>> > +# define __wcslcpy  __wcslcpy_generic
>> > +# include <wcsmbs/wcslcpy.c>
>> > +
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
>> > new file mode 100644
>> > index 0000000000..371ef9626c
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
>> > @@ -0,0 +1,35 @@
>> > +/* Multiple versions of wcslcpy.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +/* Define multiple versions only for the definition in libc.  */
>> > +#if IS_IN (libc)
>> > +# define __wcslcpy __redirect_wcslcpy
>> > +# include <wchar.h>
>> > +# undef __wcslcpy
>> > +
>> > +# define SYMBOL_NAME wcslcpy
>> > +# include "ifunc-strlcpy.h"
>> > +
>> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
>> > +weak_alias (__wcslcpy, wcslcpy)
>> > +# ifdef SHARED
>> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
>> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
>> > +# endif
>> > +#endif
>> > --
>> > 2.38.1
>> >
Sunil Pandey July 3, 2023, 1:03 a.m. UTC | #14
On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
> >> <libc-alpha@sourceware.org> wrote:
> >> >
> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> >> > ---
> >> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
> >> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
> >> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446
> +++++++++++++++++++++
> >> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
> >> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
> >> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
> >> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
> >> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
> >> >  9 files changed, 627 insertions(+)
> >> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
> >> >
> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> >> > index e1e894c963..7e3fc081df 100644
> >> > --- a/sysdeps/x86_64/multiarch/Makefile
> >> > +++ b/sysdeps/x86_64/multiarch/Makefile
> >> > @@ -82,6 +82,8 @@ sysdep_routines += \
> >> >    strcpy-sse2 \
> >> >    strcpy-sse2-unaligned \
> >> >    strcspn-sse4 \
> >> > +  strlcpy-avx2 \
> >> > +  strlcpy-generic \
> >> >    strlen-avx2 \
> >> >    strlen-avx2-rtm \
> >> >    strlen-evex \
> >> > @@ -153,6 +155,8 @@ sysdep_routines += \
> >> >    wcscpy-evex \
> >> >    wcscpy-generic \
> >> >    wcscpy-ssse3 \
> >> > +  wcslcpy-avx2 \
> >> > +  wcslcpy-generic \
> >> >    wcslen-avx2 \
> >> >    wcslen-avx2-rtm \
> >> >    wcslen-evex \
> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > index 5427ff1907..9928dee187 100644
> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >> >                                      1,
> >> >                                      __strncat_sse2_unaligned))
> >> >
> >> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> >> > +  IFUNC_IMPL (i, name, strlcpy,
> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> >> > +                                    CPU_FEATURE_USABLE (AVX2),
> >> > +                                    __strlcpy_avx2)
> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> >> > +                                    1,
> >> > +                                    __strlcpy_generic))
> >> > +
> >> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >> >    IFUNC_IMPL (i, name, strncpy,
> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >> >                                      1,
> >> >                                      __wcscpy_generic))
> >> >
> >> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> >> > +  IFUNC_IMPL (i, name, wcslcpy,
> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> >> > +                                    CPU_FEATURE_USABLE (AVX2),
> >> > +                                    __wcslcpy_avx2)
> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> >> > +                                    1,
> >> > +                                    __wcslcpy_generic))
> >> > +
> >> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> >> >    IFUNC_IMPL (i, name, wcsncpy,
> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> > new file mode 100644
> >> > index 0000000000..982a30d15b
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> > @@ -0,0 +1,34 @@
> >> > +/* Common definition for ifunc selections.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +#include <init-arch.h>
> >> > +
> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >> > +
> >> > +static inline void *
> >> > +IFUNC_SELECTOR (void)
> >> > +{
> >> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> >> > +
> >> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> >> > +    return OPTIMIZE (avx2);
> >> > +
> >> > +  return OPTIMIZE (generic);
> >> > +}
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> > new file mode 100644
> >> > index 0000000000..cf54b1e990
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> > @@ -0,0 +1,446 @@
> >> > +/* Strlcpy/wcslcpy optimized with AVX2.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +#include <isa-level.h>
> >> > +
> >> > +#if ISA_SHOULD_BUILD (3)
> >> > +
> >> > +# include <sysdep.h>
> >> > +
> >> > +# ifndef VEC_SIZE
> >> > +#  include "x86-avx-vecs.h"
> >> > +# endif
> >> > +
> >> > +# ifndef STRLCPY
> >> > +#  define STRLCPY      __strlcpy_avx2
> >> > +# endif
> >> > +
> >> > +
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +#  define CHAR_SIZE    4
> >> > +#  define MOVU         movl
> >> > +#  define VPCMPEQ      vpcmpeqd
> >> > +#  define VPMINU       vpminud
> >> > +# else
> >> > +#  define CHAR_SIZE    1
> >> > +#  define MOVU         movb
> >> > +#  define VPCMPEQ      vpcmpeqb
> >> > +#  define VPMINU       vpminub
> >> > +# endif
> >> > +
> >> > +# define PMOVMSK       vpmovmskb
> >> > +# define PAGE_SIZE     4096
> >> > +# define VEC_SIZE      32
> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> > +
> >> > +       .section SECTION(.text),"ax",@progbits
> >> > +/* Aligning entry point to 64 byte, provides better performance for
> >> > +   one vector length string.  */
> >> > +
> >> > +ENTRY_P2ALIGN (STRLCPY, 6)
> >> > +# ifdef __ILP32__
> >> > +       /* Clear the upper 32 bits.  */
> >> > +       movl    %edx, %edx
> >> > +# endif
> >> > +
> >> > +       /* Zero out vector register for end of string comparison. */
> >> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> >> > +       /* Save source pointer for return calculation.  */
> >> > +       mov     %rsi, %r8
> >> > +       mov     %esi, %eax
> >> > +       sall    $20, %eax
> >> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> >> > +       ja      L(page_cross)
> >> > +
> >> > +L(page_cross_continue):
> >> > +       /* Load first vector.  */
> >> > +       VMOVU   (%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       PMOVMSK %VMM(2), %eax
> >> > +       test    %eax, %eax
> >> > +       jnz     L(ret_vec_x1)
> >> > +
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_second_vector)
> >> > +
> >> > +       /* Check whether we can copy full vector.  */
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(page_cross_small_vec_copy)
> >> > +       /* Copy first vector.  */
> >> > +       VMOVU   %VMM(1), (%rdi)
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +
> >> > +L(continue_second_vector):
> >> > +       /* Align RSI pointer and adjust RDI based on offset.  */
> >> > +       mov     %rsi, %rax
> >> > +       and     $-VEC_SIZE, %rsi
> >> > +       sub     %rsi, %rax
> >> > +       sub     %rax, %rdi
> >> > +
> >> > +       /* Check if string already copied N char, and RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(skip_copy_alignment_fix)
> >> > +
> >> > +       /* Adjust RDX for copy alignment fix.  */
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +# endif
> >> > +       add     %rax, %rdx
> >> > +
> >> > +L(skip_copy_alignment_fix):
> >> > +       /* Load second vector.  */
> >> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x2)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_third_vector)
> >> > +
> >> > +       /* Jump below/equal(instead of below) used here, because last
> >> > +          copy chracter must be NULL.  */
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_second_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy second vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> >> > +
> >> > +L(continue_third_vector):
> >> > +       /* Load third vector.  */
> >> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x3)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_fourth_vector)
> >> > +
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_third_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy third vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> >> > +
> >> > +L(continue_fourth_vector):
> >> > +       /* Load fourth vector.  */
> >> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x4)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_4x_align)
> >> > +
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_fourth_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy fourth vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> >> > +
> >> > +
> >> > +L(loop_4x_align):
> >> > +       /* Jump to loop if RSI is already 4 vector align.  */
> >> > +       test    $(VEC_SIZE * 4 - 1), %esi
> >> > +       jz      L(loop_4x_read)
> >> > +
> >> > +       mov     %rsi, %rcx
> >> > +
> >> > +       /* Align RSI to 4x vector.  */
> >> > +       and     $(VEC_SIZE * -4), %rsi
> >> > +       sub     %rsi, %rcx
> >> > +
> >> > +       /* Adjust RDI for RSI alignment fix.  */
> >> > +       sub     %rcx, %rdi
> >> > +
> >> > +       /* Jump to loop if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_4x_read)
> >> > +
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +
> >> > +       /* Adjust RDX for RSI alignment fix.  */
> >> > +       add     %rcx, %rdx
> >> > +       jmp     L(loop_4x_read)
> >> > +
> >> > +       .p2align 4,,6
> >> > +L(loop_4x_vec):
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_partial_copy_return)
> >> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
> >> > +       jbe     L(loop_partial_copy)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> >> > +       sub     $(CHAR_PER_VEC * 4), %rdx
> >> > +
> >> > +L(loop_partial_copy_return):
> >> > +       sub     $(VEC_SIZE * -4), %rsi
> >> > +       sub     $(VEC_SIZE * -4), %rdi
> >> > +
> >> > +L(loop_4x_read):
> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> >> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> >> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> >> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> >> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> >> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> >> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> >> > +       vptest  %VMM(7), %VMM(7)
> >> > +
> >> > +       jz      L(loop_4x_vec)
> >> > +
> >> > +       /* Check if string ends in first vector or second vector.  */
> >> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +# endif
> >> > +       xor     %r10, %r10
> >> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> >> > +       vptest  %VMM(6), %VMM(6)
> >> > +       jnz     L(endloop)
> >> > +       sub     $(CHAR_PER_VEC * -2), %rax
> >> > +       mov     $(CHAR_PER_VEC * 2), %r10
> >> > +       VMOVA   %VMM(3), %VMM(1)
> >> > +       VMOVA   %VMM(4), %VMM(2)
> >> > +
> >> > +L(endloop):
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> >> > +       PMOVMSK %VMM(1), %rcx
> >> > +       PMOVMSK %VMM(2), %r9
> >> > +       shlq    $32, %r9
> >> > +       orq     %r9, %rcx
> >> > +       bsf     %rcx, %rcx
> >> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       /* At this point RAX has length to return.  */
> >> > +       add     %rcx, %rax
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +
> >> > +       /* Add 1 to account for NULL character in RDX comparison.  */
> >> > +       lea     1(%r10, %rcx), %rcx
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(loop_partial_copy):
> >> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
> >> > +       jbe     L(loop_partial_first_half)
> >> > +       /* Reload first 2 vector.  */
> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> >> > +
> >> > +L(loop_partial_first_half):
> >> > +       /* Go back 2 vector from last and use overlapping copy.
> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> >> > +        */
> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> >> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %rdx, %rdx
> >> > +       vptest  %VMM(7), %VMM(7)
> >> > +       jz      L(loop_partial_copy_return)
> >> > +       ret
> >> > +
> >> > +       .p2align 4
> >> > +L(page_cross):
> >> > +       mov     %rsi, %rcx
> >> > +       mov     %rsi, %r11
> >> > +       and     $-VEC_SIZE, %r11
> >> > +       and     $(VEC_SIZE - 1), %rcx
> >> > +       VMOVA   (%r11), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       PMOVMSK %VMM(2), %eax
> >> > +       shr     %cl, %eax
> >> > +       jz      L(page_cross_continue)
> >> > +
> >> > +L(ret_vec_x1):
> >> > +       bsf     %eax, %eax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %eax
> >> > +# endif
> >> > +       /* Increment by 1 to account for NULL char.  */
> >> > +       lea     1(%eax), %ecx
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +
> >> > +L(page_cross_small_vec_copy):
> >> > +       cmp     $(16 / CHAR_SIZE), %rdx
> >> > +       jbe     L(copy_8_byte_scalar)
> >> > +       VMOVU   (%rsi), %VMM_128(1)
> >> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> >> > +       VMOVU   %VMM_128(1), (%rdi)
> >> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %rdx, %rdx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_8_byte_scalar):
> >> > +       cmp     $(8 / CHAR_SIZE), %rdx
> >> > +       jbe     L(copy_4_byte_scalar)
> >> > +       movq    (%rsi), %r10
> >> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> >> > +       movq    %r10, (%rdi)
> >> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_4_byte_scalar):
> >> > +# ifndef USE_AS_WCSLCPY
> >> > +       cmp     $4, %rdx
> >> > +       jbe     L(copy_2_byte_scalar)
> >> > +# endif
> >> > +       movl    (%rsi), %r10d
> >> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> >> > +       movl    %r10d, (%rdi)
> >> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +# ifndef USE_AS_WCSLCPY
> >> > +L(copy_2_byte_scalar):
> >> > +       cmp     $2, %rdx
> >> > +       jbe     L(copy_1_byte_scalar)
> >> > +       movw    (%rsi), %r10w
> >> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> >> > +       movw    %r10w, (%rdi)
> >> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_1_byte_scalar):
> >> > +       MOVU    (%rsi), %r10b
> >> > +       MOVU    %r10b, (%rdi)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +# endif
> >> > +
> >> > +L(ret_vec_x2):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_second_vector):
> >> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_third_vector)
> >> > +
> >> > +L(ret):
> >> > +       ret
> >> > +
> >> > +L(ret_vec_x3):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_third_vector):
> >> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_fourth_vector)
> >> > +       ret
> >> > +
> >> > +L(ret_vec_x4):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_fourth_vector):
> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_fourth_vector)
> >> > +       ret
> >> > +
> >> > +END (STRLCPY)
> >>
> >> Is strlcpy/strlcat integratable with existing strncat impl? Had
> >> figured they would
> >> fit in the same file.
> >
> >
> > Hi Noah,
> >
> > It may not be a good idea to put strlcpy/strlcat in the existing
> strncpy/strnat impl file,
> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
> >
> Well, we can put the impl there and include it from another to manage
> any special
> link cases.
>

Due to ABI, none of strlcpy/strlcat changes can go in the glibc version
earlier than 2.38,
to avoid any future strncpy backporting complications, it is better to keep
them in separate
files for now.


> > --Sunil
> >
> >>
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c
> b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> > new file mode 100644
> >> > index 0000000000..eee3b7b086
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> > @@ -0,0 +1,25 @@
> >> > +/* strlcpy generic.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +
> >> > +#include <isa-level.h>
> >> > +#if ISA_SHOULD_BUILD (1)
> >> > +# define __strlcpy  __strlcpy_generic
> >> > +# include <string/strlcpy.c>
> >> > +
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c
> b/sysdeps/x86_64/multiarch/strlcpy.c
> >> > new file mode 100644
> >> > index 0000000000..ded41fbcfb
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> >> > @@ -0,0 +1,36 @@
> >> > +/* Multiple versions of strlcpy.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +/* Define multiple versions only for the definition in libc.  */
> >> > +#if IS_IN (libc)
> >> > +# define __strlcpy __redirect_strlcpy
> >> > +# include <string.h>
> >> > +# undef __strlcpy
> >> > +
> >> > +# define SYMBOL_NAME strlcpy
> >> > +# include "ifunc-strlcpy.h"
> >> > +
> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR
> ());
> >> > +weak_alias (__strlcpy, strlcpy)
> >> > +
> >> > +# ifdef SHARED
> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> >> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__
> (strlcpy);
> >> > +# endif
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> > new file mode 100644
> >> > index 0000000000..dafc20ded0
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> > @@ -0,0 +1,4 @@
> >> > +#define STRLCPY        __wcslcpy_avx2
> >> > +#define USE_AS_WCSLCPY 1
> >> > +
> >> > +#include "strlcpy-avx2.S"
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> > new file mode 100644
> >> > index 0000000000..ffd3c0e846
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> > @@ -0,0 +1,25 @@
> >> > +/* wcslcpy generic.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +
> >> > +#include <isa-level.h>
> >> > +#if ISA_SHOULD_BUILD (1)
> >> > +# define __wcslcpy  __wcslcpy_generic
> >> > +# include <wcsmbs/wcslcpy.c>
> >> > +
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c
> b/sysdeps/x86_64/multiarch/wcslcpy.c
> >> > new file mode 100644
> >> > index 0000000000..371ef9626c
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> >> > @@ -0,0 +1,35 @@
> >> > +/* Multiple versions of wcslcpy.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +/* Define multiple versions only for the definition in libc.  */
> >> > +#if IS_IN (libc)
> >> > +# define __wcslcpy __redirect_wcslcpy
> >> > +# include <wchar.h>
> >> > +# undef __wcslcpy
> >> > +
> >> > +# define SYMBOL_NAME wcslcpy
> >> > +# include "ifunc-strlcpy.h"
> >> > +
> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR
> ());
> >> > +weak_alias (__wcslcpy, wcslcpy)
> >> > +# ifdef SHARED
> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> >> > +  __attribute__((visibility ("hidden"))) __attribute_copy__
> (wcslcpy);
> >> > +# endif
> >> > +#endif
> >> > --
> >> > 2.38.1
> >> >
>
Noah Goldstein July 3, 2023, 1:47 a.m. UTC | #15
On Sun, Jul 2, 2023 at 8:04 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>> >
>> >
>> >
>> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >>
>> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
>> >> <libc-alpha@sourceware.org> wrote:
>> >> >
>> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
>> >> > ---
>> >> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
>> >> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>> >> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>> >> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>> >> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>> >> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>> >> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>> >> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>> >> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>> >> >  9 files changed, 627 insertions(+)
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>> >> >
>> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> >> > index e1e894c963..7e3fc081df 100644
>> >> > --- a/sysdeps/x86_64/multiarch/Makefile
>> >> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> >> > @@ -82,6 +82,8 @@ sysdep_routines += \
>> >> >    strcpy-sse2 \
>> >> >    strcpy-sse2-unaligned \
>> >> >    strcspn-sse4 \
>> >> > +  strlcpy-avx2 \
>> >> > +  strlcpy-generic \
>> >> >    strlen-avx2 \
>> >> >    strlen-avx2-rtm \
>> >> >    strlen-evex \
>> >> > @@ -153,6 +155,8 @@ sysdep_routines += \
>> >> >    wcscpy-evex \
>> >> >    wcscpy-generic \
>> >> >    wcscpy-ssse3 \
>> >> > +  wcslcpy-avx2 \
>> >> > +  wcslcpy-generic \
>> >> >    wcslen-avx2 \
>> >> >    wcslen-avx2-rtm \
>> >> >    wcslen-evex \
>> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > index 5427ff1907..9928dee187 100644
>> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >> >                                      1,
>> >> >                                      __strncat_sse2_unaligned))
>> >> >
>> >> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
>> >> > +  IFUNC_IMPL (i, name, strlcpy,
>> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
>> >> > +                                    CPU_FEATURE_USABLE (AVX2),
>> >> > +                                    __strlcpy_avx2)
>> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
>> >> > +                                    1,
>> >> > +                                    __strlcpy_generic))
>> >> > +
>> >> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>> >> >    IFUNC_IMPL (i, name, strncpy,
>> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
>> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >> >                                      1,
>> >> >                                      __wcscpy_generic))
>> >> >
>> >> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
>> >> > +  IFUNC_IMPL (i, name, wcslcpy,
>> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
>> >> > +                                    CPU_FEATURE_USABLE (AVX2),
>> >> > +                                    __wcslcpy_avx2)
>> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
>> >> > +                                    1,
>> >> > +                                    __wcslcpy_generic))
>> >> > +
>> >> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>> >> >    IFUNC_IMPL (i, name, wcsncpy,
>> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
>> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> > new file mode 100644
>> >> > index 0000000000..982a30d15b
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> > @@ -0,0 +1,34 @@
>> >> > +/* Common definition for ifunc selections.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +#include <init-arch.h>
>> >> > +
>> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>> >> > +
>> >> > +static inline void *
>> >> > +IFUNC_SELECTOR (void)
>> >> > +{
>> >> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
>> >> > +
>> >> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>> >> > +    return OPTIMIZE (avx2);
>> >> > +
>> >> > +  return OPTIMIZE (generic);
>> >> > +}
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> > new file mode 100644
>> >> > index 0000000000..cf54b1e990
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> > @@ -0,0 +1,446 @@
>> >> > +/* Strlcpy/wcslcpy optimized with AVX2.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +
>> >> > +#if ISA_SHOULD_BUILD (3)
>> >> > +
>> >> > +# include <sysdep.h>
>> >> > +
>> >> > +# ifndef VEC_SIZE
>> >> > +#  include "x86-avx-vecs.h"
>> >> > +# endif
>> >> > +
>> >> > +# ifndef STRLCPY
>> >> > +#  define STRLCPY      __strlcpy_avx2
>> >> > +# endif
>> >> > +
>> >> > +
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +#  define CHAR_SIZE    4
>> >> > +#  define MOVU         movl
>> >> > +#  define VPCMPEQ      vpcmpeqd
>> >> > +#  define VPMINU       vpminud
>> >> > +# else
>> >> > +#  define CHAR_SIZE    1
>> >> > +#  define MOVU         movb
>> >> > +#  define VPCMPEQ      vpcmpeqb
>> >> > +#  define VPMINU       vpminub
>> >> > +# endif
>> >> > +
>> >> > +# define PMOVMSK       vpmovmskb
>> >> > +# define PAGE_SIZE     4096
>> >> > +# define VEC_SIZE      32
>> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >> > +
>> >> > +       .section SECTION(.text),"ax",@progbits
>> >> > +/* Aligning entry point to 64 byte, provides better performance for
>> >> > +   one vector length string.  */
>> >> > +
>> >> > +ENTRY_P2ALIGN (STRLCPY, 6)
>> >> > +# ifdef __ILP32__
>> >> > +       /* Clear the upper 32 bits.  */
>> >> > +       movl    %edx, %edx
>> >> > +# endif
>> >> > +
>> >> > +       /* Zero out vector register for end of string comparison. */
>> >> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
>> >> > +       /* Save source pointer for return calculation.  */
>> >> > +       mov     %rsi, %r8
>> >> > +       mov     %esi, %eax
>> >> > +       sall    $20, %eax
>> >> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
>> >> > +       ja      L(page_cross)
>> >> > +
>> >> > +L(page_cross_continue):
>> >> > +       /* Load first vector.  */
>> >> > +       VMOVU   (%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       PMOVMSK %VMM(2), %eax
>> >> > +       test    %eax, %eax
>> >> > +       jnz     L(ret_vec_x1)
>> >> > +
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_second_vector)
>> >> > +
>> >> > +       /* Check whether we can copy full vector.  */
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(page_cross_small_vec_copy)
>> >> > +       /* Copy first vector.  */
>> >> > +       VMOVU   %VMM(1), (%rdi)
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +
>> >> > +L(continue_second_vector):
>> >> > +       /* Align RSI pointer and adjust RDI based on offset.  */
>> >> > +       mov     %rsi, %rax
>> >> > +       and     $-VEC_SIZE, %rsi
>> >> > +       sub     %rsi, %rax
>> >> > +       sub     %rax, %rdi
>> >> > +
>> >> > +       /* Check if string already copied N char, and RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(skip_copy_alignment_fix)
>> >> > +
>> >> > +       /* Adjust RDX for copy alignment fix.  */
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +# endif
>> >> > +       add     %rax, %rdx
>> >> > +
>> >> > +L(skip_copy_alignment_fix):
>> >> > +       /* Load second vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x2)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_third_vector)
>> >> > +
>> >> > +       /* Jump below/equal(instead of below) used here, because last
>> >> > +          copy chracter must be NULL.  */
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_second_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy second vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
>> >> > +
>> >> > +L(continue_third_vector):
>> >> > +       /* Load third vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x3)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_third_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy third vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
>> >> > +
>> >> > +L(continue_fourth_vector):
>> >> > +       /* Load fourth vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x4)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_4x_align)
>> >> > +
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_fourth_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy fourth vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
>> >> > +
>> >> > +
>> >> > +L(loop_4x_align):
>> >> > +       /* Jump to loop if RSI is already 4 vector align.  */
>> >> > +       test    $(VEC_SIZE * 4 - 1), %esi
>> >> > +       jz      L(loop_4x_read)
>> >> > +
>> >> > +       mov     %rsi, %rcx
>> >> > +
>> >> > +       /* Align RSI to 4x vector.  */
>> >> > +       and     $(VEC_SIZE * -4), %rsi
>> >> > +       sub     %rsi, %rcx
>> >> > +
>> >> > +       /* Adjust RDI for RSI alignment fix.  */
>> >> > +       sub     %rcx, %rdi
>> >> > +
>> >> > +       /* Jump to loop if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_4x_read)
>> >> > +
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +
>> >> > +       /* Adjust RDX for RSI alignment fix.  */
>> >> > +       add     %rcx, %rdx
>> >> > +       jmp     L(loop_4x_read)
>> >> > +
>> >> > +       .p2align 4,,6
>> >> > +L(loop_4x_vec):
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_partial_copy_return)
>> >> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
>> >> > +       jbe     L(loop_partial_copy)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
>> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
>> >> > +       sub     $(CHAR_PER_VEC * 4), %rdx
>> >> > +
>> >> > +L(loop_partial_copy_return):
>> >> > +       sub     $(VEC_SIZE * -4), %rsi
>> >> > +       sub     $(VEC_SIZE * -4), %rdi
>> >> > +
>> >> > +L(loop_4x_read):
>> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> >> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
>> >> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
>> >> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
>> >> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
>> >> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
>> >> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
>> >> > +       vptest  %VMM(7), %VMM(7)
>> >> > +
>> >> > +       jz      L(loop_4x_vec)
>> >> > +
>> >> > +       /* Check if string ends in first vector or second vector.  */
>> >> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +# endif
>> >> > +       xor     %r10, %r10
>> >> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
>> >> > +       vptest  %VMM(6), %VMM(6)
>> >> > +       jnz     L(endloop)
>> >> > +       sub     $(CHAR_PER_VEC * -2), %rax
>> >> > +       mov     $(CHAR_PER_VEC * 2), %r10
>> >> > +       VMOVA   %VMM(3), %VMM(1)
>> >> > +       VMOVA   %VMM(4), %VMM(2)
>> >> > +
>> >> > +L(endloop):
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
>> >> > +       PMOVMSK %VMM(1), %rcx
>> >> > +       PMOVMSK %VMM(2), %r9
>> >> > +       shlq    $32, %r9
>> >> > +       orq     %r9, %rcx
>> >> > +       bsf     %rcx, %rcx
>> >> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       /* At this point RAX has length to return.  */
>> >> > +       add     %rcx, %rax
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +
>> >> > +       /* Add 1 to account for NULL character in RDX comparison.  */
>> >> > +       lea     1(%r10, %rcx), %rcx
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(loop_partial_copy):
>> >> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
>> >> > +       jbe     L(loop_partial_first_half)
>> >> > +       /* Reload first 2 vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> >> > +
>> >> > +L(loop_partial_first_half):
>> >> > +       /* Go back 2 vector from last and use overlapping copy.
>> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
>> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
>> >> > +        */
>> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
>> >> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
>> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %rdx, %rdx
>> >> > +       vptest  %VMM(7), %VMM(7)
>> >> > +       jz      L(loop_partial_copy_return)
>> >> > +       ret
>> >> > +
>> >> > +       .p2align 4
>> >> > +L(page_cross):
>> >> > +       mov     %rsi, %rcx
>> >> > +       mov     %rsi, %r11
>> >> > +       and     $-VEC_SIZE, %r11
>> >> > +       and     $(VEC_SIZE - 1), %rcx
>> >> > +       VMOVA   (%r11), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       PMOVMSK %VMM(2), %eax
>> >> > +       shr     %cl, %eax
>> >> > +       jz      L(page_cross_continue)
>> >> > +
>> >> > +L(ret_vec_x1):
>> >> > +       bsf     %eax, %eax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %eax
>> >> > +# endif
>> >> > +       /* Increment by 1 to account for NULL char.  */
>> >> > +       lea     1(%eax), %ecx
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +
>> >> > +L(page_cross_small_vec_copy):
>> >> > +       cmp     $(16 / CHAR_SIZE), %rdx
>> >> > +       jbe     L(copy_8_byte_scalar)
>> >> > +       VMOVU   (%rsi), %VMM_128(1)
>> >> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
>> >> > +       VMOVU   %VMM_128(1), (%rdi)
>> >> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %rdx, %rdx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_8_byte_scalar):
>> >> > +       cmp     $(8 / CHAR_SIZE), %rdx
>> >> > +       jbe     L(copy_4_byte_scalar)
>> >> > +       movq    (%rsi), %r10
>> >> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
>> >> > +       movq    %r10, (%rdi)
>> >> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_4_byte_scalar):
>> >> > +# ifndef USE_AS_WCSLCPY
>> >> > +       cmp     $4, %rdx
>> >> > +       jbe     L(copy_2_byte_scalar)
>> >> > +# endif
>> >> > +       movl    (%rsi), %r10d
>> >> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
>> >> > +       movl    %r10d, (%rdi)
>> >> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +# ifndef USE_AS_WCSLCPY
>> >> > +L(copy_2_byte_scalar):
>> >> > +       cmp     $2, %rdx
>> >> > +       jbe     L(copy_1_byte_scalar)
>> >> > +       movw    (%rsi), %r10w
>> >> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
>> >> > +       movw    %r10w, (%rdi)
>> >> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_1_byte_scalar):
>> >> > +       MOVU    (%rsi), %r10b
>> >> > +       MOVU    %r10b, (%rdi)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +# endif
>> >> > +
>> >> > +L(ret_vec_x2):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_second_vector):
>> >> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_third_vector)
>> >> > +
>> >> > +L(ret):
>> >> > +       ret
>> >> > +
>> >> > +L(ret_vec_x3):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_third_vector):
>> >> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(ret_vec_x4):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_fourth_vector):
>> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +       ret
>> >> > +
>> >> > +END (STRLCPY)
>> >>
>> >> Is strlcpy/strlcat integratable with existing strncat impl? Had
>> >> figured they would
>> >> fit in the same file.
>> >
>> >
>> > Hi Noah,
>> >
>> > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file,
>> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
>> >
>> Well, we can put the impl there and include it from another to manage
>> any special
>> link cases.
>
>
> Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38,
> to avoid any future strncpy backporting complications, it is better to keep them in separate
> files for now.
>
I get that, but can't we just have an impl file that implements all
the functions logic. It would
only build strl* if its included to (similar to how strlen avx512 impl
is currently setup).

>>
>> > --Sunil
>> >
>> >>
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> > new file mode 100644
>> >> > index 0000000000..eee3b7b086
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> > @@ -0,0 +1,25 @@
>> >> > +/* strlcpy generic.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +#if ISA_SHOULD_BUILD (1)
>> >> > +# define __strlcpy  __strlcpy_generic
>> >> > +# include <string/strlcpy.c>
>> >> > +
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
>> >> > new file mode 100644
>> >> > index 0000000000..ded41fbcfb
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
>> >> > @@ -0,0 +1,36 @@
>> >> > +/* Multiple versions of strlcpy.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +/* Define multiple versions only for the definition in libc.  */
>> >> > +#if IS_IN (libc)
>> >> > +# define __strlcpy __redirect_strlcpy
>> >> > +# include <string.h>
>> >> > +# undef __strlcpy
>> >> > +
>> >> > +# define SYMBOL_NAME strlcpy
>> >> > +# include "ifunc-strlcpy.h"
>> >> > +
>> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
>> >> > +weak_alias (__strlcpy, strlcpy)
>> >> > +
>> >> > +# ifdef SHARED
>> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
>> >> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
>> >> > +# endif
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> > new file mode 100644
>> >> > index 0000000000..dafc20ded0
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> > @@ -0,0 +1,4 @@
>> >> > +#define STRLCPY        __wcslcpy_avx2
>> >> > +#define USE_AS_WCSLCPY 1
>> >> > +
>> >> > +#include "strlcpy-avx2.S"
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> > new file mode 100644
>> >> > index 0000000000..ffd3c0e846
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> > @@ -0,0 +1,25 @@
>> >> > +/* wcslcpy generic.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +#if ISA_SHOULD_BUILD (1)
>> >> > +# define __wcslcpy  __wcslcpy_generic
>> >> > +# include <wcsmbs/wcslcpy.c>
>> >> > +
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
>> >> > new file mode 100644
>> >> > index 0000000000..371ef9626c
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
>> >> > @@ -0,0 +1,35 @@
>> >> > +/* Multiple versions of wcslcpy.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +/* Define multiple versions only for the definition in libc.  */
>> >> > +#if IS_IN (libc)
>> >> > +# define __wcslcpy __redirect_wcslcpy
>> >> > +# include <wchar.h>
>> >> > +# undef __wcslcpy
>> >> > +
>> >> > +# define SYMBOL_NAME wcslcpy
>> >> > +# include "ifunc-strlcpy.h"
>> >> > +
>> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
>> >> > +weak_alias (__wcslcpy, wcslcpy)
>> >> > +# ifdef SHARED
>> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
>> >> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
>> >> > +# endif
>> >> > +#endif
>> >> > --
>> >> > 2.38.1
>> >> >
Adhemerval Zanella Netto July 3, 2023, 12:55 p.m. UTC | #16
On 30/06/23 18:27, Paul Eggert wrote:
> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
>> Think we should at the very least wait for the generic strlcpy codes
>> to land first.
> 
> Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit.

I tend to agree, although these are now added in next POSIX my understanding
is there are still not encouraged to be used due multiple shortcoming on 
previous discussion.
Paul Eggert July 3, 2023, 4:30 p.m. UTC | #17
On 2023-06-30 15:21, Sunil Pandey wrote:
> Attached is strcpy/wcslcpy microbenchmark data based on Noah
> strlcpy/wcslcpy microbenchmark patch.

Although it's helpful to know that the proposed patch improves 
microbenchmark scores, that's not enough to justify it. Let's see 
benchmarks of real programs. If they don't show significant wins, let's 
not bother.

Programs that use strlcpy, by and large, don't use it in 
performance-sensitive areas, and their developers and users are far more 
worried about security than about performance. Making the implementation 
harder to audit will likely be a net negative for these applications. 
This doesn't sound a like a win at all.

Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?
Noah Goldstein July 3, 2023, 6:40 p.m. UTC | #18
On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote:
>
> On 2023-06-30 15:21, Sunil Pandey wrote:
> > Attached is strcpy/wcslcpy microbenchmark data based on Noah
> > strlcpy/wcslcpy microbenchmark patch.
>
> Although it's helpful to know that the proposed patch improves
> microbenchmark scores, that's not enough to justify it. Let's see
> benchmarks of real programs. If they don't show significant wins, let's
> not bother.
>
> Programs that use strlcpy, by and large, don't use it in
> performance-sensitive areas, and their developers and users are far more
> worried about security than about performance. Making the implementation
> harder to audit will likely be a net negative for these applications.
> This doesn't sound a like a win at all.
>
> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?

Think we should look into dropping optimized strcpy/wcscpy family
in general? For the most part don't see them in perf sensitive areas
anyways (generally people that care about perf maintain the length
and use mem* functions).
Adhemerval Zanella Netto July 3, 2023, 6:54 p.m. UTC | #19
On 03/07/23 15:40, Noah Goldstein via Libc-alpha wrote:
> On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> On 2023-06-30 15:21, Sunil Pandey wrote:
>>> Attached is strcpy/wcslcpy microbenchmark data based on Noah
>>> strlcpy/wcslcpy microbenchmark patch.
>>
>> Although it's helpful to know that the proposed patch improves
>> microbenchmark scores, that's not enough to justify it. Let's see
>> benchmarks of real programs. If they don't show significant wins, let's
>> not bother.
>>
>> Programs that use strlcpy, by and large, don't use it in
>> performance-sensitive areas, and their developers and users are far more
>> worried about security than about performance. Making the implementation
>> harder to audit will likely be a net negative for these applications.
>> This doesn't sound a like a win at all.
>>
>> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?
> 
> Think we should look into dropping optimized strcpy/wcscpy family
> in general? For the most part don't see them in perf sensitive areas
> anyways (generally people that care about perf maintain the length
> and use mem* functions).

I will go for it, these interface are provided mainly to comply with
standards and for x86 it adds only more maintenance.
Paul Eggert July 3, 2023, 9:14 p.m. UTC | #20
On 2023-07-03 11:40, Noah Goldstein wrote:
> Think we should look into dropping optimized strcpy/wcscpy family
> in general?

For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink.

strcpy optimization might be worth keeping, as it's used so much more. 
Measurements of real programs would help decide. In the meantime inertia 
suggests that when in doubt, leave it alone.

For strlcpy it's an easy call: don't optimize unless realistic 
benchmarks show it's a win.
Gabriel Ravier July 3, 2023, 10:04 p.m. UTC | #21
On 7/3/23 23:14, Paul Eggert wrote:
> On 2023-07-03 11:40, Noah Goldstein wrote:
>> Think we should look into dropping optimized strcpy/wcscpy family
>> in general?
>
> For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink.
>
> strcpy optimization might be worth keeping, as it's used so much more. 
> Measurements of real programs would help decide. In the meantime 
> inertia suggests that when in doubt, leave it alone.
>
> For strlcpy it's an easy call: don't optimize unless realistic 
> benchmarks show it's a win.

I guess it depends on just how much people use BSD software on Linux, 
because if you're looking at the BSDs the amount of usage of strlcpy is 
just absurdly massive - OpenBSD's tree has 4997 occurences of it, when 
memcpy is present 13470 times. That still means memcpy is used 3 times 
as often, but the idea that strlcpy is so popular as to be used to a 
remotely comparable degree is itself kind of astonishing.
Paul Eggert July 3, 2023, 11:12 p.m. UTC | #22
On 2023-07-03 15:04, Gabriel Ravier wrote:
> OpenBSD's tree has 4997 occurrences of it

Many years ago the OpenBSD team went through its source code and 
replaced uses of strcpy with strlcpy, without much thought involved and 
even introducing problems in the process. I expect that not much of this 
code is used elsewhere and it's not that relevant to glibc. Of the 
little OpenBSDish code that is relevant (notably OpenSSH) I expect the 
performance difference to be so small as to not be worth optimizating 
glibc. Real-worldish benchmarks could help check this.
Andreas Schwab July 4, 2023, 7:45 a.m. UTC | #23
On Jul 03 2023, Paul Eggert wrote:

> On 2023-07-03 15:04, Gabriel Ravier wrote:
>> OpenBSD's tree has 4997 occurrences of it
>
> Many years ago the OpenBSD team went through its source code and replaced
> uses of strcpy with strlcpy, without much thought involved and even
> introducing problems in the process.

In the Linux kernel sources all uses of strlcpy are being erased,
because the developers have realized how crappy that interface is.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e1e894c963..7e3fc081df 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -82,6 +82,8 @@  sysdep_routines += \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
   strcspn-sse4 \
+  strlcpy-avx2 \
+  strlcpy-generic \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
@@ -153,6 +155,8 @@  sysdep_routines += \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
+  wcslcpy-avx2 \
+  wcslcpy-generic \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5427ff1907..9928dee187 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -751,6 +751,15 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __strncat_sse2_unaligned))
 
+  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
+  IFUNC_IMPL (i, name, strlcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __strlcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
+				     1,
+				     __strlcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
@@ -917,6 +926,15 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
+  IFUNC_IMPL (i, name, wcslcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __wcslcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
+				     1,
+				     __wcslcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
   IFUNC_IMPL (i, name, wcsncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
new file mode 100644
index 0000000000..982a30d15b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
@@ -0,0 +1,34 @@ 
+/* Common definition for ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    return OPTIMIZE (avx2);
+
+  return OPTIMIZE (generic);
+}
diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
new file mode 100644
index 0000000000..cf54b1e990
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
@@ -0,0 +1,446 @@ 
+/* Strlcpy/wcslcpy optimized with AVX2.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRLCPY
+#  define STRLCPY	__strlcpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSLCPY
+#  define CHAR_SIZE	4
+#  define MOVU		movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define CHAR_SIZE	1
+#  define MOVU		movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# define PMOVMSK	vpmovmskb
+# define PAGE_SIZE	4096
+# define VEC_SIZE	32
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+
+ENTRY_P2ALIGN (STRLCPY, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl    %edx, %edx
+# endif
+
+	/* Zero out vector register for end of string comparison. */
+	vpxor	%VMM(0), %VMM(0), %VMM(0)
+	/* Save source pointer for return calculation.  */
+	mov	%rsi, %r8
+	mov	%esi, %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	/* Load first vector.  */
+	VMOVU	(%rsi), %VMM(1)
+	VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	test	%eax, %eax
+	jnz	L(ret_vec_x1)
+
+	test	%rdx, %rdx
+	jz	L(continue_second_vector)
+
+	/* Check whether we can copy full vector.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(page_cross_small_vec_copy)
+	/* Copy first vector.  */
+	VMOVU	%VMM(1), (%rdi)
+	sub	$CHAR_PER_VEC, %rdx
+
+L(continue_second_vector):
+	/* Align RSI pointer and adjust RDI based on offset.  */
+	mov	%rsi, %rax
+	and	$-VEC_SIZE, %rsi
+	sub	%rsi, %rax
+	sub	%rax, %rdi
+
+	/* Check if string already copied N char, and RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(skip_copy_alignment_fix)
+
+	/* Adjust RDX for copy alignment fix.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	add	%rax, %rdx
+
+L(skip_copy_alignment_fix):
+	/* Load second vector.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x2)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_third_vector)
+
+	/* Jump below/equal(instead of below) used here, because last
+	   copy chracter must be NULL.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_second_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy second vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+
+L(continue_third_vector):
+	/* Load third vector.  */
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x3)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_fourth_vector)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_third_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy third vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi)
+
+L(continue_fourth_vector):
+	/* Load fourth vector.  */
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x4)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_4x_align)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_fourth_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy fourth vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 3)(%rdi)
+
+
+L(loop_4x_align):
+	/* Jump to loop if RSI is already 4 vector align.  */
+	test	$(VEC_SIZE * 4 - 1), %esi
+	jz	L(loop_4x_read)
+
+	mov	%rsi, %rcx
+
+	/* Align RSI to 4x vector.  */
+	and	$(VEC_SIZE * -4), %rsi
+	sub	%rsi, %rcx
+
+	/* Adjust RDI for RSI alignment fix.  */
+	sub	%rcx, %rdi
+
+	/* Jump to loop if RDX is 0.  */
+	test    %rdx, %rdx
+	jz	L(loop_4x_read)
+
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+
+	/* Adjust RDX for RSI alignment fix.  */
+	add	%rcx, %rdx
+	jmp	L(loop_4x_read)
+
+	.p2align 4,,6
+L(loop_4x_vec):
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_partial_copy_return)
+	cmp	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(loop_partial_copy)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 6)(%rdi)
+	VMOVU	%VMM(4), (VEC_SIZE * 7)(%rdi)
+	sub	$(CHAR_PER_VEC * 4), %rdx
+
+L(loop_partial_copy_return):
+	sub	$(VEC_SIZE * -4), %rsi
+	sub	$(VEC_SIZE * -4), %rdi
+
+L(loop_4x_read):
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rsi), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%rsi), %VMM(4)
+	VPMINU	%VMM(1), %VMM(2), %VMM(5)
+	VPMINU	%VMM(3), %VMM(4), %VMM(6)
+	VPMINU	%VMM(5), %VMM(6), %VMM(7)
+	VPCMPEQ	%VMM(0), %VMM(7), %VMM(7)
+	vptest	%VMM(7), %VMM(7)
+
+	jz	L(loop_4x_vec)
+
+	/* Check if string ends in first vector or second vector.  */
+	lea	(VEC_SIZE * 4)(%rsi), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	xor	%r10, %r10
+	VPCMPEQ	%VMM(0), %VMM(5), %VMM(6)
+	vptest	%VMM(6), %VMM(6)
+	jnz	L(endloop)
+	sub	$(CHAR_PER_VEC * -2), %rax
+	mov	$(CHAR_PER_VEC * 2), %r10
+	VMOVA	%VMM(3), %VMM(1)
+	VMOVA	%VMM(4), %VMM(2)
+
+L(endloop):
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(2), %VMM(2)
+	PMOVMSK %VMM(1), %rcx
+	PMOVMSK %VMM(2), %r9
+	shlq	$32, %r9
+	orq	%r9, %rcx
+	bsf	%rcx, %rcx
+	/* Shift RCX by 2, VPMOVMSK has only byte version.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+	/* At this point RAX has length to return.  */
+	add	%rcx, %rax
+	test	%rdx, %rdx
+	jz	L(ret)
+
+	/* Add 1 to account for NULL character in RDX comparison.  */
+	lea	1(%r10, %rcx), %rcx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(loop_partial_copy):
+	cmp	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(loop_partial_first_half)
+	/* Reload first 2 vector.  */
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+
+L(loop_partial_first_half):
+	/* Go back 2 vector from last and use overlapping copy.
+	   (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
+	   (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
+	 */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(7), %VMM(7)
+	jz	L(loop_partial_copy_return)
+	ret
+
+	.p2align 4
+L(page_cross):
+	mov	%rsi, %rcx
+	mov	%rsi, %r11
+	and	$-VEC_SIZE, %r11
+	and	$(VEC_SIZE - 1), %rcx
+	VMOVA	(%r11), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	shr	%cl, %eax
+	jz	L(page_cross_continue)
+
+L(ret_vec_x1):
+	bsf	%eax, %eax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %eax
+# endif
+	/* Increment by 1 to account for NULL char.  */
+	lea	1(%eax), %ecx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+	test	%rdx, %rdx
+	jz	L(ret)
+
+L(page_cross_small_vec_copy):
+	cmp	$(16 / CHAR_SIZE), %rdx
+	jbe	L(copy_8_byte_scalar)
+	VMOVU	(%rsi), %VMM_128(1)
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
+	VMOVU	%VMM_128(1), (%rdi)
+	VMOVU	%VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_8_byte_scalar):
+	cmp	$(8 / CHAR_SIZE), %rdx
+	jbe	L(copy_4_byte_scalar)
+	movq	(%rsi), %r10
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %r11
+	movq	%r10, (%rdi)
+	movq	%r11, -8(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_4_byte_scalar):
+# ifndef USE_AS_WCSLCPY
+	cmp	$4, %rdx
+	jbe	L(copy_2_byte_scalar)
+# endif
+	movl	(%rsi), %r10d
+	movl	-4(%rsi, %rdx, CHAR_SIZE), %r11d
+	movl	%r10d, (%rdi)
+	movl	%r11d, -4(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+# ifndef USE_AS_WCSLCPY
+L(copy_2_byte_scalar):
+	cmp	$2, %rdx
+	jbe	L(copy_1_byte_scalar)
+	movw	(%rsi), %r10w
+	movw	-(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
+	movw	%r10w, (%rdi)
+	movw	%r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_1_byte_scalar):
+	MOVU	(%rsi), %r10b
+	MOVU	%r10b, (%rdi)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+# endif
+
+L(ret_vec_x2):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	VEC_SIZE(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_second_vector):
+	VMOVU	(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_third_vector)
+
+L(ret):
+	ret
+
+L(ret_vec_x3):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 2)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_third_vector):
+	VMOVU	(VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+L(ret_vec_x4):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 3)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_fourth_vector):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+END (STRLCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
new file mode 100644
index 0000000000..eee3b7b086
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
@@ -0,0 +1,25 @@ 
+/* strlcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __strlcpy  __strlcpy_generic
+# include <string/strlcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
new file mode 100644
index 0000000000..ded41fbcfb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy.c
@@ -0,0 +1,36 @@ 
+/* Multiple versions of strlcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __strlcpy __redirect_strlcpy
+# include <string.h>
+# undef __strlcpy
+
+# define SYMBOL_NAME strlcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
+weak_alias (__strlcpy, strlcpy)
+
+# ifdef SHARED
+__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
new file mode 100644
index 0000000000..dafc20ded0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
@@ -0,0 +1,4 @@ 
+#define STRLCPY	__wcslcpy_avx2
+#define USE_AS_WCSLCPY 1
+
+#include "strlcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
new file mode 100644
index 0000000000..ffd3c0e846
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
@@ -0,0 +1,25 @@ 
+/* wcslcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __wcslcpy  __wcslcpy_generic
+# include <wcsmbs/wcslcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
new file mode 100644
index 0000000000..371ef9626c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy.c
@@ -0,0 +1,35 @@ 
+/* Multiple versions of wcslcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcslcpy __redirect_wcslcpy
+# include <wchar.h>
+# undef __wcslcpy
+
+# define SYMBOL_NAME wcslcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
+weak_alias (__wcslcpy, wcslcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
+# endif
+#endif