diff mbox series

x86_64: Remove bzero optimization

Message ID 20220513124034.1694981-1-adhemerval.zanella@linaro.org
State New
Headers show
Series x86_64: Remove bzero optimization | expand

Commit Message

Adhemerval Zanella Netto May 13, 2022, 12:40 p.m. UTC
Both symbols are marked as legacy in POSIX.1-2001 and removed on
POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
or _DEFAULT_SOURCE.

GCC also replaces bcopy with a memmove and bzero with memset on default
configuration (to actually get a bzero libc call the code requires
to omit string.h inclusion and built with --fno-builtin), so it is
highly unlikely programs are actually calling libc bzero symbol.

On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
by the installed binaries.

  $ cat count_bstring.sh
  #!/bin/bash

  files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
  total=0
  for file in $files; do
    symbols=`objdump -R $file 2>&1`
    if [ $? -eq 0 ]; then
      ncalls=`echo $symbols | grep -w $1 | wc -l`
      ((total=total+ncalls))
      if [ $ncalls -gt 0 ]; then
        echo "$file: $ncalls"
      fi
    fi
  done
  echo "TOTAL=$total"
  $ ./count_bstring.sh bzero
  TOTAL=0

Checked on x86_64-linux-gnu.
---
 sysdeps/x86_64/bzero.S                        |   1 -
 sysdeps/x86_64/memset.S                       |  10 +-
 sysdeps/x86_64/multiarch/Makefile             |   1 -
 sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
 .../memset-avx2-unaligned-erms-rtm.S          |   1 -
 .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
 .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
 .../multiarch/memset-evex-unaligned-erms.S    |   3 -
 .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
 .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
 11 files changed, 2 insertions(+), 228 deletions(-)
 delete mode 100644 sysdeps/x86_64/bzero.S
 delete mode 100644 sysdeps/x86_64/multiarch/bzero.c

Comments

Noah Goldstein May 13, 2022, 2:54 p.m. UTC | #1
On Fri, May 13, 2022 at 7:42 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Both symbols are marked as legacy in POSIX.1-2001 and removed on
> POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
> or _DEFAULT_SOURCE.
>
> GCC also replaces bcopy with a memmove and bzero with memset on default
> configuration (to actually get a bzero libc call the code requires
> to omit string.h inclusion and built with --fno-builtin), so it is
> highly unlikely programs are actually calling libc bzero symbol.
>
> On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
> by the installed binaries.
>
>   $ cat count_bstring.sh
>   #!/bin/bash
>
>   files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
>   total=0
>   for file in $files; do
>     symbols=`objdump -R $file 2>&1`
>     if [ $? -eq 0 ]; then
>       ncalls=`echo $symbols | grep -w $1 | wc -l`
>       ((total=total+ncalls))
>       if [ $ncalls -gt 0 ]; then
>         echo "$file: $ncalls"
>       fi
>     fi
>   done
>   echo "TOTAL=$total"
>   $ ./count_bstring.sh bzero
>   TOTAL=0
>
> Checked on x86_64-linux-gnu.
> ---
>  sysdeps/x86_64/bzero.S                        |   1 -
>  sysdeps/x86_64/memset.S                       |  10 +-
>  sysdeps/x86_64/multiarch/Makefile             |   1 -
>  sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
>  .../memset-avx2-unaligned-erms-rtm.S          |   1 -
>  .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
>  .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
>  .../multiarch/memset-evex-unaligned-erms.S    |   3 -
>  .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
>  .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
>  11 files changed, 2 insertions(+), 228 deletions(-)
>  delete mode 100644 sysdeps/x86_64/bzero.S
>  delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
>
> diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
> deleted file mode 100644
> index f96d567fd8..0000000000
> --- a/sysdeps/x86_64/bzero.S
> +++ /dev/null
> @@ -1 +0,0 @@
> -/* Implemented in memset.S.  */
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index af26e9cedc..a6eea61a4d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -1,4 +1,4 @@
> -/* memset/bzero -- set memory area to CH/0
> +/* memset -- set memory area to CH/0
>     Optimized version for x86-64.
>     Copyright (C) 2002-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
> @@ -35,9 +35,6 @@
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
> -# define BZERO_ZERO_VEC0() \
> -  pxor %xmm0, %xmm0
> -
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0; \
> @@ -56,10 +53,6 @@
>  # define MEMSET_SYMBOL(p,s)    memset
>  #endif
>
> -#ifndef BZERO_SYMBOL
> -# define BZERO_SYMBOL(p,s)     __bzero
> -#endif
> -
>  #ifndef WMEMSET_SYMBOL
>  # define WMEMSET_CHK_SYMBOL(p,s) p
>  # define WMEMSET_SYMBOL(p,s)   __wmemset
> @@ -70,7 +63,6 @@
>  libc_hidden_builtin_def (memset)
>
>  #if IS_IN (libc)
> -weak_alias (__bzero, bzero)
>  libc_hidden_def (__wmemset)
>  weak_alias (__wmemset, wmemset)
>  libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 0400ea332b..f3ab5e0928 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,7 +1,6 @@
>  ifeq ($(subdir),string)
>
>  sysdep_routines += \
> -  bzero \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> deleted file mode 100644
> index 58a14b2c33..0000000000
> --- a/sysdeps/x86_64/multiarch/bzero.c
> +++ /dev/null
> @@ -1,106 +0,0 @@
> -/* Multiple versions of bzero.
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -/* Define multiple versions only for the definition in libc.  */
> -#if IS_IN (libc)
> -# define __bzero __redirect___bzero
> -# include <string.h>
> -# undef __bzero
> -
> -# define SYMBOL_NAME __bzero
> -# include <init-arch.h>
> -
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> -  attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> -  attribute_hidden;
> -
> -static inline void *
> -IFUNC_SELECTOR (void)
> -{
> -  const struct cpu_features* cpu_features = __get_cpu_features ();
> -
> -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> -      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> -    {
> -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> -       {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -           return OPTIMIZE1 (avx512_unaligned_erms);
> -
> -         return OPTIMIZE1 (avx512_unaligned);
> -       }
> -    }
> -
> -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> -    {
> -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> -       {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -           return OPTIMIZE1 (evex_unaligned_erms);
> -
> -         return OPTIMIZE1 (evex_unaligned);
> -       }
> -
> -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> -       {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> -
> -         return OPTIMIZE1 (avx2_unaligned_rtm);
> -       }
> -
> -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> -       {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -           return OPTIMIZE1 (avx2_unaligned_erms);
> -
> -         return OPTIMIZE1 (avx2_unaligned);
> -       }
> -    }
> -
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -    return OPTIMIZE1 (sse2_unaligned_erms);
> -
> -  return OPTIMIZE1 (sse2_unaligned);
> -}
> -
> -libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> -
> -weak_alias (__bzero, bzero)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a8afcf81bb..7218095430 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memset_avx512_no_vzeroupper)
>              )
>
> -  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> -  IFUNC_IMPL (i, name, bzero,
> -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> -                             __bzero_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> -                             __bzero_sse2_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             CPU_FEATURE_USABLE (AVX2),
> -                             __bzero_avx2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             CPU_FEATURE_USABLE (AVX2),
> -                             __bzero_avx2_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX2)
> -                              && CPU_FEATURE_USABLE (RTM)),
> -                             __bzero_avx2_unaligned_rtm)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX2)
> -                              && CPU_FEATURE_USABLE (RTM)),
> -                             __bzero_avx2_unaligned_erms_rtm)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX512VL)
> -                              && CPU_FEATURE_USABLE (AVX512BW)
> -                              && CPU_FEATURE_USABLE (BMI2)),
> -                             __bzero_evex_unaligned)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX512VL)
> -                              && CPU_FEATURE_USABLE (AVX512BW)
> -                              && CPU_FEATURE_USABLE (BMI2)),
> -                             __bzero_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX512VL)
> -                              && CPU_FEATURE_USABLE (AVX512BW)
> -                              && CPU_FEATURE_USABLE (BMI2)),
> -                             __bzero_avx512_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, bzero,
> -                             (CPU_FEATURE_USABLE (AVX512VL)
> -                              && CPU_FEATURE_USABLE (AVX512BW)
> -                              && CPU_FEATURE_USABLE (BMI2)),
> -                             __bzero_avx512_unaligned)
> -            )
> -
>    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>    IFUNC_IMPL (i, name, rawmemchr,
>               IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 5a5ee6f672..8ac3e479bb 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,7 +5,6 @@
>
>  #define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> -#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>
>  #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index a093a2831f..c0bf2875d0 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,9 +14,6 @@
>    vmovd d, %xmm0; \
>    movq r, %rax;
>
> -# define BZERO_ZERO_VEC0() \
> -  vpxor %xmm0, %xmm0, %xmm0
> -
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>
> @@ -32,9 +29,6 @@
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
>  # endif
> -# ifndef BZERO_SYMBOL
> -#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> -# endif
>  # ifndef WMEMSET_SYMBOL
>  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 727c92133a..5241216a77 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,9 +19,6 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> -# define BZERO_ZERO_VEC0() \
> -  vpxorq %XMM0, %XMM0, %XMM0
> -
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 5d8fa78f05..6370021506 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,9 +19,6 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> -# define BZERO_ZERO_VEC0() \
> -  vpxorq %XMM0, %XMM0, %XMM0
> -
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index d52d170804..3d92f6993a 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,7 +22,6 @@
>
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> -# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
>  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
>
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 785fee1d57..aa78fbb620 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -1,4 +1,4 @@
> -/* memset/bzero with unaligned store and rep stosb
> +/* memset with unaligned store and rep stosb
>     Copyright (C) 2016-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -26,10 +26,6 @@
>
>  #include <sysdep.h>
>
> -#ifndef BZERO_SYMBOL
> -# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> -#endif
> -
>  #ifndef MEMSET_CHK_SYMBOL
>  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
>  #endif
> @@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> -ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> -#if VEC_SIZE > 16
> -       BZERO_ZERO_VEC0 ()
> -#endif
> -       mov     %RDI_LP, %RAX_LP
> -       mov     %RSI_LP, %RDX_LP
> -#ifndef USE_LESS_VEC_MASK_STORE
> -       xorl    %esi, %esi
> -#endif
> -       cmp     $VEC_SIZE, %RDX_LP
> -       jb      L(less_vec_no_vdup)
> -#ifdef USE_LESS_VEC_MASK_STORE
> -       xorl    %esi, %esi
> -#endif
> -#if VEC_SIZE <= 16
> -       BZERO_ZERO_VEC0 ()
> -#endif
> -       cmp     $(VEC_SIZE * 2), %RDX_LP
> -       ja      L(more_2x_vec)
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> -       VZEROUPPER_RETURN
> -END (BZERO_SYMBOL(__bzero, unaligned))
> -
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>         cmp     %RDX_LP, %RCX_LP
> @@ -216,31 +187,6 @@ END (__memset_erms)
>  END (MEMSET_SYMBOL (__memset, erms))
>  # endif
>
> -ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> -# if VEC_SIZE > 16
> -       BZERO_ZERO_VEC0 ()
> -# endif
> -       mov     %RDI_LP, %RAX_LP
> -       mov     %RSI_LP, %RDX_LP
> -# ifndef USE_LESS_VEC_MASK_STORE
> -       xorl    %esi, %esi
> -# endif
> -       cmp     $VEC_SIZE, %RDX_LP
> -       jb      L(less_vec_no_vdup)

This label 'L(less_vec_no_vdup)' is no longer used. Can you
remove it from the rest of memset-vec-unaligned-erms.S
> -# ifdef USE_LESS_VEC_MASK_STORE
> -       xorl    %esi, %esi
> -# endif
> -# if VEC_SIZE <= 16
> -       BZERO_ZERO_VEC0 ()
> -# endif
> -       cmp     $(VEC_SIZE * 2), %RDX_LP
> -       ja      L(stosb_more_2x_vec)
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> -       VZEROUPPER_RETURN
> -END (BZERO_SYMBOL(__bzero, unaligned_erms))
> -
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>         cmp     %RDX_LP, %RCX_LP
> --
> 2.34.1
>

Im in favor of this.

Should we keep the old method of just swapping params that was removed in:

commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Feb 7 05:55:15 2022 -0800

    x86-64: Optimize bzero

```
- .section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
- mov %RDI_LP, %RAX_LP /* Set return value.  */
- mov %RSI_LP, %RDX_LP /* Set n.  */
- xorl %esi, %esi
- pxor %XMM0, %XMM0
- jmp L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
```
?

Old software that hasn't been recompiled will have a bit of a regression
going through the generic impl and it only costs 16 bytes of code.
H.J. Lu May 13, 2022, 7:50 p.m. UTC | #2
On Fri, May 13, 2022 at 7:55 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, May 13, 2022 at 7:42 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Both symbols are marked as legacy in POSIX.1-2001 and removed on
> > POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
> > or _DEFAULT_SOURCE.
> >
> > GCC also replaces bcopy with a memmove and bzero with memset on default
> > configuration (to actually get a bzero libc call the code requires
> > to omit string.h inclusion and built with --fno-builtin), so it is
> > highly unlikely programs are actually calling libc bzero symbol.
> >
> > On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
> > by the installed binaries.
> >
> >   $ cat count_bstring.sh
> >   #!/bin/bash
> >
> >   files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
> >   total=0
> >   for file in $files; do
> >     symbols=`objdump -R $file 2>&1`
> >     if [ $? -eq 0 ]; then
> >       ncalls=`echo $symbols | grep -w $1 | wc -l`
> >       ((total=total+ncalls))
> >       if [ $ncalls -gt 0 ]; then
> >         echo "$file: $ncalls"
> >       fi
> >     fi
> >   done
> >   echo "TOTAL=$total"
> >   $ ./count_bstring.sh bzero
> >   TOTAL=0
> >
> > Checked on x86_64-linux-gnu.
> > ---
> >  sysdeps/x86_64/bzero.S                        |   1 -
> >  sysdeps/x86_64/memset.S                       |  10 +-
> >  sysdeps/x86_64/multiarch/Makefile             |   1 -
> >  sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
> >  .../memset-avx2-unaligned-erms-rtm.S          |   1 -
> >  .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
> >  .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
> >  .../multiarch/memset-evex-unaligned-erms.S    |   3 -
> >  .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
> >  .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
> >  11 files changed, 2 insertions(+), 228 deletions(-)
> >  delete mode 100644 sysdeps/x86_64/bzero.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
> >
> > diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
> > deleted file mode 100644
> > index f96d567fd8..0000000000
> > --- a/sysdeps/x86_64/bzero.S
> > +++ /dev/null
> > @@ -1 +0,0 @@
> > -/* Implemented in memset.S.  */
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index af26e9cedc..a6eea61a4d 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -1,4 +1,4 @@
> > -/* memset/bzero -- set memory area to CH/0
> > +/* memset -- set memory area to CH/0
> >     Optimized version for x86-64.
> >     Copyright (C) 2002-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> > @@ -35,9 +35,6 @@
> >    punpcklwd %xmm0, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> > -# define BZERO_ZERO_VEC0() \
> > -  pxor %xmm0, %xmm0
> > -
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    movd d, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0; \
> > @@ -56,10 +53,6 @@
> >  # define MEMSET_SYMBOL(p,s)    memset
> >  #endif
> >
> > -#ifndef BZERO_SYMBOL
> > -# define BZERO_SYMBOL(p,s)     __bzero
> > -#endif
> > -
> >  #ifndef WMEMSET_SYMBOL
> >  # define WMEMSET_CHK_SYMBOL(p,s) p
> >  # define WMEMSET_SYMBOL(p,s)   __wmemset
> > @@ -70,7 +63,6 @@
> >  libc_hidden_builtin_def (memset)
> >
> >  #if IS_IN (libc)
> > -weak_alias (__bzero, bzero)
> >  libc_hidden_def (__wmemset)
> >  weak_alias (__wmemset, wmemset)
> >  libc_hidden_weak (wmemset)
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 0400ea332b..f3ab5e0928 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -1,7 +1,6 @@
> >  ifeq ($(subdir),string)
> >
> >  sysdep_routines += \
> > -  bzero \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> > deleted file mode 100644
> > index 58a14b2c33..0000000000
> > --- a/sysdeps/x86_64/multiarch/bzero.c
> > +++ /dev/null
> > @@ -1,106 +0,0 @@
> > -/* Multiple versions of bzero.
> > -   All versions must be listed in ifunc-impl-list.c.
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -/* Define multiple versions only for the definition in libc.  */
> > -#if IS_IN (libc)
> > -# define __bzero __redirect___bzero
> > -# include <string.h>
> > -# undef __bzero
> > -
> > -# define SYMBOL_NAME __bzero
> > -# include <init-arch.h>
> > -
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> > -  attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> > -  attribute_hidden;
> > -
> > -static inline void *
> > -IFUNC_SELECTOR (void)
> > -{
> > -  const struct cpu_features* cpu_features = __get_cpu_features ();
> > -
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > -      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> > -    {
> > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > -       {
> > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > -           return OPTIMIZE1 (avx512_unaligned_erms);
> > -
> > -         return OPTIMIZE1 (avx512_unaligned);
> > -       }
> > -    }
> > -
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > -    {
> > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > -       {
> > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > -           return OPTIMIZE1 (evex_unaligned_erms);
> > -
> > -         return OPTIMIZE1 (evex_unaligned);
> > -       }
> > -
> > -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > -       {
> > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > -           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> > -
> > -         return OPTIMIZE1 (avx2_unaligned_rtm);
> > -       }
> > -
> > -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > -       {
> > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > -           return OPTIMIZE1 (avx2_unaligned_erms);
> > -
> > -         return OPTIMIZE1 (avx2_unaligned);
> > -       }
> > -    }
> > -
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > -    return OPTIMIZE1 (sse2_unaligned_erms);
> > -
> > -  return OPTIMIZE1 (sse2_unaligned);
> > -}
> > -
> > -libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> > -
> > -weak_alias (__bzero, bzero)
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a8afcf81bb..7218095430 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                               __memset_avx512_no_vzeroupper)
> >              )
> >
> > -  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> > -  IFUNC_IMPL (i, name, bzero,
> > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > -                             __bzero_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > -                             __bzero_sse2_unaligned_erms)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             CPU_FEATURE_USABLE (AVX2),
> > -                             __bzero_avx2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             CPU_FEATURE_USABLE (AVX2),
> > -                             __bzero_avx2_unaligned_erms)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX2)
> > -                              && CPU_FEATURE_USABLE (RTM)),
> > -                             __bzero_avx2_unaligned_rtm)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX2)
> > -                              && CPU_FEATURE_USABLE (RTM)),
> > -                             __bzero_avx2_unaligned_erms_rtm)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > -                              && CPU_FEATURE_USABLE (BMI2)),
> > -                             __bzero_evex_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > -                              && CPU_FEATURE_USABLE (BMI2)),
> > -                             __bzero_evex_unaligned_erms)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > -                              && CPU_FEATURE_USABLE (BMI2)),
> > -                             __bzero_avx512_unaligned_erms)
> > -             IFUNC_IMPL_ADD (array, i, bzero,
> > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > -                              && CPU_FEATURE_USABLE (BMI2)),
> > -                             __bzero_avx512_unaligned)
> > -            )
> > -
> >    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
> >    IFUNC_IMPL (i, name, rawmemchr,
> >               IFUNC_IMPL_ADD (array, i, rawmemchr,
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > index 5a5ee6f672..8ac3e479bb 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > @@ -5,7 +5,6 @@
> >
> >  #define SECTION(p) p##.avx.rtm
> >  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> > -#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
> >  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
> >
> >  #include "memset-avx2-unaligned-erms.S"
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index a093a2831f..c0bf2875d0 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -14,9 +14,6 @@
> >    vmovd d, %xmm0; \
> >    movq r, %rax;
> >
> > -# define BZERO_ZERO_VEC0() \
> > -  vpxor %xmm0, %xmm0, %xmm0
> > -
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> >
> > @@ -32,9 +29,6 @@
> >  # ifndef MEMSET_SYMBOL
> >  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
> >  # endif
> > -# ifndef BZERO_SYMBOL
> > -#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> > -# endif
> >  # ifndef WMEMSET_SYMBOL
> >  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
> >  # endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index 727c92133a..5241216a77 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -19,9 +19,6 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > -# define BZERO_ZERO_VEC0() \
> > -  vpxorq %XMM0, %XMM0, %XMM0
> > -
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 5d8fa78f05..6370021506 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -19,9 +19,6 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > -# define BZERO_ZERO_VEC0() \
> > -  vpxorq %XMM0, %XMM0, %XMM0
> > -
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > index d52d170804..3d92f6993a 100644
> > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > @@ -22,7 +22,6 @@
> >
> >  #if IS_IN (libc)
> >  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> > -# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
> >  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
> >
> >  # ifdef SHARED
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 785fee1d57..aa78fbb620 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -1,4 +1,4 @@
> > -/* memset/bzero with unaligned store and rep stosb
> > +/* memset with unaligned store and rep stosb
> >     Copyright (C) 2016-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -26,10 +26,6 @@
> >
> >  #include <sysdep.h>
> >
> > -#ifndef BZERO_SYMBOL
> > -# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> > -#endif
> > -
> >  #ifndef MEMSET_CHK_SYMBOL
> >  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
> >  #endif
> > @@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > -ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> > -#if VEC_SIZE > 16
> > -       BZERO_ZERO_VEC0 ()
> > -#endif
> > -       mov     %RDI_LP, %RAX_LP
> > -       mov     %RSI_LP, %RDX_LP
> > -#ifndef USE_LESS_VEC_MASK_STORE
> > -       xorl    %esi, %esi
> > -#endif
> > -       cmp     $VEC_SIZE, %RDX_LP
> > -       jb      L(less_vec_no_vdup)
> > -#ifdef USE_LESS_VEC_MASK_STORE
> > -       xorl    %esi, %esi
> > -#endif
> > -#if VEC_SIZE <= 16
> > -       BZERO_ZERO_VEC0 ()
> > -#endif
> > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > -       ja      L(more_2x_vec)
> > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > -       VZEROUPPER_RETURN
> > -END (BZERO_SYMBOL(__bzero, unaligned))
> > -
> >  #if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >         cmp     %RDX_LP, %RCX_LP
> > @@ -216,31 +187,6 @@ END (__memset_erms)
> >  END (MEMSET_SYMBOL (__memset, erms))
> >  # endif
> >
> > -ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> > -# if VEC_SIZE > 16
> > -       BZERO_ZERO_VEC0 ()
> > -# endif
> > -       mov     %RDI_LP, %RAX_LP
> > -       mov     %RSI_LP, %RDX_LP
> > -# ifndef USE_LESS_VEC_MASK_STORE
> > -       xorl    %esi, %esi
> > -# endif
> > -       cmp     $VEC_SIZE, %RDX_LP
> > -       jb      L(less_vec_no_vdup)
>
> This label 'L(less_vec_no_vdup)' is no longer used. Can you
> remove it from the rest of memset-vec-unaligned-erms.S
> > -# ifdef USE_LESS_VEC_MASK_STORE
> > -       xorl    %esi, %esi
> > -# endif
> > -# if VEC_SIZE <= 16
> > -       BZERO_ZERO_VEC0 ()
> > -# endif
> > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > -       ja      L(stosb_more_2x_vec)
> > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > -       VZEROUPPER_RETURN
> > -END (BZERO_SYMBOL(__bzero, unaligned_erms))
> > -
> >  # if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >         cmp     %RDX_LP, %RCX_LP
> > --
> > 2.34.1
> >
>
> Im in favor of this.
>
> Should we keep the old method of just swapping params that was removed in:
>
> commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Mon Feb 7 05:55:15 2022 -0800
>
>     x86-64: Optimize bzero
>
> ```
> - .section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> - mov %RDI_LP, %RAX_LP /* Set return value.  */
> - mov %RSI_LP, %RDX_LP /* Set n.  */
> - xorl %esi, %esi
> - pxor %XMM0, %XMM0
> - jmp L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
> ```
> ?
>
> Old software that hasn't been recompiled will have a bit of a regression
> going through the generic impl and it only costs 16 bytes of code.

This old code hardcoded bzero to SSE2 memset.   The generic one
will use memset (0) which can use YMM or ZMM stores.  I don't think
there will be much of a regression.
Noah Goldstein May 13, 2022, 11:13 p.m. UTC | #3
On Fri, May 13, 2022 at 2:51 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, May 13, 2022 at 7:55 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Fri, May 13, 2022 at 7:42 AM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > Both symbols are marked as legacy in POSIX.1-2001 and removed on
> > > POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
> > > or _DEFAULT_SOURCE.
> > >
> > > GCC also replaces bcopy with a memmove and bzero with memset on default
> > > configuration (to actually get a bzero libc call the code requires
> > > to omit string.h inclusion and built with --fno-builtin), so it is
> > > highly unlikely programs are actually calling libc bzero symbol.
> > >
> > > On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
> > > by the installed binaries.
> > >
> > >   $ cat count_bstring.sh
> > >   #!/bin/bash
> > >
> > >   files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
> > >   total=0
> > >   for file in $files; do
> > >     symbols=`objdump -R $file 2>&1`
> > >     if [ $? -eq 0 ]; then
> > >       ncalls=`echo $symbols | grep -w $1 | wc -l`
> > >       ((total=total+ncalls))
> > >       if [ $ncalls -gt 0 ]; then
> > >         echo "$file: $ncalls"
> > >       fi
> > >     fi
> > >   done
> > >   echo "TOTAL=$total"
> > >   $ ./count_bstring.sh bzero
> > >   TOTAL=0
> > >
> > > Checked on x86_64-linux-gnu.
> > > ---
> > >  sysdeps/x86_64/bzero.S                        |   1 -
> > >  sysdeps/x86_64/memset.S                       |  10 +-
> > >  sysdeps/x86_64/multiarch/Makefile             |   1 -
> > >  sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
> > >  .../memset-avx2-unaligned-erms-rtm.S          |   1 -
> > >  .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
> > >  .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
> > >  .../multiarch/memset-evex-unaligned-erms.S    |   3 -
> > >  .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
> > >  .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
> > >  11 files changed, 2 insertions(+), 228 deletions(-)
> > >  delete mode 100644 sysdeps/x86_64/bzero.S
> > >  delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
> > >
> > > diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
> > > deleted file mode 100644
> > > index f96d567fd8..0000000000
> > > --- a/sysdeps/x86_64/bzero.S
> > > +++ /dev/null
> > > @@ -1 +0,0 @@
> > > -/* Implemented in memset.S.  */
> > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > index af26e9cedc..a6eea61a4d 100644
> > > --- a/sysdeps/x86_64/memset.S
> > > +++ b/sysdeps/x86_64/memset.S
> > > @@ -1,4 +1,4 @@
> > > -/* memset/bzero -- set memory area to CH/0
> > > +/* memset -- set memory area to CH/0
> > >     Optimized version for x86-64.
> > >     Copyright (C) 2002-2022 Free Software Foundation, Inc.
> > >     This file is part of the GNU C Library.
> > > @@ -35,9 +35,6 @@
> > >    punpcklwd %xmm0, %xmm0; \
> > >    pshufd $0, %xmm0, %xmm0
> > >
> > > -# define BZERO_ZERO_VEC0() \
> > > -  pxor %xmm0, %xmm0
> > > -
> > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    movd d, %xmm0; \
> > >    pshufd $0, %xmm0, %xmm0; \
> > > @@ -56,10 +53,6 @@
> > >  # define MEMSET_SYMBOL(p,s)    memset
> > >  #endif
> > >
> > > -#ifndef BZERO_SYMBOL
> > > -# define BZERO_SYMBOL(p,s)     __bzero
> > > -#endif
> > > -
> > >  #ifndef WMEMSET_SYMBOL
> > >  # define WMEMSET_CHK_SYMBOL(p,s) p
> > >  # define WMEMSET_SYMBOL(p,s)   __wmemset
> > > @@ -70,7 +63,6 @@
> > >  libc_hidden_builtin_def (memset)
> > >
> > >  #if IS_IN (libc)
> > > -weak_alias (__bzero, bzero)
> > >  libc_hidden_def (__wmemset)
> > >  weak_alias (__wmemset, wmemset)
> > >  libc_hidden_weak (wmemset)
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index 0400ea332b..f3ab5e0928 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -1,7 +1,6 @@
> > >  ifeq ($(subdir),string)
> > >
> > >  sysdep_routines += \
> > > -  bzero \
> > >    memchr-avx2 \
> > >    memchr-avx2-rtm \
> > >    memchr-evex \
> > > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> > > deleted file mode 100644
> > > index 58a14b2c33..0000000000
> > > --- a/sysdeps/x86_64/multiarch/bzero.c
> > > +++ /dev/null
> > > @@ -1,106 +0,0 @@
> > > -/* Multiple versions of bzero.
> > > -   All versions must be listed in ifunc-impl-list.c.
> > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > -   This file is part of the GNU C Library.
> > > -
> > > -   The GNU C Library is free software; you can redistribute it and/or
> > > -   modify it under the terms of the GNU Lesser General Public
> > > -   License as published by the Free Software Foundation; either
> > > -   version 2.1 of the License, or (at your option) any later version.
> > > -
> > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > -   Lesser General Public License for more details.
> > > -
> > > -   You should have received a copy of the GNU Lesser General Public
> > > -   License along with the GNU C Library; if not, see
> > > -   <https://www.gnu.org/licenses/>.  */
> > > -
> > > -/* Define multiple versions only for the definition in libc.  */
> > > -#if IS_IN (libc)
> > > -# define __bzero __redirect___bzero
> > > -# include <string.h>
> > > -# undef __bzero
> > > -
> > > -# define SYMBOL_NAME __bzero
> > > -# include <init-arch.h>
> > > -
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> > > -  attribute_hidden;
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> > > -  attribute_hidden;
> > > -
> > > -static inline void *
> > > -IFUNC_SELECTOR (void)
> > > -{
> > > -  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > -
> > > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > -      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> > > -    {
> > > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > -       {
> > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > -           return OPTIMIZE1 (avx512_unaligned_erms);
> > > -
> > > -         return OPTIMIZE1 (avx512_unaligned);
> > > -       }
> > > -    }
> > > -
> > > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > > -    {
> > > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > -       {
> > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > -           return OPTIMIZE1 (evex_unaligned_erms);
> > > -
> > > -         return OPTIMIZE1 (evex_unaligned);
> > > -       }
> > > -
> > > -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > -       {
> > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > -           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> > > -
> > > -         return OPTIMIZE1 (avx2_unaligned_rtm);
> > > -       }
> > > -
> > > -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > -       {
> > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > -           return OPTIMIZE1 (avx2_unaligned_erms);
> > > -
> > > -         return OPTIMIZE1 (avx2_unaligned);
> > > -       }
> > > -    }
> > > -
> > > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > -    return OPTIMIZE1 (sse2_unaligned_erms);
> > > -
> > > -  return OPTIMIZE1 (sse2_unaligned);
> > > -}
> > > -
> > > -libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> > > -
> > > -weak_alias (__bzero, bzero)
> > > -#endif
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a8afcf81bb..7218095430 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                               __memset_avx512_no_vzeroupper)
> > >              )
> > >
> > > -  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> > > -  IFUNC_IMPL (i, name, bzero,
> > > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > > -                             __bzero_sse2_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > > -                             __bzero_sse2_unaligned_erms)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             CPU_FEATURE_USABLE (AVX2),
> > > -                             __bzero_avx2_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             CPU_FEATURE_USABLE (AVX2),
> > > -                             __bzero_avx2_unaligned_erms)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX2)
> > > -                              && CPU_FEATURE_USABLE (RTM)),
> > > -                             __bzero_avx2_unaligned_rtm)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX2)
> > > -                              && CPU_FEATURE_USABLE (RTM)),
> > > -                             __bzero_avx2_unaligned_erms_rtm)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > -                             __bzero_evex_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > -                             __bzero_evex_unaligned_erms)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > -                             __bzero_avx512_unaligned_erms)
> > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > -                             __bzero_avx512_unaligned)
> > > -            )
> > > -
> > >    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
> > >    IFUNC_IMPL (i, name, rawmemchr,
> > >               IFUNC_IMPL_ADD (array, i, rawmemchr,
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > index 5a5ee6f672..8ac3e479bb 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > @@ -5,7 +5,6 @@
> > >
> > >  #define SECTION(p) p##.avx.rtm
> > >  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> > > -#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
> > >  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
> > >
> > >  #include "memset-avx2-unaligned-erms.S"
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > index a093a2831f..c0bf2875d0 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > @@ -14,9 +14,6 @@
> > >    vmovd d, %xmm0; \
> > >    movq r, %rax;
> > >
> > > -# define BZERO_ZERO_VEC0() \
> > > -  vpxor %xmm0, %xmm0, %xmm0
> > > -
> > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > >
> > > @@ -32,9 +29,6 @@
> > >  # ifndef MEMSET_SYMBOL
> > >  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
> > >  # endif
> > > -# ifndef BZERO_SYMBOL
> > > -#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> > > -# endif
> > >  # ifndef WMEMSET_SYMBOL
> > >  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
> > >  # endif
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > index 727c92133a..5241216a77 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > @@ -19,9 +19,6 @@
> > >    vpbroadcastb d, %VEC0; \
> > >    movq r, %rax
> > >
> > > -# define BZERO_ZERO_VEC0() \
> > > -  vpxorq %XMM0, %XMM0, %XMM0
> > > -
> > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    vpbroadcastd d, %VEC0; \
> > >    movq r, %rax
> > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > index 5d8fa78f05..6370021506 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > @@ -19,9 +19,6 @@
> > >    vpbroadcastb d, %VEC0; \
> > >    movq r, %rax
> > >
> > > -# define BZERO_ZERO_VEC0() \
> > > -  vpxorq %XMM0, %XMM0, %XMM0
> > > -
> > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    vpbroadcastd d, %VEC0; \
> > >    movq r, %rax
> > > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > index d52d170804..3d92f6993a 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > @@ -22,7 +22,6 @@
> > >
> > >  #if IS_IN (libc)
> > >  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> > > -# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
> > >  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
> > >
> > >  # ifdef SHARED
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index 785fee1d57..aa78fbb620 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -1,4 +1,4 @@
> > > -/* memset/bzero with unaligned store and rep stosb
> > > +/* memset with unaligned store and rep stosb
> > >     Copyright (C) 2016-2022 Free Software Foundation, Inc.
> > >     This file is part of the GNU C Library.
> > >
> > > @@ -26,10 +26,6 @@
> > >
> > >  #include <sysdep.h>
> > >
> > > -#ifndef BZERO_SYMBOL
> > > -# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> > > -#endif
> > > -
> > >  #ifndef MEMSET_CHK_SYMBOL
> > >  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
> > >  #endif
> > > @@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >  #endif
> > >
> > > -ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> > > -#if VEC_SIZE > 16
> > > -       BZERO_ZERO_VEC0 ()
> > > -#endif
> > > -       mov     %RDI_LP, %RAX_LP
> > > -       mov     %RSI_LP, %RDX_LP
> > > -#ifndef USE_LESS_VEC_MASK_STORE
> > > -       xorl    %esi, %esi
> > > -#endif
> > > -       cmp     $VEC_SIZE, %RDX_LP
> > > -       jb      L(less_vec_no_vdup)
> > > -#ifdef USE_LESS_VEC_MASK_STORE
> > > -       xorl    %esi, %esi
> > > -#endif
> > > -#if VEC_SIZE <= 16
> > > -       BZERO_ZERO_VEC0 ()
> > > -#endif
> > > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > > -       ja      L(more_2x_vec)
> > > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > -       VMOVU   %VEC(0), (%rdi)
> > > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > -       VZEROUPPER_RETURN
> > > -END (BZERO_SYMBOL(__bzero, unaligned))
> > > -
> > >  #if defined SHARED && IS_IN (libc)
> > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > >         cmp     %RDX_LP, %RCX_LP
> > > @@ -216,31 +187,6 @@ END (__memset_erms)
> > >  END (MEMSET_SYMBOL (__memset, erms))
> > >  # endif
> > >
> > > -ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> > > -# if VEC_SIZE > 16
> > > -       BZERO_ZERO_VEC0 ()
> > > -# endif
> > > -       mov     %RDI_LP, %RAX_LP
> > > -       mov     %RSI_LP, %RDX_LP
> > > -# ifndef USE_LESS_VEC_MASK_STORE
> > > -       xorl    %esi, %esi
> > > -# endif
> > > -       cmp     $VEC_SIZE, %RDX_LP
> > > -       jb      L(less_vec_no_vdup)
> >
> > This label 'L(less_vec_no_vdup)' is no longer used. Can you
> > remove it from the rest of memset-vec-unaligned-erms.S
> > > -# ifdef USE_LESS_VEC_MASK_STORE
> > > -       xorl    %esi, %esi
> > > -# endif
> > > -# if VEC_SIZE <= 16
> > > -       BZERO_ZERO_VEC0 ()
> > > -# endif
> > > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > > -       ja      L(stosb_more_2x_vec)
> > > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > -       VMOVU   %VEC(0), (%rdi)
> > > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > -       VZEROUPPER_RETURN
> > > -END (BZERO_SYMBOL(__bzero, unaligned_erms))
> > > -
> > >  # if defined SHARED && IS_IN (libc)
> > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >         cmp     %RDX_LP, %RCX_LP
> > > --
> > > 2.34.1
> > >
> >
> > Im in favor of this.
> >
> > Should we keep the old method of just swapping params that was removed in:
> >
> > commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date:   Mon Feb 7 05:55:15 2022 -0800
> >
> >     x86-64: Optimize bzero
> >
> > ```
> > - .section SECTION(.text),"ax",@progbits
> > -#if VEC_SIZE == 16 && IS_IN (libc)
> > -ENTRY (__bzero)
> > - mov %RDI_LP, %RAX_LP /* Set return value.  */
> > - mov %RSI_LP, %RDX_LP /* Set n.  */
> > - xorl %esi, %esi
> > - pxor %XMM0, %XMM0
> > - jmp L(entry_from_bzero)
> > -END (__bzero)
> > -weak_alias (__bzero, bzero)
> > ```
> > ?
> >
> > Old software that hasn't been recompiled will have a bit of a regression
> > going through the generic impl and it only costs 16 bytes of code.
>
> This old code hardcoded bzero to SSE2 memset.   The generic one
> will use memset (0) which can use YMM or ZMM stores.  I don't think
> there will be much of a regression.

If the code has been recompiled. The point is binaries that haven't been
recompiled since gcc/clang started replacing bzero -> memset(0) are going
to get the generic implementation now.
>
> --
> H.J.
H.J. Lu May 14, 2022, 12:42 a.m. UTC | #4
On Fri, May 13, 2022 at 4:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, May 13, 2022 at 2:51 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, May 13, 2022 at 7:55 AM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > On Fri, May 13, 2022 at 7:42 AM Adhemerval Zanella via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Both symbols are marked as legacy in POSIX.1-2001 and removed on
> > > > POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
> > > > or _DEFAULT_SOURCE.
> > > >
> > > > GCC also replaces bcopy with a memmove and bzero with memset on default
> > > > configuration (to actually get a bzero libc call the code requires
> > > > to omit string.h inclusion and built with --fno-builtin), so it is
> > > > highly unlikely programs are actually calling libc bzero symbol.
> > > >
> > > > On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
> > > > by the installed binaries.
> > > >
> > > >   $ cat count_bstring.sh
> > > >   #!/bin/bash
> > > >
> > > >   files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
> > > >   total=0
> > > >   for file in $files; do
> > > >     symbols=`objdump -R $file 2>&1`
> > > >     if [ $? -eq 0 ]; then
> > > >       ncalls=`echo $symbols | grep -w $1 | wc -l`
> > > >       ((total=total+ncalls))
> > > >       if [ $ncalls -gt 0 ]; then
> > > >         echo "$file: $ncalls"
> > > >       fi
> > > >     fi
> > > >   done
> > > >   echo "TOTAL=$total"
> > > >   $ ./count_bstring.sh bzero
> > > >   TOTAL=0
> > > >
> > > > Checked on x86_64-linux-gnu.
> > > > ---
> > > >  sysdeps/x86_64/bzero.S                        |   1 -
> > > >  sysdeps/x86_64/memset.S                       |  10 +-
> > > >  sysdeps/x86_64/multiarch/Makefile             |   1 -
> > > >  sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
> > > >  .../memset-avx2-unaligned-erms-rtm.S          |   1 -
> > > >  .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
> > > >  .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
> > > >  .../multiarch/memset-evex-unaligned-erms.S    |   3 -
> > > >  .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
> > > >  .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
> > > >  11 files changed, 2 insertions(+), 228 deletions(-)
> > > >  delete mode 100644 sysdeps/x86_64/bzero.S
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
> > > >
> > > > diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
> > > > deleted file mode 100644
> > > > index f96d567fd8..0000000000
> > > > --- a/sysdeps/x86_64/bzero.S
> > > > +++ /dev/null
> > > > @@ -1 +0,0 @@
> > > > -/* Implemented in memset.S.  */
> > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > > index af26e9cedc..a6eea61a4d 100644
> > > > --- a/sysdeps/x86_64/memset.S
> > > > +++ b/sysdeps/x86_64/memset.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* memset/bzero -- set memory area to CH/0
> > > > +/* memset -- set memory area to CH/0
> > > >     Optimized version for x86-64.
> > > >     Copyright (C) 2002-2022 Free Software Foundation, Inc.
> > > >     This file is part of the GNU C Library.
> > > > @@ -35,9 +35,6 @@
> > > >    punpcklwd %xmm0, %xmm0; \
> > > >    pshufd $0, %xmm0, %xmm0
> > > >
> > > > -# define BZERO_ZERO_VEC0() \
> > > > -  pxor %xmm0, %xmm0
> > > > -
> > > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > >    movd d, %xmm0; \
> > > >    pshufd $0, %xmm0, %xmm0; \
> > > > @@ -56,10 +53,6 @@
> > > >  # define MEMSET_SYMBOL(p,s)    memset
> > > >  #endif
> > > >
> > > > -#ifndef BZERO_SYMBOL
> > > > -# define BZERO_SYMBOL(p,s)     __bzero
> > > > -#endif
> > > > -
> > > >  #ifndef WMEMSET_SYMBOL
> > > >  # define WMEMSET_CHK_SYMBOL(p,s) p
> > > >  # define WMEMSET_SYMBOL(p,s)   __wmemset
> > > > @@ -70,7 +63,6 @@
> > > >  libc_hidden_builtin_def (memset)
> > > >
> > > >  #if IS_IN (libc)
> > > > -weak_alias (__bzero, bzero)
> > > >  libc_hidden_def (__wmemset)
> > > >  weak_alias (__wmemset, wmemset)
> > > >  libc_hidden_weak (wmemset)
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index 0400ea332b..f3ab5e0928 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -1,7 +1,6 @@
> > > >  ifeq ($(subdir),string)
> > > >
> > > >  sysdep_routines += \
> > > > -  bzero \
> > > >    memchr-avx2 \
> > > >    memchr-avx2-rtm \
> > > >    memchr-evex \
> > > > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> > > > deleted file mode 100644
> > > > index 58a14b2c33..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/bzero.c
> > > > +++ /dev/null
> > > > @@ -1,106 +0,0 @@
> > > > -/* Multiple versions of bzero.
> > > > -   All versions must be listed in ifunc-impl-list.c.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -/* Define multiple versions only for the definition in libc.  */
> > > > -#if IS_IN (libc)
> > > > -# define __bzero __redirect___bzero
> > > > -# include <string.h>
> > > > -# undef __bzero
> > > > -
> > > > -# define SYMBOL_NAME __bzero
> > > > -# include <init-arch.h>
> > > > -
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> > > > -  attribute_hidden;
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> > > > -  attribute_hidden;
> > > > -
> > > > -static inline void *
> > > > -IFUNC_SELECTOR (void)
> > > > -{
> > > > -  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > -
> > > > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > > -      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> > > > -    {
> > > > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > > -       {
> > > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > > -           return OPTIMIZE1 (avx512_unaligned_erms);
> > > > -
> > > > -         return OPTIMIZE1 (avx512_unaligned);
> > > > -       }
> > > > -    }
> > > > -
> > > > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > > > -    {
> > > > -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > > -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > > -       {
> > > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > > -           return OPTIMIZE1 (evex_unaligned_erms);
> > > > -
> > > > -         return OPTIMIZE1 (evex_unaligned);
> > > > -       }
> > > > -
> > > > -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > -       {
> > > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > > -           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> > > > -
> > > > -         return OPTIMIZE1 (avx2_unaligned_rtm);
> > > > -       }
> > > > -
> > > > -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > -       {
> > > > -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > > -           return OPTIMIZE1 (avx2_unaligned_erms);
> > > > -
> > > > -         return OPTIMIZE1 (avx2_unaligned);
> > > > -       }
> > > > -    }
> > > > -
> > > > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > > > -    return OPTIMIZE1 (sse2_unaligned_erms);
> > > > -
> > > > -  return OPTIMIZE1 (sse2_unaligned);
> > > > -}
> > > > -
> > > > -libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> > > > -
> > > > -weak_alias (__bzero, bzero)
> > > > -#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a8afcf81bb..7218095430 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                               __memset_avx512_no_vzeroupper)
> > > >              )
> > > >
> > > > -  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> > > > -  IFUNC_IMPL (i, name, bzero,
> > > > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > > > -                             __bzero_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero, 1,
> > > > -                             __bzero_sse2_unaligned_erms)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             CPU_FEATURE_USABLE (AVX2),
> > > > -                             __bzero_avx2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             CPU_FEATURE_USABLE (AVX2),
> > > > -                             __bzero_avx2_unaligned_erms)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX2)
> > > > -                              && CPU_FEATURE_USABLE (RTM)),
> > > > -                             __bzero_avx2_unaligned_rtm)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX2)
> > > > -                              && CPU_FEATURE_USABLE (RTM)),
> > > > -                             __bzero_avx2_unaligned_erms_rtm)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > > -                             __bzero_evex_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > > -                             __bzero_evex_unaligned_erms)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > > -                             __bzero_avx512_unaligned_erms)
> > > > -             IFUNC_IMPL_ADD (array, i, bzero,
> > > > -                             (CPU_FEATURE_USABLE (AVX512VL)
> > > > -                              && CPU_FEATURE_USABLE (AVX512BW)
> > > > -                              && CPU_FEATURE_USABLE (BMI2)),
> > > > -                             __bzero_avx512_unaligned)
> > > > -            )
> > > > -
> > > >    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
> > > >    IFUNC_IMPL (i, name, rawmemchr,
> > > >               IFUNC_IMPL_ADD (array, i, rawmemchr,
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > > index 5a5ee6f672..8ac3e479bb 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > > > @@ -5,7 +5,6 @@
> > > >
> > > >  #define SECTION(p) p##.avx.rtm
> > > >  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> > > > -#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
> > > >  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
> > > >
> > > >  #include "memset-avx2-unaligned-erms.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > index a093a2831f..c0bf2875d0 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > @@ -14,9 +14,6 @@
> > > >    vmovd d, %xmm0; \
> > > >    movq r, %rax;
> > > >
> > > > -# define BZERO_ZERO_VEC0() \
> > > > -  vpxor %xmm0, %xmm0, %xmm0
> > > > -
> > > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > >    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > > >
> > > > @@ -32,9 +29,6 @@
> > > >  # ifndef MEMSET_SYMBOL
> > > >  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
> > > >  # endif
> > > > -# ifndef BZERO_SYMBOL
> > > > -#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> > > > -# endif
> > > >  # ifndef WMEMSET_SYMBOL
> > > >  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
> > > >  # endif
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > index 727c92133a..5241216a77 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > @@ -19,9 +19,6 @@
> > > >    vpbroadcastb d, %VEC0; \
> > > >    movq r, %rax
> > > >
> > > > -# define BZERO_ZERO_VEC0() \
> > > > -  vpxorq %XMM0, %XMM0, %XMM0
> > > > -
> > > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > >    vpbroadcastd d, %VEC0; \
> > > >    movq r, %rax
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > index 5d8fa78f05..6370021506 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > @@ -19,9 +19,6 @@
> > > >    vpbroadcastb d, %VEC0; \
> > > >    movq r, %rax
> > > >
> > > > -# define BZERO_ZERO_VEC0() \
> > > > -  vpxorq %XMM0, %XMM0, %XMM0
> > > > -
> > > >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > >    vpbroadcastd d, %VEC0; \
> > > >    movq r, %rax
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > > index d52d170804..3d92f6993a 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > > > @@ -22,7 +22,6 @@
> > > >
> > > >  #if IS_IN (libc)
> > > >  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> > > > -# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
> > > >  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
> > > >
> > > >  # ifdef SHARED
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > index 785fee1d57..aa78fbb620 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* memset/bzero with unaligned store and rep stosb
> > > > +/* memset with unaligned store and rep stosb
> > > >     Copyright (C) 2016-2022 Free Software Foundation, Inc.
> > > >     This file is part of the GNU C Library.
> > > >
> > > > @@ -26,10 +26,6 @@
> > > >
> > > >  #include <sysdep.h>
> > > >
> > > > -#ifndef BZERO_SYMBOL
> > > > -# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> > > > -#endif
> > > > -
> > > >  #ifndef MEMSET_CHK_SYMBOL
> > > >  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
> > > >  #endif
> > > > @@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > >  #endif
> > > >
> > > > -ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> > > > -#if VEC_SIZE > 16
> > > > -       BZERO_ZERO_VEC0 ()
> > > > -#endif
> > > > -       mov     %RDI_LP, %RAX_LP
> > > > -       mov     %RSI_LP, %RDX_LP
> > > > -#ifndef USE_LESS_VEC_MASK_STORE
> > > > -       xorl    %esi, %esi
> > > > -#endif
> > > > -       cmp     $VEC_SIZE, %RDX_LP
> > > > -       jb      L(less_vec_no_vdup)
> > > > -#ifdef USE_LESS_VEC_MASK_STORE
> > > > -       xorl    %esi, %esi
> > > > -#endif
> > > > -#if VEC_SIZE <= 16
> > > > -       BZERO_ZERO_VEC0 ()
> > > > -#endif
> > > > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > > > -       ja      L(more_2x_vec)
> > > > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > > -       VMOVU   %VEC(0), (%rdi)
> > > > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > > -       VZEROUPPER_RETURN
> > > > -END (BZERO_SYMBOL(__bzero, unaligned))
> > > > -
> > > >  #if defined SHARED && IS_IN (libc)
> > > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > > >         cmp     %RDX_LP, %RCX_LP
> > > > @@ -216,31 +187,6 @@ END (__memset_erms)
> > > >  END (MEMSET_SYMBOL (__memset, erms))
> > > >  # endif
> > > >
> > > > -ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> > > > -# if VEC_SIZE > 16
> > > > -       BZERO_ZERO_VEC0 ()
> > > > -# endif
> > > > -       mov     %RDI_LP, %RAX_LP
> > > > -       mov     %RSI_LP, %RDX_LP
> > > > -# ifndef USE_LESS_VEC_MASK_STORE
> > > > -       xorl    %esi, %esi
> > > > -# endif
> > > > -       cmp     $VEC_SIZE, %RDX_LP
> > > > -       jb      L(less_vec_no_vdup)
> > >
> > > This label 'L(less_vec_no_vdup)' is no longer used. Can you
> > > remove it from the rest of memset-vec-unaligned-erms.S
> > > > -# ifdef USE_LESS_VEC_MASK_STORE
> > > > -       xorl    %esi, %esi
> > > > -# endif
> > > > -# if VEC_SIZE <= 16
> > > > -       BZERO_ZERO_VEC0 ()
> > > > -# endif
> > > > -       cmp     $(VEC_SIZE * 2), %RDX_LP
> > > > -       ja      L(stosb_more_2x_vec)
> > > > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > > -       VMOVU   %VEC(0), (%rdi)
> > > > -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > > -       VZEROUPPER_RETURN
> > > > -END (BZERO_SYMBOL(__bzero, unaligned_erms))
> > > > -
> > > >  # if defined SHARED && IS_IN (libc)
> > > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > > >         cmp     %RDX_LP, %RCX_LP
> > > > --
> > > > 2.34.1
> > > >
> > >
> > > Im in favor of this.
> > >
> > > Should we keep the old method of just swapping params that was removed in:
> > >
> > > commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
> > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > Date:   Mon Feb 7 05:55:15 2022 -0800
> > >
> > >     x86-64: Optimize bzero
> > >
> > > ```
> > > - .section SECTION(.text),"ax",@progbits
> > > -#if VEC_SIZE == 16 && IS_IN (libc)
> > > -ENTRY (__bzero)
> > > - mov %RDI_LP, %RAX_LP /* Set return value.  */
> > > - mov %RSI_LP, %RDX_LP /* Set n.  */
> > > - xorl %esi, %esi
> > > - pxor %XMM0, %XMM0
> > > - jmp L(entry_from_bzero)
> > > -END (__bzero)
> > > -weak_alias (__bzero, bzero)
> > > ```
> > > ?
> > >
> > > Old software that hasn't been recompiled will have a bit of a regression
> > > going through the generic impl and it only costs 16 bytes of code.
> >
> > This old code hardcoded bzero to SSE2 memset.   The generic one
> > will use memset (0) which can use YMM or ZMM stores.  I don't think
> > there will be much of a regression.
>
> If the code has been recompiled. The point is binaries that haven't been
> recompiled since gcc/clang started replacing bzero -> memset(0) are going
> to get the generic implementation now.

The generic bzero calls memset (0), which is optimized by IFUNC.
Fangrui Song May 14, 2022, 11:51 p.m. UTC | #5
On 2022-05-13, Adhemerval Zanella via Libc-alpha wrote:
>Both symbols are marked as legacy in POSIX.1-2001 and removed on
>POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
>or _DEFAULT_SOURCE.
>
>GCC also replaces bcopy with a memmove and bzero with memset on default
>configuration (to actually get a bzero libc call the code requires
>to omit string.h inclusion and built with --fno-builtin), so it is
>highly unlikely programs are actually calling libc bzero symbol.

Typo: -fno-builtin

LGTM:)



Note: on arm64, Apple decided to optimize memset zero to bzero since 2021-03: https://reviews.llvm.org/D99358

>On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
>by the installed binaries.
>
>  $ cat count_bstring.sh
>  #!/bin/bash
>
>  files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
>  total=0
>  for file in $files; do
>    symbols=`objdump -R $file 2>&1`
>    if [ $? -eq 0 ]; then
>      ncalls=`echo $symbols | grep -w $1 | wc -l`
>      ((total=total+ncalls))
>      if [ $ncalls -gt 0 ]; then
>        echo "$file: $ncalls"
>      fi
>    fi
>  done
>  echo "TOTAL=$total"
>  $ ./count_bstring.sh bzero
>  TOTAL=0
>
>Checked on x86_64-linux-gnu.
>---
> sysdeps/x86_64/bzero.S                        |   1 -
> sysdeps/x86_64/memset.S                       |  10 +-
> sysdeps/x86_64/multiarch/Makefile             |   1 -
> sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
> sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
> .../memset-avx2-unaligned-erms-rtm.S          |   1 -
> .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
> .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
> .../multiarch/memset-evex-unaligned-erms.S    |   3 -
> .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
> .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
> 11 files changed, 2 insertions(+), 228 deletions(-)
> delete mode 100644 sysdeps/x86_64/bzero.S
> delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
>
>diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
>deleted file mode 100644
>index f96d567fd8..0000000000
>--- a/sysdeps/x86_64/bzero.S
>+++ /dev/null
>@@ -1 +0,0 @@
>-/* Implemented in memset.S.  */
>diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>index af26e9cedc..a6eea61a4d 100644
>--- a/sysdeps/x86_64/memset.S
>+++ b/sysdeps/x86_64/memset.S
>@@ -1,4 +1,4 @@
>-/* memset/bzero -- set memory area to CH/0
>+/* memset -- set memory area to CH/0
>    Optimized version for x86-64.
>    Copyright (C) 2002-2022 Free Software Foundation, Inc.
>    This file is part of the GNU C Library.
>@@ -35,9 +35,6 @@
>   punpcklwd %xmm0, %xmm0; \
>   pshufd $0, %xmm0, %xmm0
>
>-# define BZERO_ZERO_VEC0() \
>-  pxor %xmm0, %xmm0
>-
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>   movd d, %xmm0; \
>   pshufd $0, %xmm0, %xmm0; \
>@@ -56,10 +53,6 @@
> # define MEMSET_SYMBOL(p,s)	memset
> #endif
>
>-#ifndef BZERO_SYMBOL
>-# define BZERO_SYMBOL(p,s)	__bzero
>-#endif
>-
> #ifndef WMEMSET_SYMBOL
> # define WMEMSET_CHK_SYMBOL(p,s) p
> # define WMEMSET_SYMBOL(p,s)	__wmemset
>@@ -70,7 +63,6 @@
> libc_hidden_builtin_def (memset)
>
> #if IS_IN (libc)
>-weak_alias (__bzero, bzero)
> libc_hidden_def (__wmemset)
> weak_alias (__wmemset, wmemset)
> libc_hidden_weak (wmemset)
>diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>index 0400ea332b..f3ab5e0928 100644
>--- a/sysdeps/x86_64/multiarch/Makefile
>+++ b/sysdeps/x86_64/multiarch/Makefile
>@@ -1,7 +1,6 @@
> ifeq ($(subdir),string)
>
> sysdep_routines += \
>-  bzero \
>   memchr-avx2 \
>   memchr-avx2-rtm \
>   memchr-evex \
>diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
>deleted file mode 100644
>index 58a14b2c33..0000000000
>--- a/sysdeps/x86_64/multiarch/bzero.c
>+++ /dev/null
>@@ -1,106 +0,0 @@
>-/* Multiple versions of bzero.
>-   All versions must be listed in ifunc-impl-list.c.
>-   Copyright (C) 2022 Free Software Foundation, Inc.
>-   This file is part of the GNU C Library.
>-
>-   The GNU C Library is free software; you can redistribute it and/or
>-   modify it under the terms of the GNU Lesser General Public
>-   License as published by the Free Software Foundation; either
>-   version 2.1 of the License, or (at your option) any later version.
>-
>-   The GNU C Library is distributed in the hope that it will be useful,
>-   but WITHOUT ANY WARRANTY; without even the implied warranty of
>-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>-   Lesser General Public License for more details.
>-
>-   You should have received a copy of the GNU Lesser General Public
>-   License along with the GNU C Library; if not, see
>-   <https://www.gnu.org/licenses/>.  */
>-
>-/* Define multiple versions only for the definition in libc.  */
>-#if IS_IN (libc)
>-# define __bzero __redirect___bzero
>-# include <string.h>
>-# undef __bzero
>-
>-# define SYMBOL_NAME __bzero
>-# include <init-arch.h>
>-
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
>-  attribute_hidden;
>-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
>-  attribute_hidden;
>-
>-static inline void *
>-IFUNC_SELECTOR (void)
>-{
>-  const struct cpu_features* cpu_features = __get_cpu_features ();
>-
>-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>-      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
>-    {
>-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>-	{
>-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>-	    return OPTIMIZE1 (avx512_unaligned_erms);
>-
>-	  return OPTIMIZE1 (avx512_unaligned);
>-	}
>-    }
>-
>-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>-    {
>-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>-	{
>-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>-	    return OPTIMIZE1 (evex_unaligned_erms);
>-
>-	  return OPTIMIZE1 (evex_unaligned);
>-	}
>-
>-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>-	{
>-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>-	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
>-
>-	  return OPTIMIZE1 (avx2_unaligned_rtm);
>-	}
>-
>-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
>-	{
>-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>-	    return OPTIMIZE1 (avx2_unaligned_erms);
>-
>-	  return OPTIMIZE1 (avx2_unaligned);
>-	}
>-    }
>-
>-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>-    return OPTIMIZE1 (sse2_unaligned_erms);
>-
>-  return OPTIMIZE1 (sse2_unaligned);
>-}
>-
>-libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
>-
>-weak_alias (__bzero, bzero)
>-#endif
>diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>index a8afcf81bb..7218095430 100644
>--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>@@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> 			      __memset_avx512_no_vzeroupper)
> 	     )
>
>-  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
>-  IFUNC_IMPL (i, name, bzero,
>-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
>-			      __bzero_sse2_unaligned)
>-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
>-			      __bzero_sse2_unaligned_erms)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      CPU_FEATURE_USABLE (AVX2),
>-			      __bzero_avx2_unaligned)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      CPU_FEATURE_USABLE (AVX2),
>-			      __bzero_avx2_unaligned_erms)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX2)
>-			       && CPU_FEATURE_USABLE (RTM)),
>-			      __bzero_avx2_unaligned_rtm)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX2)
>-			       && CPU_FEATURE_USABLE (RTM)),
>-			      __bzero_avx2_unaligned_erms_rtm)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX512VL)
>-			       && CPU_FEATURE_USABLE (AVX512BW)
>-			       && CPU_FEATURE_USABLE (BMI2)),
>-			      __bzero_evex_unaligned)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX512VL)
>-			       && CPU_FEATURE_USABLE (AVX512BW)
>-			       && CPU_FEATURE_USABLE (BMI2)),
>-			      __bzero_evex_unaligned_erms)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX512VL)
>-			       && CPU_FEATURE_USABLE (AVX512BW)
>-			       && CPU_FEATURE_USABLE (BMI2)),
>-			      __bzero_avx512_unaligned_erms)
>-	      IFUNC_IMPL_ADD (array, i, bzero,
>-			      (CPU_FEATURE_USABLE (AVX512VL)
>-			       && CPU_FEATURE_USABLE (AVX512BW)
>-			       && CPU_FEATURE_USABLE (BMI2)),
>-			      __bzero_avx512_unaligned)
>-	     )
>-
>   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>   IFUNC_IMPL (i, name, rawmemchr,
> 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
>diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>index 5a5ee6f672..8ac3e479bb 100644
>--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>@@ -5,7 +5,6 @@
>
> #define SECTION(p) p##.avx.rtm
> #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
>-#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
> #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
>
> #include "memset-avx2-unaligned-erms.S"
>diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>index a093a2831f..c0bf2875d0 100644
>--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>@@ -14,9 +14,6 @@
>   vmovd d, %xmm0; \
>   movq r, %rax;
>
>-# define BZERO_ZERO_VEC0() \
>-  vpxor %xmm0, %xmm0, %xmm0
>-
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>
>@@ -32,9 +29,6 @@
> # ifndef MEMSET_SYMBOL
> #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
> # endif
>-# ifndef BZERO_SYMBOL
>-#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
>-# endif
> # ifndef WMEMSET_SYMBOL
> #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
> # endif
>diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>index 727c92133a..5241216a77 100644
>--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>@@ -19,9 +19,6 @@
>   vpbroadcastb d, %VEC0; \
>   movq r, %rax
>
>-# define BZERO_ZERO_VEC0() \
>-  vpxorq %XMM0, %XMM0, %XMM0
>-
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>   vpbroadcastd d, %VEC0; \
>   movq r, %rax
>diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>index 5d8fa78f05..6370021506 100644
>--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>@@ -19,9 +19,6 @@
>   vpbroadcastb d, %VEC0; \
>   movq r, %rax
>
>-# define BZERO_ZERO_VEC0() \
>-  vpxorq %XMM0, %XMM0, %XMM0
>-
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>   vpbroadcastd d, %VEC0; \
>   movq r, %rax
>diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>index d52d170804..3d92f6993a 100644
>--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>@@ -22,7 +22,6 @@
>
> #if IS_IN (libc)
> # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
>-# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
> # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
>
> # ifdef SHARED
>diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>index 785fee1d57..aa78fbb620 100644
>--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>@@ -1,4 +1,4 @@
>-/* memset/bzero with unaligned store and rep stosb
>+/* memset with unaligned store and rep stosb
>    Copyright (C) 2016-2022 Free Software Foundation, Inc.
>    This file is part of the GNU C Library.
>
>@@ -26,10 +26,6 @@
>
> #include <sysdep.h>
>
>-#ifndef BZERO_SYMBOL
>-# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
>-#endif
>-
> #ifndef MEMSET_CHK_SYMBOL
> # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
> #endif
>@@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> END (WMEMSET_SYMBOL (__wmemset, unaligned))
> #endif
>
>-ENTRY (BZERO_SYMBOL(__bzero, unaligned))
>-#if VEC_SIZE > 16
>-	BZERO_ZERO_VEC0 ()
>-#endif
>-	mov	%RDI_LP, %RAX_LP
>-	mov	%RSI_LP, %RDX_LP
>-#ifndef USE_LESS_VEC_MASK_STORE
>-	xorl	%esi, %esi
>-#endif
>-	cmp	$VEC_SIZE, %RDX_LP
>-	jb	L(less_vec_no_vdup)
>-#ifdef USE_LESS_VEC_MASK_STORE
>-	xorl	%esi, %esi
>-#endif
>-#if VEC_SIZE <= 16
>-	BZERO_ZERO_VEC0 ()
>-#endif
>-	cmp	$(VEC_SIZE * 2), %RDX_LP
>-	ja	L(more_2x_vec)
>-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>-	VMOVU	%VEC(0), (%rdi)
>-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>-	VZEROUPPER_RETURN
>-END (BZERO_SYMBOL(__bzero, unaligned))
>-
> #if defined SHARED && IS_IN (libc)
> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> 	cmp	%RDX_LP, %RCX_LP
>@@ -216,31 +187,6 @@ END (__memset_erms)
> END (MEMSET_SYMBOL (__memset, erms))
> # endif
>
>-ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
>-# if VEC_SIZE > 16
>-	BZERO_ZERO_VEC0 ()
>-# endif
>-	mov	%RDI_LP, %RAX_LP
>-	mov	%RSI_LP, %RDX_LP
>-# ifndef USE_LESS_VEC_MASK_STORE
>-	xorl	%esi, %esi
>-# endif
>-	cmp	$VEC_SIZE, %RDX_LP
>-	jb	L(less_vec_no_vdup)
>-# ifdef USE_LESS_VEC_MASK_STORE
>-	xorl	%esi, %esi
>-# endif
>-# if VEC_SIZE <= 16
>-	BZERO_ZERO_VEC0 ()
>-# endif
>-	cmp	$(VEC_SIZE * 2), %RDX_LP
>-	ja	L(stosb_more_2x_vec)
>-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>-	VMOVU	%VEC(0), (%rdi)
>-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>-	VZEROUPPER_RETURN
>-END (BZERO_SYMBOL(__bzero, unaligned_erms))
>-
> # if defined SHARED && IS_IN (libc)
> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> 	cmp	%RDX_LP, %RCX_LP
>-- 
>2.34.1
>
Adhemerval Zanella Netto May 16, 2022, 12:35 p.m. UTC | #6
On 14/05/2022 20:51, Fangrui Song wrote:
> On 2022-05-13, Adhemerval Zanella via Libc-alpha wrote:
>> Both symbols are marked as legacy in POSIX.1-2001 and removed on
>> POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
>> or _DEFAULT_SOURCE.
>>
>> GCC also replaces bcopy with a memmove and bzero with memset on default
>> configuration (to actually get a bzero libc call the code requires
>> to omit string.h inclusion and built with --fno-builtin), so it is
>> highly unlikely programs are actually calling libc bzero symbol.
> 
> Typo: -fno-builtin
> 
> LGTM:)

Thanks.

> 
> 
> 
> Note: on arm64, Apple decided to optimize memset zero to bzero since 2021-03: https://reviews.llvm.org/D99358

We discussed it back in Febuary [1] and Wilco has summarized why GCC has
a prefence to avoid calls to bzero, bcopy, and mempcpy:

  GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to
  avoid unnecessary proliferation of almost identical string functions where
  there is absolutely no performance benefit. In fact this duplication is typically
  a net loss, partly due to reducing optimization effort and the extra runtime
  overhead due to having multiple copies of the same function in caches and
  predictors.

And also Wilco has suggested that although memset with zero is the most common
pattern, it is really doubtful that the micro-optimziation is worth on modern
cores and I agree with him. Not sure how is darwin libc organized and if whether
its bzero does yield any performance gain, but bug report it seems it was 
aimed to reduce code size.

[1] https://sourceware.org/pipermail/libc-alpha/2022-February/136359.html
[2] https://sourceware.org/pipermail/libc-alpha/2022-February/136374.html

> 
>> On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
>> by the installed binaries.
>>
>>  $ cat count_bstring.sh
>>  #!/bin/bash
>>
>>  files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
>>  total=0
>>  for file in $files; do
>>    symbols=`objdump -R $file 2>&1`
>>    if [ $? -eq 0 ]; then
>>      ncalls=`echo $symbols | grep -w $1 | wc -l`
>>      ((total=total+ncalls))
>>      if [ $ncalls -gt 0 ]; then
>>        echo "$file: $ncalls"
>>      fi
>>    fi
>>  done
>>  echo "TOTAL=$total"
>>  $ ./count_bstring.sh bzero
>>  TOTAL=0
>>
>> Checked on x86_64-linux-gnu.
>> ---
>> sysdeps/x86_64/bzero.S                        |   1 -
>> sysdeps/x86_64/memset.S                       |  10 +-
>> sysdeps/x86_64/multiarch/Makefile             |   1 -
>> sysdeps/x86_64/multiarch/bzero.c              | 106 ------------------
>> sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 -------
>> .../memset-avx2-unaligned-erms-rtm.S          |   1 -
>> .../multiarch/memset-avx2-unaligned-erms.S    |   6 -
>> .../multiarch/memset-avx512-unaligned-erms.S  |   3 -
>> .../multiarch/memset-evex-unaligned-erms.S    |   3 -
>> .../multiarch/memset-sse2-unaligned-erms.S    |   1 -
>> .../multiarch/memset-vec-unaligned-erms.S     |  56 +--------
>> 11 files changed, 2 insertions(+), 228 deletions(-)
>> delete mode 100644 sysdeps/x86_64/bzero.S
>> delete mode 100644 sysdeps/x86_64/multiarch/bzero.c
>>
>> diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
>> deleted file mode 100644
>> index f96d567fd8..0000000000
>> --- a/sysdeps/x86_64/bzero.S
>> +++ /dev/null
>> @@ -1 +0,0 @@
>> -/* Implemented in memset.S.  */
>> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>> index af26e9cedc..a6eea61a4d 100644
>> --- a/sysdeps/x86_64/memset.S
>> +++ b/sysdeps/x86_64/memset.S
>> @@ -1,4 +1,4 @@
>> -/* memset/bzero -- set memory area to CH/0
>> +/* memset -- set memory area to CH/0
>>    Optimized version for x86-64.
>>    Copyright (C) 2002-2022 Free Software Foundation, Inc.
>>    This file is part of the GNU C Library.
>> @@ -35,9 +35,6 @@
>>   punpcklwd %xmm0, %xmm0; \
>>   pshufd $0, %xmm0, %xmm0
>>
>> -# define BZERO_ZERO_VEC0() \
>> -  pxor %xmm0, %xmm0
>> -
>> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>>   movd d, %xmm0; \
>>   pshufd $0, %xmm0, %xmm0; \
>> @@ -56,10 +53,6 @@
>> # define MEMSET_SYMBOL(p,s)    memset
>> #endif
>>
>> -#ifndef BZERO_SYMBOL
>> -# define BZERO_SYMBOL(p,s)    __bzero
>> -#endif
>> -
>> #ifndef WMEMSET_SYMBOL
>> # define WMEMSET_CHK_SYMBOL(p,s) p
>> # define WMEMSET_SYMBOL(p,s)    __wmemset
>> @@ -70,7 +63,6 @@
>> libc_hidden_builtin_def (memset)
>>
>> #if IS_IN (libc)
>> -weak_alias (__bzero, bzero)
>> libc_hidden_def (__wmemset)
>> weak_alias (__wmemset, wmemset)
>> libc_hidden_weak (wmemset)
>> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> index 0400ea332b..f3ab5e0928 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -1,7 +1,6 @@
>> ifeq ($(subdir),string)
>>
>> sysdep_routines += \
>> -  bzero \
>>   memchr-avx2 \
>>   memchr-avx2-rtm \
>>   memchr-evex \
>> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
>> deleted file mode 100644
>> index 58a14b2c33..0000000000
>> --- a/sysdeps/x86_64/multiarch/bzero.c
>> +++ /dev/null
>> @@ -1,106 +0,0 @@
>> -/* Multiple versions of bzero.
>> -   All versions must be listed in ifunc-impl-list.c.
>> -   Copyright (C) 2022 Free Software Foundation, Inc.
>> -   This file is part of the GNU C Library.
>> -
>> -   The GNU C Library is free software; you can redistribute it and/or
>> -   modify it under the terms of the GNU Lesser General Public
>> -   License as published by the Free Software Foundation; either
>> -   version 2.1 of the License, or (at your option) any later version.
>> -
>> -   The GNU C Library is distributed in the hope that it will be useful,
>> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> -   Lesser General Public License for more details.
>> -
>> -   You should have received a copy of the GNU Lesser General Public
>> -   License along with the GNU C Library; if not, see
>> -   <https://www.gnu.org/licenses/>.  */
>> -
>> -/* Define multiple versions only for the definition in libc.  */
>> -#if IS_IN (libc)
>> -# define __bzero __redirect___bzero
>> -# include <string.h>
>> -# undef __bzero
>> -
>> -# define SYMBOL_NAME __bzero
>> -# include <init-arch.h>
>> -
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
>> -  attribute_hidden;
>> -extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
>> -  attribute_hidden;
>> -
>> -static inline void *
>> -IFUNC_SELECTOR (void)
>> -{
>> -  const struct cpu_features* cpu_features = __get_cpu_features ();
>> -
>> -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>> -      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>> -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>> -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>> -        return OPTIMIZE1 (avx512_unaligned_erms);
>> -
>> -      return OPTIMIZE1 (avx512_unaligned);
>> -    }
>> -    }
>> -
>> -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>> -          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>> -          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>> -        return OPTIMIZE1 (evex_unaligned_erms);
>> -
>> -      return OPTIMIZE1 (evex_unaligned);
>> -    }
>> -
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>> -        return OPTIMIZE1 (avx2_unaligned_erms_rtm);
>> -
>> -      return OPTIMIZE1 (avx2_unaligned_rtm);
>> -    }
>> -
>> -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
>> -    {
>> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>> -        return OPTIMIZE1 (avx2_unaligned_erms);
>> -
>> -      return OPTIMIZE1 (avx2_unaligned);
>> -    }
>> -    }
>> -
>> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>> -    return OPTIMIZE1 (sse2_unaligned_erms);
>> -
>> -  return OPTIMIZE1 (sse2_unaligned);
>> -}
>> -
>> -libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
>> -
>> -weak_alias (__bzero, bzero)
>> -#endif
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index a8afcf81bb..7218095430 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -291,48 +291,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>                   __memset_avx512_no_vzeroupper)
>>          )
>>
>> -  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
>> -  IFUNC_IMPL (i, name, bzero,
>> -          IFUNC_IMPL_ADD (array, i, bzero, 1,
>> -                  __bzero_sse2_unaligned)
>> -          IFUNC_IMPL_ADD (array, i, bzero, 1,
>> -                  __bzero_sse2_unaligned_erms)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  CPU_FEATURE_USABLE (AVX2),
>> -                  __bzero_avx2_unaligned)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  CPU_FEATURE_USABLE (AVX2),
>> -                  __bzero_avx2_unaligned_erms)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX2)
>> -                   && CPU_FEATURE_USABLE (RTM)),
>> -                  __bzero_avx2_unaligned_rtm)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX2)
>> -                   && CPU_FEATURE_USABLE (RTM)),
>> -                  __bzero_avx2_unaligned_erms_rtm)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX512VL)
>> -                   && CPU_FEATURE_USABLE (AVX512BW)
>> -                   && CPU_FEATURE_USABLE (BMI2)),
>> -                  __bzero_evex_unaligned)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX512VL)
>> -                   && CPU_FEATURE_USABLE (AVX512BW)
>> -                   && CPU_FEATURE_USABLE (BMI2)),
>> -                  __bzero_evex_unaligned_erms)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX512VL)
>> -                   && CPU_FEATURE_USABLE (AVX512BW)
>> -                   && CPU_FEATURE_USABLE (BMI2)),
>> -                  __bzero_avx512_unaligned_erms)
>> -          IFUNC_IMPL_ADD (array, i, bzero,
>> -                  (CPU_FEATURE_USABLE (AVX512VL)
>> -                   && CPU_FEATURE_USABLE (AVX512BW)
>> -                   && CPU_FEATURE_USABLE (BMI2)),
>> -                  __bzero_avx512_unaligned)
>> -         )
>> -
>>   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>>   IFUNC_IMPL (i, name, rawmemchr,
>>           IFUNC_IMPL_ADD (array, i, rawmemchr,
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>> index 5a5ee6f672..8ac3e479bb 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
>> @@ -5,7 +5,6 @@
>>
>> #define SECTION(p) p##.avx.rtm
>> #define MEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>> -#define BZERO_SYMBOL(p,s)    p##_avx2_##s##_rtm
>> #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>>
>> #include "memset-avx2-unaligned-erms.S"
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> index a093a2831f..c0bf2875d0 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> @@ -14,9 +14,6 @@
>>   vmovd d, %xmm0; \
>>   movq r, %rax;
>>
>> -# define BZERO_ZERO_VEC0() \
>> -  vpxor %xmm0, %xmm0, %xmm0
>> -
>> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>>   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>>
>> @@ -32,9 +29,6 @@
>> # ifndef MEMSET_SYMBOL
>> #  define MEMSET_SYMBOL(p,s)    p##_avx2_##s
>> # endif
>> -# ifndef BZERO_SYMBOL
>> -#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
>> -# endif
>> # ifndef WMEMSET_SYMBOL
>> #  define WMEMSET_SYMBOL(p,s)    p##_avx2_##s
>> # endif
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> index 727c92133a..5241216a77 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> @@ -19,9 +19,6 @@
>>   vpbroadcastb d, %VEC0; \
>>   movq r, %rax
>>
>> -# define BZERO_ZERO_VEC0() \
>> -  vpxorq %XMM0, %XMM0, %XMM0
>> -
>> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>>   vpbroadcastd d, %VEC0; \
>>   movq r, %rax
>> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> index 5d8fa78f05..6370021506 100644
>> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> @@ -19,9 +19,6 @@
>>   vpbroadcastb d, %VEC0; \
>>   movq r, %rax
>>
>> -# define BZERO_ZERO_VEC0() \
>> -  vpxorq %XMM0, %XMM0, %XMM0
>> -
>> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>>   vpbroadcastd d, %VEC0; \
>>   movq r, %rax
>> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>> index d52d170804..3d92f6993a 100644
>> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>> @@ -22,7 +22,6 @@
>>
>> #if IS_IN (libc)
>> # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
>> -# define BZERO_SYMBOL(p,s)    MEMSET_SYMBOL (p, s)
>> # define WMEMSET_SYMBOL(p,s)    p##_sse2_##s
>>
>> # ifdef SHARED
>> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> index 785fee1d57..aa78fbb620 100644
>> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> @@ -1,4 +1,4 @@
>> -/* memset/bzero with unaligned store and rep stosb
>> +/* memset with unaligned store and rep stosb
>>    Copyright (C) 2016-2022 Free Software Foundation, Inc.
>>    This file is part of the GNU C Library.
>>
>> @@ -26,10 +26,6 @@
>>
>> #include <sysdep.h>
>>
>> -#ifndef BZERO_SYMBOL
>> -# define BZERO_SYMBOL(p,s)        MEMSET_SYMBOL (p, s)
>> -#endif
>> -
>> #ifndef MEMSET_CHK_SYMBOL
>> # define MEMSET_CHK_SYMBOL(p,s)        MEMSET_SYMBOL(p, s)
>> #endif
>> @@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>> END (WMEMSET_SYMBOL (__wmemset, unaligned))
>> #endif
>>
>> -ENTRY (BZERO_SYMBOL(__bzero, unaligned))
>> -#if VEC_SIZE > 16
>> -    BZERO_ZERO_VEC0 ()
>> -#endif
>> -    mov    %RDI_LP, %RAX_LP
>> -    mov    %RSI_LP, %RDX_LP
>> -#ifndef USE_LESS_VEC_MASK_STORE
>> -    xorl    %esi, %esi
>> -#endif
>> -    cmp    $VEC_SIZE, %RDX_LP
>> -    jb    L(less_vec_no_vdup)
>> -#ifdef USE_LESS_VEC_MASK_STORE
>> -    xorl    %esi, %esi
>> -#endif
>> -#if VEC_SIZE <= 16
>> -    BZERO_ZERO_VEC0 ()
>> -#endif
>> -    cmp    $(VEC_SIZE * 2), %RDX_LP
>> -    ja    L(more_2x_vec)
>> -    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>> -    VMOVU    %VEC(0), (%rdi)
>> -    VMOVU    %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>> -    VZEROUPPER_RETURN
>> -END (BZERO_SYMBOL(__bzero, unaligned))
>> -
>> #if defined SHARED && IS_IN (libc)
>> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>>     cmp    %RDX_LP, %RCX_LP
>> @@ -216,31 +187,6 @@ END (__memset_erms)
>> END (MEMSET_SYMBOL (__memset, erms))
>> # endif
>>
>> -ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
>> -# if VEC_SIZE > 16
>> -    BZERO_ZERO_VEC0 ()
>> -# endif
>> -    mov    %RDI_LP, %RAX_LP
>> -    mov    %RSI_LP, %RDX_LP
>> -# ifndef USE_LESS_VEC_MASK_STORE
>> -    xorl    %esi, %esi
>> -# endif
>> -    cmp    $VEC_SIZE, %RDX_LP
>> -    jb    L(less_vec_no_vdup)
>> -# ifdef USE_LESS_VEC_MASK_STORE
>> -    xorl    %esi, %esi
>> -# endif
>> -# if VEC_SIZE <= 16
>> -    BZERO_ZERO_VEC0 ()
>> -# endif
>> -    cmp    $(VEC_SIZE * 2), %RDX_LP
>> -    ja    L(stosb_more_2x_vec)
>> -    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
>> -    VMOVU    %VEC(0), (%rdi)
>> -    VMOVU    %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>> -    VZEROUPPER_RETURN
>> -END (BZERO_SYMBOL(__bzero, unaligned_erms))
>> -
>> # if defined SHARED && IS_IN (libc)
>> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>>     cmp    %RDX_LP, %RCX_LP
>> -- 
>> 2.34.1
>>
Wilco Dijkstra May 16, 2022, 1:29 p.m. UTC | #7
Hi,

> And also Wilco has suggested that although memset with zero is the most common
> pattern, it is really doubtful that the micro-optimziation is worth on modern
> cores and I agree with him. Not sure how is darwin libc organized and if whether
> its bzero does yield any performance gain, but bug report it seems it was 
> aimed to reduce code size.

Yes, about ~90% of memset calls are zero on average. The statistics vary a lot per
application, 50% non-zero and 99% non-zero happen too, so it is not nearly skewed
enough that you only ever do zero memsets in most applications.

And the codesize impact is marginal - in SPEC2017 the benefit would be about 0.05%
on AArch64. These kinds of gains may be worth it in tiny microcontrollers where every
byte counts, but it's a drop in the ocean in big Linux systems.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
deleted file mode 100644
index f96d567fd8..0000000000
--- a/sysdeps/x86_64/bzero.S
+++ /dev/null
@@ -1 +0,0 @@ 
-/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index af26e9cedc..a6eea61a4d 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -1,4 +1,4 @@ 
-/* memset/bzero -- set memory area to CH/0
+/* memset -- set memory area to CH/0
    Optimized version for x86-64.
    Copyright (C) 2002-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -35,9 +35,6 @@ 
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
-# define BZERO_ZERO_VEC0() \
-  pxor %xmm0, %xmm0
-
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   pshufd $0, %xmm0, %xmm0; \
@@ -56,10 +53,6 @@ 
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
-#ifndef BZERO_SYMBOL
-# define BZERO_SYMBOL(p,s)	__bzero
-#endif
-
 #ifndef WMEMSET_SYMBOL
 # define WMEMSET_CHK_SYMBOL(p,s) p
 # define WMEMSET_SYMBOL(p,s)	__wmemset
@@ -70,7 +63,6 @@ 
 libc_hidden_builtin_def (memset)
 
 #if IS_IN (libc)
-weak_alias (__bzero, bzero)
 libc_hidden_def (__wmemset)
 weak_alias (__wmemset, wmemset)
 libc_hidden_weak (wmemset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0400ea332b..f3ab5e0928 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,7 +1,6 @@ 
 ifeq ($(subdir),string)
 
 sysdep_routines += \
-  bzero \
   memchr-avx2 \
   memchr-avx2-rtm \
   memchr-evex \
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
deleted file mode 100644
index 58a14b2c33..0000000000
--- a/sysdeps/x86_64/multiarch/bzero.c
+++ /dev/null
@@ -1,106 +0,0 @@ 
-/* Multiple versions of bzero.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Define multiple versions only for the definition in libc.  */
-#if IS_IN (libc)
-# define __bzero __redirect___bzero
-# include <string.h>
-# undef __bzero
-
-# define SYMBOL_NAME __bzero
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
-  attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
-  const struct cpu_features* cpu_features = __get_cpu_features ();
-
-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
-      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
-    {
-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
-	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE1 (avx512_unaligned_erms);
-
-	  return OPTIMIZE1 (avx512_unaligned);
-	}
-    }
-
-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
-    {
-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
-	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE1 (evex_unaligned_erms);
-
-	  return OPTIMIZE1 (evex_unaligned);
-	}
-
-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
-	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
-
-	  return OPTIMIZE1 (avx2_unaligned_rtm);
-	}
-
-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
-	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE1 (avx2_unaligned_erms);
-
-	  return OPTIMIZE1 (avx2_unaligned);
-	}
-    }
-
-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-    return OPTIMIZE1 (sse2_unaligned_erms);
-
-  return OPTIMIZE1 (sse2_unaligned);
-}
-
-libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
-
-weak_alias (__bzero, bzero)
-#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8afcf81bb..7218095430 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -291,48 +291,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx512_no_vzeroupper)
 	     )
 
-  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
-  IFUNC_IMPL (i, name, bzero,
-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
-			      __bzero_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
-			      __bzero_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      CPU_FEATURE_USABLE (AVX2),
-			      __bzero_avx2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      CPU_FEATURE_USABLE (AVX2),
-			      __bzero_avx2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX2)
-			       && CPU_FEATURE_USABLE (RTM)),
-			      __bzero_avx2_unaligned_rtm)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX2)
-			       && CPU_FEATURE_USABLE (RTM)),
-			      __bzero_avx2_unaligned_erms_rtm)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX512VL)
-			       && CPU_FEATURE_USABLE (AVX512BW)
-			       && CPU_FEATURE_USABLE (BMI2)),
-			      __bzero_evex_unaligned)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX512VL)
-			       && CPU_FEATURE_USABLE (AVX512BW)
-			       && CPU_FEATURE_USABLE (BMI2)),
-			      __bzero_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX512VL)
-			       && CPU_FEATURE_USABLE (AVX512BW)
-			       && CPU_FEATURE_USABLE (BMI2)),
-			      __bzero_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, bzero,
-			      (CPU_FEATURE_USABLE (AVX512VL)
-			       && CPU_FEATURE_USABLE (AVX512BW)
-			       && CPU_FEATURE_USABLE (BMI2)),
-			      __bzero_avx512_unaligned)
-	     )
-
   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 5a5ee6f672..8ac3e479bb 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -5,7 +5,6 @@ 
 
 #define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
-#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
 #include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a093a2831f..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,9 +14,6 @@ 
   vmovd d, %xmm0; \
   movq r, %rax;
 
-# define BZERO_ZERO_VEC0() \
-  vpxor %xmm0, %xmm0, %xmm0
-
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
 
@@ -32,9 +29,6 @@ 
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
-# ifndef BZERO_SYMBOL
-#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
-# endif
 # ifndef WMEMSET_SYMBOL
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 727c92133a..5241216a77 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,9 +19,6 @@ 
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
-# define BZERO_ZERO_VEC0() \
-  vpxorq %XMM0, %XMM0, %XMM0
-
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 5d8fa78f05..6370021506 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,9 +19,6 @@ 
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
-# define BZERO_ZERO_VEC0() \
-  vpxorq %XMM0, %XMM0, %XMM0
-
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index d52d170804..3d92f6993a 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -22,7 +22,6 @@ 
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
-# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
 # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 785fee1d57..aa78fbb620 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -1,4 +1,4 @@ 
-/* memset/bzero with unaligned store and rep stosb
+/* memset with unaligned store and rep stosb
    Copyright (C) 2016-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -26,10 +26,6 @@ 
 
 #include <sysdep.h>
 
-#ifndef BZERO_SYMBOL
-# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
-#endif
-
 #ifndef MEMSET_CHK_SYMBOL
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
@@ -134,31 +130,6 @@  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
-ENTRY (BZERO_SYMBOL(__bzero, unaligned))
-#if VEC_SIZE > 16
-	BZERO_ZERO_VEC0 ()
-#endif
-	mov	%RDI_LP, %RAX_LP
-	mov	%RSI_LP, %RDX_LP
-#ifndef USE_LESS_VEC_MASK_STORE
-	xorl	%esi, %esi
-#endif
-	cmp	$VEC_SIZE, %RDX_LP
-	jb	L(less_vec_no_vdup)
-#ifdef USE_LESS_VEC_MASK_STORE
-	xorl	%esi, %esi
-#endif
-#if VEC_SIZE <= 16
-	BZERO_ZERO_VEC0 ()
-#endif
-	cmp	$(VEC_SIZE * 2), %RDX_LP
-	ja	L(more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
-	VZEROUPPER_RETURN
-END (BZERO_SYMBOL(__bzero, unaligned))
-
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmp	%RDX_LP, %RCX_LP
@@ -216,31 +187,6 @@  END (__memset_erms)
 END (MEMSET_SYMBOL (__memset, erms))
 # endif
 
-ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
-# if VEC_SIZE > 16
-	BZERO_ZERO_VEC0 ()
-# endif
-	mov	%RDI_LP, %RAX_LP
-	mov	%RSI_LP, %RDX_LP
-# ifndef USE_LESS_VEC_MASK_STORE
-	xorl	%esi, %esi
-# endif
-	cmp	$VEC_SIZE, %RDX_LP
-	jb	L(less_vec_no_vdup)
-# ifdef USE_LESS_VEC_MASK_STORE
-	xorl	%esi, %esi
-# endif
-# if VEC_SIZE <= 16
-	BZERO_ZERO_VEC0 ()
-# endif
-	cmp	$(VEC_SIZE * 2), %RDX_LP
-	ja	L(stosb_more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
-	VZEROUPPER_RETURN
-END (BZERO_SYMBOL(__bzero, unaligned_erms))
-
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP