[3/3] aarch64: Hoist ZVA check out of the memset function

Message ID 1510204408-1739-4-git-send-email-siddhesh@sourceware.org
State New
Headers show
Series
  • memset zva optimization
Related show

Commit Message

Siddhesh Poyarekar Nov. 9, 2017, 5:13 a.m.
The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  Hoist the zva check
into an ifunc resolver to optimize for the most common zva size
i.e. 64 bytes.  I have retained the older memset as __memset_generic
for internal libc.so use so that the change impact is minimal.  We
should eventually have a discussion on what is more expensive, reading
dczid_el0 on every memset invocation or the indirection due to PLT.

The gains due to this are significant for falkor, with run time
reductions as high as 48% in some cases.  Likewise for mustang,
although the numbers are slightly lower.  Here's a sample from the
falkor tests:

Function: memset
Variant: walk
                                    simple_memset	__memset_zva_64	__memset_generic

Comments

Andrew Pinski Nov. 9, 2017, 5:29 a.m. | #1
On Wed, Nov 8, 2017 at 9:13 PM, Siddhesh Poyarekar
<siddhesh@sourceware.org> wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver to optimize for the most common zva size
> i.e. 64 bytes.  I have retained the older memset as __memset_generic
> for internal libc.so use so that the change impact is minimal.  We
> should eventually have a discussion on what is more expensive, reading
> dczid_el0 on every memset invocation or the indirection due to PLT.
>
> The gains due to this are significant for falkor, with run time
> reductions as high as 48% in some cases.  Likewise for mustang,
> although the numbers are slightly lower.  Here's a sample from the
> falkor tests:

I don't like this at all for the increase file size for the not so
significant gain on real platforms.  I think we should declare falkor
micro-arch is broken and move on.

Thanks,
Andrew

>
> Function: memset
> Variant: walk
>                                     simple_memset       __memset_zva_64 __memset_generic
> ======================================================================================================
>                   length=256, char=0:       139.96 (-698.28%)           9.07 ( 48.26%)         17.53
>                   length=257, char=0:       140.50 (-699.03%)           9.53 ( 45.80%)         17.58
>                   length=258, char=0:       140.96 (-703.95%)           9.58 ( 45.36%)         17.53
>                   length=259, char=0:       141.56 (-705.16%)           9.53 ( 45.79%)         17.58
>                   length=260, char=0:       142.15 (-710.76%)           9.57 ( 45.39%)         17.53
>                   length=261, char=0:       142.50 (-710.39%)           9.53 ( 45.78%)         17.58
>                   length=262, char=0:       142.97 (-715.09%)           9.57 ( 45.42%)         17.54
>                   length=263, char=0:       143.51 (-716.18%)           9.53 ( 45.80%)         17.58
>                   length=264, char=0:       143.93 (-720.55%)           9.58 ( 45.39%)         17.54
>                   length=265, char=0:       144.56 (-722.07%)           9.53 ( 45.80%)         17.59
>                   length=266, char=0:       144.98 (-726.42%)           9.58 ( 45.42%)         17.54
>                   length=267, char=0:       145.53 (-727.53%)           9.53 ( 45.80%)         17.59
>                   length=268, char=0:       146.25 (-731.81%)           9.53 ( 45.79%)         17.58
>                   length=269, char=0:       146.52 (-735.39%)           9.53 ( 45.66%)         17.54
>                   length=270, char=0:       146.97 (-735.81%)           9.53 ( 45.80%)         17.58
>                   length=271, char=0:       147.54 (-741.08%)           9.58 ( 45.38%)         17.54
>                   length=512, char=0:       268.26 (-1307.85%)         12.06 ( 36.71%)         19.05
>                   length=513, char=0:       268.73 (-1273.89%)         13.56 ( 30.68%)         19.56
>                   length=514, char=0:       269.31 (-1276.89%)         13.56 ( 30.68%)         19.56
>                   length=515, char=0:       269.73 (-1279.05%)         13.56 ( 30.68%)         19.56
>                   length=516, char=0:       270.34 (-1282.24%)         13.56 ( 30.67%)         19.56
>                   length=517, char=0:       270.83 (-1284.71%)         13.56 ( 30.66%)         19.56
>                   length=518, char=0:       271.20 (-1286.54%)         13.56 ( 30.67%)         19.56
>                   length=519, char=0:       271.67 (-1288.67%)         13.65 ( 30.24%)         19.56
>                   length=520, char=0:       272.14 (-1291.04%)         13.65 ( 30.22%)         19.56
>                   length=521, char=0:       272.66 (-1293.69%)         13.65 ( 30.23%)         19.56
>                   length=522, char=0:       273.14 (-1296.13%)         13.65 ( 30.20%)         19.56
>                   length=523, char=0:       273.64 (-1298.75%)         13.65 ( 30.23%)         19.56
>                   length=524, char=0:       274.34 (-1302.16%)         13.66 ( 30.20%)         19.57
>                   length=525, char=0:       274.64 (-1297.78%)         13.56 ( 30.99%)         19.65
>                   length=526, char=0:       275.20 (-1300.04%)         13.56 ( 31.01%)         19.66
>                   length=527, char=0:       275.66 (-1302.86%)         13.56 ( 30.99%)         19.65
>                  length=1024, char=0:       524.46 (-2169.75%)         20.12 ( 12.92%)         23.11
>                  length=1025, char=0:       525.14 (-2124.63%)         21.62 (  8.40%)         23.61
>                  length=1026, char=0:       525.59 (-2125.36%)         21.88 (  7.37%)         23.62
>                  length=1027, char=0:       525.98 (-2127.14%)         21.62 (  8.46%)         23.62
>                  length=1028, char=0:       526.68 (-2131.10%)         21.62 (  8.42%)         23.61
>                  length=1029, char=0:       527.10 (-2131.70%)         21.79 (  7.73%)         23.62
>                  length=1030, char=0:       527.54 (-2118.51%)         21.62 (  9.10%)         23.78
>                  length=1031, char=0:       527.98 (-2136.37%)         21.62 (  8.43%)         23.61
>                  length=1032, char=0:       528.70 (-2139.38%)         21.62 (  8.43%)         23.61
>                  length=1033, char=0:       529.25 (-2124.37%)         21.62 (  9.11%)         23.79
>                  length=1034, char=0:       529.48 (-2142.95%)         21.62 (  8.43%)         23.61
>                  length=1035, char=0:       530.11 (-2145.13%)         21.62 (  8.44%)         23.61
>                  length=1036, char=0:       530.76 (-2147.10%)         21.79 (  7.73%)         23.62
>                  length=1037, char=0:       531.03 (-2149.45%)         21.62 (  8.42%)         23.61
>                  length=1038, char=0:       531.64 (-2151.87%)         21.62 (  8.42%)         23.61
>                  length=1039, char=0:       531.99 (-2151.63%)         21.80 (  7.75%)         23.63
>
>         * sysdeps/aarch64/memset-reg.h: New file.
>         * sysdeps/aarch64/memset.S: Use it.
>         (__memset): Rename to MEMSET macro.
>         [ZVA_MACRO]: Use zva_macro.
>         * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
>         Add memset_generic and memset_zva_64.
>         * sysdeps/aarch64/multiarch/ifunc-impl-list.c
>         (__libc_ifunc_impl_list): Add memset ifuncs.
>         * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
>         local variable zva_size.
>         * sysdeps/aarch64/multiarch/memset.c: New file.
>         * sysdeps/aarch64/multiarch/memset_generic.S: New file.
>         * sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
>         * sysdeps/aarch64/multiarch/rtld-memset.S: New file.
>         * sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>         (DCZID_DZP_MASK): New macro.
>         (DCZID_BS_MASK): Likewise.
>         (init_cpu_features): Read and set zva_size.
>         * sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>         (struct cpu_features): New member zva_size.
> ---
>  sysdeps/aarch64/memset-reg.h                   | 30 ++++++++++++++++
>  sysdeps/aarch64/memset.S                       | 27 +++++---------
>  sysdeps/aarch64/multiarch/Makefile             |  2 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c    |  3 ++
>  sysdeps/aarch64/multiarch/init-arch.h          |  8 +++--
>  sysdeps/aarch64/multiarch/memset.c             | 41 +++++++++++++++++++++
>  sysdeps/aarch64/multiarch/memset_generic.S     | 27 ++++++++++++++
>  sysdeps/aarch64/multiarch/memset_zva_64.S      | 49 ++++++++++++++++++++++++++
>  sysdeps/aarch64/multiarch/rtld-memset.S        | 23 ++++++++++++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++++++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.h |  1 +
>  11 files changed, 199 insertions(+), 22 deletions(-)
>  create mode 100644 sysdeps/aarch64/memset-reg.h
>  create mode 100644 sysdeps/aarch64/multiarch/memset.c
>  create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
>  create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S
>
> diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
> new file mode 100644
> index 0000000..518e562
> --- /dev/null
> +++ b/sysdeps/aarch64/memset-reg.h
> @@ -0,0 +1,30 @@
> +/* Register aliases for memset to be used across implementations.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define dstin  x0
> +#define val    x1
> +#define valw   w1
> +#define count  x2
> +#define dst    x3
> +#define dstend x4
> +#define tmp1   x5
> +#define tmp1w  w5
> +#define tmp2   x6
> +#define tmp2w  w6
> +#define zva_len x7
> +#define zva_lenw w7
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..45fb0a8 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -17,6 +17,7 @@
>     <http://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#include "memset-reg.h"
>
>  /* Assumptions:
>   *
> @@ -24,20 +25,7 @@
>   *
>   */
>
> -#define dstin  x0
> -#define val    x1
> -#define valw   w1
> -#define count  x2
> -#define dst    x3
> -#define dstend x4
> -#define tmp1   x5
> -#define tmp1w  w5
> -#define tmp2   x6
> -#define tmp2w  w6
> -#define zva_len x7
> -#define zva_lenw w7
> -
> -ENTRY_ALIGN (__memset, 6)
> +ENTRY_ALIGN (MEMSET, 6)
>
>         DELOUSE (0)
>         DELOUSE (2)
> @@ -108,8 +96,11 @@ L(tail64):
>         stp     q0, q0, [dstend, -32]
>         ret
>
> -       .p2align 3
>  L(try_zva):
> +#ifdef ZVA_MACRO
> +       zva_macro
> +#else
> +       .p2align 3
>         mrs     tmp1, dczid_el0
>         tbnz    tmp1w, 4, L(no_zva)
>         and     tmp1w, tmp1w, 15
> @@ -189,7 +180,7 @@ L(zva_other):
>         b.hs    3b
>  4:     add     count, count, zva_len
>         b       L(tail64)
> +#endif
>
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..57a6c8b 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> -                  memmove_falkor
> +                  memmove_falkor memset_generic memset_zva_64
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..a8c2f6d 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> +  IFUNC_IMPL (i, name, memset,
> +             IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> +             IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..a756dad 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,8 @@
>
>  #include <ldsodefs.h>
>
> -#define INIT_ARCH()                            \
> -  uint64_t __attribute__((unused)) midr =      \
> -    GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH()                                                          \
> +  uint64_t __attribute__((unused)) midr =                                    \
> +    GLRO(dl_aarch64_cpu_features).midr_el1;                                  \
> +  unsigned __attribute__((unused)) zva_size =                                \
> +    GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..fcaa376
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,41 @@
> +/* Multiple versions of memset. AARCH64 version.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 64 ? __memset_zva_64
> +                           : __memset_generic));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..55134ed
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_generic
> +/* Add a hidden definition for use within libc.so.  */
> +# ifdef SHARED
> +       .globl __GI_memset; __GI_memset = __memset_generic
> +# endif
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
> new file mode 100644
> index 0000000..bb9f140
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
> @@ -0,0 +1,49 @@
> +/* Memset for zva size of 64 bytes.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <memset-reg.h>
> +
> +#if IS_IN (libc)
> +.macro zva_macro
> +       .p2align 4
> +       /* Write the first and last 64 byte aligned block using stp rather
> +          than using DC ZVA.  This is faster on some cores.  */
> +       str     q0, [dst, 16]
> +       stp     q0, q0, [dst, 32]
> +       bic     dst, dst, 63
> +       stp     q0, q0, [dst, 64]
> +       stp     q0, q0, [dst, 96]
> +       sub     count, dstend, dst      /* Count is now 128 too large.  */
> +       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
> +       add     dst, dst, 128
> +1:     dc      zva, dst
> +       add     dst, dst, 64
> +       subs    count, count, 64
> +       b.hi    1b
> +       stp     q0, q0, [dst, 0]
> +       stp     q0, q0, [dst, 32]
> +       stp     q0, q0, [dstend, -64]
> +       stp     q0, q0, [dstend, -32]
> +       ret
> +.endm
> +
> +# define ZVA_MACRO zva_macro
> +# define MEMSET __memset_zva_64
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
> new file mode 100644
> index 0000000..4437393
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
> @@ -0,0 +1,23 @@
> +/* Memset for aarch64, for the dynamic linker.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (rtld)
> +# define MEMSET memset
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>
>  #endif /* _CPU_FEATURES_AARCH64_H  */
> --
> 2.7.5
>
Siddhesh Poyarekar Nov. 9, 2017, 5:44 a.m. | #2
On Thursday 09 November 2017 10:59 AM, Andrew Pinski wrote:
> I don't like this at all for the increase file size for the not so
> significant gain on real platforms.  I think we should declare falkor
> micro-arch is broken and move on.

Are you sure this doesn't give any gains for thunderx for the sizes I
mentioned (256B - ~1K)?  I see significant gains on mustang too and it
is obvious to see why; it is 3 instructions and a branch less in a hot
path and that should be significant regardless of the MRS cost.

If you don't see any gain then I don't mind posting this as a
falkor-specific change.  If you change your mind in future we can always
change the IFUNC condition.

Siddhesh
Andrew Pinski Nov. 9, 2017, 5:45 a.m. | #3
On Wed, Nov 8, 2017 at 9:44 PM, Siddhesh Poyarekar
<siddhesh@sourceware.org> wrote:
> On Thursday 09 November 2017 10:59 AM, Andrew Pinski wrote:
>> I don't like this at all for the increase file size for the not so
>> significant gain on real platforms.  I think we should declare falkor
>> micro-arch is broken and move on.
>
> Are you sure this doesn't give any gains for thunderx for the sizes I
> mentioned (256B - ~1K)?  I see significant gains on mustang too and it
> is obvious to see why; it is 3 instructions and a branch less in a hot
> path and that should be significant regardless of the MRS cost.

How can it, ThunderX has 128 byte cache lines.

Thanks,
Andrew

>
> If you don't see any gain then I don't mind posting this as a
> falkor-specific change.  If you change your mind in future we can always
> change the IFUNC condition.
>
> Siddhesh
Siddhesh Poyarekar Nov. 9, 2017, 5:59 a.m. | #4
On Thursday 09 November 2017 11:15 AM, Andrew Pinski wrote:
> How can it, ThunderX has 128 byte cache lines.

Oh yeah, I had forgotten about that.

I just tested Mustang again with the updated test the mustang gives
minimal gains, i.e. 2-5% or so.  I'll make this into a falkor-specific
change for now.  If there are other cores interested in making this more
generic then we can make that change later.

Thanks,
Siddhesh

Patch

======================================================================================================
                  length=256, char=0:       139.96 (-698.28%)	        9.07 ( 48.26%)	       17.53
                  length=257, char=0:       140.50 (-699.03%)	        9.53 ( 45.80%)	       17.58
                  length=258, char=0:       140.96 (-703.95%)	        9.58 ( 45.36%)	       17.53
                  length=259, char=0:       141.56 (-705.16%)	        9.53 ( 45.79%)	       17.58
                  length=260, char=0:       142.15 (-710.76%)	        9.57 ( 45.39%)	       17.53
                  length=261, char=0:       142.50 (-710.39%)	        9.53 ( 45.78%)	       17.58
                  length=262, char=0:       142.97 (-715.09%)	        9.57 ( 45.42%)	       17.54
                  length=263, char=0:       143.51 (-716.18%)	        9.53 ( 45.80%)	       17.58
                  length=264, char=0:       143.93 (-720.55%)	        9.58 ( 45.39%)	       17.54
                  length=265, char=0:       144.56 (-722.07%)	        9.53 ( 45.80%)	       17.59
                  length=266, char=0:       144.98 (-726.42%)	        9.58 ( 45.42%)	       17.54
                  length=267, char=0:       145.53 (-727.53%)	        9.53 ( 45.80%)	       17.59
                  length=268, char=0:       146.25 (-731.81%)	        9.53 ( 45.79%)	       17.58
                  length=269, char=0:       146.52 (-735.39%)	        9.53 ( 45.66%)	       17.54
                  length=270, char=0:       146.97 (-735.81%)	        9.53 ( 45.80%)	       17.58
                  length=271, char=0:       147.54 (-741.08%)	        9.58 ( 45.38%)	       17.54
                  length=512, char=0:       268.26 (-1307.85%)	       12.06 ( 36.71%)	       19.05
                  length=513, char=0:       268.73 (-1273.89%)	       13.56 ( 30.68%)	       19.56
                  length=514, char=0:       269.31 (-1276.89%)	       13.56 ( 30.68%)	       19.56
                  length=515, char=0:       269.73 (-1279.05%)	       13.56 ( 30.68%)	       19.56
                  length=516, char=0:       270.34 (-1282.24%)	       13.56 ( 30.67%)	       19.56
                  length=517, char=0:       270.83 (-1284.71%)	       13.56 ( 30.66%)	       19.56
                  length=518, char=0:       271.20 (-1286.54%)	       13.56 ( 30.67%)	       19.56
                  length=519, char=0:       271.67 (-1288.67%)	       13.65 ( 30.24%)	       19.56
                  length=520, char=0:       272.14 (-1291.04%)	       13.65 ( 30.22%)	       19.56
                  length=521, char=0:       272.66 (-1293.69%)	       13.65 ( 30.23%)	       19.56
                  length=522, char=0:       273.14 (-1296.13%)	       13.65 ( 30.20%)	       19.56
                  length=523, char=0:       273.64 (-1298.75%)	       13.65 ( 30.23%)	       19.56
                  length=524, char=0:       274.34 (-1302.16%)	       13.66 ( 30.20%)	       19.57
                  length=525, char=0:       274.64 (-1297.78%)	       13.56 ( 30.99%)	       19.65
                  length=526, char=0:       275.20 (-1300.04%)	       13.56 ( 31.01%)	       19.66
                  length=527, char=0:       275.66 (-1302.86%)	       13.56 ( 30.99%)	       19.65
                 length=1024, char=0:       524.46 (-2169.75%)	       20.12 ( 12.92%)	       23.11
                 length=1025, char=0:       525.14 (-2124.63%)	       21.62 (  8.40%)	       23.61
                 length=1026, char=0:       525.59 (-2125.36%)	       21.88 (  7.37%)	       23.62
                 length=1027, char=0:       525.98 (-2127.14%)	       21.62 (  8.46%)	       23.62
                 length=1028, char=0:       526.68 (-2131.10%)	       21.62 (  8.42%)	       23.61
                 length=1029, char=0:       527.10 (-2131.70%)	       21.79 (  7.73%)	       23.62
                 length=1030, char=0:       527.54 (-2118.51%)	       21.62 (  9.10%)	       23.78
                 length=1031, char=0:       527.98 (-2136.37%)	       21.62 (  8.43%)	       23.61
                 length=1032, char=0:       528.70 (-2139.38%)	       21.62 (  8.43%)	       23.61
                 length=1033, char=0:       529.25 (-2124.37%)	       21.62 (  9.11%)	       23.79
                 length=1034, char=0:       529.48 (-2142.95%)	       21.62 (  8.43%)	       23.61
                 length=1035, char=0:       530.11 (-2145.13%)	       21.62 (  8.44%)	       23.61
                 length=1036, char=0:       530.76 (-2147.10%)	       21.79 (  7.73%)	       23.62
                 length=1037, char=0:       531.03 (-2149.45%)	       21.62 (  8.42%)	       23.61
                 length=1038, char=0:       531.64 (-2151.87%)	       21.62 (  8.42%)	       23.61
                 length=1039, char=0:       531.99 (-2151.63%)	       21.80 (  7.75%)	       23.63

	* sysdeps/aarch64/memset-reg.h: New file.
	* sysdeps/aarch64/memset.S: Use it.
	(__memset): Rename to MEMSET macro.
	[ZVA_MACRO]: Use zva_macro.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic and memset_zva_64.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	local variable zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK): New macro.
	(DCZID_BS_MASK): Likewise.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.
---
 sysdeps/aarch64/memset-reg.h                   | 30 ++++++++++++++++
 sysdeps/aarch64/memset.S                       | 27 +++++---------
 sysdeps/aarch64/multiarch/Makefile             |  2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c    |  3 ++
 sysdeps/aarch64/multiarch/init-arch.h          |  8 +++--
 sysdeps/aarch64/multiarch/memset.c             | 41 +++++++++++++++++++++
 sysdeps/aarch64/multiarch/memset_generic.S     | 27 ++++++++++++++
 sysdeps/aarch64/multiarch/memset_zva_64.S      | 49 ++++++++++++++++++++++++++
 sysdeps/aarch64/multiarch/rtld-memset.S        | 23 ++++++++++++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++++++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |  1 +
 11 files changed, 199 insertions(+), 22 deletions(-)
 create mode 100644 sysdeps/aarch64/memset-reg.h
 create mode 100644 sysdeps/aarch64/multiarch/memset.c
 create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
 create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S

diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
new file mode 100644
index 0000000..518e562
--- /dev/null
+++ b/sysdeps/aarch64/memset-reg.h
@@ -0,0 +1,30 @@ 
+/* Register aliases for memset to be used across implementations.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 110fd22..45fb0a8 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -17,6 +17,7 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include "memset-reg.h"
 
 /* Assumptions:
  *
@@ -24,20 +25,7 @@ 
  *
  */
 
-#define dstin	x0
-#define val	x1
-#define valw	w1
-#define count	x2
-#define dst	x3
-#define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
-
-ENTRY_ALIGN (__memset, 6)
+ENTRY_ALIGN (MEMSET, 6)
 
 	DELOUSE (0)
 	DELOUSE (2)
@@ -108,8 +96,11 @@  L(tail64):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
 L(try_zva):
+#ifdef ZVA_MACRO
+	zva_macro
+#else
+	.p2align 3
 	mrs	tmp1, dczid_el0
 	tbnz	tmp1w, 4, L(no_zva)
 	and	tmp1w, tmp1w, 15
@@ -189,7 +180,7 @@  L(zva_other):
 	b.hs	3b
 4:	add	count, count, zva_len
 	b	L(tail64)
+#endif
 
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 9aa1e79..57a6c8b 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
-		   memmove_falkor
+		   memmove_falkor memset_generic memset_zva_64
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 2cb74d5..a8c2f6d 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 3af442c..a756dad 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,6 +18,8 @@ 
 
 #include <ldsodefs.h>
 
-#define INIT_ARCH()				\
-  uint64_t __attribute__((unused)) midr =	\
-    GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  unsigned __attribute__((unused)) zva_size =				      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000..fcaa376
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,41 @@ 
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset, (zva_size == 64 ? __memset_zva_64
+			    : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000..55134ed
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@ 
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
new file mode 100644
index 0000000..bb9f140
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
@@ -0,0 +1,49 @@ 
+/* Memset for zva size of 64 bytes.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <memset-reg.h>
+
+#if IS_IN (libc)
+.macro zva_macro
+	.p2align 4
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.  */
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_zva_64
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000..4437393
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@ 
+/* Memset for aarch64, for the dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index e769eeb..092ee81 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -20,6 +20,9 @@ 
 #include <sys/auxv.h>
 #include <elf/dl-hwcaps.h>
 
+#define DCZID_DZP_MASK (1 << 4)
+#define DCZID_BS_MASK (0xf)
+
 #if HAVE_TUNABLES
 struct cpu_list
 {
@@ -72,4 +75,11 @@  init_cpu_features (struct cpu_features *cpu_features)
     }
 
   cpu_features->midr_el1 = midr;
+
+  /* Check if ZVA is enabled.  */
+  unsigned dczid;
+  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+
+  if ((dczid & DCZID_DZP_MASK) == 0)
+    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
 }
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53d..f2b6afd 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -47,6 +47,7 @@ 
 struct cpu_features
 {
   uint64_t midr_el1;
+  unsigned zva_size;
 };
 
 #endif /* _CPU_FEATURES_AARCH64_H  */