diff mbox series

AArch64: Check kernel version for SVE ifuncs

Message ID PAWPR08MB898285FB64F7186879479C94832A2@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Check kernel version for SVE ifuncs | expand

Commit Message

Wilco Dijkstra March 13, 2024, 2:31 p.m. UTC
Older Linux kernels may disable SVE after certain syscalls.  Calling the
SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
As a result, applications with a high use of syscalls may run slower with
the SVE memcpy.  Avoid this by checking the kernel version and enable the
SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.

Passes regress, OK for commit?

---

Comments

Adhemerval Zanella Netto March 13, 2024, 6:12 p.m. UTC | #1
On 13/03/24 11:31, Wilco Dijkstra wrote:
> 
> Older Linux kernels may disable SVE after certain syscalls.  Calling the
> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> As a result, applications with a high use of syscalls may run slower with
> the SVE memcpy.  Avoid this by checking the kernel version and enable the
> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
> 
> Passes regress, OK for commit?

How does it handle backports? Parsing kernel version is really not ideal
(and that's why we removed it on previous version), can't kernel provide
this information through any means (as powerpc does for HTM without
suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ?

> 
> ---
> 
> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
> --- a/sysdeps/aarch64/cpu-features.h
> +++ b/sysdeps/aarch64/cpu-features.h
> @@ -71,6 +71,7 @@ struct cpu_features
>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
>    uint8_t mte_state;
>    bool sve;
> +  bool prefer_sve_ifuncs;
>    bool mops;
>  };
>  
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -36,5 +36,7 @@
>      MTE_ENABLED ();							      \
>    bool __attribute__((unused)) sve =					      \
>      GLRO(dl_aarch64_cpu_features).sve;					      \
> +  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
> +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
>    bool __attribute__((unused)) mops =					      \
>      GLRO(dl_aarch64_cpu_features).mops;
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>  	return __memcpy_a64fx;
> -      return __memcpy_sve;
> +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
>      }
>  
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>  	return __memmove_a64fx;
> -      return __memmove_sve;
> +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
>      }
>  
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -21,6 +21,7 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  #include <sys/prctl.h>
> +#include <sys/utsname.h>
>  #include <dl-tunables-parse.h>
>  
>  #define DCZID_DZP_MASK (1 << 4)
> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
>    return UINT64_MAX;
>  }
>  
> +/* Parse kernel version without calling any library functions.
> +   Allow 2 digits for kernel version and 3 digits for major version,
> +   separated by '.': "kk.mmm.".
> +   Return kernel version * 1000 + major version, or -1 on failure.  */
> +
> +static inline int
> +kernel_version (void)
> +{
> +  struct utsname buf;
> +  const char *p = &buf.release[0];
> +  int kernel = 0;
> +  int major = 0;
> +
> +  if (__uname (&buf) < 0)
> +    return -1;
> +
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p != '.')
> +    return -1;
> +  p++;
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p != '.' && *p != '\0')
> +    return -1;
> +
> +  return kernel * 1000 + major;
> +}
> +
>  static inline void
>  init_cpu_features (struct cpu_features *cpu_features)
>  {
> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
>    /* Check if SVE is supported.  */
>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
>  
> +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
> +  cpu_features->prefer_sve_ifuncs =
> +    cpu_features->sve && kernel_version () >= 6002;
> +
>    /* Check if MOPS is supported.  */
>    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
>  }
>
Szabolcs Nagy March 13, 2024, 6:39 p.m. UTC | #2
The 03/13/2024 14:31, Wilco Dijkstra wrote:
> 
> Older Linux kernels may disable SVE after certain syscalls.  Calling the
> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> As a result, applications with a high use of syscalls may run slower with
> the SVE memcpy.  Avoid this by checking the kernel version and enable the
> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.

a kernel version check at startup is a bit ugly.

i think we should use the aarch64 kernel-features.h to define an
__ASSUME_FAST_SVE or similar based on the __LINUX_KERNEL_VERSION
(so once we raise the min kernel version to 6.2 the startup check
is not done anymore or users can config --enable-kernel=6.2.0).

(fast_sve may not be the best name, 'no_excessive_sve_traps' or
'sve_stays_enabled' may be better, but i'm not clear on the exact
kernel behaviour we want here so let you decide)

> 
> Passes regress, OK for commit?
> 

patch looks ok otherwise.

> ---
> 
> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
> --- a/sysdeps/aarch64/cpu-features.h
> +++ b/sysdeps/aarch64/cpu-features.h
> @@ -71,6 +71,7 @@ struct cpu_features
>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
>    uint8_t mte_state;
>    bool sve;
> +  bool prefer_sve_ifuncs;
>    bool mops;
>  };
>  
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -36,5 +36,7 @@
>      MTE_ENABLED ();							      \
>    bool __attribute__((unused)) sve =					      \
>      GLRO(dl_aarch64_cpu_features).sve;					      \
> +  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
> +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
>    bool __attribute__((unused)) mops =					      \
>      GLRO(dl_aarch64_cpu_features).mops;
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>  	return __memcpy_a64fx;
> -      return __memcpy_sve;
> +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
>      }
>  
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>  	return __memmove_a64fx;
> -      return __memmove_sve;
> +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
>      }
>  
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -21,6 +21,7 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  #include <sys/prctl.h>
> +#include <sys/utsname.h>
>  #include <dl-tunables-parse.h>
>  
>  #define DCZID_DZP_MASK (1 << 4)
> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
>    return UINT64_MAX;
>  }
>  
> +/* Parse kernel version without calling any library functions.
> +   Allow 2 digits for kernel version and 3 digits for major version,
> +   separated by '.': "kk.mmm.".

looks reasonable.

> +   Return kernel version * 1000 + major version, or -1 on failure.  */
> +
> +static inline int
> +kernel_version (void)
> +{
> +  struct utsname buf;
> +  const char *p = &buf.release[0];
> +  int kernel = 0;
> +  int major = 0;
> +
> +  if (__uname (&buf) < 0)
> +    return -1;
> +
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p != '.')
> +    return -1;
> +  p++;
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p != '.' && *p != '\0')
> +    return -1;
> +
> +  return kernel * 1000 + major;
> +}
> +
>  static inline void
>  init_cpu_features (struct cpu_features *cpu_features)
>  {
> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
>    /* Check if SVE is supported.  */
>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
>  
> +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
> +  cpu_features->prefer_sve_ifuncs =
> +    cpu_features->sve && kernel_version () >= 6002;

e.g. this can be '&& fast_sve ()' and define fast_sve based
on __ASSUME_FAST_SVE.

> +
>    /* Check if MOPS is supported.  */
>    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
>  }
>
Szabolcs Nagy March 13, 2024, 7:25 p.m. UTC | #3
The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
> On 13/03/24 11:31, Wilco Dijkstra wrote:
> > 
> > Older Linux kernels may disable SVE after certain syscalls.  Calling the
> > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> > As a result, applications with a high use of syscalls may run slower with
> > the SVE memcpy.  Avoid this by checking the kernel version and enable the
> > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
> > 
> > Passes regress, OK for commit?
> 
> How does it handle backports? Parsing kernel version is really not ideal
> (and that's why we removed it on previous version), can't kernel provide
> this information through any means (as powerpc does for HTM without
> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ?

the check is for performance only, not correctness.

if we detect a bad kernel as good, that can cause significant
slowdown due to the traps depending on the workload.
(at least this is the claim, i haven't verified, but plausible)

if we detect a good kernel as bad (e.g. because of kernel
backports), that is acceptable amount of slowdown (the fallback
memcpy is reasonably fast).

i think the kernel version check is ugly, but in this case it
makes sense: direct detection of the behaviour is hard, the
version check should reliably avoid detecting a bad kernel as
good, and it is better than always assuming a bad kernel.

> 
> > 
> > ---
> > 
> > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> > index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
> > --- a/sysdeps/aarch64/cpu-features.h
> > +++ b/sysdeps/aarch64/cpu-features.h
> > @@ -71,6 +71,7 @@ struct cpu_features
> >    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
> >    uint8_t mte_state;
> >    bool sve;
> > +  bool prefer_sve_ifuncs;
> >    bool mops;
> >  };
> >  
> > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> > index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
> > --- a/sysdeps/aarch64/multiarch/init-arch.h
> > +++ b/sysdeps/aarch64/multiarch/init-arch.h
> > @@ -36,5 +36,7 @@
> >      MTE_ENABLED ();							      \
> >    bool __attribute__((unused)) sve =					      \
> >      GLRO(dl_aarch64_cpu_features).sve;					      \
> > +  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
> > +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
> >    bool __attribute__((unused)) mops =					      \
> >      GLRO(dl_aarch64_cpu_features).mops;
> > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> > index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
> > --- a/sysdeps/aarch64/multiarch/memcpy.c
> > +++ b/sysdeps/aarch64/multiarch/memcpy.c
> > @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
> >      {
> >        if (IS_A64FX (midr))
> >  	return __memcpy_a64fx;
> > -      return __memcpy_sve;
> > +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
> >      }
> >  
> >    if (IS_THUNDERX (midr))
> > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> > index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
> > --- a/sysdeps/aarch64/multiarch/memmove.c
> > +++ b/sysdeps/aarch64/multiarch/memmove.c
> > @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
> >      {
> >        if (IS_A64FX (midr))
> >  	return __memmove_a64fx;
> > -      return __memmove_sve;
> > +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
> >      }
> >  
> >    if (IS_THUNDERX (midr))
> > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> > index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
> > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> > @@ -21,6 +21,7 @@
> >  #include <sys/auxv.h>
> >  #include <elf/dl-hwcaps.h>
> >  #include <sys/prctl.h>
> > +#include <sys/utsname.h>
> >  #include <dl-tunables-parse.h>
> >  
> >  #define DCZID_DZP_MASK (1 << 4)
> > @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
> >    return UINT64_MAX;
> >  }
> >  
> > +/* Parse kernel version without calling any library functions.
> > +   Allow 2 digits for kernel version and 3 digits for major version,
> > +   separated by '.': "kk.mmm.".
> > +   Return kernel version * 1000 + major version, or -1 on failure.  */
> > +
> > +static inline int
> > +kernel_version (void)
> > +{
> > +  struct utsname buf;
> > +  const char *p = &buf.release[0];
> > +  int kernel = 0;
> > +  int major = 0;
> > +
> > +  if (__uname (&buf) < 0)
> > +    return -1;
> > +
> > +  if (*p >= '0' && *p <= '9')
> > +    kernel = (kernel * 10) + *p++ - '0';
> > +  if (*p >= '0' && *p <= '9')
> > +    kernel = (kernel * 10) + *p++ - '0';
> > +  if (*p != '.')
> > +    return -1;
> > +  p++;
> > +  if (*p >= '0' && *p <= '9')
> > +    major = (major * 10) + *p++ - '0';
> > +  if (*p >= '0' && *p <= '9')
> > +    major = (major * 10) + *p++ - '0';
> > +  if (*p >= '0' && *p <= '9')
> > +    major = (major * 10) + *p++ - '0';
> > +  if (*p != '.' && *p != '\0')
> > +    return -1;
> > +
> > +  return kernel * 1000 + major;
> > +}
> > +
> >  static inline void
> >  init_cpu_features (struct cpu_features *cpu_features)
> >  {
> > @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
> >    /* Check if SVE is supported.  */
> >    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
> >  
> > +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
> > +  cpu_features->prefer_sve_ifuncs =
> > +    cpu_features->sve && kernel_version () >= 6002;
> > +
> >    /* Check if MOPS is supported.  */
> >    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
> >  }
> >
Andrew Pinski March 13, 2024, 7:31 p.m. UTC | #4
On Wed, Mar 13, 2024 at 7:32 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
>
> Older Linux kernels may disable SVE after certain syscalls.  Calling the
> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> As a result, applications with a high use of syscalls may run slower with
> the SVE memcpy.  Avoid this by checking the kernel version and enable the
> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
>
> Passes regress, OK for commit?
>
> ---
>
> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
> --- a/sysdeps/aarch64/cpu-features.h
> +++ b/sysdeps/aarch64/cpu-features.h
> @@ -71,6 +71,7 @@ struct cpu_features
>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
>    uint8_t mte_state;
>    bool sve;
> +  bool prefer_sve_ifuncs;
>    bool mops;
>  };
>
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -36,5 +36,7 @@
>      MTE_ENABLED ();                                                          \
>    bool __attribute__((unused)) sve =                                         \
>      GLRO(dl_aarch64_cpu_features).sve;                                       \
> +  bool __attribute__((unused)) prefer_sve_ifuncs =                           \
> +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;                         \
>    bool __attribute__((unused)) mops =                                        \
>      GLRO(dl_aarch64_cpu_features).mops;
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>         return __memcpy_a64fx;
> -      return __memcpy_sve;
> +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;

Does it make sense to check prefer_sve_ifuncs before checking IS_A64FX
because this is about the use of SVE registers and the kernel version
and the a64fx versions use SVE too?

That is doing:
if (!prefer_sve_ifuncs)
  return __memcpy_generic;
if (IS_A64FX (midr))
  return __memcpy_a64fx;
return __memcpy_sve;

Thanks,
Andrew Pinski


>      }
>
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
>      {
>        if (IS_A64FX (midr))
>         return __memmove_a64fx;
> -      return __memmove_sve;
> +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
>      }
>
>    if (IS_THUNDERX (midr))
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -21,6 +21,7 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  #include <sys/prctl.h>
> +#include <sys/utsname.h>
>  #include <dl-tunables-parse.h>
>
>  #define DCZID_DZP_MASK (1 << 4)
> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
>    return UINT64_MAX;
>  }
>
> +/* Parse kernel version without calling any library functions.
> +   Allow 2 digits for kernel version and 3 digits for major version,
> +   separated by '.': "kk.mmm.".
> +   Return kernel version * 1000 + major version, or -1 on failure.  */
> +
> +static inline int
> +kernel_version (void)
> +{
> +  struct utsname buf;
> +  const char *p = &buf.release[0];
> +  int kernel = 0;
> +  int major = 0;
> +
> +  if (__uname (&buf) < 0)
> +    return -1;
> +
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    kernel = (kernel * 10) + *p++ - '0';
> +  if (*p != '.')
> +    return -1;
> +  p++;
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p >= '0' && *p <= '9')
> +    major = (major * 10) + *p++ - '0';
> +  if (*p != '.' && *p != '\0')
> +    return -1;
> +
> +  return kernel * 1000 + major;
> +}
> +
>  static inline void
>  init_cpu_features (struct cpu_features *cpu_features)
>  {
> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
>    /* Check if SVE is supported.  */
>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
>
> +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
> +  cpu_features->prefer_sve_ifuncs =
> +    cpu_features->sve && kernel_version () >= 6002;
> +
>    /* Check if MOPS is supported.  */
>    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
>  }
>
Adhemerval Zanella Netto March 13, 2024, 7:55 p.m. UTC | #5
On 13/03/24 16:25, Szabolcs Nagy wrote:
> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
>> On 13/03/24 11:31, Wilco Dijkstra wrote:
>>>
>>> Older Linux kernels may disable SVE after certain syscalls.  Calling the
>>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
>>> As a result, applications with a high use of syscalls may run slower with
>>> the SVE memcpy.  Avoid this by checking the kernel version and enable the
>>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
>>>
>>> Passes regress, OK for commit?
>>
>> How does it handle backports? Parsing kernel version is really not ideal
>> (and that's why we removed it on previous version), can't kernel provide
>> this information through any means (as powerpc does for HTM without
>> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ?
> 
> the check is for performance only, not correctness.
> 
> if we detect a bad kernel as good, that can cause significant
> slowdown due to the traps depending on the workload.
> (at least this is the claim, i haven't verified, but plausible)
> 
> if we detect a good kernel as bad (e.g. because of kernel
> backports), that is acceptable amount of slowdown (the fallback
> memcpy is reasonably fast).
> 
> i think the kernel version check is ugly, but in this case it
> makes sense: direct detection of the behaviour is hard, the
> version check should reliably avoid detecting a bad kernel as
> good, and it is better than always assuming a bad kernel.

Yes, I understand this. My point it won't be possible to backport this 
kernel fix to get the performance improvement of SVE routines without 
hacking the glibc as well.  It is not really a blocker, but I would
expect kernel to do proper advertise for such performance change that
might interfere with ifunc selection. Maybe we can add a tunable to
force SVE selection, but I don't have a strong opinion.

And I don't think __ASSUME_FAST_SVE would work well here, it means it
would always detect a good kernel even when running on a older one
(I am not sure how usual this is).  The minimum supported kernel 
version can work to ensure that this check won't be necessary, but in
this case we won't really need this test anyway.

The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda)
used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease.
I think we will need at least the osrelease fallback in the case of
uname syscall filtering. 

> 
>>
>>>
>>> ---
>>>
>>> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
>>> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
>>> --- a/sysdeps/aarch64/cpu-features.h
>>> +++ b/sysdeps/aarch64/cpu-features.h
>>> @@ -71,6 +71,7 @@ struct cpu_features
>>>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
>>>    uint8_t mte_state;
>>>    bool sve;
>>> +  bool prefer_sve_ifuncs;
>>>    bool mops;
>>>  };
>>>  
>>> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
>>> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
>>> --- a/sysdeps/aarch64/multiarch/init-arch.h
>>> +++ b/sysdeps/aarch64/multiarch/init-arch.h
>>> @@ -36,5 +36,7 @@
>>>      MTE_ENABLED ();							      \
>>>    bool __attribute__((unused)) sve =					      \
>>>      GLRO(dl_aarch64_cpu_features).sve;					      \
>>> +  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
>>> +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
>>>    bool __attribute__((unused)) mops =					      \
>>>      GLRO(dl_aarch64_cpu_features).mops;
>>> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
>>> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
>>> --- a/sysdeps/aarch64/multiarch/memcpy.c
>>> +++ b/sysdeps/aarch64/multiarch/memcpy.c
>>> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
>>>      {
>>>        if (IS_A64FX (midr))
>>>  	return __memcpy_a64fx;
>>> -      return __memcpy_sve;
>>> +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
>>>      }
>>>  
>>>    if (IS_THUNDERX (midr))
>>> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
>>> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
>>> --- a/sysdeps/aarch64/multiarch/memmove.c
>>> +++ b/sysdeps/aarch64/multiarch/memmove.c
>>> @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
>>>      {
>>>        if (IS_A64FX (midr))
>>>  	return __memmove_a64fx;
>>> -      return __memmove_sve;
>>> +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
>>>      }
>>>  
>>>    if (IS_THUNDERX (midr))
>>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>>> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
>>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>>> @@ -21,6 +21,7 @@
>>>  #include <sys/auxv.h>
>>>  #include <elf/dl-hwcaps.h>
>>>  #include <sys/prctl.h>
>>> +#include <sys/utsname.h>
>>>  #include <dl-tunables-parse.h>
>>>  
>>>  #define DCZID_DZP_MASK (1 << 4)
>>> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
>>>    return UINT64_MAX;
>>>  }
>>>  
>>> +/* Parse kernel version without calling any library functions.
>>> +   Allow 2 digits for kernel version and 3 digits for major version,
>>> +   separated by '.': "kk.mmm.".
>>> +   Return kernel version * 1000 + major version, or -1 on failure.  */
>>> +
>>> +static inline int
>>> +kernel_version (void)
>>> +{
>>> +  struct utsname buf;
>>> +  const char *p = &buf.release[0];
>>> +  int kernel = 0;
>>> +  int major = 0;
>>> +
>>> +  if (__uname (&buf) < 0)
>>> +    return -1;
>>> +
>>> +  if (*p >= '0' && *p <= '9')
>>> +    kernel = (kernel * 10) + *p++ - '0';
>>> +  if (*p >= '0' && *p <= '9')
>>> +    kernel = (kernel * 10) + *p++ - '0';
>>> +  if (*p != '.')
>>> +    return -1;
>>> +  p++;
>>> +  if (*p >= '0' && *p <= '9')
>>> +    major = (major * 10) + *p++ - '0';
>>> +  if (*p >= '0' && *p <= '9')
>>> +    major = (major * 10) + *p++ - '0';
>>> +  if (*p >= '0' && *p <= '9')
>>> +    major = (major * 10) + *p++ - '0';
>>> +  if (*p != '.' && *p != '\0')
>>> +    return -1;
>>> +
>>> +  return kernel * 1000 + major;
>>> +}
>>> +
>>>  static inline void
>>>  init_cpu_features (struct cpu_features *cpu_features)
>>>  {
>>> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
>>>    /* Check if SVE is supported.  */
>>>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
>>>  
>>> +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
>>> +  cpu_features->prefer_sve_ifuncs =
>>> +    cpu_features->sve && kernel_version () >= 6002;
>>> +
>>>    /* Check if MOPS is supported.  */
>>>    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
>>>  }
>>>
Wilco Dijkstra March 13, 2024, 8:44 p.m. UTC | #6
Hi Andrew,

> Does it make sense to check prefer_sve_ifuncs before checking IS_A64FX
> because this is about the use of SVE registers and the kernel version
> and the a64fx versions use SVE too?
>
> That is doing:
> if (!prefer_sve_ifuncs)
>   return __memcpy_generic;
> if (IS_A64FX (midr))
>   return __memcpy_a64fx;
> return __memcpy_sve;

The A64FX memcpy shows a major speedup due to the very wide SVE registers. It runs
floating point code, not general purpose server code. Since the benefit of SVE is larger
and the likelihood of system calls far lower, there is no reason to change the default
for A64FX.

Cheers,
Wilco
Szabolcs Nagy March 14, 2024, 8:35 a.m. UTC | #7
The 03/13/2024 16:55, Adhemerval Zanella Netto wrote:
> On 13/03/24 16:25, Szabolcs Nagy wrote:
> > The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
> >> On 13/03/24 11:31, Wilco Dijkstra wrote:
> >>>
> >>> Older Linux kernels may disable SVE after certain syscalls.  Calling the
> >>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> >>> As a result, applications with a high use of syscalls may run slower with
> >>> the SVE memcpy.  Avoid this by checking the kernel version and enable the
> >>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
> >>>
> >>> Passes regress, OK for commit?
> >>
> >> How does it handle backports? Parsing kernel version is really not ideal
> >> (and that's why we removed it on previous version), can't kernel provide
> >> this information through any means (as powerpc does for HTM without
> >> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ?
> > 
> > the check is for performance only, not correctness.
> > 
> > if we detect a bad kernel as good, that can cause significant
> > slowdown due to the traps depending on the workload.
> > (at least this is the claim, i haven't verified, but plausible)
> > 
> > if we detect a good kernel as bad (e.g. because of kernel
> > backports), that is acceptable amount of slowdown (the fallback
> > memcpy is reasonably fast).
> > 
> > i think the kernel version check is ugly, but in this case it
> > makes sense: direct detection of the behaviour is hard, the
> > version check should reliably avoid detecting a bad kernel as
> > good, and it is better than always assuming a bad kernel.
> 
> Yes, I understand this. My point it won't be possible to backport this 
> kernel fix to get the performance improvement of SVE routines without 
> hacking the glibc as well.  It is not really a blocker, but I would
> expect kernel to do proper advertise for such performance change that
> might interfere with ifunc selection. Maybe we can add a tunable to
> force SVE selection, but I don't have a strong opinion.

right.

i can ask linux devs, if they are happy introducing an easy
check in future kernels or if the change is easy to backport
at all or we don't want to encourage that.

> 
> And I don't think __ASSUME_FAST_SVE would work well here, it means it
> would always detect a good kernel even when running on a older one

why? if it is set based on min supported kernel version then
running on older kernel is a bug. if it can be overriden by
a tunable then it is a user error if the tunable is wrong.

> (I am not sure how usual this is).  The minimum supported kernel 
> version can work to ensure that this check won't be necessary, but in
> this case we won't really need this test anyway.

you mean after min version is increased the check can be removed?
i expect all __ASSUME* based on min linux version works like that
and it's useful to have the __ASSUME exactly to be able to find
which code can be removed after a min version increase.

i don't know if distros actually adjust the min version in their
glibc, i guess that would be risky if it should work in containers,
so 6.2 min version is probably far in the future.

> 
> The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda)
> used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease.
> I think we will need at least the osrelease fallback in the case of
> uname syscall filtering. 

i didn't know about the vdso PT_NOTE.

we could check osrelease there i guess. fallback is not critical
as slightly slower memcpy is not the end of the world, and there
is a balance how complicated code we want to maintain here.



> 
> > 
> >>
> >>>
> >>> ---
> >>>
> >>> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
> >>> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
> >>> --- a/sysdeps/aarch64/cpu-features.h
> >>> +++ b/sysdeps/aarch64/cpu-features.h
> >>> @@ -71,6 +71,7 @@ struct cpu_features
> >>>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
> >>>    uint8_t mte_state;
> >>>    bool sve;
> >>> +  bool prefer_sve_ifuncs;
> >>>    bool mops;
> >>>  };
> >>>  
> >>> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> >>> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
> >>> --- a/sysdeps/aarch64/multiarch/init-arch.h
> >>> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> >>> @@ -36,5 +36,7 @@
> >>>      MTE_ENABLED ();							      \
> >>>    bool __attribute__((unused)) sve =					      \
> >>>      GLRO(dl_aarch64_cpu_features).sve;					      \
> >>> +  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
> >>> +    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
> >>>    bool __attribute__((unused)) mops =					      \
> >>>      GLRO(dl_aarch64_cpu_features).mops;
> >>> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> >>> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
> >>> --- a/sysdeps/aarch64/multiarch/memcpy.c
> >>> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> >>> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void)
> >>>      {
> >>>        if (IS_A64FX (midr))
> >>>  	return __memcpy_a64fx;
> >>> -      return __memcpy_sve;
> >>> +      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
> >>>      }
> >>>  
> >>>    if (IS_THUNDERX (midr))
> >>> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> >>> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
> >>> --- a/sysdeps/aarch64/multiarch/memmove.c
> >>> +++ b/sysdeps/aarch64/multiarch/memmove.c
> >>> @@ -47,7 +47,7 @@ select_memmove_ifunc (void)
> >>>      {
> >>>        if (IS_A64FX (midr))
> >>>  	return __memmove_a64fx;
> >>> -      return __memmove_sve;
> >>> +      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
> >>>      }
> >>>  
> >>>    if (IS_THUNDERX (midr))
> >>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> >>> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
> >>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> >>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> >>> @@ -21,6 +21,7 @@
> >>>  #include <sys/auxv.h>
> >>>  #include <elf/dl-hwcaps.h>
> >>>  #include <sys/prctl.h>
> >>> +#include <sys/utsname.h>
> >>>  #include <dl-tunables-parse.h>
> >>>  
> >>>  #define DCZID_DZP_MASK (1 << 4)
> >>> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu)
> >>>    return UINT64_MAX;
> >>>  }
> >>>  
> >>> +/* Parse kernel version without calling any library functions.
> >>> +   Allow 2 digits for kernel version and 3 digits for major version,
> >>> +   separated by '.': "kk.mmm.".
> >>> +   Return kernel version * 1000 + major version, or -1 on failure.  */
> >>> +
> >>> +static inline int
> >>> +kernel_version (void)
> >>> +{
> >>> +  struct utsname buf;
> >>> +  const char *p = &buf.release[0];
> >>> +  int kernel = 0;
> >>> +  int major = 0;
> >>> +
> >>> +  if (__uname (&buf) < 0)
> >>> +    return -1;
> >>> +
> >>> +  if (*p >= '0' && *p <= '9')
> >>> +    kernel = (kernel * 10) + *p++ - '0';
> >>> +  if (*p >= '0' && *p <= '9')
> >>> +    kernel = (kernel * 10) + *p++ - '0';
> >>> +  if (*p != '.')
> >>> +    return -1;
> >>> +  p++;
> >>> +  if (*p >= '0' && *p <= '9')
> >>> +    major = (major * 10) + *p++ - '0';
> >>> +  if (*p >= '0' && *p <= '9')
> >>> +    major = (major * 10) + *p++ - '0';
> >>> +  if (*p >= '0' && *p <= '9')
> >>> +    major = (major * 10) + *p++ - '0';
> >>> +  if (*p != '.' && *p != '\0')
> >>> +    return -1;
> >>> +
> >>> +  return kernel * 1000 + major;
> >>> +}
> >>> +
> >>>  static inline void
> >>>  init_cpu_features (struct cpu_features *cpu_features)
> >>>  {
> >>> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features)
> >>>    /* Check if SVE is supported.  */
> >>>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
> >>>  
> >>> +  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
> >>> +  cpu_features->prefer_sve_ifuncs =
> >>> +    cpu_features->sve && kernel_version () >= 6002;
> >>> +
> >>>    /* Check if MOPS is supported.  */
> >>>    cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
> >>>  }
> >>>
Florian Weimer March 14, 2024, 9:02 a.m. UTC | #8
* Adhemerval Zanella Netto:

> Yes, I understand this. My point it won't be possible to backport this 
> kernel fix to get the performance improvement of SVE routines without 
> hacking the glibc as well.  It is not really a blocker, but I would
> expect kernel to do proper advertise for such performance change that
> might interfere with ifunc selection. Maybe we can add a tunable to
> force SVE selection, but I don't have a strong opinion.

I think the kernel change went into the other direction: it eliminated
the SVE trap after system calls.

Thanks,
Florian
Florian Weimer March 14, 2024, 9:06 a.m. UTC | #9
* Wilco Dijkstra:

> Older Linux kernels may disable SVE after certain syscalls.  Calling the
> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
> As a result, applications with a high use of syscalls may run slower with
> the SVE memcpy.  Avoid this by checking the kernel version and enable the
> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.

Given that the existing SVE users have not complained about this, is it
really a good idea to disable SVE string functions for them?  Not
everyone interleaves system calls and string functions in inner loops.

> +/* Parse kernel version without calling any library functions.
> +   Allow 2 digits for kernel version and 3 digits for major version,
> +   separated by '.': "kk.mmm.".
> +   Return kernel version * 1000 + major version, or -1 on failure.  */

I think you should parse the versions you exclude because you know their
syntax.  We tried to predict the layout of future kernel version strings
before and failed.

Thanks,
Florian
Adhemerval Zanella Netto March 14, 2024, 1:47 p.m. UTC | #10
On 14/03/24 05:35, Szabolcs Nagy wrote:
> The 03/13/2024 16:55, Adhemerval Zanella Netto wrote:
>> On 13/03/24 16:25, Szabolcs Nagy wrote:
>>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
>>>> On 13/03/24 11:31, Wilco Dijkstra wrote:
>>>>>
>>>>> Older Linux kernels may disable SVE after certain syscalls.  Calling the
>>>>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE.
>>>>> As a result, applications with a high use of syscalls may run slower with
>>>>> the SVE memcpy.  Avoid this by checking the kernel version and enable the
>>>>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer.
>>>>>
>>>>> Passes regress, OK for commit?
>>>>
>>>> How does it handle backports? Parsing kernel version is really not ideal
>>>> (and that's why we removed it on previous version), can't kernel provide
>>>> this information through any means (as powerpc does for HTM without
>>>> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ?
>>>
>>> the check is for performance only, not correctness.
>>>
>>> if we detect a bad kernel as good, that can cause significant
>>> slowdown due to the traps depending on the workload.
>>> (at least this is the claim, i haven't verified, but plausible)
>>>
>>> if we detect a good kernel as bad (e.g. because of kernel
>>> backports), that is acceptable amount of slowdown (the fallback
>>> memcpy is reasonably fast).
>>>
>>> i think the kernel version check is ugly, but in this case it
>>> makes sense: direct detection of the behaviour is hard, the
>>> version check should reliably avoid detecting a bad kernel as
>>> good, and it is better than always assuming a bad kernel.
>>
>> Yes, I understand this. My point it won't be possible to backport this 
>> kernel fix to get the performance improvement of SVE routines without 
>> hacking the glibc as well.  It is not really a blocker, but I would
>> expect kernel to do proper advertise for such performance change that
>> might interfere with ifunc selection. Maybe we can add a tunable to
>> force SVE selection, but I don't have a strong opinion.
> 
> right.
> 
> i can ask linux devs, if they are happy introducing an easy
> check in future kernels or if the change is easy to backport
> at all or we don't want to encourage that.
> 
>>
>> And I don't think __ASSUME_FAST_SVE would work well here, it means it
>> would always detect a good kernel even when running on a older one
> 
> why? if it is set based on min supported kernel version then
> running on older kernel is a bug. if it can be overriden by
> a tunable then it is a user error if the tunable is wrong.
> 
>> (I am not sure how usual this is).  The minimum supported kernel 
>> version can work to ensure that this check won't be necessary, but in
>> this case we won't really need this test anyway.
> 
> you mean after min version is increased the check can be removed?
> i expect all __ASSUME* based on min linux version works like that
> and it's useful to have the __ASSUME exactly to be able to find
> which code can be removed after a min version increase.
> 
> i don't know if distros actually adjust the min version in their
> glibc, i guess that would be risky if it should work in containers,
> so 6.2 min version is probably far in the future.

All major distros I am aware of does not set --enable-kernel, so SVE
only will be selected either someone builds a glibc with
--enable-kernel=6.2 or when we raise the minimum version to 6.2.  Not
a deal breaker, but the SVE routines will ended up not being actively
used for a long time.

> 
>>
>> The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda)
>> used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease.
>> I think we will need at least the osrelease fallback in the case of
>> uname syscall filtering. 
> 
> i didn't know about the vdso PT_NOTE.
> 
> we could check osrelease there i guess. fallback is not critical
> as slightly slower memcpy is not the end of the world, and there
> is a balance how complicated code we want to maintain here.

I agree, maybe uname alone should be suffice.
Szabolcs Nagy March 14, 2024, 2:26 p.m. UTC | #11
The 03/14/2024 10:47, Adhemerval Zanella Netto wrote:
> On 14/03/24 05:35, Szabolcs Nagy wrote:
> > The 03/13/2024 16:55, Adhemerval Zanella Netto wrote:
> >> On 13/03/24 16:25, Szabolcs Nagy wrote:
> >>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
> >> And I don't think __ASSUME_FAST_SVE would work well here, it means it
> >> would always detect a good kernel even when running on a older one
> > 
> > why? if it is set based on min supported kernel version then
> > running on older kernel is a bug. if it can be overriden by
> > a tunable then it is a user error if the tunable is wrong.
> > 
> >> (I am not sure how usual this is).  The minimum supported kernel 
> >> version can work to ensure that this check won't be necessary, but in
> >> this case we won't really need this test anyway.
> > 
> > you mean after min version is increased the check can be removed?
> > i expect all __ASSUME* based on min linux version works like that
> > and it's useful to have the __ASSUME exactly to be able to find
> > which code can be removed after a min version increase.
> > 
> > i don't know if distros actually adjust the min version in their
> > glibc, i guess that would be risky if it should work in containers,
> > so 6.2 min version is probably far in the future.
> 
> All major distros I am aware of does not set --enable-kernel, so SVE
> only will be selected either someone builds a glibc with
> --enable-kernel=6.2 or when we raise the minimum version to 6.2.  Not
> a deal breaker, but the SVE routines will ended up not being actively
> used for a long time.

the __ASSUME would just gate the runtime version check,
we would still do a kernel version check so sve will be
used on new kernels, the __ASSUME is there so we dont
forget to remove the check when the min version is high
enough.
Adhemerval Zanella Netto March 14, 2024, 2:28 p.m. UTC | #12
On 14/03/24 11:26, Szabolcs Nagy wrote:
> The 03/14/2024 10:47, Adhemerval Zanella Netto wrote:
>> On 14/03/24 05:35, Szabolcs Nagy wrote:
>>> The 03/13/2024 16:55, Adhemerval Zanella Netto wrote:
>>>> On 13/03/24 16:25, Szabolcs Nagy wrote:
>>>>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote:
>>>> And I don't think __ASSUME_FAST_SVE would work well here, it means it
>>>> would always detect a good kernel even when running on a older one
>>>
>>> why? if it is set based on min supported kernel version then
>>> running on older kernel is a bug. if it can be overriden by
>>> a tunable then it is a user error if the tunable is wrong.
>>>
>>>> (I am not sure how usual this is).  The minimum supported kernel 
>>>> version can work to ensure that this check won't be necessary, but in
>>>> this case we won't really need this test anyway.
>>>
>>> you mean after min version is increased the check can be removed?
>>> i expect all __ASSUME* based on min linux version works like that
>>> and it's useful to have the __ASSUME exactly to be able to find
>>> which code can be removed after a min version increase.
>>>
>>> i don't know if distros actually adjust the min version in their
>>> glibc, i guess that would be risky if it should work in containers,
>>> so 6.2 min version is probably far in the future.
>>
>> All major distros I am aware of does not set --enable-kernel, so SVE
>> only will be selected either someone builds a glibc with
>> --enable-kernel=6.2 or when we raise the minimum version to 6.2.  Not
>> a deal breaker, but the SVE routines will ended up not being actively
>> used for a long time.
> 
> the __ASSUME would just gate the runtime version check,
> we would still do a kernel version check so sve will be
> used on new kernels, the __ASSUME is there so we dont
> forget to remove the check when the min version is high
> enough.

Ah right, I assumed you suggested to use only the _ASSUME check
instead of kernel version. Sounds reasonable to me.
Wilco Dijkstra March 14, 2024, 2:42 p.m. UTC | #13
Hi Florian,

> Given that the existing SVE users have not complained about this, is it
> really a good idea to disable SVE string functions for them?  Not
> everyone interleaves system calls and string functions in inner loops.

People have found significant regressions when trying the SVE memcpy [1],
the kernel commit mentions 70% overhead in microbenchmarks [2].

Also distros appear to use a wide range of kernel versions. I've seen the
same GLIBC version used with 5.11 kernel all the way up to 6.5 in the
same version of a distro - and these are all standard installations...

Hence we need some kind of workaround to deal with old kernels before
adding more uses of SVE (new SVE ifuncs or backports of existing ones).

> +/* Parse kernel version without calling any library functions.
> +   Allow 2 digits for kernel version and 3 digits for major version,
> +   separated by '.': "kk.mmm.".
> +   Return kernel version * 1000 + major version, or -1 on failure.  */

> I think you should parse the versions you exclude because you know their
> syntax.  We tried to predict the layout of future kernel version strings
> before and failed.

Yes that is a good idea. So we could just scan for "k.m." format and if it doesn't
match or if uname fails, treat it like a new kernel. It's not a correctness issue,
so if someone uses a different format or blocks uname() in a container then it's
fine to assume it is a new kernel and enable the SVE ifuncs.

Cheers,
Wilco


[1] https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1999551/comments/45
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/arm64/kernel/fpsimd.c?id=8c845e2731041f0fdf9287dea80b039b26332c9f
Florian Weimer March 14, 2024, 2:55 p.m. UTC | #14
* Wilco Dijkstra:

> Hi Florian,
>
>> Given that the existing SVE users have not complained about this, is it
>> really a good idea to disable SVE string functions for them?  Not
>> everyone interleaves system calls and string functions in inner loops.
>
> People have found significant regressions when trying the SVE memcpy [1],
> the kernel commit mentions 70% overhead in microbenchmarks [2].
>
> Also distros appear to use a wide range of kernel versions. I've seen the
> same GLIBC version used with 5.11 kernel all the way up to 6.5 in the
> same version of a distro - and these are all standard installations...

I generally prefer we fix the component that has the bug.  With that
approach, you'd have to to use your distribution contacts to request a
backport.

The workaround introduces a performance hit for those users who today
benefit from the SVE string operations, but do not suffer from the
syscall interaction issue.

In the end, it is your port, but version-based workarounds are really
unusual for glibc.

Thanks,
Florian
Wilco Dijkstra March 14, 2024, 3:38 p.m. UTC | #15
Hi Florian,

> I generally prefer we fix the component that has the bug.  With that
> approach, you'd have to to use your distribution contacts to request a
> backport.

The issue is present since SVE was added in 4.18 so it affects many kernels.
It is unlikely to be easy to backport since it relies on various other changes
to how syscalls and register state is dealt with.

Originally it was thought to be OK - and it would be if you only ever use
SVE in vectorized loops. However using a few SVE instructions everywhere
in applications breaks that model. Plus all the security features have increased
the overhead of kernel traps in recent years...

> The workaround introduces a performance hit for those users who today
> benefit from the SVE string operations, but do not suffer from the
> syscall interaction issue.

Today everybody has the performance hit of not getting the SVE memcpy
at all as it has blocked backports. For example I don't see the SVE memcpy
being used on Neoverse V1.

> In the end, it is your port, but version-based workarounds are really
> unusual for glibc.

Agreed, I'm hoping we can remove the check in a few years time when all
distros have moved to minimum kernel >= 6.2 in their supported versions.

Cheers,
Wilco
Florian Weimer March 14, 2024, 4:52 p.m. UTC | #16
* Wilco Dijkstra:

> Hi Florian,
>
>> I generally prefer we fix the component that has the bug.  With that
>> approach, you'd have to to use your distribution contacts to request a
>> backport.
>
> The issue is present since SVE was added in 4.18 so it affects many
> kernels.  It is unlikely to be easy to backport since it relies on
> various other changes to how syscalls and register state is dealt
> with.
>
> Originally it was thought to be OK - and it would be if you only ever
> use SVE in vectorized loops. However using a few SVE instructions
> everywhere in applications breaks that model. Plus all the security
> features have increased the overhead of kernel traps in recent
> years...

Yes, that might be the case.

Can we hold off merging is for a bit?  I want to cross-checks a few
things internally before we go with the version check as proposed if
that's possible.

Thanks,
Florian
Florian Weimer March 18, 2024, 11:46 a.m. UTC | #17
* Florian Weimer:

> * Wilco Dijkstra:
>
>> Hi Florian,
>>
>>> I generally prefer we fix the component that has the bug.  With that
>>> approach, you'd have to to use your distribution contacts to request a
>>> backport.
>>
>> The issue is present since SVE was added in 4.18 so it affects many
>> kernels.  It is unlikely to be easy to backport since it relies on
>> various other changes to how syscalls and register state is dealt
>> with.
>>
>> Originally it was thought to be OK - and it would be if you only ever
>> use SVE in vectorized loops. However using a few SVE instructions
>> everywhere in applications breaks that model. Plus all the security
>> features have increased the overhead of kernel traps in recent
>> years...
>
> Yes, that might be the case.
>
> Can we hold off merging is for a bit?  I want to cross-checks a few
> things internally before we go with the version check as proposed if
> that's possible.

I would suggest to check for version >= 6.2 || version == 5.14.0.  At
this point, people running 5.14 are very likely on the el9 kernel or a
derivative, and we have backported the upstream fix into it:

  arm64/sve: Leave SVE enabled on syscall if we don't context switch 
  <https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/commit/564deeeb8038b29b91aafe2871c06aaa299145b1>

There are no plans to backport this into the el8 kernel that I know off,
and the window for such changes is more or less closed at this point, so
a similar check for 4.18.0 is not needed.

I understand that this makes the check even uglier, but that's the
nature of version checks. 8-(

Thanks,
Florian
Wilco Dijkstra March 18, 2024, 2:22 p.m. UTC | #18
Hi Florian,

> I would suggest to check for version >= 6.2 || version == 5.14.0.  At
> this point, people running 5.14 are very likely on the el9 kernel or a
> derivative, and we have backported the upstream fix into it:

Thanks for checking that, I've updated the patch to exclude 5.14.0.

> I understand that this makes the check even uglier, but that's the
> nature of version checks. 8-(

Yes, but it's unavoidable if the kernel doesn't add performance related
HWCAPs for us to check. I cleaned up the parser a little by using the same
8:8:8 format as used by __LINUX_KERNEL_VERSION.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644
--- a/sysdeps/aarch64/cpu-features.h
+++ b/sysdeps/aarch64/cpu-features.h
@@ -71,6 +71,7 @@  struct cpu_features
   /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
   uint8_t mte_state;
   bool sve;
+  bool prefer_sve_ifuncs;
   bool mops;
 };
 
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -36,5 +36,7 @@ 
     MTE_ENABLED ();							      \
   bool __attribute__((unused)) sve =					      \
     GLRO(dl_aarch64_cpu_features).sve;					      \
+  bool __attribute__((unused)) prefer_sve_ifuncs =			      \
+    GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs;			      \
   bool __attribute__((unused)) mops =					      \
     GLRO(dl_aarch64_cpu_features).mops;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -47,7 +47,7 @@  select_memcpy_ifunc (void)
     {
       if (IS_A64FX (midr))
 	return __memcpy_a64fx;
-      return __memcpy_sve;
+      return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
     }
 
   if (IS_THUNDERX (midr))
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -47,7 +47,7 @@  select_memmove_ifunc (void)
     {
       if (IS_A64FX (midr))
 	return __memmove_a64fx;
-      return __memmove_sve;
+      return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
     }
 
   if (IS_THUNDERX (midr))
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -21,6 +21,7 @@ 
 #include <sys/auxv.h>
 #include <elf/dl-hwcaps.h>
 #include <sys/prctl.h>
+#include <sys/utsname.h>
 #include <dl-tunables-parse.h>
 
 #define DCZID_DZP_MASK (1 << 4)
@@ -62,6 +63,41 @@  get_midr_from_mcpu (const struct tunable_str_t *mcpu)
   return UINT64_MAX;
 }
 
+/* Parse kernel version without calling any library functions.
+   Allow 2 digits for kernel version and 3 digits for major version,
+   separated by '.': "kk.mmm.".
+   Return kernel version * 1000 + major version, or -1 on failure.  */
+
+static inline int
+kernel_version (void)
+{
+  struct utsname buf;
+  const char *p = &buf.release[0];
+  int kernel = 0;
+  int major = 0;
+
+  if (__uname (&buf) < 0)
+    return -1;
+
+  if (*p >= '0' && *p <= '9')
+    kernel = (kernel * 10) + *p++ - '0';
+  if (*p >= '0' && *p <= '9')
+    kernel = (kernel * 10) + *p++ - '0';
+  if (*p != '.')
+    return -1;
+  p++;
+  if (*p >= '0' && *p <= '9')
+    major = (major * 10) + *p++ - '0';
+  if (*p >= '0' && *p <= '9')
+    major = (major * 10) + *p++ - '0';
+  if (*p >= '0' && *p <= '9')
+    major = (major * 10) + *p++ - '0';
+  if (*p != '.' && *p != '\0')
+    return -1;
+
+  return kernel * 1000 + major;
+}
+
 static inline void
 init_cpu_features (struct cpu_features *cpu_features)
 {
@@ -126,6 +162,10 @@  init_cpu_features (struct cpu_features *cpu_features)
   /* Check if SVE is supported.  */
   cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
 
+  /* Prefer using SVE in string ifuncs from Linux 6.2 onwards.  */
+  cpu_features->prefer_sve_ifuncs =
+    cpu_features->sve && kernel_version () >= 6002;
+
   /* Check if MOPS is supported.  */
   cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
 }