diff mbox series

[v3] x86-64: Update _dl_tlsdesc_dynamic to preserve vector registers

Message ID 20240212132657.3478716-1-hjl.tools@gmail.com
State New
Headers show
Series [v3] x86-64: Update _dl_tlsdesc_dynamic to preserve vector registers | expand

Commit Message

H.J. Lu Feb. 12, 2024, 1:26 p.m. UTC
Changes in v3:

1. Don't add GLRO(dl_x86_64_tlsdesc_dynamic) to libc.a.

Changes in v2:

1.  Add GLRO(dl_x86_64_runtime_resolve) to optimize
elf_machine_runtime_setup.

---
Compiler generates the following instruction sequence for GNU2 dynamic
TLS access:

	leaq	tls_var@TLSDESC(%rip), %rax
	call	*tls_var@TLSCALL(%rax)

CALL instruction may be transparent to compiler which assumes all
registers, except for RAX, are unchanged after CALL.  At run-time,
_dl_tlsdesc_dynamic is called, which calls __tls_get_addr on the
slow path.  __tls_get_addr is a normal function which doesn't
preserve any caller-saved registers.  _dl_tlsdesc_dynamic saves and
restores integer caller-saved registers, but doesn't preserve any
vector registers which are caller-saved.  Add _dl_tlsdesc_dynamic
IFUNC functions for FXSAVE, XSAVE and XSAVEC to save and restore
all vector registers.  This fixes BZ #31372.

Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_64_tlsdesc_dynamic)
to optimize elf_machine_runtime_setup.
---
 elf/Makefile                         |  13 ++
 elf/tst-gnu2-tls2.c                  |  89 ++++++++++++++
 elf/tst-gnu2-tls2.h                  |  26 ++++
 elf/tst-gnu2-tls2mod0.c              |  28 +++++
 elf/tst-gnu2-tls2mod1.c              |  28 +++++
 elf/tst-gnu2-tls2mod2.c              |  28 +++++
 sysdeps/x86/cpu-features.c           |  37 ++++++
 sysdeps/x86_64/dl-machine.h          |  19 +--
 sysdeps/x86_64/dl-procinfo.c         |  32 +++++
 sysdeps/x86_64/dl-tlsdesc-dynamic.h  | 170 +++++++++++++++++++++++++++
 sysdeps/x86_64/dl-tlsdesc.S          | 108 +++++------------
 sysdeps/x86_64/dl-trampoline-save.h  |  36 ++++++
 sysdeps/x86_64/dl-trampoline-state.h |  51 ++++++++
 sysdeps/x86_64/dl-trampoline.S       |  20 +---
 sysdeps/x86_64/dl-trampoline.h       |  34 +-----
 15 files changed, 576 insertions(+), 143 deletions(-)
 create mode 100644 elf/tst-gnu2-tls2.c
 create mode 100644 elf/tst-gnu2-tls2.h
 create mode 100644 elf/tst-gnu2-tls2mod0.c
 create mode 100644 elf/tst-gnu2-tls2mod1.c
 create mode 100644 elf/tst-gnu2-tls2mod2.c
 create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-state.h

Comments

Noah Goldstein Feb. 12, 2024, 6:05 p.m. UTC | #1
On Mon, Feb 12, 2024 at 1:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Changes in v3:
>
> 1. Don't add GLRO(dl_x86_64_tlsdesc_dynamic) to libc.a.
>
> Changes in v2:
>
> 1.  Add GLRO(dl_x86_64_runtime_resolve) to optimize
> elf_machine_runtime_setup.
>
> ---
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
>
>         leaq    tls_var@TLSDESC(%rip), %rax
>         call    *tls_var@TLSCALL(%rax)
>
> CALL instruction may be transparent to compiler which assumes all
> registers, except for RAX, are unchanged after CALL.  At run-time,
> _dl_tlsdesc_dynamic is called, which calls __tls_get_addr on the
> slow path.  __tls_get_addr is a normal function which doesn't
> preserve any caller-saved registers.  _dl_tlsdesc_dynamic saves and
> restores integer caller-saved registers, but doesn't preserve any
> vector registers which are caller-saved.  Add _dl_tlsdesc_dynamic
> IFUNC functions for FXSAVE, XSAVE and XSAVEC to save and restore
> all vector registers.  This fixes BZ #31372.
>
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_64_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
>  elf/Makefile                         |  13 ++
>  elf/tst-gnu2-tls2.c                  |  89 ++++++++++++++
>  elf/tst-gnu2-tls2.h                  |  26 ++++
>  elf/tst-gnu2-tls2mod0.c              |  28 +++++
>  elf/tst-gnu2-tls2mod1.c              |  28 +++++
>  elf/tst-gnu2-tls2mod2.c              |  28 +++++
>  sysdeps/x86/cpu-features.c           |  37 ++++++
>  sysdeps/x86_64/dl-machine.h          |  19 +--
>  sysdeps/x86_64/dl-procinfo.c         |  32 +++++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h  | 170 +++++++++++++++++++++++++++
>  sysdeps/x86_64/dl-tlsdesc.S          | 108 +++++------------
>  sysdeps/x86_64/dl-trampoline-save.h  |  36 ++++++
>  sysdeps/x86_64/dl-trampoline-state.h |  51 ++++++++
>  sysdeps/x86_64/dl-trampoline.S       |  20 +---
>  sysdeps/x86_64/dl-trampoline.h       |  34 +-----
>  15 files changed, 576 insertions(+), 143 deletions(-)
>  create mode 100644 elf/tst-gnu2-tls2.c
>  create mode 100644 elf/tst-gnu2-tls2.h
>  create mode 100644 elf/tst-gnu2-tls2mod0.c
>  create mode 100644 elf/tst-gnu2-tls2mod1.c
>  create mode 100644 elf/tst-gnu2-tls2mod2.c
>  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..ff15ec16dd 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -424,6 +424,7 @@ tests += \
>    tst-glibc-hwcaps-prepend \
>    tst-global1 \
>    tst-global2 \
> +  tst-gnu2-tls2 \
>    tst-initfinilazyfail \
>    tst-initorder \
>    tst-initorder2 \
> @@ -846,6 +847,9 @@ modules-names += \
>    tst-filterobj-flt \
>    tst-finilazyfailmod \
>    tst-globalmod2 \
> +  tst-gnu2-tls2mod0 \
> +  tst-gnu2-tls2mod1 \
> +  tst-gnu2-tls2mod2 \
>    tst-initlazyfailmod \
>    tst-initorder2a \
>    tst-initorder2b \
> @@ -3044,8 +3048,17 @@ $(objpfx)tst-tlsgap.out: \
>    $(objpfx)tst-tlsgap-mod0.so \
>    $(objpfx)tst-tlsgap-mod1.so \
>    $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
> +$(objpfx)tst-gnu2-tls2.out: \
> +  $(objpfx)tst-gnu2-tls2mod0.so \
> +  $(objpfx)tst-gnu2-tls2mod1.so \
> +  $(objpfx)tst-gnu2-tls2mod2.so
>  ifeq (yes,$(have-mtls-dialect-gnu2))
>  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
>  endif
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..36547efb6d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,89 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include "tst-gnu2-tls2.h"
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> +  printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> +  xdlclose (mod[i]);
> +  mod[i] = NULL;
> +  printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> +  struct tls var = { -1, -1, -1, -1 };
> +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> +  struct tls *p = f (&var);
> +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> +  ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> +  /* The DTV generation is at the last dlopen of mod0 and the
> +     entry for mod1 is NULL.  */
> +
> +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> +
> +  /* Force the slow path in GNU2 TLS descriptor call.  */
> +  access_mod (1, "apply_tls");
> +
> +  return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  open_mod (0);
> +  open_mod (1);
> +  open_mod (2);
> +  close_mod (0);
> +  close_mod (1); /* Create modid gap at mod1.  */
> +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> +
> +  /* Create a thread where DTV of mod1 is NULL.  */
> +  pthread_t t = xpthread_create (NULL, start, NULL);
> +  xpthread_join (t);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> +  int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var0 = *p;
> +  return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var1[1] = *p;
> +  return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var2 = *p;
> +  return &tls_var2;
> +}
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..aafdaee3e3 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -47,6 +47,16 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
>                  : plt_rewrite_jmp);
>      }
>  }
> +
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
>  #endif
>
>  #ifdef __LP64__
> @@ -1130,6 +1140,33 @@ no_cpuid:
>                TUNABLE_CALLBACK (set_x86_shstk));
>  #endif
>
> +# ifdef __x86_64__
> +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> +       {
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +# ifdef SHARED
> +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +# endif
> +       }
> +      else
> +       {
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +# ifdef SHARED
> +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +# endif
> +       }
> +    }
> +  else
> +    {
> +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> +      GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> +    }
> +#endif
> +
>  #ifdef SHARED
>  # ifdef __x86_64__
>    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..74b977fd3c 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>                            int lazy, int profile)
>  {
>    Elf64_Addr *got;
> -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> -      const struct cpu_features* cpu_features = __get_cpu_features ();
> -
>  #ifdef SHARED
>        /* The got[2] entry contains the address of a function which gets
>          called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>          end in this function.  */
>        if (__glibc_unlikely (profile))
>         {
> +         const struct cpu_features* cpu_features = __get_cpu_features ();
>           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
>           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>           /* This function will get called to fix up the GOT entry
>              indicated by the offset on the stack, and then jump to
>              the resolved address.  */
> -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> -           *(ElfW(Addr) *) (got + 2)
> -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> -         else
> -           *(ElfW(Addr) *) (got + 2)
> -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> +         *(ElfW(Addr) *) (got + 2)
> +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
>         }
>      }
>
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + reloc->r_addend);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_64_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..8f317a502c 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,37 @@
>
>  #include <sysdeps/x86/dl-procinfo.c>
>
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> +  ._dl_x86_64_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> +  ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
>  #undef PROCINFO_DECL
>  #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..b708d0d9e4
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,170 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef SECTION
> +# define SECTION(p)    p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> +       .section SECTION(.text),"ax",@progbits
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       _CET_ENDBR
> +       /* Preserve call-clobbered registers that we modify.
> +          We need two scratch regs anyway.  */
> +       movq    %rsi, -16(%rsp)
> +       mov     %fs:DTV_OFFSET, %RSI_LP
> +       movq    %rdi, -8(%rsp)
> +       movq    TLSDESC_ARG(%rax), %rdi
> +       movq    (%rsi), %rax
> +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> +       ja      2f
> +       movq    TLSDESC_MODID(%rdi), %rax
> +       salq    $4, %rax
> +       movq    (%rax,%rsi), %rax
> +       cmpq    $-1, %rax
> +       je      2f
> +       addq    TLSDESC_MODOFF(%rdi), %rax
> +1:
> +       movq    -16(%rsp), %rsi
> +       sub     %fs:0, %RAX_LP
> +       movq    -8(%rsp), %rdi
> +       ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movq    %rbx, -24(%rsp)
> +       mov     %RSP_LP, %RBX_LP
> +       cfi_def_cfa_register(%rbx)
> +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> +       # RBX above.
> +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +       # Allocate stack space of the required size to save the state.
> +# if IS_IN (rtld)
> +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# else
> +       sub      _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# endif
> +#endif
> +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> +          r10 and r11.  */
> +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> +       fxsave  STATE_SAVE_OFFSET(%rsp)
> +#else
> +       movl    $STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
Do we not need to save/restore components [32:62]?
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
What is the purpose of this memset? Isn't this memory going to
be immediately overwritten by the xsave?
> +# ifdef USE_XSAVE
> +       xsave   STATE_SAVE_OFFSET(%rsp)
> +# else
> +       xsavec  STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> +       /* %rdi already points to the tlsinfo data structure.  */
> +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FXSAVE
> +       fxrstor STATE_SAVE_OFFSET(%rsp)
> +#else
> +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> +       mov     %RAX_LP, %RCX_LP
> +       movl    $STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  STATE_SAVE_OFFSET(%rsp)
> +       mov     %RCX_LP, %RAX_LP
> +#endif
> +       movq    REGISTER_SAVE_R11(%rsp), %r11
> +       movq    REGISTER_SAVE_R10(%rsp), %r10
> +       movq    REGISTER_SAVE_R9(%rsp), %r9
> +       movq    REGISTER_SAVE_R8(%rsp), %r8
> +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %RBX_LP, %RSP_LP
> +       cfi_def_cfa_register(%rsp)
> +       movq    -24(%rsp), %rbx
> +       cfi_restore(%rbx)
> +#else
> +       add     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> +   passing when calling _dl_tlsdesc_dynamic.  */
> +#define REGISTER_SAVE_RCX      0
> +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
>
>         .text
>
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       _CET_ENDBR
> -       /* Preserve call-clobbered registers that we modify.
> -          We need two scratch regs anyway.  */
> -       movq    %rsi, -16(%rsp)
> -       mov     %fs:DTV_OFFSET, %RSI_LP
> -       movq    %rdi, -8(%rsp)
> -       movq    TLSDESC_ARG(%rax), %rdi
> -       movq    (%rsi), %rax
> -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> -       ja      .Lslow
> -       movq    TLSDESC_MODID(%rdi), %rax
> -       salq    $4, %rax
> -       movq    (%rax,%rsi), %rax
> -       cmpq    $-1, %rax
> -       je      .Lslow
> -       addq    TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> -       movq    -16(%rsp), %rsi
> -       sub     %fs:0, %RAX_LP
> -       movq    -8(%rsp), %rdi
> -       ret
> -.Lslow:
> -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> -       subq    $72, %rsp
> -       cfi_adjust_cfa_offset (72)
> -       movq    %rdx, 8(%rsp)
> -       movq    %rcx, 16(%rsp)
> -       movq    %r8, 24(%rsp)
> -       movq    %r9, 32(%rsp)
> -       movq    %r10, 40(%rsp)
> -       movq    %r11, 48(%rsp)
> -       /* %rdi already points to the tlsinfo data structure.  */
> -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> -       movq    8(%rsp), %rdx
> -       movq    16(%rsp), %rcx
> -       movq    24(%rsp), %r8
> -       movq    32(%rsp), %r9
> -       movq    40(%rsp), %r10
> -       movq    48(%rsp), %r11
> -       addq    $72, %rsp
> -       cfi_adjust_cfa_offset (-72)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..ebfbfe5c27
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,36 @@
> +/* x86-64 PLT trampoline register save macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  We use unaligned
> +   16-byte move to load and store SSE registers, which has no penalty
> +   on modern processors if stack is 16-byte aligned.  */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> +   stack to 16 bytes before calling _dl_fixup.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX.  */
> +# define LOCAL_STORAGE_AREA    8
> +# define BASE                  rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers.  */
> +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> +#  if (REGISTER_SAVE_AREA % 16) != 0
> +#   error REGISTER_SAVE_AREA must be multiple of 16
> +#  endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +#  error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers.  */
> +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address:  All saved
> +   registers.  */
> +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> +# define BASE                  rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
>  #include <features-offsets.h>
>  #include <link-defines.h>
>  #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> -   that stack will be always aligned at 16 bytes.  We use unaligned
> -   16-byte move to load and store SSE registers, which has no penalty
> -   on modern processors if stack is 16-byte aligned.  */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> -   stack to 16 bytes before calling _dl_fixup.  */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> -   || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
>  # undef LOCAL_STORAGE_AREA
>  # undef BASE
>
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX.  */
> -#  define LOCAL_STORAGE_AREA   8
> -#  define BASE                 rbx
> -#  ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers.  */
> -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> -#   if (REGISTER_SAVE_AREA % 16) != 0
> -#    error REGISTER_SAVE_AREA must be multiple of 16
> -#   endif
> -#  endif
> -# else
> -#  ifndef USE_FXSAVE
> -#   error USE_FXSAVE must be defined
> -#  endif
> -/* Use fxsave to save XMM registers.  */
> -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address:  All saved
> -   registers.  */
> -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> -#  define BASE                 rsp
> -#  if (REGISTER_SAVE_AREA % 16) != 8
> -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> -#  endif
> -# endif
> +# include "dl-trampoline-state.h"
>
>         .globl _dl_runtime_resolve
>         .hidden _dl_runtime_resolve
> --
> 2.43.0
>
H.J. Lu Feb. 12, 2024, 6:55 p.m. UTC | #2
On Mon, Feb 12, 2024 at 10:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Feb 12, 2024 at 1:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Changes in v3:
> >
> > 1. Don't add GLRO(dl_x86_64_tlsdesc_dynamic) to libc.a.
> >
> > Changes in v2:
> >
> > 1.  Add GLRO(dl_x86_64_runtime_resolve) to optimize
> > elf_machine_runtime_setup.
> >
> > ---
> > Compiler generates the following instruction sequence for GNU2 dynamic
> > TLS access:
> >
> >         leaq    tls_var@TLSDESC(%rip), %rax
> >         call    *tls_var@TLSCALL(%rax)
> >
> > CALL instruction may be transparent to compiler which assumes all
> > registers, except for RAX, are unchanged after CALL.  At run-time,
> > _dl_tlsdesc_dynamic is called, which calls __tls_get_addr on the
> > slow path.  __tls_get_addr is a normal function which doesn't
> > preserve any caller-saved registers.  _dl_tlsdesc_dynamic saves and
> > restores integer caller-saved registers, but doesn't preserve any
> > vector registers which are caller-saved.  Add _dl_tlsdesc_dynamic
> > IFUNC functions for FXSAVE, XSAVE and XSAVEC to save and restore
> > all vector registers.  This fixes BZ #31372.
> >
> > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_64_tlsdesc_dynamic)
> > to optimize elf_machine_runtime_setup.
> > ---
> >  elf/Makefile                         |  13 ++
> >  elf/tst-gnu2-tls2.c                  |  89 ++++++++++++++
> >  elf/tst-gnu2-tls2.h                  |  26 ++++
> >  elf/tst-gnu2-tls2mod0.c              |  28 +++++
> >  elf/tst-gnu2-tls2mod1.c              |  28 +++++
> >  elf/tst-gnu2-tls2mod2.c              |  28 +++++
> >  sysdeps/x86/cpu-features.c           |  37 ++++++
> >  sysdeps/x86_64/dl-machine.h          |  19 +--
> >  sysdeps/x86_64/dl-procinfo.c         |  32 +++++
> >  sysdeps/x86_64/dl-tlsdesc-dynamic.h  | 170 +++++++++++++++++++++++++++
> >  sysdeps/x86_64/dl-tlsdesc.S          | 108 +++++------------
> >  sysdeps/x86_64/dl-trampoline-save.h  |  36 ++++++
> >  sysdeps/x86_64/dl-trampoline-state.h |  51 ++++++++
> >  sysdeps/x86_64/dl-trampoline.S       |  20 +---
> >  sysdeps/x86_64/dl-trampoline.h       |  34 +-----
> >  15 files changed, 576 insertions(+), 143 deletions(-)
> >  create mode 100644 elf/tst-gnu2-tls2.c
> >  create mode 100644 elf/tst-gnu2-tls2.h
> >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> >
> > diff --git a/elf/Makefile b/elf/Makefile
> > index 5d78b659ce..ff15ec16dd 100644
> > --- a/elf/Makefile
> > +++ b/elf/Makefile
> > @@ -424,6 +424,7 @@ tests += \
> >    tst-glibc-hwcaps-prepend \
> >    tst-global1 \
> >    tst-global2 \
> > +  tst-gnu2-tls2 \
> >    tst-initfinilazyfail \
> >    tst-initorder \
> >    tst-initorder2 \
> > @@ -846,6 +847,9 @@ modules-names += \
> >    tst-filterobj-flt \
> >    tst-finilazyfailmod \
> >    tst-globalmod2 \
> > +  tst-gnu2-tls2mod0 \
> > +  tst-gnu2-tls2mod1 \
> > +  tst-gnu2-tls2mod2 \
> >    tst-initlazyfailmod \
> >    tst-initorder2a \
> >    tst-initorder2b \
> > @@ -3044,8 +3048,17 @@ $(objpfx)tst-tlsgap.out: \
> >    $(objpfx)tst-tlsgap-mod0.so \
> >    $(objpfx)tst-tlsgap-mod1.so \
> >    $(objpfx)tst-tlsgap-mod2.so
> > +
> > +$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
> > +$(objpfx)tst-gnu2-tls2.out: \
> > +  $(objpfx)tst-gnu2-tls2mod0.so \
> > +  $(objpfx)tst-gnu2-tls2mod1.so \
> > +  $(objpfx)tst-gnu2-tls2mod2.so
> >  ifeq (yes,$(have-mtls-dialect-gnu2))
> >  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> >  endif
> > diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> > new file mode 100644
> > index 0000000000..36547efb6d
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2.c
> > @@ -0,0 +1,89 @@
> > +/* Test TLSDESC relocation.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdio.h>
> > +#include <string.h>
> > +#include <dlfcn.h>
> > +#include <pthread.h>
> > +#include <support/xdlfcn.h>
> > +#include <support/xthread.h>
> > +#include <support/check.h>
> > +#include "tst-gnu2-tls2.h"
> > +
> > +static void *mod[3];
> > +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> > +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> > +#undef MOD
> > +
> > +static void
> > +open_mod (int i)
> > +{
> > +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> > +  printf ("open %s\n", modname[i]);
> > +}
> > +
> > +static void
> > +close_mod (int i)
> > +{
> > +  xdlclose (mod[i]);
> > +  mod[i] = NULL;
> > +  printf ("close %s\n", modname[i]);
> > +}
> > +
> > +static void
> > +access_mod (int i, const char *sym)
> > +{
> > +  struct tls var = { -1, -1, -1, -1 };
> > +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> > +  struct tls *p = f (&var);
> > +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> > +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> > +  ++(p->a);
> > +}
> > +
> > +static void *
> > +start (void *arg)
> > +{
> > +  /* The DTV generation is at the last dlopen of mod0 and the
> > +     entry for mod1 is NULL.  */
> > +
> > +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> > +
> > +  /* Force the slow path in GNU2 TLS descriptor call.  */
> > +  access_mod (1, "apply_tls");
> > +
> > +  return arg;
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  open_mod (0);
> > +  open_mod (1);
> > +  open_mod (2);
> > +  close_mod (0);
> > +  close_mod (1); /* Create modid gap at mod1.  */
> > +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> > +
> > +  /* Create a thread where DTV of mod1 is NULL.  */
> > +  pthread_t t = xpthread_create (NULL, start, NULL);
> > +  xpthread_join (t);
> > +  return 0;
> > +}
> > +
> > +#include <support/test-driver.c>
> > diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> > new file mode 100644
> > index 0000000000..e33f4dbe27
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2.h
> > @@ -0,0 +1,26 @@
> > +/* Test TLSDESC relocation.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdint.h>
> > +
> > +struct tls
> > +{
> > +  int64_t a, b, c, d;
> > +};
> > +
> > +extern struct tls *apply_tls (struct tls *);
> > diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> > new file mode 100644
> > index 0000000000..67dc0d464d
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod0.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var0 = *p;
> > +  return &tls_var0;
> > +}
> > diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> > new file mode 100644
> > index 0000000000..a4ae6db24f
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod1.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var1[1] = *p;
> > +  return &tls_var1[1];
> > +}
> > diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> > new file mode 100644
> > index 0000000000..2d13921717
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod2.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var2 = *p;
> > +  return &tls_var2;
> > +}
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 25e6622a79..aafdaee3e3 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -47,6 +47,16 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> >                  : plt_rewrite_jmp);
> >      }
> >  }
> > +
> > +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> > +#endif
> > +
> > +#ifdef __x86_64__
> > +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
> >  #endif
> >
> >  #ifdef __LP64__
> > @@ -1130,6 +1140,33 @@ no_cpuid:
> >                TUNABLE_CALLBACK (set_x86_shstk));
> >  #endif
> >
> > +# ifdef __x86_64__
> > +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> > +       {
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> > +# ifdef SHARED
> > +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> > +# endif
> > +       }
> > +      else
> > +       {
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> > +# ifdef SHARED
> > +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> > +# endif
> > +       }
> > +    }
> > +  else
> > +    {
> > +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> > +# ifdef SHARED
> > +      GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > +# endif
> > +    }
> > +#endif
> > +
> >  #ifdef SHARED
> >  # ifdef __x86_64__
> >    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index 6d605d0d32..74b977fd3c 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >                            int lazy, int profile)
> >  {
> >    Elf64_Addr *got;
> > -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> > @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >        /* Identify this shared object.  */
> >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> >
> > -      const struct cpu_features* cpu_features = __get_cpu_features ();
> > -
> >  #ifdef SHARED
> >        /* The got[2] entry contains the address of a function which gets
> >          called to get the address of a so far unresolved function and
> > @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >          end in this function.  */
> >        if (__glibc_unlikely (profile))
> >         {
> > +         const struct cpu_features* cpu_features = __get_cpu_features ();
> >           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> >           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >           /* This function will get called to fix up the GOT entry
> >              indicated by the offset on the stack, and then jump to
> >              the resolved address.  */
> > -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > -         else
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> > +         *(ElfW(Addr) *) (got + 2)
> > +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
> >         }
> >      }
> >
> > @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
> >                   {
> >                     td->arg = _dl_make_tlsdesc_dynamic
> >                       (sym_map, sym->st_value + reloc->r_addend);
> > -                   td->entry = _dl_tlsdesc_dynamic;
> > +                   td->entry = GLRO(dl_x86_64_tlsdesc_dynamic);
> >                   }
> >                 else
> >  #  endif
> > diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> > index 4d1d790fbb..8f317a502c 100644
> > --- a/sysdeps/x86_64/dl-procinfo.c
> > +++ b/sysdeps/x86_64/dl-procinfo.c
> > @@ -41,5 +41,37 @@
> >
> >  #include <sysdeps/x86/dl-procinfo.c>
> >
> > +#if defined SHARED && !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL
> > +  ._dl_x86_64_tlsdesc_dynamic
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_64_tlsdesc_dynamic
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# ifdef PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > +
> > +#if !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL && defined SHARED
> > +  ._dl_x86_64_runtime_resolve
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# if !defined SHARED || defined PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > +
> >  #undef PROCINFO_DECL
> >  #undef PROCINFO_CLASS
> > diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > new file mode 100644
> > index 0000000000..b708d0d9e4
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > @@ -0,0 +1,170 @@
> > +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef SECTION
> > +# define SECTION(p)    p
> > +#endif
> > +
> > +#undef REGISTER_SAVE_AREA
> > +#undef LOCAL_STORAGE_AREA
> > +#undef BASE
> > +
> > +#include "dl-trampoline-state.h"
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +
> > +       .hidden _dl_tlsdesc_dynamic
> > +       .global _dl_tlsdesc_dynamic
> > +       .type   _dl_tlsdesc_dynamic,@function
> > +
> > +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > +       between the thread pointer and the object denoted by the
> > +       argument, without clobbering any registers.
> > +
> > +       The assembly code that follows is a rendition of the following
> > +       C code, hand-optimized a little bit.
> > +
> > +ptrdiff_t
> > +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > +{
> > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > +                           != TLS_DTV_UNALLOCATED),
> > +                       1))
> > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > +      - __thread_pointer;
> > +
> > +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > +}
> > +*/
> > +       cfi_startproc
> > +       .align 16
> > +_dl_tlsdesc_dynamic:
> > +       _CET_ENDBR
> > +       /* Preserve call-clobbered registers that we modify.
> > +          We need two scratch regs anyway.  */
> > +       movq    %rsi, -16(%rsp)
> > +       mov     %fs:DTV_OFFSET, %RSI_LP
> > +       movq    %rdi, -8(%rsp)
> > +       movq    TLSDESC_ARG(%rax), %rdi
> > +       movq    (%rsi), %rax
> > +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > +       ja      2f
> > +       movq    TLSDESC_MODID(%rdi), %rax
> > +       salq    $4, %rax
> > +       movq    (%rax,%rsi), %rax
> > +       cmpq    $-1, %rax
> > +       je      2f
> > +       addq    TLSDESC_MODOFF(%rdi), %rax
> > +1:
> > +       movq    -16(%rsp), %rsi
> > +       sub     %fs:0, %RAX_LP
> > +       movq    -8(%rsp), %rdi
> > +       ret
> > +2:
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       movq    %rbx, -24(%rsp)
> > +       mov     %RSP_LP, %RBX_LP
> > +       cfi_def_cfa_register(%rbx)
> > +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> > +#endif
> > +#ifdef REGISTER_SAVE_AREA
> > +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> > +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> > +       # RBX above.
> > +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> > +# else
> > +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > +# endif
> > +#else
> > +       # Allocate stack space of the required size to save the state.
> > +# if IS_IN (rtld)
> > +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > +# else
> > +       sub      _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > +# endif
> > +#endif
> > +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> > +          r10 and r11.  */
> > +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> > +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> > +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> > +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> > +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> > +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> > +#ifdef USE_FXSAVE
> > +       fxsave  STATE_SAVE_OFFSET(%rsp)
> > +#else
> > +       movl    $STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> Do we not need to save/restore components [32:62]?
> > +       # Clear the XSAVE Header.
> > +# ifdef USE_XSAVE
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> > +# endif
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> What is the purpose of this memset? Isn't this memory going to
> be immediately overwritten by the xsave?

From Intel SDM vol 1,

13.4.2 XSAVE Header
The XSAVE header of an XSAVE area comprises the 64 bytes starting at
offset 512 from the area’s base address:
•Bytes 7:0 of the XSAVE header is a state-component bitmap (see
Section 13.1) called XSTATE_BV. It
identifies the state components in the XSAVE area.
•Bytes 15:8 of the XSAVE header is a state-component bitmap called
XCOMP_BV. It is used as follows:
— XCOMP_BV[63] indicates the format of the extended region of the
XSAVE area (see Section 13.4.3). If it is
clear, the standard format is used. If it is set, the compacted format
is used; XCOMP_BV[62:0] provide
format specifics as specified in Section 13.4.3.
— XCOMP_BV[63] determines which form of the XRSTOR instruction is
used. If the bit is set, the compacted
form is used; otherwise, the standard form is used. See Section 13.8.
— All bits in XCOMP_BV should be 0 if the processor does not support
the compaction extensions to the XSAVE
feature set.
•
Bytes 63:16 of the XSAVE header are reserved.
Section 13.7 through Section 13.9 provide details of how instructions
in the XSAVE feature set use the XSAVE
header of an XSAVE area.

We need to clear the reserved field in the XSAVE header before
issuing XSAVE.

> > +# ifdef USE_XSAVE
> > +       xsave   STATE_SAVE_OFFSET(%rsp)
> > +# else
> > +       xsavec  STATE_SAVE_OFFSET(%rsp)
> > +# endif
> > +#endif
> > +       /* %rdi already points to the tlsinfo data structure.  */
> > +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > +       # Get register content back.
> > +#ifdef USE_FXSAVE
> > +       fxrstor STATE_SAVE_OFFSET(%rsp)
> > +#else
> > +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> > +       mov     %RAX_LP, %RCX_LP
> > +       movl    $STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       xrstor  STATE_SAVE_OFFSET(%rsp)
> > +       mov     %RCX_LP, %RAX_LP
> > +#endif
> > +       movq    REGISTER_SAVE_R11(%rsp), %r11
> > +       movq    REGISTER_SAVE_R10(%rsp), %r10
> > +       movq    REGISTER_SAVE_R9(%rsp), %r9
> > +       movq    REGISTER_SAVE_R8(%rsp), %r8
> > +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> > +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       mov     %RBX_LP, %RSP_LP
> > +       cfi_def_cfa_register(%rsp)
> > +       movq    -24(%rsp), %rbx
> > +       cfi_restore(%rbx)
> > +#else
> > +       add     $REGISTER_SAVE_AREA, %RSP_LP
> > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> > +#endif
> > +       jmp     1b
> > +       cfi_endproc
> > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +
> > +#undef STATE_SAVE_ALIGNMENT
> > diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> > index f748af2ece..ea69f5223a 100644
> > --- a/sysdeps/x86_64/dl-tlsdesc.S
> > +++ b/sysdeps/x86_64/dl-tlsdesc.S
> > @@ -18,7 +18,19 @@
> >
> >  #include <sysdep.h>
> >  #include <tls.h>
> > +#include <cpu-features-offsets.h>
> > +#include <features-offsets.h>
> >  #include "tlsdesc.h"
> > +#include "dl-trampoline-save.h"
> > +
> > +/* Area on stack to save and restore registers used for parameter
> > +   passing when calling _dl_tlsdesc_dynamic.  */
> > +#define REGISTER_SAVE_RCX      0
> > +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> > +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> > +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> > +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> > +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
> >
> >         .text
> >
> > @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
> >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> >
> >  #ifdef SHARED
> > -       .hidden _dl_tlsdesc_dynamic
> > -       .global _dl_tlsdesc_dynamic
> > -       .type   _dl_tlsdesc_dynamic,@function
> > -
> > -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > -       between the thread pointer and the object denoted by the
> > -       argument, without clobbering any registers.
> > -
> > -       The assembly code that follows is a rendition of the following
> > -       C code, hand-optimized a little bit.
> > -
> > -ptrdiff_t
> > -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > -{
> > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > -                           != TLS_DTV_UNALLOCATED),
> > -                       1))
> > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > -      - __thread_pointer;
> > -
> > -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > -}
> > -*/
> > -       cfi_startproc
> > -       .align 16
> > -_dl_tlsdesc_dynamic:
> > -       _CET_ENDBR
> > -       /* Preserve call-clobbered registers that we modify.
> > -          We need two scratch regs anyway.  */
> > -       movq    %rsi, -16(%rsp)
> > -       mov     %fs:DTV_OFFSET, %RSI_LP
> > -       movq    %rdi, -8(%rsp)
> > -       movq    TLSDESC_ARG(%rax), %rdi
> > -       movq    (%rsi), %rax
> > -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > -       ja      .Lslow
> > -       movq    TLSDESC_MODID(%rdi), %rax
> > -       salq    $4, %rax
> > -       movq    (%rax,%rsi), %rax
> > -       cmpq    $-1, %rax
> > -       je      .Lslow
> > -       addq    TLSDESC_MODOFF(%rdi), %rax
> > -.Lret:
> > -       movq    -16(%rsp), %rsi
> > -       sub     %fs:0, %RAX_LP
> > -       movq    -8(%rsp), %rdi
> > -       ret
> > -.Lslow:
> > -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> > -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> > -       subq    $72, %rsp
> > -       cfi_adjust_cfa_offset (72)
> > -       movq    %rdx, 8(%rsp)
> > -       movq    %rcx, 16(%rsp)
> > -       movq    %r8, 24(%rsp)
> > -       movq    %r9, 32(%rsp)
> > -       movq    %r10, 40(%rsp)
> > -       movq    %r11, 48(%rsp)
> > -       /* %rdi already points to the tlsinfo data structure.  */
> > -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > -       movq    8(%rsp), %rdx
> > -       movq    16(%rsp), %rcx
> > -       movq    24(%rsp), %r8
> > -       movq    32(%rsp), %r9
> > -       movq    40(%rsp), %r10
> > -       movq    48(%rsp), %r11
> > -       addq    $72, %rsp
> > -       cfi_adjust_cfa_offset (-72)
> > -       jmp     .Lret
> > -       cfi_endproc
> > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_FXSAVE
> > +
> > +# define USE_XSAVE
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVE
> > +
> > +# define USE_XSAVEC
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVEC
> >  #endif /* SHARED */
> > diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> > new file mode 100644
> > index 0000000000..ebfbfe5c27
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-trampoline-save.h
> > @@ -0,0 +1,36 @@
> > +/* x86-64 PLT trampoline register save macros.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef DL_STACK_ALIGNMENT
> > +/* Due to GCC bug:
> > +
> > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > +
> > +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > +   that stack will be always aligned at 16 bytes.  We use unaligned
> > +   16-byte move to load and store SSE registers, which has no penalty
> > +   on modern processors if stack is 16-byte aligned.  */
> > +# define DL_STACK_ALIGNMENT 8
> > +#endif
> > +
> > +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > +   stack to 16 bytes before calling _dl_fixup.  */
> > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > +   || 16 > DL_STACK_ALIGNMENT)
> > diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> > new file mode 100644
> > index 0000000000..575f120797
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-trampoline-state.h
> > @@ -0,0 +1,51 @@
> > +/* x86-64 PLT dl-trampoline state macros.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > +#endif
> > +
> > +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > +#endif
> > +
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +/* Local stack area before jumping to function address: RBX.  */
> > +# define LOCAL_STORAGE_AREA    8
> > +# define BASE                  rbx
> > +# ifdef USE_FXSAVE
> > +/* Use fxsave to save XMM registers.  */
> > +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> > +#  if (REGISTER_SAVE_AREA % 16) != 0
> > +#   error REGISTER_SAVE_AREA must be multiple of 16
> > +#  endif
> > +# endif
> > +#else
> > +# ifndef USE_FXSAVE
> > +#  error USE_FXSAVE must be defined
> > +# endif
> > +/* Use fxsave to save XMM registers.  */
> > +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> > +/* Local stack area before jumping to function address:  All saved
> > +   registers.  */
> > +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> > +# define BASE                  rsp
> > +# if (REGISTER_SAVE_AREA % 16) != 8
> > +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > index b2e7e0f69b..87c5137837 100644
> > --- a/sysdeps/x86_64/dl-trampoline.S
> > +++ b/sysdeps/x86_64/dl-trampoline.S
> > @@ -22,25 +22,7 @@
> >  #include <features-offsets.h>
> >  #include <link-defines.h>
> >  #include <isa-level.h>
> > -
> > -#ifndef DL_STACK_ALIGNMENT
> > -/* Due to GCC bug:
> > -
> > -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > -
> > -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > -   that stack will be always aligned at 16 bytes.  We use unaligned
> > -   16-byte move to load and store SSE registers, which has no penalty
> > -   on modern processors if stack is 16-byte aligned.  */
> > -# define DL_STACK_ALIGNMENT 8
> > -#endif
> > -
> > -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > -   stack to 16 bytes before calling _dl_fixup.  */
> > -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > -   || 16 > DL_STACK_ALIGNMENT)
> > +#include "dl-trampoline-save.h"
> >
> >  /* Area on stack to save and restore registers used for parameter
> >     passing when calling _dl_fixup.  */
> > diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> > index f55c6ea040..d9ccfb40d4 100644
> > --- a/sysdeps/x86_64/dl-trampoline.h
> > +++ b/sysdeps/x86_64/dl-trampoline.h
> > @@ -27,39 +27,7 @@
> >  # undef LOCAL_STORAGE_AREA
> >  # undef BASE
> >
> > -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> > -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> > -# endif
> > -
> > -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > -# endif
> > -
> > -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > -/* Local stack area before jumping to function address: RBX.  */
> > -#  define LOCAL_STORAGE_AREA   8
> > -#  define BASE                 rbx
> > -#  ifdef USE_FXSAVE
> > -/* Use fxsave to save XMM registers.  */
> > -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> > -#   if (REGISTER_SAVE_AREA % 16) != 0
> > -#    error REGISTER_SAVE_AREA must be multiple of 16
> > -#   endif
> > -#  endif
> > -# else
> > -#  ifndef USE_FXSAVE
> > -#   error USE_FXSAVE must be defined
> > -#  endif
> > -/* Use fxsave to save XMM registers.  */
> > -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> > -/* Local stack area before jumping to function address:  All saved
> > -   registers.  */
> > -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> > -#  define BASE                 rsp
> > -#  if (REGISTER_SAVE_AREA % 16) != 8
> > -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> > -#  endif
> > -# endif
> > +# include "dl-trampoline-state.h"
> >
> >         .globl _dl_runtime_resolve
> >         .hidden _dl_runtime_resolve
> > --
> > 2.43.0
> >
diff mbox series

Patch

diff --git a/elf/Makefile b/elf/Makefile
index 5d78b659ce..ff15ec16dd 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -424,6 +424,7 @@  tests += \
   tst-glibc-hwcaps-prepend \
   tst-global1 \
   tst-global2 \
+  tst-gnu2-tls2 \
   tst-initfinilazyfail \
   tst-initorder \
   tst-initorder2 \
@@ -846,6 +847,9 @@  modules-names += \
   tst-filterobj-flt \
   tst-finilazyfailmod \
   tst-globalmod2 \
+  tst-gnu2-tls2mod0 \
+  tst-gnu2-tls2mod1 \
+  tst-gnu2-tls2mod2 \
   tst-initlazyfailmod \
   tst-initorder2a \
   tst-initorder2b \
@@ -3044,8 +3048,17 @@  $(objpfx)tst-tlsgap.out: \
   $(objpfx)tst-tlsgap-mod0.so \
   $(objpfx)tst-tlsgap-mod1.so \
   $(objpfx)tst-tlsgap-mod2.so
+
+$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2.out: \
+  $(objpfx)tst-gnu2-tls2mod0.so \
+  $(objpfx)tst-gnu2-tls2mod1.so \
+  $(objpfx)tst-gnu2-tls2mod2.so
 ifeq (yes,$(have-mtls-dialect-gnu2))
 CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
 endif
diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
new file mode 100644
index 0000000000..36547efb6d
--- /dev/null
+++ b/elf/tst-gnu2-tls2.c
@@ -0,0 +1,89 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <support/xdlfcn.h>
+#include <support/xthread.h>
+#include <support/check.h>
+#include "tst-gnu2-tls2.h"
+
+static void *mod[3];
+#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
+static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
+#undef MOD
+
+static void
+open_mod (int i)
+{
+  mod[i] = xdlopen (modname[i], RTLD_LAZY);
+  printf ("open %s\n", modname[i]);
+}
+
+static void
+close_mod (int i)
+{
+  xdlclose (mod[i]);
+  mod[i] = NULL;
+  printf ("close %s\n", modname[i]);
+}
+
+static void
+access_mod (int i, const char *sym)
+{
+  struct tls var = { -1, -1, -1, -1 };
+  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
+  struct tls *p = f (&var);
+  printf ("access %s: %s() = %p\n", modname[i], sym, p);
+  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
+  ++(p->a);
+}
+
+static void *
+start (void *arg)
+{
+  /* The DTV generation is at the last dlopen of mod0 and the
+     entry for mod1 is NULL.  */
+
+  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
+
+  /* Force the slow path in GNU2 TLS descriptor call.  */
+  access_mod (1, "apply_tls");
+
+  return arg;
+}
+
+static int
+do_test (void)
+{
+  open_mod (0);
+  open_mod (1);
+  open_mod (2);
+  close_mod (0);
+  close_mod (1); /* Create modid gap at mod1.  */
+  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
+
+  /* Create a thread where DTV of mod1 is NULL.  */
+  pthread_t t = xpthread_create (NULL, start, NULL);
+  xpthread_join (t);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
new file mode 100644
index 0000000000..e33f4dbe27
--- /dev/null
+++ b/elf/tst-gnu2-tls2.h
@@ -0,0 +1,26 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+struct tls
+{
+  int64_t a, b, c, d;
+};
+
+extern struct tls *apply_tls (struct tls *);
diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
new file mode 100644
index 0000000000..67dc0d464d
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod0.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var0 = *p;
+  return &tls_var0;
+}
diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
new file mode 100644
index 0000000000..a4ae6db24f
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod1.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var1[1] = *p;
+  return &tls_var1[1];
+}
diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
new file mode 100644
index 0000000000..2d13921717
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod2.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var2 = *p;
+  return &tls_var2;
+}
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 25e6622a79..aafdaee3e3 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -47,6 +47,16 @@  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
 		 : plt_rewrite_jmp);
     }
 }
+
+extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
+#endif
+
+#ifdef __x86_64__
+extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
 #endif
 
 #ifdef __LP64__
@@ -1130,6 +1140,33 @@  no_cpuid:
 	       TUNABLE_CALLBACK (set_x86_shstk));
 #endif
 
+# ifdef __x86_64__
+  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
+	{
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
+# ifdef SHARED
+	  GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
+# endif
+	}
+      else
+	{
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
+# ifdef SHARED
+	  GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
+# endif
+	}
+    }
+  else
+    {
+      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
+# ifdef SHARED
+      GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
+# endif
+    }
+#endif
+
 #ifdef SHARED
 # ifdef __x86_64__
   TUNABLE_GET (plt_rewrite, tunable_val_t *,
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 6d605d0d32..74b977fd3c 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -71,9 +71,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 			   int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -96,8 +93,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
-      const struct cpu_features* cpu_features = __get_cpu_features ();
-
 #ifdef SHARED
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
@@ -107,6 +102,7 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
+	  const struct cpu_features* cpu_features = __get_cpu_features ();
 	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
 	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
@@ -126,15 +122,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
-	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
-	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
-		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
-		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
-	  else
-	    *(ElfW(Addr) *) (got + 2)
-	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
+	  *(ElfW(Addr) *) (got + 2)
+	    = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
 	}
     }
 
@@ -383,7 +372,7 @@  and creates an unsatisfiable circular dependency.\n",
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
 		      (sym_map, sym->st_value + reloc->r_addend);
-		    td->entry = _dl_tlsdesc_dynamic;
+		    td->entry = GLRO(dl_x86_64_tlsdesc_dynamic);
 		  }
 		else
 #  endif
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
index 4d1d790fbb..8f317a502c 100644
--- a/sysdeps/x86_64/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -41,5 +41,37 @@ 
 
 #include <sysdeps/x86/dl-procinfo.c>
 
+#if defined SHARED && !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL
+  ._dl_x86_64_tlsdesc_dynamic
+# else
+PROCINFO_CLASS void * _dl_x86_64_tlsdesc_dynamic
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# ifdef PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_64_runtime_resolve
+# else
+PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #undef PROCINFO_DECL
 #undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
new file mode 100644
index 0000000000..b708d0d9e4
--- /dev/null
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -0,0 +1,170 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
+   Copyright (C) 2004-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef SECTION
+# define SECTION(p)	p
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+
+#include "dl-trampoline-state.h"
+
+	.section SECTION(.text),"ax",@progbits
+
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* %rax points to the TLS descriptor, such that 0(%rax) points to
+	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %rax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	_CET_ENDBR
+	/* Preserve call-clobbered registers that we modify.
+	   We need two scratch regs anyway.  */
+	movq	%rsi, -16(%rsp)
+	mov	%fs:DTV_OFFSET, %RSI_LP
+	movq	%rdi, -8(%rsp)
+	movq	TLSDESC_ARG(%rax), %rdi
+	movq	(%rsi), %rax
+	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
+	ja	2f
+	movq	TLSDESC_MODID(%rdi), %rax
+	salq	$4, %rax
+	movq	(%rax,%rsi), %rax
+	cmpq	$-1, %rax
+	je	2f
+	addq	TLSDESC_MODOFF(%rdi), %rax
+1:
+	movq	-16(%rsp), %rsi
+	sub	%fs:0, %RAX_LP
+	movq	-8(%rsp), %rdi
+	ret
+2:
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	movq	%rbx, -24(%rsp)
+	mov	%RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and	$-STATE_SAVE_ALIGNMENT, %RSP_LP
+#endif
+#ifdef REGISTER_SAVE_AREA
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	# STATE_SAVE_OFFSET has space for 8 integer registers.  But we
+	# need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
+	# RBX above.
+	sub	$(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
+# else
+	sub	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+# endif
+#else
+	# Allocate stack space of the required size to save the state.
+# if IS_IN (rtld)
+	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+# else
+	sub	 _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+# endif
+#endif
+	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
+	   r10 and r11.  */
+	movq	%rcx, REGISTER_SAVE_RCX(%rsp)
+	movq	%rdx, REGISTER_SAVE_RDX(%rsp)
+	movq	%r8, REGISTER_SAVE_R8(%rsp)
+	movq	%r9, REGISTER_SAVE_R9(%rsp)
+	movq	%r10, REGISTER_SAVE_R10(%rsp)
+	movq	%r11, REGISTER_SAVE_R11(%rsp)
+#ifdef USE_FXSAVE
+	fxsave	STATE_SAVE_OFFSET(%rsp)
+#else
+	movl	$STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	# Clear the XSAVE Header.
+# ifdef USE_XSAVE
+	movq	%rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
+# endif
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
+# ifdef USE_XSAVE
+	xsave	STATE_SAVE_OFFSET(%rsp)
+# else
+	xsavec	STATE_SAVE_OFFSET(%rsp)
+# endif
+#endif
+	/* %rdi already points to the tlsinfo data structure.  */
+	call	HIDDEN_JUMPTARGET (__tls_get_addr)
+	# Get register content back.
+#ifdef USE_FXSAVE
+	fxrstor	STATE_SAVE_OFFSET(%rsp)
+#else
+	/* Save and retore __tls_get_addr return value stored in RAX.  */
+	mov	%RAX_LP, %RCX_LP
+	movl	$STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	xrstor	STATE_SAVE_OFFSET(%rsp)
+	mov	%RCX_LP, %RAX_LP
+#endif
+	movq	REGISTER_SAVE_R11(%rsp), %r11
+	movq	REGISTER_SAVE_R10(%rsp), %r10
+	movq	REGISTER_SAVE_R9(%rsp), %r9
+	movq	REGISTER_SAVE_R8(%rsp), %r8
+	movq	REGISTER_SAVE_RDX(%rsp), %rdx
+	movq	REGISTER_SAVE_RCX(%rsp), %rcx
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov	%RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq	-24(%rsp), %rbx
+	cfi_restore(%rbx)
+#else
+	add	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
+#endif
+	jmp	1b
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
index f748af2ece..ea69f5223a 100644
--- a/sysdeps/x86_64/dl-tlsdesc.S
+++ b/sysdeps/x86_64/dl-tlsdesc.S
@@ -18,7 +18,19 @@ 
 
 #include <sysdep.h>
 #include <tls.h>
+#include <cpu-features-offsets.h>
+#include <features-offsets.h>
 #include "tlsdesc.h"
+#include "dl-trampoline-save.h"
+
+/* Area on stack to save and restore registers used for parameter
+   passing when calling _dl_tlsdesc_dynamic.  */
+#define REGISTER_SAVE_RCX	0
+#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
+#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDX + 8)
+#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
+#define REGISTER_SAVE_R10	(REGISTER_SAVE_R9 + 8)
+#define REGISTER_SAVE_R11	(REGISTER_SAVE_R10 + 8)
 
 	.text
 
@@ -67,80 +79,24 @@  _dl_tlsdesc_undefweak:
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
 #ifdef SHARED
-	.hidden _dl_tlsdesc_dynamic
-	.global	_dl_tlsdesc_dynamic
-	.type	_dl_tlsdesc_dynamic,@function
-
-     /* %rax points to the TLS descriptor, such that 0(%rax) points to
-	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
-	tlsdesc_dynamic_arg object.  It must return in %rax the offset
-	between the thread pointer and the object denoted by the
-	argument, without clobbering any registers.
-
-	The assembly code that follows is a rendition of the following
-	C code, hand-optimized a little bit.
-
-ptrdiff_t
-_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
-{
-  struct tlsdesc_dynamic_arg *td = tdp->arg;
-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
-  if (__builtin_expect (td->gen_count <= dtv[0].counter
-			&& (dtv[td->tlsinfo.ti_module].pointer.val
-			    != TLS_DTV_UNALLOCATED),
-			1))
-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
-      - __thread_pointer;
-
-  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
-}
-*/
-	cfi_startproc
-	.align 16
-_dl_tlsdesc_dynamic:
-	_CET_ENDBR
-	/* Preserve call-clobbered registers that we modify.
-	   We need two scratch regs anyway.  */
-	movq	%rsi, -16(%rsp)
-	mov	%fs:DTV_OFFSET, %RSI_LP
-	movq	%rdi, -8(%rsp)
-	movq	TLSDESC_ARG(%rax), %rdi
-	movq	(%rsi), %rax
-	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
-	ja	.Lslow
-	movq	TLSDESC_MODID(%rdi), %rax
-	salq	$4, %rax
-	movq	(%rax,%rsi), %rax
-	cmpq	$-1, %rax
-	je	.Lslow
-	addq	TLSDESC_MODOFF(%rdi), %rax
-.Lret:
-	movq	-16(%rsp), %rsi
-	sub	%fs:0, %RAX_LP
-	movq	-8(%rsp), %rdi
-	ret
-.Lslow:
-	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
-	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
-	subq	$72, %rsp
-	cfi_adjust_cfa_offset (72)
-	movq	%rdx, 8(%rsp)
-	movq	%rcx, 16(%rsp)
-	movq	%r8, 24(%rsp)
-	movq	%r9, 32(%rsp)
-	movq	%r10, 40(%rsp)
-	movq	%r11, 48(%rsp)
-	/* %rdi already points to the tlsinfo data structure.  */
-	call	HIDDEN_JUMPTARGET (__tls_get_addr)
-	movq	8(%rsp), %rdx
-	movq	16(%rsp), %rcx
-	movq	24(%rsp), %r8
-	movq	32(%rsp), %r9
-	movq	40(%rsp), %r10
-	movq	48(%rsp), %r11
-	addq	$72, %rsp
-	cfi_adjust_cfa_offset (-72)
-	jmp	.Lret
-	cfi_endproc
-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_FXSAVE
+
+# define USE_XSAVE
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVE
+
+# define USE_XSAVEC
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVEC
 #endif /* SHARED */
diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
new file mode 100644
index 0000000000..ebfbfe5c27
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-save.h
@@ -0,0 +1,36 @@ 
+/* x86-64 PLT trampoline register save macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
+   stack to 16 bytes before calling _dl_fixup.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || 16 > DL_STACK_ALIGNMENT)
diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
new file mode 100644
index 0000000000..575f120797
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-state.h
@@ -0,0 +1,51 @@ 
+/* x86-64 PLT dl-trampoline state macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if (STATE_SAVE_ALIGNMENT % 16) != 0
+# error STATE_SAVE_ALIGNMENT must be multiple of 16
+#endif
+
+#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
+# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
+#endif
+
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# ifdef USE_FXSAVE
+/* Use fxsave to save XMM registers.  */
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
+#  if (REGISTER_SAVE_AREA % 16) != 0
+#   error REGISTER_SAVE_AREA must be multiple of 16
+#  endif
+# endif
+#else
+# ifndef USE_FXSAVE
+#  error USE_FXSAVE must be defined
+# endif
+/* Use fxsave to save XMM registers.  */
+# define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multiple of 8
+# endif
+#endif
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index b2e7e0f69b..87c5137837 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -22,25 +22,7 @@ 
 #include <features-offsets.h>
 #include <link-defines.h>
 #include <isa-level.h>
-
-#ifndef DL_STACK_ALIGNMENT
-/* Due to GCC bug:
-
-   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
-
-   __tls_get_addr may be called with 8-byte stack alignment.  Although
-   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
-   that stack will be always aligned at 16 bytes.  We use unaligned
-   16-byte move to load and store SSE registers, which has no penalty
-   on modern processors if stack is 16-byte aligned.  */
-# define DL_STACK_ALIGNMENT 8
-#endif
-
-/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
-   stack to 16 bytes before calling _dl_fixup.  */
-#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
-  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
-   || 16 > DL_STACK_ALIGNMENT)
+#include "dl-trampoline-save.h"
 
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index f55c6ea040..d9ccfb40d4 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -27,39 +27,7 @@ 
 # undef LOCAL_STORAGE_AREA
 # undef BASE
 
-# if (STATE_SAVE_ALIGNMENT % 16) != 0
-#  error STATE_SAVE_ALIGNMENT must be multiple of 16
-# endif
-
-# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
-#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
-# endif
-
-# if DL_RUNTIME_RESOLVE_REALIGN_STACK
-/* Local stack area before jumping to function address: RBX.  */
-#  define LOCAL_STORAGE_AREA	8
-#  define BASE			rbx
-#  ifdef USE_FXSAVE
-/* Use fxsave to save XMM registers.  */
-#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
-#   if (REGISTER_SAVE_AREA % 16) != 0
-#    error REGISTER_SAVE_AREA must be multiple of 16
-#   endif
-#  endif
-# else
-#  ifndef USE_FXSAVE
-#   error USE_FXSAVE must be defined
-#  endif
-/* Use fxsave to save XMM registers.  */
-#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
-/* Local stack area before jumping to function address:  All saved
-   registers.  */
-#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
-#  define BASE			rsp
-#  if (REGISTER_SAVE_AREA % 16) != 8
-#   error REGISTER_SAVE_AREA must be odd multiple of 8
-#  endif
-# endif
+# include "dl-trampoline-state.h"
 
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve