diff mbox series

x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers

Message ID 20240228175010.240286-1-hjl.tools@gmail.com
State New
Headers show
Series x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers | expand

Commit Message

H.J. Lu Feb. 28, 2024, 5:50 p.m. UTC
_dl_tlsdesc_dynamic should also preserve AMX registers which are
caller-saved.  Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID
to x86-64 TLSDESC_CALL_STATE_SAVE_MASK.  Compute the AMX state size
and save it in xsave_state_full_size which is only used by
_dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec.  This fixes
the AMX part of BZ #31372.  Tested on AMX processor.

AMX test is enabled only for compilers with the fix for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098

GCC 14 and GCC 11/12/13 branches have the bug fix.
---
 sysdeps/unix/sysv/linux/x86_64/Makefile       | 27 ++++++
 .../sysv/linux/x86_64/include/asm/prctl.h     |  5 ++
 .../linux/x86_64/tst-gnu2-tls2-amx-mod0.c     |  2 +
 .../linux/x86_64/tst-gnu2-tls2-amx-mod1.c     |  2 +
 .../linux/x86_64/tst-gnu2-tls2-amx-mod2.c     |  2 +
 .../sysv/linux/x86_64/tst-gnu2-tls2-amx.c     | 86 +++++++++++++++++++
 .../sysv/linux/x86_64/tst-gnu2-tls2-amx.h     | 63 ++++++++++++++
 sysdeps/x86/cpu-features-offsets.sym          |  1 +
 sysdeps/x86/cpu-features.c                    | 55 +++++++++++-
 sysdeps/x86/include/cpu-features.h            |  2 +
 sysdeps/x86/sysdep.h                          | 18 +++-
 sysdeps/x86_64/configure                      | 28 ++++++
 sysdeps/x86_64/configure.ac                   | 15 ++++
 sysdeps/x86_64/dl-tlsdesc-dynamic.h           |  2 +-
 14 files changed, 302 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h

Comments

Sunil Pandey Feb. 28, 2024, 7:26 p.m. UTC | #1
On Wed, Feb 28, 2024 at 9:50 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> _dl_tlsdesc_dynamic should also preserve AMX registers which are
> caller-saved.  Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID
> to x86-64 TLSDESC_CALL_STATE_SAVE_MASK.  Compute the AMX state size
> and save it in xsave_state_full_size which is only used by
> _dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec.  This fixes
> the AMX part of BZ #31372.  Tested on AMX processor.
>
> AMX test is enabled only for compilers with the fix for
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098
>
> GCC 14 and GCC 11/12/13 branches have the bug fix.
> ---
>  sysdeps/unix/sysv/linux/x86_64/Makefile       | 27 ++++++
>  .../sysv/linux/x86_64/include/asm/prctl.h     |  5 ++
>  .../linux/x86_64/tst-gnu2-tls2-amx-mod0.c     |  2 +
>  .../linux/x86_64/tst-gnu2-tls2-amx-mod1.c     |  2 +
>  .../linux/x86_64/tst-gnu2-tls2-amx-mod2.c     |  2 +
>  .../sysv/linux/x86_64/tst-gnu2-tls2-amx.c     | 86 +++++++++++++++++++
>  .../sysv/linux/x86_64/tst-gnu2-tls2-amx.h     | 63 ++++++++++++++
>  sysdeps/x86/cpu-features-offsets.sym          |  1 +
>  sysdeps/x86/cpu-features.c                    | 55 +++++++++++-
>  sysdeps/x86/include/cpu-features.h            |  2 +
>  sysdeps/x86/sysdep.h                          | 18 +++-
>  sysdeps/x86_64/configure                      | 28 ++++++
>  sysdeps/x86_64/configure.ac                   | 15 ++++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h           |  2 +-
>  14 files changed, 302 insertions(+), 6 deletions(-)
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile
> b/sysdeps/unix/sysv/linux/x86_64/Makefile
> index 7d1d205fa0..fcbffd81cb 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/Makefile
> +++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
> @@ -66,6 +66,33 @@ $(objpfx)libx86-64-isa-level%.os:
> $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is
>  $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so
>         cp $< $@
>  endif
> +
> +ifeq (yes,$(have-mamx-tile))
> +tests += \
> +  tst-gnu2-tls2-amx \
> +# tests
> +
> +modules-names += \
> +  tst-gnu2-tls2-amx-mod0 \
> +  tst-gnu2-tls2-amx-mod1 \
> +  tst-gnu2-tls2-amx-mod2 \
> +# modules-names
> +
> +$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library)
> +$(objpfx)tst-gnu2-tls2-amx.out: \
> +  $(objpfx)tst-gnu2-tls2-amx-mod0.so \
> +  $(objpfx)tst-gnu2-tls2-amx-mod1.so \
> +  $(objpfx)tst-gnu2-tls2-amx-mod2.so
> +$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport)
> +$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport)
> +$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport)
> +
> +CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile
> +CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2
> +endif
> +
>  endif # $(subdir) == elf
>
>  ifneq ($(enable-cet),no)
> diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
> b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
> index 2f511321ad..ef4631bf4b 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
> +++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
> @@ -20,3 +20,8 @@
>  # define ARCH_SHSTK_SHSTK              0x1
>  # define ARCH_SHSTK_WRSS               0x2
>  #endif
> +
> +#ifndef ARCH_GET_XCOMP_PERM
> +# define ARCH_GET_XCOMP_PERM           0x1022
> +# define ARCH_REQ_XCOMP_PERM           0x1023
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
> b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
> new file mode 100644
> index 0000000000..2e0c7b91b7
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
> @@ -0,0 +1,2 @@
> +#include "tst-gnu2-tls2-amx.h"
> +#include <tst-gnu2-tls2mod0.c>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
> b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
> new file mode 100644
> index 0000000000..b8a8ccf1c1
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
> @@ -0,0 +1,2 @@
> +#include "tst-gnu2-tls2-amx.h"
> +#include <tst-gnu2-tls2mod1.c>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
> b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
> new file mode 100644
> index 0000000000..cdf4a8f363
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
> @@ -0,0 +1,2 @@
> +#include "tst-gnu2-tls2-amx.h"
> +#include <tst-gnu2-tls2mod2.c>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
> b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
> new file mode 100644
> index 0000000000..9efa2f3270
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
> @@ -0,0 +1,86 @@
> +/* Test TLSDESC relocation with AMX.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdbool.h>
> +#include <asm/prctl.h>
> +#include <support/check.h>
> +#include "tst-gnu2-tls2-amx.h"
> +
> +extern int arch_prctl (int, ...);
> +
> +#define X86_XSTATE_TILECFG_ID  17
> +#define X86_XSTATE_TILEDATA_ID 18
> +
> +/* Initialize tile config.  */
> +__attribute__ ((noinline, noclone))
> +static void
> +init_tile_config (__tilecfg *tileinfo)
> +{
> +  int i;
> +  tileinfo->palette_id = 1;
> +  tileinfo->start_row = 0;
> +
> +  for (i = 0; i < 1; ++i)
> +  {
> +    tileinfo->colsb[i] = MAX_ROWS;
> +    tileinfo->rows[i] = MAX_ROWS;
> +  }
> +
> +  for (i = 1; i < 4; ++i)
> +  {
> +    tileinfo->colsb[i] = MAX_COLS;
> +    tileinfo->rows[i] = MAX_ROWS;
> +  }
>

Can it be written as
 for (i = 0; i < 4; ++i)
{
...
}

If not, please add some comments on the above 2 loops.


> +
> +  _tile_loadconfig (tileinfo);
> +}
> +
> +static bool
> +enable_amx (void)
> +{
> +  uint64_t bitmask;
> +  if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0)
> +    return false;
> +
> +  if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0)
> +    return false;
> +
> +  if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0)
> +    return false;
> +
> +  /* Load tile configuration.  */
> +  __tilecfg tile_data = { 0 };
> +  init_tile_config (&tile_data);
> +
> +  return true;
> +}
> +
> +/* An architecture can define it to clobber caller-saved registers in
> +   malloc below to verify that the implicit TLSDESC call won't change
> +   caller-saved registers.  */
> +static void
> +clear_tile_register (void)
> +{
> +  _tile_zero (2);
> +}
> +
> +#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so"
> +#define IS_SUPPORTED() enable_amx ()
> +#define PREPARE_MALLOC() clear_tile_register ()
> +
> +#include <elf/tst-gnu2-tls2.c>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
> b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
> new file mode 100644
> index 0000000000..1845a3caba
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
> @@ -0,0 +1,63 @@
> +/* Test TLSDESC relocation with AMX.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +#include <string.h>
> +#include <x86intrin.h>
> +#include <support/check.h>
> +
> +#define MAX_ROWS 16
> +#define MAX_COLS 64
> +#define MAX 1024
> +#define STRIDE 64
> +
> +typedef struct __tile_config
> +{
> +  uint8_t palette_id;
> +  uint8_t start_row;
> +  uint8_t reserved_0[14];
> +  uint16_t colsb[16];
> +  uint8_t rows[16];
> +} __tilecfg __attribute__ ((aligned (64)));
> +
> +/* Initialize int8_t buffer */
> +static inline void
> +init_buffer (int8_t *buf, int8_t value)
> +{
> +  int rows, colsb, i, j;
> +  rows  = MAX_ROWS;
> +  colsb = MAX_COLS;
> +
> +  for (i = 0; i < rows; i++)
> +    for (j = 0; j < colsb; j++)
> +      buf[i * colsb + j] = value;
> +}
> +
> +#define BEFORE_TLSDESC_CALL()                                  \
> +  int8_t src[MAX];                                             \
> +  int8_t res[MAX];                                             \
> +  /* Initialize src with data  */                              \
> +  init_buffer (src, 2);                                                \
> +  /* Load tile rows from memory.  */                           \
> +  _tile_loadd (2, src, STRIDE);
> +
> +#define AFTER_TLSDESC_CALL()                                   \
> +  /* Store the tile data to memory.  */                                \
> +  _tile_stored (2, res, STRIDE);                               \
> +  _tile_release ();                                            \
> +  TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0);
> diff --git a/sysdeps/x86/cpu-features-offsets.sym
> b/sysdeps/x86/cpu-features-offsets.sym
> index 6a8fd29813..21fc88d651 100644
> --- a/sysdeps/x86/cpu-features-offsets.sym
> +++ b/sysdeps/x86/cpu-features-offsets.sym
> @@ -3,3 +3,4 @@
>  #include <ldsodefs.h>
>
>  XSAVE_STATE_SIZE_OFFSET        offsetof (struct cpu_features,
> xsave_state_size)
> +XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features,
> xsave_state_full_size)
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 835113b42f..d71e8d3d2e 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -307,6 +307,8 @@ update_active (struct cpu_features *cpu_features)
>           __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
>           if (ebx != 0)
>             {
> +             /* NB: On AMX capable processors, ebx always includes AMX
> +                states.  */
>               unsigned int xsave_state_full_size
>                 = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
>
> @@ -320,6 +322,11 @@ update_active (struct cpu_features *cpu_features)
>                 {
>                   unsigned int xstate_comp_offsets[32];
>                   unsigned int xstate_comp_sizes[32];
> +#ifdef __x86_64__
> +                 unsigned int xstate_amx_comp_offsets[32];
> +                 unsigned int xstate_amx_comp_sizes[32];
> +                 unsigned int amx_ecx;
> +#endif
>                   unsigned int i;
>
>                   xstate_comp_offsets[0] = 0;
> @@ -327,16 +334,39 @@ update_active (struct cpu_features *cpu_features)
>                   xstate_comp_offsets[2] = 576;
>                   xstate_comp_sizes[0] = 160;
>                   xstate_comp_sizes[1] = 256;
> +#ifdef __x86_64__
> +                 xstate_amx_comp_offsets[0] = 0;
> +                 xstate_amx_comp_offsets[1] = 160;
> +                 xstate_amx_comp_offsets[2] = 576;
> +                 xstate_amx_comp_sizes[0] = 160;
> +                 xstate_amx_comp_sizes[1] = 256;
> +#endif
>
>                   for (i = 2; i < 32; i++)
>                     {
> -                     if ((STATE_SAVE_MASK & (1 << i)) != 0)
> +                     if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
>                         {
>                           __cpuid_count (0xd, i, eax, ebx, ecx, edx);
> -                         xstate_comp_sizes[i] = eax;
> +#ifdef __x86_64__
> +                         /* Include this in xsave_state_full_size.  */
> +                         amx_ecx = ecx;
> +                         xstate_amx_comp_sizes[i] = eax;
> +                         if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
> +                           {
> +                             /* Exclude this from xsave_state_size.  */
> +                             ecx = 0;
> +                             xstate_comp_sizes[i] = 0;
> +                           }
> +                         else
> +#endif
> +                           xstate_comp_sizes[i] = eax;
>                         }
>                       else
>                         {
> +#ifdef __x86_64__
> +                         amx_ecx = 0;
> +                         xstate_amx_comp_sizes[i] = 0;
> +#endif
>                           ecx = 0;
>                           xstate_comp_sizes[i] = 0;
>                         }
> @@ -349,6 +379,15 @@ update_active (struct cpu_features *cpu_features)
>                           if ((ecx & (1 << 1)) != 0)
>                             xstate_comp_offsets[i]
>                               = ALIGN_UP (xstate_comp_offsets[i], 64);
> +#ifdef __x86_64__
> +                         xstate_amx_comp_offsets[i]
> +                           = (xstate_amx_comp_offsets[i - 1]
> +                              + xstate_amx_comp_sizes[i - 1]);
> +                         if ((amx_ecx & (1 << 1)) != 0)
> +                           xstate_amx_comp_offsets[i]
> +                             = ALIGN_UP (xstate_amx_comp_offsets[i],
> +                                         64);
> +#endif
>                         }
>                     }
>
> @@ -357,6 +396,18 @@ update_active (struct cpu_features *cpu_features)
>                     = xstate_comp_offsets[31] + xstate_comp_sizes[31];
>                   if (size)
>                     {
> +#ifdef __x86_64__
> +                     unsigned int amx_size
> +                       = (xstate_amx_comp_offsets[31]
> +                          + xstate_amx_comp_sizes[31]);
> +                     amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
> +                                          64);
> +                     /* Set xsave_state_full_size to the compact AMX
> +                        state size for XSAVEC.  NB: xsave_state_full_size
> +                        is only used in _dl_tlsdesc_dynamic_xsave and
> +                        _dl_tlsdesc_dynamic_xsavec.  */
> +                     cpu_features->xsave_state_full_size = amx_size;
> +#endif
>                       cpu_features->xsave_state_size
>                         = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
>                       CPU_FEATURE_SET (cpu_features, XSAVEC);
> diff --git a/sysdeps/x86/include/cpu-features.h
> b/sysdeps/x86/include/cpu-features.h
> index b9bf3115b6..cd7bd27cf3 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -934,6 +934,8 @@ struct cpu_features
>    /* The full state size for XSAVE when XSAVEC is disabled by
>
>       GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
> +
> +     and the AMX state size when XSAVEC is available.
>     */
>    unsigned int xsave_state_full_size;
>    /* Data cache size for use in memory and string routines, typically
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 485cad9c02..db8e576e91 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -56,6 +56,14 @@
>     | (1 << X86_XSTATE_ZMM_H_ID)        \
>     | (1 << X86_XSTATE_ZMM_ID)          \
>     | (1 << X86_XSTATE_APX_F_ID))
> +
> +/* AMX state mask.  */
> +# define AMX_STATE_SAVE_MASK           \
> +  ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
> +
> +/* States to be included in xsave_state_full_size.  */
> +# define FULL_STATE_SAVE_MASK          \
> +  (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
>  #else
>  /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
>     doesn't have red-zone, use 0 here.  */
> @@ -68,13 +76,17 @@
>     | (1 << X86_XSTATE_BNDREGS_ID)      \
>     | (1 << X86_XSTATE_K_ID)            \
>     | (1 << X86_XSTATE_ZMM_H_ID))
> +
> +/* States to be included in xsave_state_size.  */
> +# define FULL_STATE_SAVE_MASK          STATE_SAVE_MASK
>  #endif
>
>  /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> -   Compiler assumes that all registers, including x87 FPU stack registers,
> -   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> +   Compiler assumes that all registers, including AMX and x87 FPU
> +   stack registers, are unchanged after CALL, except for EFLAGS and
> +   RAX/EAX.  */
>  #define TLSDESC_CALL_STATE_SAVE_MASK   \
> -  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> +  (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
>
>  /* Constants for bits in __x86_string_control:  */
>
> diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
> index 418cc4a9b8..04a534fa12 100755
> --- a/sysdeps/x86_64/configure
> +++ b/sysdeps/x86_64/configure
> @@ -134,6 +134,34 @@ fi
>  config_vars="$config_vars
>  enable-cet = $enable_cet"
>
> +# Check if -mamx-tile works properly.
> +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile
> works properly" >&5
> +printf %s "checking whether -mamx-tile works properly... " >&6; }
> +if test ${libc_cv_x86_have_amx_tile+y}
> +then :
> +  printf %s "(cached) " >&6
> +else $as_nop
> +  cat > conftest.c <<EOF
> +#include <x86intrin.h>
> +EOF
> +              libc_cv_x86_have_amx_tile=no
> +              if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c >
> conftest.i'
> +  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
> +  (eval $ac_try) 2>&5
> +  ac_status=$?
> +  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
> +  test $ac_status = 0; }; }; then
> +                if grep -q __builtin_ia32_ldtilecfg conftest.i; then
> +                  libc_cv_x86_have_amx_tile=yes
> +                fi
> +              fi
> +              rm -rf conftest*
> +fi
> +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result:
> $libc_cv_x86_have_amx_tile" >&5
> +printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
> +config_vars="$config_vars
> +have-mamx-tile = $libc_cv_x86_have_amx_tile"
> +
>  test -n "$critic_missing" && as_fn_error $? "
>  *** $critic_missing" "$LINENO" 5
>
> diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
> index d1f803c02e..c714c47351 100644
> --- a/sysdeps/x86_64/configure.ac
> +++ b/sysdeps/x86_64/configure.ac
> @@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then
>  fi
>  LIBC_CONFIG_VAR([enable-cet], [$enable_cet])
>
> +# Check if -mamx-tile works properly.
> +AC_CACHE_CHECK(whether -mamx-tile works properly,
> +              libc_cv_x86_have_amx_tile, [dnl
> +cat > conftest.c <<EOF
> +#include <x86intrin.h>
> +EOF
> +              libc_cv_x86_have_amx_tile=no
> +              if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c
> > conftest.i); then
> +                if grep -q __builtin_ia32_ldtilecfg conftest.i; then
> +                  libc_cv_x86_have_amx_tile=yes
> +                fi
> +              fi
> +              rm -rf conftest*])
> +LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
> +
>  test -n "$critic_missing" && AC_MSG_ERROR([
>  *** $critic_missing])
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> index 0c2e8d5320..9f02cfc3eb 100644
> --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
>  # endif
>  #else
>         /* Allocate stack space of the required size to save the state.  */
> -       sub
>  _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> %RSP_LP
> +       sub
>  _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip),
> %RSP_LP
>  #endif
>         /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
>            r10 and r11.  */
> --
> 2.43.2
>
>
H.J. Lu Feb. 28, 2024, 7:58 p.m. UTC | #2
On Wed, Feb 28, 2024 at 11:27 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Feb 28, 2024 at 9:50 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> _dl_tlsdesc_dynamic should also preserve AMX registers which are
>> caller-saved.  Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID
>> to x86-64 TLSDESC_CALL_STATE_SAVE_MASK.  Compute the AMX state size
>> and save it in xsave_state_full_size which is only used by
>> _dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec.  This fixes
>> the AMX part of BZ #31372.  Tested on AMX processor.
>>
>> AMX test is enabled only for compilers with the fix for
>>
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098
>>
>> GCC 14 and GCC 11/12/13 branches have the bug fix.
>> ---
>>  sysdeps/unix/sysv/linux/x86_64/Makefile       | 27 ++++++
>>  .../sysv/linux/x86_64/include/asm/prctl.h     |  5 ++
>>  .../linux/x86_64/tst-gnu2-tls2-amx-mod0.c     |  2 +
>>  .../linux/x86_64/tst-gnu2-tls2-amx-mod1.c     |  2 +
>>  .../linux/x86_64/tst-gnu2-tls2-amx-mod2.c     |  2 +
>>  .../sysv/linux/x86_64/tst-gnu2-tls2-amx.c     | 86 +++++++++++++++++++
>>  .../sysv/linux/x86_64/tst-gnu2-tls2-amx.h     | 63 ++++++++++++++
>>  sysdeps/x86/cpu-features-offsets.sym          |  1 +
>>  sysdeps/x86/cpu-features.c                    | 55 +++++++++++-
>>  sysdeps/x86/include/cpu-features.h            |  2 +
>>  sysdeps/x86/sysdep.h                          | 18 +++-
>>  sysdeps/x86_64/configure                      | 28 ++++++
>>  sysdeps/x86_64/configure.ac                   | 15 ++++
>>  sysdeps/x86_64/dl-tlsdesc-dynamic.h           |  2 +-
>>  14 files changed, 302 insertions(+), 6 deletions(-)
>>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
>>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
>>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
>>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
>>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
>>
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
>> index 7d1d205fa0..fcbffd81cb 100644
>> --- a/sysdeps/unix/sysv/linux/x86_64/Makefile
>> +++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
>> @@ -66,6 +66,33 @@ $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is
>>  $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so
>>         cp $< $@
>>  endif
>> +
>> +ifeq (yes,$(have-mamx-tile))
>> +tests += \
>> +  tst-gnu2-tls2-amx \
>> +# tests
>> +
>> +modules-names += \
>> +  tst-gnu2-tls2-amx-mod0 \
>> +  tst-gnu2-tls2-amx-mod1 \
>> +  tst-gnu2-tls2-amx-mod2 \
>> +# modules-names
>> +
>> +$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library)
>> +$(objpfx)tst-gnu2-tls2-amx.out: \
>> +  $(objpfx)tst-gnu2-tls2-amx-mod0.so \
>> +  $(objpfx)tst-gnu2-tls2-amx-mod1.so \
>> +  $(objpfx)tst-gnu2-tls2-amx-mod2.so
>> +$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport)
>> +$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport)
>> +$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport)
>> +
>> +CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile
>> +CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2
>> +CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2
>> +CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2
>> +endif
>> +
>>  endif # $(subdir) == elf
>>
>>  ifneq ($(enable-cet),no)
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
>> index 2f511321ad..ef4631bf4b 100644
>> --- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
>> +++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
>> @@ -20,3 +20,8 @@
>>  # define ARCH_SHSTK_SHSTK              0x1
>>  # define ARCH_SHSTK_WRSS               0x2
>>  #endif
>> +
>> +#ifndef ARCH_GET_XCOMP_PERM
>> +# define ARCH_GET_XCOMP_PERM           0x1022
>> +# define ARCH_REQ_XCOMP_PERM           0x1023
>> +#endif
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
>> new file mode 100644
>> index 0000000000..2e0c7b91b7
>> --- /dev/null
>> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
>> @@ -0,0 +1,2 @@
>> +#include "tst-gnu2-tls2-amx.h"
>> +#include <tst-gnu2-tls2mod0.c>
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
>> new file mode 100644
>> index 0000000000..b8a8ccf1c1
>> --- /dev/null
>> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
>> @@ -0,0 +1,2 @@
>> +#include "tst-gnu2-tls2-amx.h"
>> +#include <tst-gnu2-tls2mod1.c>
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
>> new file mode 100644
>> index 0000000000..cdf4a8f363
>> --- /dev/null
>> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
>> @@ -0,0 +1,2 @@
>> +#include "tst-gnu2-tls2-amx.h"
>> +#include <tst-gnu2-tls2mod2.c>
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
>> new file mode 100644
>> index 0000000000..9efa2f3270
>> --- /dev/null
>> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
>> @@ -0,0 +1,86 @@
>> +/* Test TLSDESC relocation with AMX.
>> +   Copyright (C) 2024 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <stdbool.h>
>> +#include <asm/prctl.h>
>> +#include <support/check.h>
>> +#include "tst-gnu2-tls2-amx.h"
>> +
>> +extern int arch_prctl (int, ...);
>> +
>> +#define X86_XSTATE_TILECFG_ID  17
>> +#define X86_XSTATE_TILEDATA_ID 18
>> +
>> +/* Initialize tile config.  */
>> +__attribute__ ((noinline, noclone))
>> +static void
>> +init_tile_config (__tilecfg *tileinfo)
>> +{
>> +  int i;
>> +  tileinfo->palette_id = 1;
>> +  tileinfo->start_row = 0;
>> +
>> +  for (i = 0; i < 1; ++i)
>> +  {
>> +    tileinfo->colsb[i] = MAX_ROWS;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +    tileinfo->rows[i] = MAX_ROWS;
>> +  }
>> +
>> +  for (i = 1; i < 4; ++i)
>> +  {
>> +    tileinfo->colsb[i] = MAX_COLS;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +    tileinfo->rows[i] = MAX_ROWS;
>> +  }
>
>
> Can it be written as
>  for (i = 0; i < 4; ++i)
> {
> ...
> }
>
> If not, please add some comments on the above 2 loops.

It is MAX_ROWS vs MAX_COLS.  But the first loop can
be unrolled.  Will send v2.

>>
>> +
>> +  _tile_loadconfig (tileinfo);
>> +}
>> +
>> +static bool
>> +enable_amx (void)
>> +{
>> +  uint64_t bitmask;
>> +  if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0)
>> +    return false;
>> +
>> +  if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0)
>> +    return false;
>> +
>> +  if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0)
>> +    return false;
>> +
>> +  /* Load tile configuration.  */
>> +  __tilecfg tile_data = { 0 };
>> +  init_tile_config (&tile_data);
>> +
>> +  return true;
>> +}
>> +
>> +/* An architecture can define it to clobber caller-saved registers in
>> +   malloc below to verify that the implicit TLSDESC call won't change
>> +   caller-saved registers.  */
>> +static void
>> +clear_tile_register (void)
>> +{
>> +  _tile_zero (2);
>> +}
>> +
>> +#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so"
>> +#define IS_SUPPORTED() enable_amx ()
>> +#define PREPARE_MALLOC() clear_tile_register ()
>> +
>> +#include <elf/tst-gnu2-tls2.c>
>> diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
>> new file mode 100644
>> index 0000000000..1845a3caba
>> --- /dev/null
>> +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
>> @@ -0,0 +1,63 @@
>> +/* Test TLSDESC relocation with AMX.
>> +   Copyright (C) 2024 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <stdint.h>
>> +#include <string.h>
>> +#include <x86intrin.h>
>> +#include <support/check.h>
>> +
>> +#define MAX_ROWS 16
>> +#define MAX_COLS 64
>> +#define MAX 1024
>> +#define STRIDE 64
>> +
>> +typedef struct __tile_config
>> +{
>> +  uint8_t palette_id;
>> +  uint8_t start_row;
>> +  uint8_t reserved_0[14];
>> +  uint16_t colsb[16];
>> +  uint8_t rows[16];
>> +} __tilecfg __attribute__ ((aligned (64)));
>> +
>> +/* Initialize int8_t buffer */
>> +static inline void
>> +init_buffer (int8_t *buf, int8_t value)
>> +{
>> +  int rows, colsb, i, j;
>> +  rows  = MAX_ROWS;
>> +  colsb = MAX_COLS;
>> +
>> +  for (i = 0; i < rows; i++)
>> +    for (j = 0; j < colsb; j++)
>> +      buf[i * colsb + j] = value;
>> +}
>> +
>> +#define BEFORE_TLSDESC_CALL()                                  \
>> +  int8_t src[MAX];                                             \
>> +  int8_t res[MAX];                                             \
>> +  /* Initialize src with data  */                              \
>> +  init_buffer (src, 2);                                                \
>> +  /* Load tile rows from memory.  */                           \
>> +  _tile_loadd (2, src, STRIDE);
>> +
>> +#define AFTER_TLSDESC_CALL()                                   \
>> +  /* Store the tile data to memory.  */                                \
>> +  _tile_stored (2, res, STRIDE);                               \
>> +  _tile_release ();                                            \
>> +  TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0);
>> diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
>> index 6a8fd29813..21fc88d651 100644
>> --- a/sysdeps/x86/cpu-features-offsets.sym
>> +++ b/sysdeps/x86/cpu-features-offsets.sym
>> @@ -3,3 +3,4 @@
>>  #include <ldsodefs.h>
>>
>>  XSAVE_STATE_SIZE_OFFSET        offsetof (struct cpu_features, xsave_state_size)
>> +XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size)
>> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> index 835113b42f..d71e8d3d2e 100644
>> --- a/sysdeps/x86/cpu-features.c
>> +++ b/sysdeps/x86/cpu-features.c
>> @@ -307,6 +307,8 @@ update_active (struct cpu_features *cpu_features)
>>           __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
>>           if (ebx != 0)
>>             {
>> +             /* NB: On AMX capable processors, ebx always includes AMX
>> +                states.  */
>>               unsigned int xsave_state_full_size
>>                 = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
>>
>> @@ -320,6 +322,11 @@ update_active (struct cpu_features *cpu_features)
>>                 {
>>                   unsigned int xstate_comp_offsets[32];
>>                   unsigned int xstate_comp_sizes[32];
>> +#ifdef __x86_64__
>> +                 unsigned int xstate_amx_comp_offsets[32];
>> +                 unsigned int xstate_amx_comp_sizes[32];
>> +                 unsigned int amx_ecx;
>> +#endif
>>                   unsigned int i;
>>
>>                   xstate_comp_offsets[0] = 0;
>> @@ -327,16 +334,39 @@ update_active (struct cpu_features *cpu_features)
>>                   xstate_comp_offsets[2] = 576;
>>                   xstate_comp_sizes[0] = 160;
>>                   xstate_comp_sizes[1] = 256;
>> +#ifdef __x86_64__
>> +                 xstate_amx_comp_offsets[0] = 0;
>> +                 xstate_amx_comp_offsets[1] = 160;
>> +                 xstate_amx_comp_offsets[2] = 576;
>> +                 xstate_amx_comp_sizes[0] = 160;
>> +                 xstate_amx_comp_sizes[1] = 256;
>> +#endif
>>
>>                   for (i = 2; i < 32; i++)
>>                     {
>> -                     if ((STATE_SAVE_MASK & (1 << i)) != 0)
>> +                     if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
>>                         {
>>                           __cpuid_count (0xd, i, eax, ebx, ecx, edx);
>> -                         xstate_comp_sizes[i] = eax;
>> +#ifdef __x86_64__
>> +                         /* Include this in xsave_state_full_size.  */
>> +                         amx_ecx = ecx;
>> +                         xstate_amx_comp_sizes[i] = eax;
>> +                         if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
>> +                           {
>> +                             /* Exclude this from xsave_state_size.  */
>> +                             ecx = 0;
>> +                             xstate_comp_sizes[i] = 0;
>> +                           }
>> +                         else
>> +#endif
>> +                           xstate_comp_sizes[i] = eax;
>>                         }
>>                       else
>>                         {
>> +#ifdef __x86_64__
>> +                         amx_ecx = 0;
>> +                         xstate_amx_comp_sizes[i] = 0;
>> +#endif
>>                           ecx = 0;
>>                           xstate_comp_sizes[i] = 0;
>>                         }
>> @@ -349,6 +379,15 @@ update_active (struct cpu_features *cpu_features)
>>                           if ((ecx & (1 << 1)) != 0)
>>                             xstate_comp_offsets[i]
>>                               = ALIGN_UP (xstate_comp_offsets[i], 64);
>> +#ifdef __x86_64__
>> +                         xstate_amx_comp_offsets[i]
>> +                           = (xstate_amx_comp_offsets[i - 1]
>> +                              + xstate_amx_comp_sizes[i - 1]);
>> +                         if ((amx_ecx & (1 << 1)) != 0)
>> +                           xstate_amx_comp_offsets[i]
>> +                             = ALIGN_UP (xstate_amx_comp_offsets[i],
>> +                                         64);
>> +#endif
>>                         }
>>                     }
>>
>> @@ -357,6 +396,18 @@ update_active (struct cpu_features *cpu_features)
>>                     = xstate_comp_offsets[31] + xstate_comp_sizes[31];
>>                   if (size)
>>                     {
>> +#ifdef __x86_64__
>> +                     unsigned int amx_size
>> +                       = (xstate_amx_comp_offsets[31]
>> +                          + xstate_amx_comp_sizes[31]);
>> +                     amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
>> +                                          64);
>> +                     /* Set xsave_state_full_size to the compact AMX
>> +                        state size for XSAVEC.  NB: xsave_state_full_size
>> +                        is only used in _dl_tlsdesc_dynamic_xsave and
>> +                        _dl_tlsdesc_dynamic_xsavec.  */
>> +                     cpu_features->xsave_state_full_size = amx_size;
>> +#endif
>>                       cpu_features->xsave_state_size
>>                         = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
>>                       CPU_FEATURE_SET (cpu_features, XSAVEC);
>> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
>> index b9bf3115b6..cd7bd27cf3 100644
>> --- a/sysdeps/x86/include/cpu-features.h
>> +++ b/sysdeps/x86/include/cpu-features.h
>> @@ -934,6 +934,8 @@ struct cpu_features
>>    /* The full state size for XSAVE when XSAVEC is disabled by
>>
>>       GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
>> +
>> +     and the AMX state size when XSAVEC is available.
>>     */
>>    unsigned int xsave_state_full_size;
>>    /* Data cache size for use in memory and string routines, typically
>> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> index 485cad9c02..db8e576e91 100644
>> --- a/sysdeps/x86/sysdep.h
>> +++ b/sysdeps/x86/sysdep.h
>> @@ -56,6 +56,14 @@
>>     | (1 << X86_XSTATE_ZMM_H_ID)        \
>>     | (1 << X86_XSTATE_ZMM_ID)          \
>>     | (1 << X86_XSTATE_APX_F_ID))
>> +
>> +/* AMX state mask.  */
>> +# define AMX_STATE_SAVE_MASK           \
>> +  ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
>> +
>> +/* States to be included in xsave_state_full_size.  */
>> +# define FULL_STATE_SAVE_MASK          \
>> +  (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
>>  #else
>>  /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
>>     doesn't have red-zone, use 0 here.  */
>> @@ -68,13 +76,17 @@
>>     | (1 << X86_XSTATE_BNDREGS_ID)      \
>>     | (1 << X86_XSTATE_K_ID)            \
>>     | (1 << X86_XSTATE_ZMM_H_ID))
>> +
>> +/* States to be included in xsave_state_size.  */
>> +# define FULL_STATE_SAVE_MASK          STATE_SAVE_MASK
>>  #endif
>>
>>  /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
>> -   Compiler assumes that all registers, including x87 FPU stack registers,
>> -   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
>> +   Compiler assumes that all registers, including AMX and x87 FPU
>> +   stack registers, are unchanged after CALL, except for EFLAGS and
>> +   RAX/EAX.  */
>>  #define TLSDESC_CALL_STATE_SAVE_MASK   \
>> -  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
>> +  (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
>>
>>  /* Constants for bits in __x86_string_control:  */
>>
>> diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
>> index 418cc4a9b8..04a534fa12 100755
>> --- a/sysdeps/x86_64/configure
>> +++ b/sysdeps/x86_64/configure
>> @@ -134,6 +134,34 @@ fi
>>  config_vars="$config_vars
>>  enable-cet = $enable_cet"
>>
>> +# Check if -mamx-tile works properly.
>> +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5
>> +printf %s "checking whether -mamx-tile works properly... " >&6; }
>> +if test ${libc_cv_x86_have_amx_tile+y}
>> +then :
>> +  printf %s "(cached) " >&6
>> +else $as_nop
>> +  cat > conftest.c <<EOF
>> +#include <x86intrin.h>
>> +EOF
>> +              libc_cv_x86_have_amx_tile=no
>> +              if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i'
>> +  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
>> +  (eval $ac_try) 2>&5
>> +  ac_status=$?
>> +  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
>> +  test $ac_status = 0; }; }; then
>> +                if grep -q __builtin_ia32_ldtilecfg conftest.i; then
>> +                  libc_cv_x86_have_amx_tile=yes
>> +                fi
>> +              fi
>> +              rm -rf conftest*
>> +fi
>> +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5
>> +printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
>> +config_vars="$config_vars
>> +have-mamx-tile = $libc_cv_x86_have_amx_tile"
>> +
>>  test -n "$critic_missing" && as_fn_error $? "
>>  *** $critic_missing" "$LINENO" 5
>>
>> diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
>> index d1f803c02e..c714c47351 100644
>> --- a/sysdeps/x86_64/configure.ac
>> +++ b/sysdeps/x86_64/configure.ac
>> @@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then
>>  fi
>>  LIBC_CONFIG_VAR([enable-cet], [$enable_cet])
>>
>> +# Check if -mamx-tile works properly.
>> +AC_CACHE_CHECK(whether -mamx-tile works properly,
>> +              libc_cv_x86_have_amx_tile, [dnl
>> +cat > conftest.c <<EOF
>> +#include <x86intrin.h>
>> +EOF
>> +              libc_cv_x86_have_amx_tile=no
>> +              if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then
>> +                if grep -q __builtin_ia32_ldtilecfg conftest.i; then
>> +                  libc_cv_x86_have_amx_tile=yes
>> +                fi
>> +              fi
>> +              rm -rf conftest*])
>> +LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
>> +
>>  test -n "$critic_missing" && AC_MSG_ERROR([
>>  *** $critic_missing])
>> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
>> index 0c2e8d5320..9f02cfc3eb 100644
>> --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
>> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
>> @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
>>  # endif
>>  #else
>>         /* Allocate stack space of the required size to save the state.  */
>> -       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
>> +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
>>  #endif
>>         /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
>>            r10 and r11.  */
>> --
>> 2.43.2
>>
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
index 7d1d205fa0..fcbffd81cb 100644
--- a/sysdeps/unix/sysv/linux/x86_64/Makefile
+++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
@@ -66,6 +66,33 @@  $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is
 $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so
 	cp $< $@
 endif
+
+ifeq (yes,$(have-mamx-tile))
+tests += \
+  tst-gnu2-tls2-amx \
+# tests
+
+modules-names += \
+  tst-gnu2-tls2-amx-mod0 \
+  tst-gnu2-tls2-amx-mod1 \
+  tst-gnu2-tls2-amx-mod2 \
+# modules-names
+
+$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-amx.out: \
+  $(objpfx)tst-gnu2-tls2-amx-mod0.so \
+  $(objpfx)tst-gnu2-tls2-amx-mod1.so \
+  $(objpfx)tst-gnu2-tls2-amx-mod2.so
+$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport)
+$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport)
+$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport)
+
+CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile
+CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2
+endif
+
 endif # $(subdir) == elf
 
 ifneq ($(enable-cet),no)
diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
index 2f511321ad..ef4631bf4b 100644
--- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
+++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
@@ -20,3 +20,8 @@ 
 # define ARCH_SHSTK_SHSTK		0x1
 # define ARCH_SHSTK_WRSS		0x2
 #endif
+
+#ifndef ARCH_GET_XCOMP_PERM
+# define ARCH_GET_XCOMP_PERM		0x1022
+# define ARCH_REQ_XCOMP_PERM		0x1023
+#endif
diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
new file mode 100644
index 0000000000..2e0c7b91b7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
@@ -0,0 +1,2 @@ 
+#include "tst-gnu2-tls2-amx.h"
+#include <tst-gnu2-tls2mod0.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
new file mode 100644
index 0000000000..b8a8ccf1c1
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
@@ -0,0 +1,2 @@ 
+#include "tst-gnu2-tls2-amx.h"
+#include <tst-gnu2-tls2mod1.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
new file mode 100644
index 0000000000..cdf4a8f363
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
@@ -0,0 +1,2 @@ 
+#include "tst-gnu2-tls2-amx.h"
+#include <tst-gnu2-tls2mod2.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
new file mode 100644
index 0000000000..9efa2f3270
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
@@ -0,0 +1,86 @@ 
+/* Test TLSDESC relocation with AMX.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdbool.h>
+#include <asm/prctl.h>
+#include <support/check.h>
+#include "tst-gnu2-tls2-amx.h"
+
+extern int arch_prctl (int, ...);
+
+#define X86_XSTATE_TILECFG_ID	17
+#define X86_XSTATE_TILEDATA_ID	18
+
+/* Initialize tile config.  */
+__attribute__ ((noinline, noclone))
+static void
+init_tile_config (__tilecfg *tileinfo)
+{
+  int i;
+  tileinfo->palette_id = 1;
+  tileinfo->start_row = 0;
+
+  for (i = 0; i < 1; ++i)
+  {
+    tileinfo->colsb[i] = MAX_ROWS;
+    tileinfo->rows[i] = MAX_ROWS;
+  }
+
+  for (i = 1; i < 4; ++i)
+  {
+    tileinfo->colsb[i] = MAX_COLS;
+    tileinfo->rows[i] = MAX_ROWS;
+  }
+
+  _tile_loadconfig (tileinfo);
+}
+
+static bool
+enable_amx (void)
+{
+  uint64_t bitmask;
+  if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0)
+    return false;
+
+  if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0)
+    return false;
+
+  if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0)
+    return false;
+
+  /* Load tile configuration.  */
+  __tilecfg tile_data = { 0 };
+  init_tile_config (&tile_data);
+
+  return true;
+}
+
+/* An architecture can define it to clobber caller-saved registers in
+   malloc below to verify that the implicit TLSDESC call won't change
+   caller-saved registers.  */
+static void
+clear_tile_register (void)
+{
+  _tile_zero (2);
+}
+
+#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so"
+#define IS_SUPPORTED()	enable_amx ()
+#define PREPARE_MALLOC() clear_tile_register ()
+
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
new file mode 100644
index 0000000000..1845a3caba
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
@@ -0,0 +1,63 @@ 
+/* Test TLSDESC relocation with AMX.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <string.h>
+#include <x86intrin.h>
+#include <support/check.h>
+
+#define MAX_ROWS 16
+#define MAX_COLS 64
+#define MAX 1024
+#define STRIDE 64
+
+typedef struct __tile_config
+{
+  uint8_t palette_id;
+  uint8_t start_row;
+  uint8_t reserved_0[14];
+  uint16_t colsb[16];
+  uint8_t rows[16];
+} __tilecfg __attribute__ ((aligned (64)));
+
+/* Initialize int8_t buffer */
+static inline void
+init_buffer (int8_t *buf, int8_t value)
+{
+  int rows, colsb, i, j;
+  rows  = MAX_ROWS;
+  colsb = MAX_COLS;
+
+  for (i = 0; i < rows; i++)
+    for (j = 0; j < colsb; j++)
+      buf[i * colsb + j] = value;
+}
+
+#define BEFORE_TLSDESC_CALL()					\
+  int8_t src[MAX];						\
+  int8_t res[MAX];						\
+  /* Initialize src with data  */				\
+  init_buffer (src, 2);						\
+  /* Load tile rows from memory.  */				\
+  _tile_loadd (2, src, STRIDE);
+
+#define AFTER_TLSDESC_CALL()					\
+  /* Store the tile data to memory.  */				\
+  _tile_stored (2, res, STRIDE);				\
+  _tile_release ();						\
+  TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0);
diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
index 6a8fd29813..21fc88d651 100644
--- a/sysdeps/x86/cpu-features-offsets.sym
+++ b/sysdeps/x86/cpu-features-offsets.sym
@@ -3,3 +3,4 @@ 
 #include <ldsodefs.h>
 
 XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
+XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 835113b42f..d71e8d3d2e 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -307,6 +307,8 @@  update_active (struct cpu_features *cpu_features)
 	  __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
 	  if (ebx != 0)
 	    {
+	      /* NB: On AMX capable processors, ebx always includes AMX
+		 states.  */
 	      unsigned int xsave_state_full_size
 		= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
 
@@ -320,6 +322,11 @@  update_active (struct cpu_features *cpu_features)
 		{
 		  unsigned int xstate_comp_offsets[32];
 		  unsigned int xstate_comp_sizes[32];
+#ifdef __x86_64__
+		  unsigned int xstate_amx_comp_offsets[32];
+		  unsigned int xstate_amx_comp_sizes[32];
+		  unsigned int amx_ecx;
+#endif
 		  unsigned int i;
 
 		  xstate_comp_offsets[0] = 0;
@@ -327,16 +334,39 @@  update_active (struct cpu_features *cpu_features)
 		  xstate_comp_offsets[2] = 576;
 		  xstate_comp_sizes[0] = 160;
 		  xstate_comp_sizes[1] = 256;
+#ifdef __x86_64__
+		  xstate_amx_comp_offsets[0] = 0;
+		  xstate_amx_comp_offsets[1] = 160;
+		  xstate_amx_comp_offsets[2] = 576;
+		  xstate_amx_comp_sizes[0] = 160;
+		  xstate_amx_comp_sizes[1] = 256;
+#endif
 
 		  for (i = 2; i < 32; i++)
 		    {
-		      if ((STATE_SAVE_MASK & (1 << i)) != 0)
+		      if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
 			{
 			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
-			  xstate_comp_sizes[i] = eax;
+#ifdef __x86_64__
+			  /* Include this in xsave_state_full_size.  */
+			  amx_ecx = ecx;
+			  xstate_amx_comp_sizes[i] = eax;
+			  if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
+			    {
+			      /* Exclude this from xsave_state_size.  */
+			      ecx = 0;
+			      xstate_comp_sizes[i] = 0;
+			    }
+			  else
+#endif
+			    xstate_comp_sizes[i] = eax;
 			}
 		      else
 			{
+#ifdef __x86_64__
+			  amx_ecx = 0;
+			  xstate_amx_comp_sizes[i] = 0;
+#endif
 			  ecx = 0;
 			  xstate_comp_sizes[i] = 0;
 			}
@@ -349,6 +379,15 @@  update_active (struct cpu_features *cpu_features)
 			  if ((ecx & (1 << 1)) != 0)
 			    xstate_comp_offsets[i]
 			      = ALIGN_UP (xstate_comp_offsets[i], 64);
+#ifdef __x86_64__
+			  xstate_amx_comp_offsets[i]
+			    = (xstate_amx_comp_offsets[i - 1]
+			       + xstate_amx_comp_sizes[i - 1]);
+			  if ((amx_ecx & (1 << 1)) != 0)
+			    xstate_amx_comp_offsets[i]
+			      = ALIGN_UP (xstate_amx_comp_offsets[i],
+					  64);
+#endif
 			}
 		    }
 
@@ -357,6 +396,18 @@  update_active (struct cpu_features *cpu_features)
 		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
 		  if (size)
 		    {
+#ifdef __x86_64__
+		      unsigned int amx_size
+			= (xstate_amx_comp_offsets[31]
+			   + xstate_amx_comp_sizes[31]);
+		      amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
+					   64);
+		      /* Set xsave_state_full_size to the compact AMX
+			 state size for XSAVEC.  NB: xsave_state_full_size
+			 is only used in _dl_tlsdesc_dynamic_xsave and
+			 _dl_tlsdesc_dynamic_xsavec.  */
+		      cpu_features->xsave_state_full_size = amx_size;
+#endif
 		      cpu_features->xsave_state_size
 			= ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index b9bf3115b6..cd7bd27cf3 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -934,6 +934,8 @@  struct cpu_features
   /* The full state size for XSAVE when XSAVEC is disabled by
 
      GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+
+     and the AMX state size when XSAVEC is available.
    */
   unsigned int xsave_state_full_size;
   /* Data cache size for use in memory and string routines, typically
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 485cad9c02..db8e576e91 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -56,6 +56,14 @@ 
    | (1 << X86_XSTATE_ZMM_H_ID) 	\
    | (1 << X86_XSTATE_ZMM_ID)		\
    | (1 << X86_XSTATE_APX_F_ID))
+
+/* AMX state mask.  */
+# define AMX_STATE_SAVE_MASK		\
+  ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
+
+/* States to be included in xsave_state_full_size.  */
+# define FULL_STATE_SAVE_MASK		\
+  (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
 #else
 /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
    doesn't have red-zone, use 0 here.  */
@@ -68,13 +76,17 @@ 
    | (1 << X86_XSTATE_BNDREGS_ID)	\
    | (1 << X86_XSTATE_K_ID)		\
    | (1 << X86_XSTATE_ZMM_H_ID))
+
+/* States to be included in xsave_state_size.  */
+# define FULL_STATE_SAVE_MASK		STATE_SAVE_MASK
 #endif
 
 /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
-   Compiler assumes that all registers, including x87 FPU stack registers,
-   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
+   Compiler assumes that all registers, including AMX and x87 FPU
+   stack registers, are unchanged after CALL, except for EFLAGS and
+   RAX/EAX.  */
 #define TLSDESC_CALL_STATE_SAVE_MASK	\
-  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
+  (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
 
 /* Constants for bits in __x86_string_control:  */
 
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 418cc4a9b8..04a534fa12 100755
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -134,6 +134,34 @@  fi
 config_vars="$config_vars
 enable-cet = $enable_cet"
 
+# Check if -mamx-tile works properly.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5
+printf %s "checking whether -mamx-tile works properly... " >&6; }
+if test ${libc_cv_x86_have_amx_tile+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  cat > conftest.c <<EOF
+#include <x86intrin.h>
+EOF
+	       libc_cv_x86_have_amx_tile=no
+	       if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+		 if grep -q __builtin_ia32_ldtilecfg conftest.i; then
+		   libc_cv_x86_have_amx_tile=yes
+	         fi
+	       fi
+	       rm -rf conftest*
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5
+printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
+config_vars="$config_vars
+have-mamx-tile = $libc_cv_x86_have_amx_tile"
+
 test -n "$critic_missing" && as_fn_error $? "
 *** $critic_missing" "$LINENO" 5
 
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index d1f803c02e..c714c47351 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -61,5 +61,20 @@  elif test $enable_cet = permissive; then
 fi
 LIBC_CONFIG_VAR([enable-cet], [$enable_cet])
 
+# Check if -mamx-tile works properly.
+AC_CACHE_CHECK(whether -mamx-tile works properly,
+	       libc_cv_x86_have_amx_tile, [dnl
+cat > conftest.c <<EOF
+#include <x86intrin.h>
+EOF
+	       libc_cv_x86_have_amx_tile=no
+	       if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then
+		 if grep -q __builtin_ia32_ldtilecfg conftest.i; then
+		   libc_cv_x86_have_amx_tile=yes
+	         fi
+	       fi
+	       rm -rf conftest*])
+LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
+
 test -n "$critic_missing" && AC_MSG_ERROR([
 *** $critic_missing])
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
index 0c2e8d5320..9f02cfc3eb 100644
--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -99,7 +99,7 @@  _dl_tlsdesc_dynamic:
 # endif
 #else
 	/* Allocate stack space of the required size to save the state.  */
-	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
 #endif
 	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
 	   r10 and r11.  */