@@ -27,26 +27,36 @@
/* Area on stack to save and restore registers used for parameter
passing when calling _dl_fixup. */
#ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */
-# define REGISTER_SAVE_AREA (8 * 7)
-# define REGISTER_SAVE_RAX 0
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
+ XMM7. */
+# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8)
+/* Align XMM register save area to 16 bytes. */
+# define REGISTER_SAVE_XMM0 0
# define PRESERVE_BND_REGS_PREFIX
#else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
- BND1, BND2, BND3. */
-# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4)
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+ BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
+# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8)
/* Align bound register save area to 16 bytes. */
# define REGISTER_SAVE_BND0 0
# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16)
# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16)
# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16)
# ifdef HAVE_MPX_SUPPORT
# define PRESERVE_BND_REGS_PREFIX bnd
# else
# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
# endif
#endif
+#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16)
+#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16)
+#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16)
+#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16)
+#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16)
+#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16)
+#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16)
+#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16)
#define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8)
#define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8)
#define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8)
@@ -54,6 +64,10 @@
#define REGISTER_SAVE_R8 (REGISTER_SAVE_RDI + 8)
#define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8)
+#if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+#endif
+
.text
.globl _dl_runtime_resolve
.type _dl_runtime_resolve, @function
@@ -71,6 +85,20 @@ _dl_runtime_resolve:
movq %rdi, REGISTER_SAVE_RDI(%rsp)
movq %r8, REGISTER_SAVE_R8(%rsp)
movq %r9, REGISTER_SAVE_R9(%rsp)
+ # movaps is 1-byte shorter.
+ # We use movups instea of movaps since __tls_get_addr may be
+ # called with misaligned stack:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+ # On Intel processors since 2011, movups is as fast as movaps
+ # on aligned address.
+ movups %xmm0, REGISTER_SAVE_XMM0(%rsp)
+ movups %xmm1, REGISTER_SAVE_XMM1(%rsp)
+ movups %xmm2, REGISTER_SAVE_XMM2(%rsp)
+ movups %xmm3, REGISTER_SAVE_XMM3(%rsp)
+ movups %xmm4, REGISTER_SAVE_XMM4(%rsp)
+ movups %xmm5, REGISTER_SAVE_XMM5(%rsp)
+ movups %xmm6, REGISTER_SAVE_XMM6(%rsp)
+ movups %xmm7, REGISTER_SAVE_XMM7(%rsp)
#ifndef __ILP32__
# We also have to preserve bound registers. These are nops if
# Intel MPX isn't available or disabled.
@@ -123,6 +151,15 @@ _dl_runtime_resolve:
movq REGISTER_SAVE_RDX(%rsp), %rdx
movq REGISTER_SAVE_RCX(%rsp), %rcx
movq REGISTER_SAVE_RAX(%rsp), %rax
+ # movaps is 1-byte shorter.
+ movups REGISTER_SAVE_XMM0(%rsp), %xmm0
+ movups REGISTER_SAVE_XMM1(%rsp), %xmm1
+ movups REGISTER_SAVE_XMM2(%rsp), %xmm2
+ movups REGISTER_SAVE_XMM3(%rsp), %xmm3
+ movups REGISTER_SAVE_XMM4(%rsp), %xmm4
+ movups REGISTER_SAVE_XMM5(%rsp), %xmm5
+ movups REGISTER_SAVE_XMM6(%rsp), %xmm6
+ movups REGISTER_SAVE_XMM7(%rsp), %xmm7
# Adjust stack(PLT did 2 pushes)
addq $(REGISTER_SAVE_AREA + 16), %rsp
cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))