diff mbox series

[v1,09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S

Message ID 20221207085236.1424424-9-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,01/27] x86/fpu: Create helper file for common data macros | expand

Commit Message

Noah Goldstein Dec. 7, 2022, 8:52 a.m. UTC
No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 240 ++++++++----------
 1 file changed, 102 insertions(+), 138 deletions(-)
diff mbox series

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 37200b3601..da5744506f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -30,46 +30,45 @@ 
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
-   by use in the function. On cold-starts this might help the
-   prefetcher. Possibly a better idea is to interleave start/end so
-   that the prefetcher is less likely to detect a stream and pull
-   irrelivant lines into cache.  */
-#define sOne				0
-#define SgnMask				16
-#define sTopMask12			32
-#define iBrkValue			48
-#define iOffExpoMask			64
-#define sPoly				80
-#define sLn2				208
-#define TinyRange			224
+#define LOCAL_DATA_NAME	__svml_satanh_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+/* Offsets for data table __svml_stan_data_internal.  */
+#define _Poly_1	0
+#define _Poly_2	16
+#define _Poly_3	32
+#define _Poly_4	48
+#define _Poly_5	64
+#define _Poly_6	80
+#define _Poly_7	96
+#define _TinyRange	112
 
 #include <sysdep.h>
-#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanhf_sse4)
 	movaps	%xmm0, %xmm5
 
-	/* Load constants including One = 1 */
-	movups	ATANHF_DATA(sOne)(%rip), %xmm4
+	/* Load constants including One = 1.  */
+	movups	COMMON_DATA(_OneF)(%rip), %xmm4
 	movaps	%xmm5, %xmm3
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
+	/* Strip off the sign, so treat X as positive until right at the
+	   end.  */
+	movups	COMMON_DATA(_AbsMask)(%rip), %xmm1
 	movaps	%xmm4, %xmm2
 	andps	%xmm1, %xmm0
 	movaps	%xmm4, %xmm10
-	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
+	movups	COMMON_DATA(_Neg4096)(%rip), %xmm11
 	movaps	%xmm4, %xmm14
 	movaps	%xmm11, %xmm9
 
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
+	/* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two
+	   pieces, the upper part UHi being <= 12 bits long. Then we
+	   have:
+	    atanh(X) = 1/2 * log((1 + X) / (1 - X))
+	             = 1/2 * log1p(V / (UHi + ULo)).  */
 	movaps	%xmm0, %xmm6
 	mulps	%xmm5, %xmm3
 	subps	%xmm0, %xmm2
@@ -80,65 +79,61 @@  ENTRY(_ZGVbN4v_atanhf_sse4)
 	andps	%xmm2, %xmm9
 
 
-	/*
-	 * Check whether |X| < 1, in which case we use the main function.
-	 * Otherwise set the rangemask so that the callout will get used.
-	 * Note that this will also use the callout for NaNs since not(NaN < 1).
-	 */
+	/* Check whether |X| < 1, in which case we use the main
+	   function. Otherwise set the rangemask so that the callout
+	   will get used. Note that this will also use the callout for
+	   NaNs since not(NaN < 1).  */
 	rcpps	%xmm9, %xmm7
 	subps	%xmm9, %xmm2
 	andps	%xmm11, %xmm7
 
 
-	/*
-	 * Split V as well into upper 12 bits and lower part, so that we can get
-	 * a preliminary quotient estimate without rounding error.
-	 */
+	/* Split V as well into upper 12 bits and lower part, so that we
+	   can get a preliminary quotient estimate without rounding
+	   error.  */
 	andps	%xmm6, %xmm11
 	mulps	%xmm7, %xmm9
 	addps	%xmm2, %xmm10
 	subps	%xmm11, %xmm6
 
-	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
+	/* Hence get initial quotient estimate:
+	   QHi + QLo = R * VHi + R * VLo.  */
 	mulps	%xmm7, %xmm11
 	mulps	%xmm7, %xmm10
 	subps	%xmm9, %xmm14
 	mulps	%xmm6, %xmm7
 	subps	%xmm10, %xmm14
 
-	/* Compute D = E + E^2 */
+	/* Compute D = E + E^2.  */
 	movaps	%xmm14, %xmm13
 	movaps	%xmm4, %xmm8
 	mulps	%xmm14, %xmm13
 
-	/* reduction: compute r,n */
-	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
+	/* reduction: compute r,n.  */
+	movdqu	COMMON_DATA(_IBrkValue)(%rip), %xmm9
 	addps	%xmm13, %xmm14
 
-	/*
-	 * Compute R * (VHi + VLo) * (1 + E + E^2)
-	 * = R *  (VHi + VLo) * (1 + D)
-	 * = QHi + (QHi * D + QLo + QLo * D)
-	 */
+	/* Compute R * (VHi + VLo) * (1 + E + E^2)
+	   = R *  (VHi + VLo) * (1 + D)
+	   = QHi + (QHi * D + QLo + QLo * D).  */
 	movaps	%xmm14, %xmm2
 	mulps	%xmm7, %xmm14
 	mulps	%xmm11, %xmm2
 	addps	%xmm14, %xmm7
-	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
+	movdqu	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm12
 	movaps	%xmm4, %xmm14
 
-	/* Record the sign for eventual reincorporation. */
+	/* Record the sign for eventual reincorporation.  */
 	addps	%xmm7, %xmm2
 
 
-	/*
-	 * Now finally accumulate the high and low parts of the
-	 * argument to log1p, H + L, with a final compensated summation.
-	 */
+	/* Now finally accumulate the high and low parts of the
+	   argument to log1p, H + L, with a final compensated summation.  */
 	movaps	%xmm2, %xmm6
 	andnps	%xmm5, %xmm1
 	movaps	%xmm4, %xmm7
-	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
+	/* Or the sign bit in with the tiny result to handle atanh(-0)
+	   correctly.  */
 	addps	%xmm11, %xmm6
 	maxps	%xmm6, %xmm7
 	minps	%xmm6, %xmm8
@@ -149,43 +144,43 @@  ENTRY(_ZGVbN4v_atanhf_sse4)
 	subps	%xmm10, %xmm7
 	psubd	%xmm9, %xmm10
 	addps	%xmm8, %xmm7
-	pand	%xmm10, %xmm12
+	pandn	%xmm10, %xmm12
 	psrad	$23, %xmm10
 	cvtdq2ps %xmm10, %xmm13
 	addps	%xmm7, %xmm2
 
-	/* final reconstruction */
+	/* final reconstruction.  */
 	pslld	$23, %xmm10
 	paddd	%xmm9, %xmm12
 	psubd	%xmm10, %xmm14
 
-	/* polynomial evaluation */
+	/* polynomial evaluation.  */
 	subps	%xmm4, %xmm12
 	mulps	%xmm14, %xmm2
-	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
+	movups	LOCAL_DATA(_Poly_1)(%rip), %xmm7
 	addps	%xmm12, %xmm2
 	mulps	%xmm2, %xmm7
 
 
-	/* Finally, halve the result and reincorporate the sign */
-	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
+	/* Finally, halve the result and reincorporate the sign.  */
+	addps	LOCAL_DATA(_Poly_2)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_3)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_4)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_5)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_6)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_7)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
+	movaps	COMMON_DATA(_Neg5F)(%rip), %xmm6
 	addps	%xmm6, %xmm7
 	mulps	%xmm2, %xmm7
 	mulps	%xmm2, %xmm7
-	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
-	/* We can build `sHalf` with `sPoly & sOne`.  */
+	mulps	COMMON_DATA(_Ln2)(%rip), %xmm13
+	/* We can build `sHalf` with `_Poly & sOne`.  */
 	andps	%xmm4, %xmm6
 	orps	%xmm1, %xmm3
 	xorps	%xmm6, %xmm1
@@ -197,7 +192,7 @@  ENTRY(_ZGVbN4v_atanhf_sse4)
 	/* Finish check of NaNs.  */
 	cmpleps	%xmm0, %xmm4
 	movmskps %xmm4, %edx
-	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
+	cmpltps	LOCAL_DATA(_TinyRange)(%rip), %xmm0
 
 	andps	%xmm0, %xmm3
 	andnps	%xmm1, %xmm0
@@ -206,115 +201,84 @@  ENTRY(_ZGVbN4v_atanhf_sse4)
 	testl	%edx, %edx
 	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+
 	/* No registers to restore on fast path.  */
 	ret
 
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a atanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+
 	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
-       call entry will be 16-byte aligned. */
+	   call entry will be 16-byte aligned.  */
 	subq	$56, %rsp
-	cfi_def_cfa_offset(64)
+	cfi_def_cfa_offset (64)
 	movups	%xmm0, 24(%rsp)
 	movups	%xmm5, 40(%rsp)
 
 	/* Use rbx/rbp for callee save registers as they get short
-       encoding for many instructions (as compared with r12/r13). */
+	   encoding for many instructions (as compared with r12/r13).  */
 	movq	%rbx, (%rsp)
-	cfi_offset(rbx, -64)
+	cfi_offset (rbx, -64)
 	movq	%rbp, 8(%rsp)
-	cfi_offset(rbp, -56)
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
 	xorl	%ebp, %ebp
 	bsfl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
 	movss	40(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	/* No good way to avoid the store-forwarding fault this will cause on
-	   return. `lfence` avoids the SF fault but at greater cost as it
-	   serialized stack/callee save restoration.  */
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
 	movss	%xmm0, 24(%rsp, %rbp, 4)
 
 	leal	-1(%rbx), %eax
 	andl	%eax, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 	/* All results have been written to 24(%rsp).  */
 	movups	24(%rsp), %xmm0
 	movq	(%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore (rbx)
 	movq	8(%rsp), %rbp
-	cfi_restore(rbp)
+	cfi_restore (rbp)
 	addq	$56, %rsp
-	cfi_def_cfa_offset(8)
+	cfi_def_cfa_offset (8)
 	ret
 END(_ZGVbN4v_atanhf_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
 
-#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct{
-	__declspec(align(16)) VUINT32 sOne[4][1];
-	__declspec(align(16)) VUINT32 SgnMask[4][1];
-	__declspec(align(16)) VUINT32 sTopMask12[4][1];
-	__declspec(align(16)) VUINT32 iBrkValue[4][1];
-	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 sPoly[8][4][1];
-	__declspec(align(16)) VUINT32 sLn2[4][1];
-	__declspec(align(16)) VUINT32 TinyRange[4][1];
-} __svml_satanh_data_internal;
-#endif
-
-__svml_satanh_data_internal:
-	/* sOne = SP 1.0 */
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* sTopMask12 */
-	.align	16
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* iBrkValue = SP 2/3 */
-	.align	16
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask ==*/
-	.align	16
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-
-	/* sPoly[] = SP polynomial */
-	.align	16
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-
-	/* sLn2 = SP ln(2) */
-	.align	16
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-	/* TinyRange */
-	.align	16
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	.align	16
-	.type	__svml_satanh_data_internal, @object
-	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
+LOCAL_DATA_NAME:
+	/* _Poly[] = SP polynomial.  */
+	/* 1.3820238411426544189453125e-01 P7.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed)
+	/* -1.5122179687023162841796875e-01 P6.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3)
+	/* 1.4042308926582336425781250e-01 P5.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12)
+	/* -1.6472326219081878662109375e-01 P4.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37)
+	/* 2.0007920265197753906250000e-01 P3.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190)
+	/* -2.5004237890243530273437500e-01 P2.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e)
+	/* 3.3333265781402587890625000e-01 P1.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94)
+	DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME