Message ID | 20210821163631.138482-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86-64: Optimize load of all bits set into ZMM register [BZ #28252] | expand |
On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha < libc-alpha@sourceware.org> wrote: > Optimize loads of all bits set into ZMM register in AVX512 SVML codes > by replacing > > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > > and > > vmovups .L_2il0floatpacket.13(%rip), %zmmX > > with > vpternlogd $0xff, %zmmX, %zmmX, %zmmX > > This fixes BZ #28252. > --- > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > 10 files changed, 11 insertions(+), 64 deletions(-) > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > index c2cf007904..0fcb912557 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovaps %zmm0, %zmm8 > > /* Check for large arguments path */ > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > Looking at the code it seems like this is used later by vpandnq %zmm1, %zmm1, %zmm2{%k1} AFAICT you can make the vpternlogd down there and just use vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z} > > /* > ARGUMENT RANGE REDUCTION: > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_cos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.16: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.16,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > index e9a5d00992..5596c950ce 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > > /* preserve mantissa, set input exponent to 2^(-10) */ > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > Earlier in the function there is a dependency breaking kxnorw %k3, %k3, %k3 so I think you can accomplish the same thing but breaking some unlucky dep chain with: vpmovm2d %k3, %zmm2 vpsrlq $32, %zmm4, %zmm6 > > /* reciprocal approximation good to at least 11 bits */ > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_log_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.12: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.12,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > index 508da563fe..2981f1582e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_d_trig_data@GOTPCREL(%rip), %rax > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > This one also seems to just be used by an vpandn later on: vpandnq %zmm13, %zmm13, %zmm14{%k1} so maybe: vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z} instead of the vpandn. vmovups __dAbsMask(%rax), %zmm7 > vmovups __dInvPI(%rax), %zmm2 > vmovups __dRShifter(%rax), %zmm1 > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_sin_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.14: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.14,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > index 965415f2bd..4ad366373b 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > > /* SinPoly = SinR*SinPoly */ > vfmadd213pd %zmm5, %zmm5, %zmm4 > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > Also vpandn below: vpandnq %zmm7, %zmm7, %zmm3{%k1} > /* Update Cos result's sign */ > vxorpd %zmm2, %zmm1, %zmm1 > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > ENTRY (_ZGVeN8vvv_sincos_skx) > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > END (_ZGVeN8vvv_sincos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.15: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.15,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > index cdcb16087d..b7d79efb54 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > X = X - Y*PI1 - Y*PI2 - Y*PI3 > */ > vmovaps %zmm0, %zmm6 > - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > Also vpandn below: vpandnd %zmm1, %zmm1, %zmm12{%k1} > vmovups __sRShifter(%rax), %zmm3 > vmovups __sPI1_FMA(%rax), %zmm5 > vmovups __sA9_FMA(%rax), %zmm9 > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_cosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > index 1b09909344..9f03b9b780 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > vmovaps %zmm0, %zmm7 > > /* compare against threshold */ > - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > Also below: vpandnd %zmm2, %zmm2, %zmm3{%k1} > vmovups __sInvLn2(%rax), %zmm4 > vmovups __sShifter(%rax), %zmm1 > vmovups __sLn2hi(%rax), %zmm6 > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_expf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > index 4a7b2adbbf..2ba38b0f33 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_slog_data@GOTPCREL(%rip), %rax > - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > Also below: vpandnd %zmm1, %zmm1, %zmm6{%k1} > vmovups _iBrkValue(%rax), %zmm4 > vmovups _sPoly_7(%rax), %zmm8 > > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_logf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.7: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.7,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > index 7f906622a5..7f0272c809 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpsrlq $32, %zmm3, %zmm2 > vpmovqd %zmm2, %ymm11 > vcvtps2pd %ymm14, %zmm13 > - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > earlier kxnorw %k3, %k3, %k3 can be used to get a dependency break with this: vpmovm2d %k3, %zmm14 > vmovaps %zmm14, %zmm26 > vpandd _ABSMASK(%rax), %zmm1, %zmm8 > vpcmpd $1, _INF(%rax), %zmm8, %k2 > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpmovqd %zmm11, %ymm5 > vpxord %zmm10, %zmm10, %zmm10 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > vpxord %zmm11, %zmm11, %zmm11 > vcvtdq2pd %ymm7, %zmm7 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16vv_powf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.23: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.23,@object > -.L_2il0floatpacket.24: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.24,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > index 54cee3a537..e1d0154441 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > > /* Result sign calculations */ > vpternlogd $150, %zmm0, %zmm14, %zmm1 > - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > /* Add correction term 0.5 for cos() part */ > vaddps %zmm8, %zmm5, %zmm15 > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > ENTRY (_ZGVeN16vvv_sincosf_skx) > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > END (_ZGVeN16vvv_sincosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > index ec65ffdce5..bcb76ff756 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > movq __svml_s_trig_data@GOTPCREL(%rip), %rax > > /* Check for large and special values */ > - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > Also below: vpandnd %zmm2, %zmm2, %zmm14{%k1} > vmovups __sAbsMask(%rax), %zmm5 > vmovups __sInvPI(%rax), %zmm1 > vmovups __sRShifter(%rax), %zmm2 > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_sinf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.11: > - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.11,@object > -- > 2.31.1 > >
On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: >> >> Optimize loads of all bits set into ZMM register in AVX512 SVML codes >> by replacing >> >> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX >> >> and >> >> vmovups .L_2il0floatpacket.13(%rip), %zmmX >> >> with >> vpternlogd $0xff, %zmmX, %zmmX, %zmmX >> >> This fixes BZ #28252. >> --- >> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ >> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- >> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ >> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ >> 10 files changed, 11 insertions(+), 64 deletions(-) >> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S >> index c2cf007904..0fcb912557 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S >> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) >> vmovaps %zmm0, %zmm8 >> >> /* Check for large arguments path */ >> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 >> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > Looking at the code it seems like this is used later by > > vpandnq %zmm1, %zmm1, %zmm2{%k1} > > AFAICT you can make the vpternlogd down there and just use > > vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z} >> >> >> /* >> ARGUMENT RANGE REDUCTION: >> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) >> vmovsd %xmm0, 1216(%rsp,%r15) >> jmp .LBL_2_7 >> END (_ZGVeN8v_cos_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.16: >> - .long 0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.16,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S >> index e9a5d00992..5596c950ce 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S >> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) >> >> /* preserve mantissa, set input exponent to 2^(-10) */ >> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 >> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > > > Earlier in the function there is a dependency breaking > > kxnorw %k3, %k3, %k3 > > so I think you can accomplish the same thing but breaking > some unlucky dep chain with: > > vpmovm2d %k3, %zmm2 > >> vpsrlq $32, %zmm4, %zmm6 >> >> /* reciprocal approximation good to at least 11 bits */ >> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) >> vmovsd %xmm0, 1216(%rsp,%r15) >> jmp .LBL_2_7 >> END (_ZGVeN8v_log_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.12: >> - .long 0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.12,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S >> index 508da563fe..2981f1582e 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S >> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) >> andq $-64, %rsp >> subq $1280, %rsp >> movq __svml_d_trig_data@GOTPCREL(%rip), %rax >> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > > This one also seems to just be used by an vpandn later on: > > vpandnq %zmm13, %zmm13, %zmm14{%k1} > > so maybe: > vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z} > instead of the vpandn. > >> vmovups __dAbsMask(%rax), %zmm7 >> vmovups __dInvPI(%rax), %zmm2 >> vmovups __dRShifter(%rax), %zmm1 >> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) >> vmovsd %xmm0, 1216(%rsp,%r15) >> jmp .LBL_2_7 >> END (_ZGVeN8v_sin_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.14: >> - .long 0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.14,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S >> index 965415f2bd..4ad366373b 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S >> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) >> >> /* SinPoly = SinR*SinPoly */ >> vfmadd213pd %zmm5, %zmm5, %zmm4 >> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > Also vpandn below: > vpandnq %zmm7, %zmm7, %zmm3{%k1} > >> >> /* Update Cos result's sign */ >> vxorpd %zmm2, %zmm1, %zmm1 >> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) >> ENTRY (_ZGVeN8vvv_sincos_skx) >> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx >> END (_ZGVeN8vvv_sincos_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.15: >> - .long 0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.15,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S >> index cdcb16087d..b7d79efb54 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S >> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) >> X = X - Y*PI1 - Y*PI2 - Y*PI3 >> */ >> vmovaps %zmm0, %zmm6 >> - vmovups .L_2il0floatpacket.13(%rip), %zmm12 >> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > > > Also vpandn below: > vpandnd %zmm1, %zmm1, %zmm12{%k1} > >> >> vmovups __sRShifter(%rax), %zmm3 >> vmovups __sPI1_FMA(%rax), %zmm5 >> vmovups __sA9_FMA(%rax), %zmm9 >> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) >> vmovss %xmm0, 1216(%rsp,%r15,8) >> jmp .LBL_2_7 >> END (_ZGVeN16v_cosf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.13: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.13,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S >> index 1b09909344..9f03b9b780 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S >> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) >> vmovaps %zmm0, %zmm7 >> >> /* compare against threshold */ >> - vmovups .L_2il0floatpacket.13(%rip), %zmm3 >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > Also below: > vpandnd %zmm2, %zmm2, %zmm3{%k1} >> >> vmovups __sInvLn2(%rax), %zmm4 >> vmovups __sShifter(%rax), %zmm1 >> vmovups __sLn2hi(%rax), %zmm6 >> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) >> jmp .LBL_2_7 >> >> END (_ZGVeN16v_expf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.13: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.13,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S >> index 4a7b2adbbf..2ba38b0f33 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S >> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) >> andq $-64, %rsp >> subq $1280, %rsp >> movq __svml_slog_data@GOTPCREL(%rip), %rax >> - vmovups .L_2il0floatpacket.7(%rip), %zmm6 >> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > > Also below: > vpandnd %zmm1, %zmm1, %zmm6{%k1} > >> >> vmovups _iBrkValue(%rax), %zmm4 >> vmovups _sPoly_7(%rax), %zmm8 >> >> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) >> jmp .LBL_2_7 >> >> END (_ZGVeN16v_logf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.7: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.7,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S >> index 7f906622a5..7f0272c809 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S >> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) >> vpsrlq $32, %zmm3, %zmm2 >> vpmovqd %zmm2, %ymm11 >> vcvtps2pd %ymm14, %zmm13 >> - vmovups .L_2il0floatpacket.23(%rip), %zmm14 >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > earlier > kxnorw %k3, %k3, %k3 > can be used to get a dependency break with this: > vpmovm2d %k3, %zmm14 The SVM codes can use some improvements. Can you open a separate glibc bug? I'd like to address only all 1s load here to avoid more complexity. Thanks. >> vmovaps %zmm14, %zmm26 >> vpandd _ABSMASK(%rax), %zmm1, %zmm8 >> vpcmpd $1, _INF(%rax), %zmm8, %k2 >> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) >> vpmovqd %zmm11, %ymm5 >> vpxord %zmm10, %zmm10, %zmm10 >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} >> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 >> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 >> vpxord %zmm11, %zmm11, %zmm11 >> vcvtdq2pd %ymm7, %zmm7 >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} >> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) >> vmovss %xmm0, 1216(%rsp,%r15,8) >> jmp .LBL_2_7 >> END (_ZGVeN16vv_powf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.23: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.23,@object >> -.L_2il0floatpacket.24: >> - .long 0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.24,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S >> index 54cee3a537..e1d0154441 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S >> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) >> >> /* Result sign calculations */ >> vpternlogd $150, %zmm0, %zmm14, %zmm1 >> - vmovups .L_2il0floatpacket.13(%rip), %zmm14 >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 >> >> /* Add correction term 0.5 for cos() part */ >> vaddps %zmm8, %zmm5, %zmm15 >> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) >> ENTRY (_ZGVeN16vvv_sincosf_skx) >> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx >> END (_ZGVeN16vvv_sincosf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.13: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.13,@object >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S >> index ec65ffdce5..bcb76ff756 100644 >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S >> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) >> movq __svml_s_trig_data@GOTPCREL(%rip), %rax >> >> /* Check for large and special values */ >> - vmovups .L_2il0floatpacket.11(%rip), %zmm14 >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > Also below: > vpandnd %zmm2, %zmm2, %zmm14{%k1} >> >> vmovups __sAbsMask(%rax), %zmm5 >> vmovups __sInvPI(%rax), %zmm1 >> vmovups __sRShifter(%rax), %zmm2 >> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) >> vmovss %xmm0, 1216(%rsp,%r15,8) >> jmp .LBL_2_7 >> END (_ZGVeN16v_sinf_skx) >> - >> - .section .rodata, "a" >> -.L_2il0floatpacket.11: >> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff >> - .type .L_2il0floatpacket.11,@object >> -- >> 2.31.1 >>
On Sat, Aug 21, 2021 at 2:09 PM H.J. Lu <hjl.tools@gmail.com> wrote: > On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > > > > > > > > On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha < > libc-alpha@sourceware.org> wrote: > >> > >> Optimize loads of all bits set into ZMM register in AVX512 SVML codes > >> by replacing > >> > >> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > >> > >> and > >> > >> vmovups .L_2il0floatpacket.13(%rip), %zmmX > >> > >> with > >> vpternlogd $0xff, %zmmX, %zmmX, %zmmX > >> > >> This fixes BZ #28252. > >> --- > >> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > >> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > >> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > >> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > >> 10 files changed, 11 insertions(+), 64 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> index c2cf007904..0fcb912557 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > >> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > >> vmovaps %zmm0, %zmm8 > >> > >> /* Check for large arguments path */ > >> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > >> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > > > Looking at the code it seems like this is used later by > > > > vpandnq %zmm1, %zmm1, %zmm2{%k1} > > > > AFAICT you can make the vpternlogd down there and just use > > > > vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z} > >> > >> > >> /* > >> ARGUMENT RANGE REDUCTION: > >> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_cos_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.16: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.16,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> index e9a5d00992..5596c950ce 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > >> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > >> > >> /* preserve mantissa, set input exponent to 2^(-10) */ > >> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > >> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > > > > > > Earlier in the function there is a dependency breaking > > > > kxnorw %k3, %k3, %k3 > > > > so I think you can accomplish the same thing but breaking > > some unlucky dep chain with: > > > > vpmovm2d %k3, %zmm2 > > > >> vpsrlq $32, %zmm4, %zmm6 > >> > >> /* reciprocal approximation good to at least 11 bits */ > >> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_log_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.12: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.12,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> index 508da563fe..2981f1582e 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > >> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > >> andq $-64, %rsp > >> subq $1280, %rsp > >> movq __svml_d_trig_data@GOTPCREL(%rip), %rax > >> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > > > > This one also seems to just be used by an vpandn later on: > > > > vpandnq %zmm13, %zmm13, %zmm14{%k1} > > > > so maybe: > > vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z} > > instead of the vpandn. > > > >> vmovups __dAbsMask(%rax), %zmm7 > >> vmovups __dInvPI(%rax), %zmm2 > >> vmovups __dRShifter(%rax), %zmm1 > >> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > >> vmovsd %xmm0, 1216(%rsp,%r15) > >> jmp .LBL_2_7 > >> END (_ZGVeN8v_sin_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.14: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.14,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> index 965415f2bd..4ad366373b 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > >> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > >> > >> /* SinPoly = SinR*SinPoly */ > >> vfmadd213pd %zmm5, %zmm5, %zmm4 > >> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > > > Also vpandn below: > > vpandnq %zmm7, %zmm7, %zmm3{%k1} > > > >> > >> /* Update Cos result's sign */ > >> vxorpd %zmm2, %zmm1, %zmm1 > >> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > >> ENTRY (_ZGVeN8vvv_sincos_skx) > >> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > >> END (_ZGVeN8vvv_sincos_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.15: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.15,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> index cdcb16087d..b7d79efb54 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > >> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > >> X = X - Y*PI1 - Y*PI2 - Y*PI3 > >> */ > >> vmovaps %zmm0, %zmm6 > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > >> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > > > > > > Also vpandn below: > > vpandnd %zmm1, %zmm1, %zmm12{%k1} > > > >> > >> vmovups __sRShifter(%rax), %zmm3 > >> vmovups __sPI1_FMA(%rax), %zmm5 > >> vmovups __sA9_FMA(%rax), %zmm9 > >> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16v_cosf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> index 1b09909344..9f03b9b780 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > >> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > >> vmovaps %zmm0, %zmm7 > >> > >> /* compare against threshold */ > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > > > Also below: > > vpandnd %zmm2, %zmm2, %zmm3{%k1} > >> > >> vmovups __sInvLn2(%rax), %zmm4 > >> vmovups __sShifter(%rax), %zmm1 > >> vmovups __sLn2hi(%rax), %zmm6 > >> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > >> jmp .LBL_2_7 > >> > >> END (_ZGVeN16v_expf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> index 4a7b2adbbf..2ba38b0f33 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > >> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > >> andq $-64, %rsp > >> subq $1280, %rsp > >> movq __svml_slog_data@GOTPCREL(%rip), %rax > >> - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > >> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > > > > Also below: > > vpandnd %zmm1, %zmm1, %zmm6{%k1} > > > >> > >> vmovups _iBrkValue(%rax), %zmm4 > >> vmovups _sPoly_7(%rax), %zmm8 > >> > >> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > >> jmp .LBL_2_7 > >> > >> END (_ZGVeN16v_logf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.7: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.7,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> index 7f906622a5..7f0272c809 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > >> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vpsrlq $32, %zmm3, %zmm2 > >> vpmovqd %zmm2, %ymm11 > >> vcvtps2pd %ymm14, %zmm13 > >> - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > earlier > > kxnorw %k3, %k3, %k3 > > can be used to get a dependency break with this: > > vpmovm2d %k3, %zmm14 > > The SVM codes can use some improvements. Can you > open a separate glibc bug? I'd like to address only all 1s > load here to avoid more complexity. > > Thanks. > Alright. Okay with this patch. > > >> vmovaps %zmm14, %zmm26 > >> vpandd _ABSMASK(%rax), %zmm1, %zmm8 > >> vpcmpd $1, _INF(%rax), %zmm8, %k2 > >> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vpmovqd %zmm11, %ymm5 > >> vpxord %zmm10, %zmm10, %zmm10 > >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > >> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > >> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > >> vpxord %zmm11, %zmm11, %zmm11 > >> vcvtdq2pd %ymm7, %zmm7 > >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > >> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16vv_powf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.23: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.23,@object > >> -.L_2il0floatpacket.24: > >> - .long 0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.24,@object > >> diff --git > a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> index 54cee3a537..e1d0154441 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > >> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > >> > >> /* Result sign calculations */ > >> vpternlogd $150, %zmm0, %zmm14, %zmm1 > >> - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > >> > >> /* Add correction term 0.5 for cos() part */ > >> vaddps %zmm8, %zmm5, %zmm15 > >> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > >> ENTRY (_ZGVeN16vvv_sincosf_skx) > >> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > >> END (_ZGVeN16vvv_sincosf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.13: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.13,@object > >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> index ec65ffdce5..bcb76ff756 100644 > >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > >> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > >> movq __svml_s_trig_data@GOTPCREL(%rip), %rax > >> > >> /* Check for large and special values */ > >> - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > Also below: > > vpandnd %zmm2, %zmm2, %zmm14{%k1} > >> > >> vmovups __sAbsMask(%rax), %zmm5 > >> vmovups __sInvPI(%rax), %zmm1 > >> vmovups __sRShifter(%rax), %zmm2 > >> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > >> vmovss %xmm0, 1216(%rsp,%r15,8) > >> jmp .LBL_2_7 > >> END (_ZGVeN16v_sinf_skx) > >> - > >> - .section .rodata, "a" > >> -.L_2il0floatpacket.11: > >> - .long > 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > >> - .type .L_2il0floatpacket.11,@object > >> -- > >> 2.31.1 > >> > > > -- > H.J. >
On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Optimize loads of all bits set into ZMM register in AVX512 SVML codes > by replacing > > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > > and > > vmovups .L_2il0floatpacket.13(%rip), %zmmX > > with > vpternlogd $0xff, %zmmX, %zmmX, %zmmX > > This fixes BZ #28252. > --- > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > 10 files changed, 11 insertions(+), 64 deletions(-) > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > index c2cf007904..0fcb912557 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovaps %zmm0, %zmm8 > > /* Check for large arguments path */ > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > /* > ARGUMENT RANGE REDUCTION: > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_cos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.16: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.16,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > index e9a5d00992..5596c950ce 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > > /* preserve mantissa, set input exponent to 2^(-10) */ > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > vpsrlq $32, %zmm4, %zmm6 > > /* reciprocal approximation good to at least 11 bits */ > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_log_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.12: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.12,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > index 508da563fe..2981f1582e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_d_trig_data@GOTPCREL(%rip), %rax > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > vmovups __dAbsMask(%rax), %zmm7 > vmovups __dInvPI(%rax), %zmm2 > vmovups __dRShifter(%rax), %zmm1 > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > vmovsd %xmm0, 1216(%rsp,%r15) > jmp .LBL_2_7 > END (_ZGVeN8v_sin_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.14: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.14,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > index 965415f2bd..4ad366373b 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > > /* SinPoly = SinR*SinPoly */ > vfmadd213pd %zmm5, %zmm5, %zmm4 > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > /* Update Cos result's sign */ > vxorpd %zmm2, %zmm1, %zmm1 > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > ENTRY (_ZGVeN8vvv_sincos_skx) > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > END (_ZGVeN8vvv_sincos_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.15: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.15,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > index cdcb16087d..b7d79efb54 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > X = X - Y*PI1 - Y*PI2 - Y*PI3 > */ > vmovaps %zmm0, %zmm6 > - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > vmovups __sRShifter(%rax), %zmm3 > vmovups __sPI1_FMA(%rax), %zmm5 > vmovups __sA9_FMA(%rax), %zmm9 > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_cosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > index 1b09909344..9f03b9b780 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > vmovaps %zmm0, %zmm7 > > /* compare against threshold */ > - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > vmovups __sInvLn2(%rax), %zmm4 > vmovups __sShifter(%rax), %zmm1 > vmovups __sLn2hi(%rax), %zmm6 > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_expf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > index 4a7b2adbbf..2ba38b0f33 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > andq $-64, %rsp > subq $1280, %rsp > movq __svml_slog_data@GOTPCREL(%rip), %rax > - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > vmovups _iBrkValue(%rax), %zmm4 > vmovups _sPoly_7(%rax), %zmm8 > > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > jmp .LBL_2_7 > > END (_ZGVeN16v_logf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.7: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.7,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > index 7f906622a5..7f0272c809 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpsrlq $32, %zmm3, %zmm2 > vpmovqd %zmm2, %ymm11 > vcvtps2pd %ymm14, %zmm13 > - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > vmovaps %zmm14, %zmm26 > vpandd _ABSMASK(%rax), %zmm1, %zmm8 > vpcmpd $1, _INF(%rax), %zmm8, %k2 > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > vpmovqd %zmm11, %ymm5 > vpxord %zmm10, %zmm10, %zmm10 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > vpxord %zmm11, %zmm11, %zmm11 > vcvtdq2pd %ymm7, %zmm7 > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16vv_powf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.23: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.23,@object > -.L_2il0floatpacket.24: > - .long 0xffffffff,0xffffffff > - .type .L_2il0floatpacket.24,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > index 54cee3a537..e1d0154441 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > > /* Result sign calculations */ > vpternlogd $150, %zmm0, %zmm14, %zmm1 > - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > /* Add correction term 0.5 for cos() part */ > vaddps %zmm8, %zmm5, %zmm15 > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > ENTRY (_ZGVeN16vvv_sincosf_skx) > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > END (_ZGVeN16vvv_sincosf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.13: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.13,@object > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > index ec65ffdce5..bcb76ff756 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > movq __svml_s_trig_data@GOTPCREL(%rip), %rax > > /* Check for large and special values */ > - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > vmovups __sAbsMask(%rax), %zmm5 > vmovups __sInvPI(%rax), %zmm1 > vmovups __sRShifter(%rax), %zmm2 > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > vmovss %xmm0, 1216(%rsp,%r15,8) > jmp .LBL_2_7 > END (_ZGVeN16v_sinf_skx) > - > - .section .rodata, "a" > -.L_2il0floatpacket.11: > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > - .type .L_2il0floatpacket.11,@object > -- > 2.31.1 > I would like to backport this patch to release branches. Any comments or objections? --Sunil
On Fri, Apr 22, 2022 at 4:40 PM Sunil Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Optimize loads of all bits set into ZMM register in AVX512 SVML codes > > by replacing > > > > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX > > > > and > > > > vmovups .L_2il0floatpacket.13(%rip), %zmmX > > > > with > > vpternlogd $0xff, %zmmX, %zmmX, %zmmX > > > > This fixes BZ #28252. > > --- > > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ > > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- > > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ > > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ > > 10 files changed, 11 insertions(+), 64 deletions(-) > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > index c2cf007904..0fcb912557 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S > > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) > > vmovaps %zmm0, %zmm8 > > > > /* Check for large arguments path */ > > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 > > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 > > > > /* > > ARGUMENT RANGE REDUCTION: > > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_cos_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.16: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.16,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > index e9a5d00992..5596c950ce 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S > > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) > > > > /* preserve mantissa, set input exponent to 2^(-10) */ > > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 > > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 > > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 > > vpsrlq $32, %zmm4, %zmm6 > > > > /* reciprocal approximation good to at least 11 bits */ > > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_log_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.12: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.12,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > index 508da563fe..2981f1582e 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S > > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) > > andq $-64, %rsp > > subq $1280, %rsp > > movq __svml_d_trig_data@GOTPCREL(%rip), %rax > > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 > > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 > > vmovups __dAbsMask(%rax), %zmm7 > > vmovups __dInvPI(%rax), %zmm2 > > vmovups __dRShifter(%rax), %zmm1 > > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) > > vmovsd %xmm0, 1216(%rsp,%r15) > > jmp .LBL_2_7 > > END (_ZGVeN8v_sin_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.14: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.14,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > index 965415f2bd..4ad366373b 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S > > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) > > > > /* SinPoly = SinR*SinPoly */ > > vfmadd213pd %zmm5, %zmm5, %zmm4 > > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 > > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > > > /* Update Cos result's sign */ > > vxorpd %zmm2, %zmm1, %zmm1 > > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) > > ENTRY (_ZGVeN8vvv_sincos_skx) > > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx > > END (_ZGVeN8vvv_sincos_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.15: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.15,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > index cdcb16087d..b7d79efb54 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S > > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) > > X = X - Y*PI1 - Y*PI2 - Y*PI3 > > */ > > vmovaps %zmm0, %zmm6 > > - vmovups .L_2il0floatpacket.13(%rip), %zmm12 > > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 > > vmovups __sRShifter(%rax), %zmm3 > > vmovups __sPI1_FMA(%rax), %zmm5 > > vmovups __sA9_FMA(%rax), %zmm9 > > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16v_cosf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > index 1b09909344..9f03b9b780 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S > > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) > > vmovaps %zmm0, %zmm7 > > > > /* compare against threshold */ > > - vmovups .L_2il0floatpacket.13(%rip), %zmm3 > > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 > > vmovups __sInvLn2(%rax), %zmm4 > > vmovups __sShifter(%rax), %zmm1 > > vmovups __sLn2hi(%rax), %zmm6 > > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) > > jmp .LBL_2_7 > > > > END (_ZGVeN16v_expf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > index 4a7b2adbbf..2ba38b0f33 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S > > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) > > andq $-64, %rsp > > subq $1280, %rsp > > movq __svml_slog_data@GOTPCREL(%rip), %rax > > - vmovups .L_2il0floatpacket.7(%rip), %zmm6 > > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 > > vmovups _iBrkValue(%rax), %zmm4 > > vmovups _sPoly_7(%rax), %zmm8 > > > > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) > > jmp .LBL_2_7 > > > > END (_ZGVeN16v_logf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.7: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.7,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > index 7f906622a5..7f0272c809 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S > > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vpsrlq $32, %zmm3, %zmm2 > > vpmovqd %zmm2, %ymm11 > > vcvtps2pd %ymm14, %zmm13 > > - vmovups .L_2il0floatpacket.23(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > vmovaps %zmm14, %zmm26 > > vpandd _ABSMASK(%rax), %zmm1, %zmm8 > > vpcmpd $1, _INF(%rax), %zmm8, %k2 > > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vpmovqd %zmm11, %ymm5 > > vpxord %zmm10, %zmm10, %zmm10 > > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} > > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 > > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 > > vpxord %zmm11, %zmm11, %zmm11 > > vcvtdq2pd %ymm7, %zmm7 > > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} > > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16vv_powf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.23: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.23,@object > > -.L_2il0floatpacket.24: > > - .long 0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.24,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > index 54cee3a537..e1d0154441 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S > > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) > > > > /* Result sign calculations */ > > vpternlogd $150, %zmm0, %zmm14, %zmm1 > > - vmovups .L_2il0floatpacket.13(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > > > /* Add correction term 0.5 for cos() part */ > > vaddps %zmm8, %zmm5, %zmm15 > > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) > > ENTRY (_ZGVeN16vvv_sincosf_skx) > > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx > > END (_ZGVeN16vvv_sincosf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.13: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.13,@object > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > index ec65ffdce5..bcb76ff756 100644 > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S > > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) > > movq __svml_s_trig_data@GOTPCREL(%rip), %rax > > > > /* Check for large and special values */ > > - vmovups .L_2il0floatpacket.11(%rip), %zmm14 > > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 > > vmovups __sAbsMask(%rax), %zmm5 > > vmovups __sInvPI(%rax), %zmm1 > > vmovups __sRShifter(%rax), %zmm2 > > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) > > vmovss %xmm0, 1216(%rsp,%r15,8) > > jmp .LBL_2_7 > > END (_ZGVeN16v_sinf_skx) > > - > > - .section .rodata, "a" > > -.L_2il0floatpacket.11: > > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff > > - .type .L_2il0floatpacket.11,@object > > -- > > 2.31.1 > > > > I would like to backport this patch to release branches. > Any comments or objections? None by me > > --Sunil
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S index c2cf007904..0fcb912557 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx) vmovaps %zmm0, %zmm8 /* Check for large arguments path */ - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 + vpternlogd $0xff, %zmm2, %zmm2, %zmm2 /* ARGUMENT RANGE REDUCTION: @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 END (_ZGVeN8v_cos_skx) - - .section .rodata, "a" -.L_2il0floatpacket.16: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.16,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S index e9a5d00992..5596c950ce 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx) /* preserve mantissa, set input exponent to 2^(-10) */ vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 + vpternlogd $0xff, %zmm1, %zmm1, %zmm1 vpsrlq $32, %zmm4, %zmm6 /* reciprocal approximation good to at least 11 bits */ @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 END (_ZGVeN8v_log_skx) - - .section .rodata, "a" -.L_2il0floatpacket.12: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.12,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S index 508da563fe..2981f1582e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx) andq $-64, %rsp subq $1280, %rsp movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 + vpternlogd $0xff, %zmm1, %zmm1, %zmm14 vmovups __dAbsMask(%rax), %zmm7 vmovups __dInvPI(%rax), %zmm2 vmovups __dRShifter(%rax), %zmm1 @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 END (_ZGVeN8v_sin_skx) - - .section .rodata, "a" -.L_2il0floatpacket.14: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.14,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S index 965415f2bd..4ad366373b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx) /* SinPoly = SinR*SinPoly */ vfmadd213pd %zmm5, %zmm5, %zmm4 - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 /* Update Cos result's sign */ vxorpd %zmm2, %zmm1, %zmm1 @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl) ENTRY (_ZGVeN8vvv_sincos_skx) WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx END (_ZGVeN8vvv_sincos_skx) - - .section .rodata, "a" -.L_2il0floatpacket.15: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.15,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S index cdcb16087d..b7d79efb54 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx) X = X - Y*PI1 - Y*PI2 - Y*PI3 */ vmovaps %zmm0, %zmm6 - vmovups .L_2il0floatpacket.13(%rip), %zmm12 + vpternlogd $0xff, %zmm12, %zmm12, %zmm12 vmovups __sRShifter(%rax), %zmm3 vmovups __sPI1_FMA(%rax), %zmm5 vmovups __sA9_FMA(%rax), %zmm9 @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 END (_ZGVeN16v_cosf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S index 1b09909344..9f03b9b780 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx) vmovaps %zmm0, %zmm7 /* compare against threshold */ - vmovups .L_2il0floatpacket.13(%rip), %zmm3 + vpternlogd $0xff, %zmm3, %zmm3, %zmm3 vmovups __sInvLn2(%rax), %zmm4 vmovups __sShifter(%rax), %zmm1 vmovups __sLn2hi(%rax), %zmm6 @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx) jmp .LBL_2_7 END (_ZGVeN16v_expf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S index 4a7b2adbbf..2ba38b0f33 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx) andq $-64, %rsp subq $1280, %rsp movq __svml_slog_data@GOTPCREL(%rip), %rax - vmovups .L_2il0floatpacket.7(%rip), %zmm6 + vpternlogd $0xff, %zmm6, %zmm6, %zmm6 vmovups _iBrkValue(%rax), %zmm4 vmovups _sPoly_7(%rax), %zmm8 @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx) jmp .LBL_2_7 END (_ZGVeN16v_logf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.7: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.7,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S index 7f906622a5..7f0272c809 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx) vpsrlq $32, %zmm3, %zmm2 vpmovqd %zmm2, %ymm11 vcvtps2pd %ymm14, %zmm13 - vmovups .L_2il0floatpacket.23(%rip), %zmm14 + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 vmovaps %zmm14, %zmm26 vpandd _ABSMASK(%rax), %zmm1, %zmm8 vpcmpd $1, _INF(%rax), %zmm8, %k2 @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx) vpmovqd %zmm11, %ymm5 vpxord %zmm10, %zmm10, %zmm10 vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 + vpternlogd $0xff, %zmm4, %zmm4, %zmm4 vpxord %zmm11, %zmm11, %zmm11 vcvtdq2pd %ymm7, %zmm7 vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 END (_ZGVeN16vv_powf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.23: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.23,@object -.L_2il0floatpacket.24: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.24,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S index 54cee3a537..e1d0154441 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx) /* Result sign calculations */ vpternlogd $150, %zmm0, %zmm14, %zmm1 - vmovups .L_2il0floatpacket.13(%rip), %zmm14 + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 /* Add correction term 0.5 for cos() part */ vaddps %zmm8, %zmm5, %zmm15 @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl) ENTRY (_ZGVeN16vvv_sincosf_skx) WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx END (_ZGVeN16vvv_sincosf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S index ec65ffdce5..bcb76ff756 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx) movq __svml_s_trig_data@GOTPCREL(%rip), %rax /* Check for large and special values */ - vmovups .L_2il0floatpacket.11(%rip), %zmm14 + vpternlogd $0xff, %zmm14, %zmm14, %zmm14 vmovups __sAbsMask(%rax), %zmm5 vmovups __sInvPI(%rax), %zmm1 vmovups __sRShifter(%rax), %zmm2 @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 END (_ZGVeN16v_sinf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.11: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.11,@object