Message ID | 20170316203911.GA26261@intel.com |
---|---|
State | New |
Headers | show |
On Thu, Mar 16, 2017 at 1:39 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve > the first 8 vector registers. The code layout is > > if only %xmm0 - %xmm7 registers are used > preserve %xmm0 - %xmm7 registers > if only %ymm0 - %ymm7 registers are used > preserve %ymm0 - %ymm7 registers > preserve %zmm0 - %zmm7 registers > > Branch predication always executes the fallthrough code path to preserve > %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 > registers are used. This leads to lower CPU frequency on Skylake > server. This patch changes the fallthrough code path to preserve > %xmm0 - %xmm7 registers instead: > > if whole %zmm0 - %zmm7 registers are used > preserve %zmm0 - %zmm7 registers > if only %ymm0 - %ymm7 registers are used > preserve %ymm0 - %ymm7 registers > preserve %xmm0 - %xmm7 registers > > Tested on Skylake server. > > Any comments? I checked it in. > H.J. > --- > [BZ #21258] > * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): > Define only if _dl_runtime_resolve is defined to > _dl_runtime_resolve_sse_vex. > * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): > Fallthrough to _dl_runtime_resolve_sse_vex. > --- > sysdeps/x86_64/dl-trampoline.S | 3 +-- > sysdeps/x86_64/dl-trampoline.h | 9 +++++---- > 2 files changed, 6 insertions(+), 6 deletions(-) > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S > index 33d7fcf..c14c61a 100644 > --- a/sysdeps/x86_64/dl-trampoline.S > +++ b/sysdeps/x86_64/dl-trampoline.S > @@ -87,11 +87,9 @@ > #endif > #define VEC(i) zmm##i > #define _dl_runtime_resolve _dl_runtime_resolve_avx512 > -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt > #define _dl_runtime_profile _dl_runtime_profile_avx512 > #include "dl-trampoline.h" > #undef _dl_runtime_resolve > -#undef _dl_runtime_resolve_opt > #undef _dl_runtime_profile > #undef VEC > #undef VMOV > @@ -145,4 +143,5 @@ > # define VMOV vmovdqu > #endif > #define _dl_runtime_resolve _dl_runtime_resolve_sse_vex > +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt > #include "dl-trampoline.h" > diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h > index b27fa06..8db24c1 100644 > --- a/sysdeps/x86_64/dl-trampoline.h > +++ b/sysdeps/x86_64/dl-trampoline.h > @@ -129,19 +129,20 @@ _dl_runtime_resolve_opt: > # YMM state isn't in use. > PRESERVE_BND_REGS_PREFIX > jz _dl_runtime_resolve_sse_vex > -# elif VEC_SIZE == 64 > +# elif VEC_SIZE == 16 > # For ZMM registers, check if YMM state and ZMM state are in > # use. > andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d > cmpl $bit_YMM_state, %r11d > - # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if > - # neither YMM state nor ZMM state are in use. > + # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. > PRESERVE_BND_REGS_PREFIX > - jl _dl_runtime_resolve_sse_vex > + jg _dl_runtime_resolve_avx512 > # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if > # ZMM state isn't in use. > PRESERVE_BND_REGS_PREFIX > je _dl_runtime_resolve_avx > + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if > + # neither YMM state nor ZMM state are in use. > # else > # error Unsupported VEC_SIZE! > # endif > -- > 2.9.3 >
On Tue, Mar 21, 2017 at 11:01 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Thu, Mar 16, 2017 at 1:39 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: >> On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve >> the first 8 vector registers. The code layout is >> >> if only %xmm0 - %xmm7 registers are used >> preserve %xmm0 - %xmm7 registers >> if only %ymm0 - %ymm7 registers are used >> preserve %ymm0 - %ymm7 registers >> preserve %zmm0 - %zmm7 registers >> >> Branch predication always executes the fallthrough code path to preserve >> %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 >> registers are used. This leads to lower CPU frequency on Skylake >> server. This patch changes the fallthrough code path to preserve >> %xmm0 - %xmm7 registers instead: >> >> if whole %zmm0 - %zmm7 registers are used >> preserve %zmm0 - %zmm7 registers >> if only %ymm0 - %ymm7 registers are used >> preserve %ymm0 - %ymm7 registers >> preserve %xmm0 - %xmm7 registers >> >> Tested on Skylake server. >> >> Any comments? > > I checked it in. > I am backing it to 2.25 and 2.24 branches.
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 33d7fcf..c14c61a 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -87,11 +87,9 @@ #endif #define VEC(i) zmm##i #define _dl_runtime_resolve _dl_runtime_resolve_avx512 -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #define _dl_runtime_profile _dl_runtime_profile_avx512 #include "dl-trampoline.h" #undef _dl_runtime_resolve -#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC #undef VMOV @@ -145,4 +143,5 @@ # define VMOV vmovdqu #endif #define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #include "dl-trampoline.h" diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index b27fa06..8db24c1 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -129,19 +129,20 @@ _dl_runtime_resolve_opt: # YMM state isn't in use. PRESERVE_BND_REGS_PREFIX jz _dl_runtime_resolve_sse_vex -# elif VEC_SIZE == 64 +# elif VEC_SIZE == 16 # For ZMM registers, check if YMM state and ZMM state are in # use. andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d cmpl $bit_YMM_state, %r11d - # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if - # neither YMM state nor ZMM state are in use. + # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. PRESERVE_BND_REGS_PREFIX - jl _dl_runtime_resolve_sse_vex + jg _dl_runtime_resolve_avx512 # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if # ZMM state isn't in use. PRESERVE_BND_REGS_PREFIX je _dl_runtime_resolve_avx + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if + # neither YMM state nor ZMM state are in use. # else # error Unsupported VEC_SIZE! # endif