diff mbox

x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]

Message ID 20170316203911.GA26261@intel.com
State New
Headers show

Commit Message

H.J. Lu March 16, 2017, 8:39 p.m. UTC
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
the first 8 vector registers.  The code layout is

  if only %xmm0 - %xmm7 registers are used
     preserve %xmm0 - %xmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %zmm0 - %zmm7 registers

Branch predication always executes the fallthrough code path to preserve
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
registers are used.  This leads to lower CPU frequency on Skylake
server.  This patch changes the fallthrough code path to preserve
%xmm0 - %xmm7 registers instead:

  if whole %zmm0 - %zmm7 registers are used
    preserve %zmm0 - %zmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %xmm0 - %xmm7 registers

Tested on Skylake server.

Any comments?

H.J.
---
	[BZ #21258]
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
	Define only if _dl_runtime_resolve is defined to
	_dl_runtime_resolve_sse_vex.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
	Fallthrough to _dl_runtime_resolve_sse_vex.
---
 sysdeps/x86_64/dl-trampoline.S | 3 +--
 sysdeps/x86_64/dl-trampoline.h | 9 +++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

Comments

H.J. Lu March 21, 2017, 6:01 p.m. UTC | #1
On Thu, Mar 16, 2017 at 1:39 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
> the first 8 vector registers.  The code layout is
>
>   if only %xmm0 - %xmm7 registers are used
>      preserve %xmm0 - %xmm7 registers
>   if only %ymm0 - %ymm7 registers are used
>      preserve %ymm0 - %ymm7 registers
>   preserve %zmm0 - %zmm7 registers
>
> Branch predication always executes the fallthrough code path to preserve
> %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
> registers are used.  This leads to lower CPU frequency on Skylake
> server.  This patch changes the fallthrough code path to preserve
> %xmm0 - %xmm7 registers instead:
>
>   if whole %zmm0 - %zmm7 registers are used
>     preserve %zmm0 - %zmm7 registers
>   if only %ymm0 - %ymm7 registers are used
>      preserve %ymm0 - %ymm7 registers
>   preserve %xmm0 - %xmm7 registers
>
> Tested on Skylake server.
>
> Any comments?

I checked it in.

> H.J.
> ---
>         [BZ #21258]
>         * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
>         Define only if _dl_runtime_resolve is defined to
>         _dl_runtime_resolve_sse_vex.
>         * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
>         Fallthrough to _dl_runtime_resolve_sse_vex.
> ---
>  sysdeps/x86_64/dl-trampoline.S | 3 +--
>  sysdeps/x86_64/dl-trampoline.h | 9 +++++----
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index 33d7fcf..c14c61a 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -87,11 +87,9 @@
>  #endif
>  #define VEC(i)                 zmm##i
>  #define _dl_runtime_resolve    _dl_runtime_resolve_avx512
> -#define _dl_runtime_resolve_opt        _dl_runtime_resolve_avx512_opt
>  #define _dl_runtime_profile    _dl_runtime_profile_avx512
>  #include "dl-trampoline.h"
>  #undef _dl_runtime_resolve
> -#undef _dl_runtime_resolve_opt
>  #undef _dl_runtime_profile
>  #undef VEC
>  #undef VMOV
> @@ -145,4 +143,5 @@
>  # define VMOV                  vmovdqu
>  #endif
>  #define _dl_runtime_resolve    _dl_runtime_resolve_sse_vex
> +#define _dl_runtime_resolve_opt        _dl_runtime_resolve_avx512_opt
>  #include "dl-trampoline.h"
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index b27fa06..8db24c1 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
>         # YMM state isn't in use.
>         PRESERVE_BND_REGS_PREFIX
>         jz _dl_runtime_resolve_sse_vex
> -# elif VEC_SIZE == 64
> +# elif VEC_SIZE == 16
>         # For ZMM registers, check if YMM state and ZMM state are in
>         # use.
>         andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
>         cmpl $bit_YMM_state, %r11d
> -       # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
> -       # neither YMM state nor ZMM state are in use.
> +       # Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
>         PRESERVE_BND_REGS_PREFIX
> -       jl _dl_runtime_resolve_sse_vex
> +       jg _dl_runtime_resolve_avx512
>         # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
>         # ZMM state isn't in use.
>         PRESERVE_BND_REGS_PREFIX
>         je _dl_runtime_resolve_avx
> +       # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
> +       # neither YMM state nor ZMM state are in use.
>  # else
>  #  error Unsupported VEC_SIZE!
>  # endif
> --
> 2.9.3
>
H.J. Lu April 7, 2017, 5:06 p.m. UTC | #2
On Tue, Mar 21, 2017 at 11:01 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Mar 16, 2017 at 1:39 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>> On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
>> the first 8 vector registers.  The code layout is
>>
>>   if only %xmm0 - %xmm7 registers are used
>>      preserve %xmm0 - %xmm7 registers
>>   if only %ymm0 - %ymm7 registers are used
>>      preserve %ymm0 - %ymm7 registers
>>   preserve %zmm0 - %zmm7 registers
>>
>> Branch predication always executes the fallthrough code path to preserve
>> %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
>> registers are used.  This leads to lower CPU frequency on Skylake
>> server.  This patch changes the fallthrough code path to preserve
>> %xmm0 - %xmm7 registers instead:
>>
>>   if whole %zmm0 - %zmm7 registers are used
>>     preserve %zmm0 - %zmm7 registers
>>   if only %ymm0 - %ymm7 registers are used
>>      preserve %ymm0 - %ymm7 registers
>>   preserve %xmm0 - %xmm7 registers
>>
>> Tested on Skylake server.
>>
>> Any comments?
>
> I checked it in.
>

I am backing it to 2.25 and 2.24 branches.
diff mbox

Patch

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 33d7fcf..c14c61a 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -87,11 +87,9 @@ 
 #endif
 #define VEC(i)			zmm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx512
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
-#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -145,4 +143,5 @@ 
 # define VMOV			vmovdqu
 #endif
 #define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index b27fa06..8db24c1 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -129,19 +129,20 @@  _dl_runtime_resolve_opt:
 	# YMM state isn't in use.
 	PRESERVE_BND_REGS_PREFIX
 	jz _dl_runtime_resolve_sse_vex
-# elif VEC_SIZE == 64
+# elif VEC_SIZE == 16
 	# For ZMM registers, check if YMM state and ZMM state are in
 	# use.
 	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
 	cmpl $bit_YMM_state, %r11d
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
-	# neither YMM state nor ZMM state are in use.
+	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
 	PRESERVE_BND_REGS_PREFIX
-	jl _dl_runtime_resolve_sse_vex
+	jg _dl_runtime_resolve_avx512
 	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
 	# ZMM state isn't in use.
 	PRESERVE_BND_REGS_PREFIX
 	je _dl_runtime_resolve_avx
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+	# neither YMM state nor ZMM state are in use.
 # else
 #  error Unsupported VEC_SIZE!
 # endif