diff mbox

[i386] Avoid LCP stalls (issue5975045)

Message ID 20120330141858.420D5615CC@tjsboxrox.mtv.corp.google.com
State New
Headers show

Commit Message

Teresa Johnson March 30, 2012, 2:18 p.m. UTC
This patch addresses instructions that incur expensive length-changing prefix (LCP) stalls
on some x86-64 implementations, notably Core2 and Corei7. Specifically, a move of
a 16-bit constant into memory requires a length-changing prefix and can incur significant
penalties. The attached patch avoids this by forcing such instructions to be split into
two: a move of the corresponding 32-bit constant into a register, and a move of the
register's lower 16 bits into memory.

Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk?

Thanks,
Teresa

2012-03-29   Teresa Johnson  <tejohnson@google.com>

	* config/i386/i386.h (ix86_tune_indices): Add
	X86_TUNE_LCP_STALL.
	* config/i386/i386.md (movhi_internal): Split to
	movhi_internal and movhi_imm_internal.
	* config/i386/i386.c (initial_ix86_tune_features): Initialize
	X86_TUNE_LCP_STALL entry.


--
This patch is available for review at http://codereview.appspot.com/5975045

Comments

Teresa Johnson March 30, 2012, 2:26 p.m. UTC | #1
I should add that I have tested performance of this on Core2, Corei7
(Nehalem) and AMD Opteron-based systems. It appears to be
performance-neutral on AMD (only minor perturbations, overall a wash).
For the test case that provoked the optimization, there were nice
improvements on Core2 and Corei7.

Thanks,
Teresa

On Fri, Mar 30, 2012 at 7:18 AM, Teresa Johnson <tejohnson@google.com> wrote:
> This patch addresses instructions that incur expensive length-changing prefix (LCP) stalls
> on some x86-64 implementations, notably Core2 and Corei7. Specifically, a move of
> a 16-bit constant into memory requires a length-changing prefix and can incur significant
> penalties. The attached patch avoids this by forcing such instructions to be split into
> two: a move of the corresponding 32-bit constant into a register, and a move of the
> register's lower 16 bits into memory.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk?
>
> Thanks,
> Teresa
>
> 2012-03-29   Teresa Johnson  <tejohnson@google.com>
>
>        * config/i386/i386.h (ix86_tune_indices): Add
>        X86_TUNE_LCP_STALL.
>        * config/i386/i386.md (movhi_internal): Split to
>        movhi_internal and movhi_imm_internal.
>        * config/i386/i386.c (initial_ix86_tune_features): Initialize
>        X86_TUNE_LCP_STALL entry.
>
> Index: config/i386/i386.h
> ===================================================================
> --- config/i386/i386.h  (revision 185920)
> +++ config/i386/i386.h  (working copy)
> @@ -262,6 +262,7 @@ enum ix86_tune_indices {
>   X86_TUNE_MOVX,
>   X86_TUNE_PARTIAL_REG_STALL,
>   X86_TUNE_PARTIAL_FLAG_REG_STALL,
> +  X86_TUNE_LCP_STALL,
>   X86_TUNE_USE_HIMODE_FIOP,
>   X86_TUNE_USE_SIMODE_FIOP,
>   X86_TUNE_USE_MOV0,
> @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L
>  #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
>  #define TARGET_PARTIAL_FLAG_REG_STALL \
>        ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
> +#define TARGET_LCP_STALL \
> +       ix86_tune_features[X86_TUNE_LCP_STALL]
>  #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
>  #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
>  #define TARGET_USE_MOV0                ix86_tune_features[X86_TUNE_USE_MOV0]
> Index: config/i386/i386.md
> ===================================================================
> --- config/i386/i386.md (revision 185920)
> +++ config/i386/i386.md (working copy)
> @@ -2262,9 +2262,19 @@
>           ]
>           (const_string "SI")))])
>
> +(define_insn "*movhi_imm_internal"
> +  [(set (match_operand:HI 0 "memory_operand" "=m")
> +        (match_operand:HI 1 "immediate_operand" "n"))]
> +  "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
> +{
> +  return "mov{w}\t{%1, %0|%0, %1}";
> +}
> +  [(set (attr "type") (const_string "imov"))
> +   (set (attr "mode") (const_string "HI"))])
> +
>  (define_insn "*movhi_internal"
>   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m")
> -       (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))]
> +       (match_operand:HI 1 "general_operand" "r,rn,rm,r"))]
>   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>   switch (get_attr_type (insn))
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 185920)
> +++ config/i386/i386.c  (working copy)
> @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86
>   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
>   m_CORE2I7 | m_GENERIC,
>
> +  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
> +   * on 16-bit immediate moves into memory on Core2 and Corei7,
> +   * which may also affect AMD implementations.  */
> +  m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE,
> +
>   /* X86_TUNE_USE_HIMODE_FIOP */
>   m_386 | m_486 | m_K6_GEODE,
>
>
> --
> This patch is available for review at http://codereview.appspot.com/5975045
Jan Hubicka March 30, 2012, 3:18 p.m. UTC | #2
> Index: config/i386/i386.md
> ===================================================================
> --- config/i386/i386.md	(revision 185920)
> +++ config/i386/i386.md	(working copy)
> @@ -2262,9 +2262,19 @@
>  	   ]
>  	   (const_string "SI")))])
>  
> +(define_insn "*movhi_imm_internal"
> +  [(set (match_operand:HI 0 "memory_operand" "=m")
> +        (match_operand:HI 1 "immediate_operand" "n"))]
> +  "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
> +{
> +  return "mov{w}\t{%1, %0|%0, %1}";
> +}
> +  [(set (attr "type") (const_string "imov"))
> +   (set (attr "mode") (const_string "HI"))])
> +
>  (define_insn "*movhi_internal"
>    [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m")
> -	(match_operand:HI 1 "general_operand" "r,rn,rm,rn"))]
> +	(match_operand:HI 1 "general_operand" "r,rn,rm,r"))]

If you do this, you will prevent reload from considering using immediate
as rematerializatoin when the register holding constant is on a stack
on !TARGET_LCP_STALL machines. The matching pattern for moves should really
handle all available alternatives, so reload is happy.

You can duplicate the pattern, but I think this is much better to be done as
post-reload peephole2.  I.e. ask for scratch register and if it is available do
the splitting.  This way optimization won't happen when there is no register
available and we will also rely on post-reload cleanups to unify moves of
constant, but I think this should work well.

You also want to conditionalize the split by optimize_insn_for_speed, too.

>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    switch (get_attr_type (insn))
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c	(revision 185920)
> +++ config/i386/i386.c	(working copy)
> @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86
>    /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
>    m_CORE2I7 | m_GENERIC,
>  
> +  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
> +   * on 16-bit immediate moves into memory on Core2 and Corei7,
> +   * which may also affect AMD implementations.  */
> +  m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE,

Is this supposed to help AMD? (at least the pre-buldozer design should not care
about length changing prefixes that much because it tags sizes in the cache).
If not, I would suggest enabling it only for cores and generic.

Honza
diff mbox

Patch

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 185920)
+++ config/i386/i386.h	(working copy)
@@ -262,6 +262,7 @@  enum ix86_tune_indices {
   X86_TUNE_MOVX,
   X86_TUNE_PARTIAL_REG_STALL,
   X86_TUNE_PARTIAL_FLAG_REG_STALL,
+  X86_TUNE_LCP_STALL,
   X86_TUNE_USE_HIMODE_FIOP,
   X86_TUNE_USE_SIMODE_FIOP,
   X86_TUNE_USE_MOV0,
@@ -340,6 +341,8 @@  extern unsigned char ix86_tune_features[X86_TUNE_L
 #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
 #define TARGET_PARTIAL_FLAG_REG_STALL \
 	ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
+#define TARGET_LCP_STALL \
+	ix86_tune_features[X86_TUNE_LCP_STALL]
 #define TARGET_USE_HIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
 #define TARGET_USE_SIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
 #define TARGET_USE_MOV0		ix86_tune_features[X86_TUNE_USE_MOV0]
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 185920)
+++ config/i386/i386.md	(working copy)
@@ -2262,9 +2262,19 @@ 
 	   ]
 	   (const_string "SI")))])
 
+(define_insn "*movhi_imm_internal"
+  [(set (match_operand:HI 0 "memory_operand" "=m")
+        (match_operand:HI 1 "immediate_operand" "n"))]
+  "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  return "mov{w}\t{%1, %0|%0, %1}";
+}
+  [(set (attr "type") (const_string "imov"))
+   (set (attr "mode") (const_string "HI"))])
+
 (define_insn "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m")
-	(match_operand:HI 1 "general_operand" "r,rn,rm,rn"))]
+	(match_operand:HI 1 "general_operand" "r,rn,rm,r"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 185920)
+++ config/i386/i386.c	(working copy)
@@ -1964,6 +1964,11 @@  static unsigned int initial_ix86_tune_features[X86
   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
   m_CORE2I7 | m_GENERIC,
 
+  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
+   * on 16-bit immediate moves into memory on Core2 and Corei7,
+   * which may also affect AMD implementations.  */
+  m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE,
+
   /* X86_TUNE_USE_HIMODE_FIOP */
   m_386 | m_486 | m_K6_GEODE,