diff mbox

[i386] : Fix PR 70873 - 20% performance regression at 482.sphinx3 after r235442 with -O2 -m32 on Haswell.

Message ID CAFULd4ZSStd5y1vzAXVqgoYEkFeQt+eaqDYM2P==wo-dBBv=Uw@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak May 4, 2016, 8:56 p.m. UTC
Hello!

This patch moves all TARGET_SSE_PARTIAL_REG_DEPENDENCY FP conversion
splitters to a later split pass. Plus, the patch substantially cleans
these and related patterns.

The functionality of post-reload conversion splitters goes this way:

- process FP conversions for TARGET_USE_VECTOR_FP_CONVERTS in an early
post-reload splitter. This pass will rewrite FP conversions to vector
insns and is thus incompatible with the next two passes. AMDFAM10
processors depend on this transformation.

- process FP conversions for TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS in
a peephole2 pass. This will transform mem->reg insns to reg->reg
insns, and these insn could be processed by the next pass. Some Intel
processors depend on this transformation.

- process FP conversions for TARGET_SSE_PARTIAL_REG_DEPENDENCY in a
late post-reload splitter, when allocated registers are stable. AMD
and Intel processors depend on this pass, so it is part of generic
tuning.

As mentioned by HJ in the PR, there looks to be a problem with the
generic splitting infrastructure. When a splitter is matched, but
FAILs in the preparatory statements , no other splitters with the same
pattern are executed. IMO, this is an implementation bug, after
splitter is FAILed, others should still be executed.

2016-05-04  Uros Bizjak  <ubizjak@gmail.com>

    PR target/70873
    * config/i386/i386.md (extendsfdf2): Use nonimm_ssenomem_operand
    as operand 0 predicate.
    (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
    Change to post-epilogue_completed late splitter.  Use sse_reg_operand
    as operand 0 predicate.
    (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
    Ditto.
    (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
    Ditto.  Emit the pattern using RTX.

    (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
    Use sse_reg_opreand as operand 0 predicate.  Do not use true_regnum in
    the post-reload splitter.  Use lowpart_subreg instead of gen_rtx_REG.
    (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
    Ditto.
    (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
    sse_reg_operand as operand 0 predicate.

    (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
    Use sse_reg_opreand as operand 0 predicate.  Use lowpart_subreg
    instead of gen_rtx_REG.
    (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
    Ditto.

Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ba1ff8b..dd56b05 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4231,12 +4231,12 @@ 
    that might lead to ICE on 32bit target.  The sequence unlikely combine
    anyway.  */
 (define_split
-  [(set (match_operand:DF 0 "register_operand")
+  [(set (match_operand:DF 0 "sse_reg_operand")
         (float_extend:DF
 	  (match_operand:SF 1 "nonimmediate_operand")))]
   "TARGET_USE_VECTOR_FP_CONVERTS
    && optimize_insn_for_speed_p ()
-   && reload_completed && SSE_REG_P (operands[0])
+   && reload_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
    [(set (match_dup 2)
@@ -4253,13 +4253,11 @@ 
     {
       /* If it is unsafe to overwrite upper half of source, we need
 	 to move to destination and unpack there.  */
-      if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
-	    || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
-	   && true_regnum (operands[0]) != true_regnum (operands[1]))
+      if (REGNO (operands[0]) != REGNO (operands[1])
 	  || (EXT_REX_SSE_REG_P (operands[1])
 	      && !TARGET_AVX512VL))
 	{
-	  rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+	  rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
 	  emit_move_insn (tmp, operands[1]);
 	}
       else
@@ -4267,7 +4265,7 @@ 
       /* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
 	 =v, v, then vbroadcastss will be only needed for AVX512F without
 	 AVX512VL.  */
-      if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
+      if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
 	emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
 					       operands[3]));
       else
@@ -4283,15 +4281,14 @@ 
 
 ;; It's more profitable to split and then extend in the same register.
 (define_peephole2
-  [(set (match_operand:DF 0 "register_operand")
+  [(set (match_operand:DF 0 "sse_reg_operand")
 	(float_extend:DF
 	  (match_operand:SF 1 "memory_operand")))]
   "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
-   && optimize_insn_for_speed_p ()
-   && SSE_REG_P (operands[0])"
+   && optimize_insn_for_speed_p ()"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float_extend:DF (match_dup 2)))]
-  "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+  "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
 
 (define_insn "*extendsfdf2"
   [(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
@@ -4390,12 +4387,12 @@ 
    that might lead to ICE on 32bit target.  The sequence unlikely combine
    anyway.  */
 (define_split
-  [(set (match_operand:SF 0 "register_operand")
+  [(set (match_operand:SF 0 "sse_reg_operand")
         (float_truncate:SF
 	  (match_operand:DF 1 "nonimmediate_operand")))]
   "TARGET_USE_VECTOR_FP_CONVERTS
    && optimize_insn_for_speed_p ()
-   && reload_completed && SSE_REG_P (operands[0])
+   && reload_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
    [(set (match_dup 2)
@@ -4413,9 +4410,7 @@ 
   if (REG_P (operands[1]))
     {
       if (!TARGET_SSE3
-	  && true_regnum (operands[0]) != true_regnum (operands[1])
-	  && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
-	      || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+	  && REGNO (operands[0]) != REGNO (operands[1]))
 	{
 	  rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
 	  emit_move_insn (tmp, operands[1]);
@@ -4432,15 +4427,14 @@ 
 
 ;; It's more profitable to split and then extend in the same register.
 (define_peephole2
-  [(set (match_operand:SF 0 "register_operand")
+  [(set (match_operand:SF 0 "sse_reg_operand")
 	(float_truncate:SF
 	  (match_operand:DF 1 "memory_operand")))]
   "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
-   && optimize_insn_for_speed_p ()
-   && SSE_REG_P (operands[0])"
+   && optimize_insn_for_speed_p ()"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float_truncate:SF (match_dup 2)))]
-  "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+  "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
 
 (define_expand "truncdfsf2_with_temp"
   [(parallel [(set (match_operand:SF 0)
@@ -4547,7 +4541,7 @@ 
   "reload_completed"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (match_dup 2))]
-  "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+  "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
 
 ;; Conversion from XFmode to {SF,DF}mode
 
@@ -5153,11 +5147,11 @@ 
 ;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
 ;; alternative in sse2_loadld.
 (define_split
-  [(set (match_operand:MODEF 0 "register_operand")
+  [(set (match_operand:MODEF 0 "sse_reg_operand")
 	(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed && SSE_REG_P (operands[0])
+  "TARGET_USE_VECTOR_CONVERTS
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed
    && (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
@@ -5176,41 +5170,43 @@ 
   DONE;
 })
 
-;; Avoid partial SSE register dependency stalls
+;; Avoid partial SSE register dependency stalls.  This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
+
 (define_split
-  [(set (match_operand:MODEF 0 "register_operand")
+  [(set (match_operand:MODEF 0 "sse_reg_operand")
 	(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && reload_completed && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
-  [(const_int 0)]
+  [(set (match_dup 0)
+	(vec_merge:<MODEF:ssevecmode>
+	  (vec_duplicate:<MODEF:ssevecmode>
+	    (float:MODEF
+	      (match_dup 1)))
+	  (match_dup 0)
+	  (const_int 1)))]
 {
   const machine_mode vmode = <MODEF:ssevecmode>mode;
-  const machine_mode mode = <MODEF:MODE>mode;
-  rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
-
-  emit_move_insn (op0, CONST0_RTX (vmode));
 
-  t = gen_rtx_FLOAT (mode, operands[1]);
-  t = gen_rtx_VEC_DUPLICATE (vmode, t);
-  t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
-  emit_insn (gen_rtx_SET (op0, t));
-  DONE;
+  operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
+  emit_move_insn (operands[0], CONST0_RTX (vmode));
 })
 
-;; Break partial reg stall for cvtsd2ss.
+;; Break partial reg stall for cvtsd2ss.  This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
 
-(define_peephole2
-  [(set (match_operand:SF 0 "register_operand")
+(define_split
+  [(set (match_operand:SF 0 "sse_reg_operand")
         (float_truncate:SF
 	  (match_operand:DF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!SSE_REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
    && (!EXT_REX_SSE_REG_P (operands[0])
@@ -5228,16 +5224,17 @@ 
   emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
 })
 
-;; Break partial reg stall for cvtss2sd.
+;; Break partial reg stall for cvtss2sd.  This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
 
-(define_peephole2
-  [(set (match_operand:DF 0 "register_operand")
+(define_split
+  [(set (match_operand:DF 0 "sse_reg_operand")
         (float_extend:DF
           (match_operand:SF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!SSE_REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
    && (!EXT_REX_SSE_REG_P (operands[0])