Patchwork [i386] : Fix PR57954 - AVX missing vxorps (zeroing) before vcvtsi2s %edx, slow down AVX code

login
register
mail settings
Submitter Uros Bizjak
Date July 29, 2013, 11:22 a.m.
Message ID <CAFULd4Y8EmiGQ=iKTtsihMsVstKpBtK7Mt4p1SpwhBu7jBradA@mail.gmail.com>
Download mbox | patch
Permalink /patch/262725/
State New
Headers show

Comments

Uros Bizjak - July 29, 2013, 11:22 a.m.
Hello!

Attached patch (that is in fact a variant of HJ's patch) implements
clearing of SSE target register before cvt* instructions (the same
approach as ICC takes).

While there, there is also no need to check for SUBREGs in post-reload
splitters.

2013-07-29  Uros Bizjak  <ubizjak@gmail.com>

    * config/i386/i386.md (float post-reload splitters): Do not check
    for subregs of SSE registers.

2013-07-29  Uros Bizjak  <ubizjak@gmail.com>
        H.J. Lu  <hongjiu.lu@intel.com>

    PR target/57954
    PR target/57988
    * config/i386/i386.md (post-reload splitter
    to avoid partial SSE reg dependency stalls): New.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
{,-m32}  and committed to mainline.

Uros.

Patch

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 201298)
+++ config/i386/i386.md	(working copy)
@@ -4596,10 +4596,7 @@ 
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_MIX_SSE_I387
    && TARGET_INTER_UNIT_CONVERSIONS
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_split
@@ -4608,10 +4605,7 @@ 
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_MIX_SSE_I387
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float:MODEF (match_dup 2)))])
 
@@ -4697,10 +4691,7 @@ 
    (clobber (match_operand:SI 2 "memory_operand"))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   rtx op1 = operands[1];
@@ -4740,10 +4731,7 @@ 
    (clobber (match_operand:SI 2 "memory_operand"))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   operands[3] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
@@ -4764,10 +4752,7 @@ 
 	(float:MODEF (match_operand:SI 1 "register_operand")))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   rtx op1 = operands[1];
@@ -4810,10 +4795,7 @@ 
 	(float:MODEF (match_operand:SI 1 "memory_operand")))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   operands[3] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
@@ -4872,10 +4854,7 @@ 
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && (TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_insn "*float<SWI48:mode><MODEF:mode>2_sse_nointerunit"
@@ -4905,10 +4884,7 @@ 
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float:MODEF (match_dup 2)))])
 
@@ -4917,10 +4893,7 @@ 
 	(float:MODEF (match_operand:SWI48 1 "memory_operand")))
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-	   && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_insn "*float<SWI48x:mode><X87MODEF:mode>2_i387_with_temp"
@@ -4968,6 +4941,46 @@ 
    && reload_completed"
   [(set (match_dup 0) (float:X87MODEF (match_dup 1)))])
 
+;; Avoid partial SSE register dependency stalls
+
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(vec_merge:<ssevecmode>
+	  (vec_duplicate:<ssevecmode>
+	    (float:MODEF (match_dup 1)))
+	  (match_dup 0)
+	  (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
+				     <MODE>mode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
+})
+
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float:MODEF (match_operand:DI 1 "nonimmediate_operand")))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(vec_merge:<ssevecmode>
+	  (vec_duplicate:<ssevecmode>
+	    (float:MODEF (match_dup 1)))
+	  (match_dup 0)
+	  (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
+				     <MODE>mode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
+})
+
 ;; Avoid store forwarding (partial memory) stall penalty
 ;; by passing DImode value through XMM registers.  */