From patchwork Sun Sep 26 10:33:01 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [i386] : Optimize DFmode signbit() for SSE math From: Uros Bizjak X-Patchwork-Id: 65780 Message-Id: To: gcc-patches@gcc.gnu.org Date: Sun, 26 Sep 2010 12:33:01 +0200 Hello! Attached patch optimized DFmode signbit function for SSE math. On 32 bit targets, generic code compiles int test (double a) { return signbit (a + 1.0); } to (-O2 -mfpmath=sse -msse2): movlpd .LC0, %xmm0 addsd 8(%ebp), %xmm0 movsd %xmm0, -8(%ebp) movl -4(%ebp), %eax This creates store forwarding (partial memory) stall. On 64 bit targets, generic code creates: addsd .LC0(%rip), %xmm0 movq %xmm0, -8(%rsp) movq -8(%rsp), %rax shrq $63, %rax shrq has high latency on AMD processors, and movq+shrq is always slower on Intel (movq also introduces bypass delay on i7, since it operates in "int" domain, where movmskpd operates in "float" domain). Atached patch generates: movlpd .LC0, %xmm0 addsd 4(%esp), %xmm0 movmskpd %xmm0, %eax andl $1, %eax and addsd .LC0(%rip), %xmm0 movmskpd %xmm0, %eax andl $1, %eax 2010-09-24 Uros Bizjak * config/i386/i386.md (movmsk_df): New insn. (signbitdf): Split out of signbit2. Generate movmsk_df sequence for TARGET_SSE_MATH. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu, committed to mainline SVN. Uros. Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 164628) +++ config/i386/i386.md (working copy) @@ -14980,18 +14980,65 @@ DONE; }) -(define_expand "signbit2" +(define_expand "signbitxf2" [(use (match_operand:SI 0 "register_operand" "")) - (use (match_operand:X87MODEF 1 "register_operand" ""))] + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx scratch = gen_reg_rtx (HImode); + + emit_insn (gen_fxamxf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); + DONE; +}) + +(define_insn "movmsk_df" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH" + "%vmovmskpd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + +;; Use movmskpd in SSE mode to avoid store forwarding stall +;; for 32bit targets and movq+shrq sequence for 64bit targets. +(define_expand "signbitdf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:DF 1 "register_operand" ""))] "TARGET_USE_FANCY_MATH_387 - && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" + || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)" { - rtx mask = GEN_INT (0x0200); + if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH) + { + emit_insn (gen_movmsk_df (operands[0], operands[1])); + emit_insn (gen_andsi3 (operands[0], operands[0], const1_rtx)); + } + else + { + rtx scratch = gen_reg_rtx (HImode); + emit_insn (gen_fxamdf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); + } + DONE; +}) + +(define_expand "signbitsf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:SF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && !(SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)" +{ rtx scratch = gen_reg_rtx (HImode); - emit_insn (gen_fxam2_i387 (scratch, operands[1])); - emit_insn (gen_andsi3 (operands[0], gen_lowpart (SImode, scratch), mask)); + emit_insn (gen_fxamsf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); DONE; })