Comments
Patch
===================================================================
@@ -14980,18 +14980,65 @@
DONE;
})
-(define_expand "signbit<mode>2"
+(define_expand "signbitxf2"
[(use (match_operand:SI 0 "register_operand" ""))
- (use (match_operand:X87MODEF 1 "register_operand" ""))]
+ (use (match_operand:XF 1 "register_operand" ""))]
+ "TARGET_USE_FANCY_MATH_387"
+{
+ rtx scratch = gen_reg_rtx (HImode);
+
+ emit_insn (gen_fxamxf2_i387 (scratch, operands[1]));
+ emit_insn (gen_andsi3 (operands[0],
+ gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+ DONE;
+})
+
+(define_insn "movmsk_df"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (unspec:SI
+ [(match_operand:DF 1 "register_operand" "x")]
+ UNSPEC_MOVMSK))]
+ "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH"
+ "%vmovmskpd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "maybe_vex")
+ (set_attr "mode" "DF")])
+
+;; Use movmskpd in SSE mode to avoid store forwarding stall
+;; for 32bit targets and movq+shrq sequence for 64bit targets.
+(define_expand "signbitdf2"
+ [(use (match_operand:SI 0 "register_operand" ""))
+ (use (match_operand:DF 1 "register_operand" ""))]
"TARGET_USE_FANCY_MATH_387
- && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+ || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)"
{
- rtx mask = GEN_INT (0x0200);
+ if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)
+ {
+ emit_insn (gen_movmsk_df (operands[0], operands[1]));
+ emit_insn (gen_andsi3 (operands[0], operands[0], const1_rtx));
+ }
+ else
+ {
+ rtx scratch = gen_reg_rtx (HImode);
+ emit_insn (gen_fxamdf2_i387 (scratch, operands[1]));
+ emit_insn (gen_andsi3 (operands[0],
+ gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+ }
+ DONE;
+})
+
+(define_expand "signbitsf2"
+ [(use (match_operand:SI 0 "register_operand" ""))
+ (use (match_operand:SF 1 "register_operand" ""))]
+ "TARGET_USE_FANCY_MATH_387
+ && !(SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)"
+{
rtx scratch = gen_reg_rtx (HImode);
- emit_insn (gen_fxam<mode>2_i387 (scratch, operands[1]));
- emit_insn (gen_andsi3 (operands[0], gen_lowpart (SImode, scratch), mask));
+ emit_insn (gen_fxamsf2_i387 (scratch, operands[1]));
+ emit_insn (gen_andsi3 (operands[0],
+ gen_lowpart (SImode, scratch), GEN_INT (0x200)));
DONE;
})
Hello! Attached patch optimized DFmode signbit function for SSE math. On 32 bit targets, generic code compiles int test (double a) { return signbit (a + 1.0); } to (-O2 -mfpmath=sse -msse2): movlpd .LC0, %xmm0 addsd 8(%ebp), %xmm0 movsd %xmm0, -8(%ebp) movl -4(%ebp), %eax This creates store forwarding (partial memory) stall. On 64 bit targets, generic code creates: addsd .LC0(%rip), %xmm0 movq %xmm0, -8(%rsp) movq -8(%rsp), %rax shrq $63, %rax shrq has high latency on AMD processors, and movq+shrq is always slower on Intel (movq also introduces bypass delay on i7, since it operates in "int" domain, where movmskpd operates in "float" domain). Atached patch generates: movlpd .LC0, %xmm0 addsd 4(%esp), %xmm0 movmskpd %xmm0, %eax andl $1, %eax and addsd .LC0(%rip), %xmm0 movmskpd %xmm0, %eax andl $1, %eax 2010-09-24 Uros Bizjak <ubizjak@gmail.com> * config/i386/i386.md (movmsk_df): New insn. (signbitdf): Split out of signbit<mode>2. Generate movmsk_df sequence for TARGET_SSE_MATH. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu, committed to mainline SVN. Uros.