Patchwork [i386] : Optimize DFmode signbit() for SSE math

login
register
mail settings
Submitter Uros Bizjak
Date Sept. 26, 2010, 10:33 a.m.
Message ID <AANLkTinni4Byp+OQvTEk-N6Pww-AR0WobCt=1jBuAg-J@mail.gmail.com>
Download mbox | patch
Permalink /patch/65780/
State New
Headers show

Comments

Uros Bizjak - Sept. 26, 2010, 10:33 a.m.
Hello!

Attached patch optimized DFmode signbit function for SSE math.

On 32 bit targets, generic code compiles

int test (double a)
{
  return signbit (a + 1.0);
}

to (-O2 -mfpmath=sse -msse2):

	movlpd	.LC0, %xmm0
	addsd	8(%ebp), %xmm0
	movsd	%xmm0, -8(%ebp)
	movl	-4(%ebp), %eax

This creates store forwarding (partial memory) stall. On 64 bit
targets, generic code creates:

	addsd	.LC0(%rip), %xmm0
	movq	%xmm0, -8(%rsp)
	movq	-8(%rsp), %rax
	shrq	$63, %rax

shrq has high latency on AMD processors, and movq+shrq is always
slower on Intel (movq also introduces bypass delay on i7, since it
operates in "int" domain, where movmskpd operates in "float" domain).

Atached patch generates:

	movlpd	.LC0, %xmm0
	addsd	4(%esp), %xmm0
	movmskpd	%xmm0, %eax
	andl	$1, %eax

and

	addsd	.LC0(%rip), %xmm0
	movmskpd	%xmm0, %eax
	andl	$1, %eax

2010-09-24  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (movmsk_df): New insn.
	(signbitdf): Split out of signbit<mode>2.  Generate movmsk_df
	sequence for TARGET_SSE_MATH.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu,
committed to mainline SVN.

Uros.

Patch

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 164628)
+++ config/i386/i386.md	(working copy)
@@ -14980,18 +14980,65 @@ 
   DONE;
 })
 
-(define_expand "signbit<mode>2"
+(define_expand "signbitxf2"
   [(use (match_operand:SI 0 "register_operand" ""))
-   (use (match_operand:X87MODEF 1 "register_operand" ""))]
+   (use (match_operand:XF 1 "register_operand" ""))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx scratch = gen_reg_rtx (HImode);
+
+  emit_insn (gen_fxamxf2_i387 (scratch, operands[1]));
+  emit_insn (gen_andsi3 (operands[0],
+	     gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+  DONE;
+})
+
+(define_insn "movmsk_df"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(match_operand:DF 1 "register_operand" "x")]
+	  UNSPEC_MOVMSK))]
+  "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH"
+  "%vmovmskpd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
+;; Use movmskpd in SSE mode to avoid store forwarding stall
+;; for 32bit targets and movq+shrq sequence for 64bit targets.
+(define_expand "signbitdf2"
+  [(use (match_operand:SI 0 "register_operand" ""))
+   (use (match_operand:DF 1 "register_operand" ""))]
   "TARGET_USE_FANCY_MATH_387
-   && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+   || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)"
 {
-  rtx mask = GEN_INT (0x0200);
+  if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)
+    {
+      emit_insn (gen_movmsk_df (operands[0], operands[1]));
+      emit_insn (gen_andsi3 (operands[0], operands[0], const1_rtx));
+    }
+  else
+    {
+      rtx scratch = gen_reg_rtx (HImode);
 
+      emit_insn (gen_fxamdf2_i387 (scratch, operands[1]));
+      emit_insn (gen_andsi3 (operands[0],
+		 gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+    }
+  DONE;
+})
+
+(define_expand "signbitsf2"
+  [(use (match_operand:SI 0 "register_operand" ""))
+   (use (match_operand:SF 1 "register_operand" ""))]
+  "TARGET_USE_FANCY_MATH_387
+   && !(SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)"
+{
   rtx scratch = gen_reg_rtx (HImode);
 
-  emit_insn (gen_fxam<mode>2_i387 (scratch, operands[1]));
-  emit_insn (gen_andsi3 (operands[0], gen_lowpart (SImode, scratch), mask));
+  emit_insn (gen_fxamsf2_i387 (scratch, operands[1]));
+  emit_insn (gen_andsi3 (operands[0],
+	     gen_lowpart (SImode, scratch), GEN_INT (0x200)));
   DONE;
 })