Patchwork [i386] : Enable TFmode moves via XMM registers for 32bit SSE targets

login
register
mail settings
Submitter Uros Bizjak
Date May 13, 2012, 5:59 p.m.
Message ID <CAFULd4YQ-Q881OtUc4i5JiZaQ3N6iXAiARjLM2+8XPOSHM9Wkg@mail.gmail.com>
Download mbox | patch
Permalink /patch/158849/
State New
Headers show

Comments

Uros Bizjak - May 13, 2012, 5:59 p.m.
Hello!

With all recent mode handling cleanups to move patterns and SSE
bitops, it is now possible to enable TFmode moves via XMM registers
for 32bit SSE targets. The compiler emits packed single operations in
this case, so following testcase:

--cut here--
__float128 test_abs (__float128 a)
{
  return (__builtin_fabsq (a));
}

__float128 test_copysign (__float128 a, __float128 b)
{
  return (__builtin_copysignq (a, b));
}
--cut here--

compiles with "-O2 -msse" to:

test_abs:
        movl    4(%esp), %eax
        movaps  20(%esp), %xmm0
        andps   .LC0, %xmm0
        movaps  %xmm0, (%eax)
        ret     $4

test_copysign:
        movaps  20(%esp), %xmm0
        movaps  36(%esp), %xmm1
        movl    4(%esp), %eax
        andps   .LC1, %xmm1
        andps   .LC0, %xmm0
        orps    %xmm1, %xmm0
        movaps  %xmm0, (%eax)
        ret     $4

For comparison, with -msse2 compiler generates:

test_abs:
        movl    4(%esp), %eax
        movdqa  20(%esp), %xmm0
        pand    .LC0, %xmm0
        movdqa  %xmm0, (%eax)
        ret     $4

test_copysign:
        movl    4(%esp), %eax
        movdqa  20(%esp), %xmm0
        movdqa  36(%esp), %xmm1
        pand    .LC0, %xmm0
        pand    .LC1, %xmm1
        por     %xmm1, %xmm0
        movdqa  %xmm0, (%eax)
        ret     $4

With unpached 4.7 compiler, the same code compiles (-O2 -msse) to some
40 SImode moves, with calls to __fabstf2 and __copysigntf2.

2012-05-13  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (*pushtf): Enable for TARGET_SSE.
	(pushtf splitter): Ditto.
	(movtf): Ditto.
	(*movtf_internal): Ditto. Use V4SFmode for !TARGET_SSE2.
	(<code>tf2): Enable for TARGET_SSE.
	(*absnegtf2_sse): Ditto.
	(copysign<mode>3): Enable TFmode for TARGET_SSE.
	(copysign<mode>3_const): Ditto.
	(copysign<mode>3_var): Ditto.
	* config/i386/sse.md (<code>tf3): Enable for TARGET_SSE.
	(*andnottf3): Ditto.  Use V4SFmode for !TARGET_SSE2.
	(*<code>tf3): Ditto.
	* config/i386/i386.c (struct builtin_description bdesc_args)
	<IX86_BUILTIN_FABSQ>: Enable for TARGET_SSE.
	<IX86_BUILTIN_COPYSIGNQ>: Ditto.
	(ix86_expand_builtin) <IX86_BUILTIN_FABSQ, IX86_BUILTIN_COPYSIGNQ>:
	Emit a normal call if SSE isn't available.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
{,-m32} and committed to mainline SVN.

Uros.

Patch

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 187435)
+++ config/i386/i386.c	(working copy)
@@ -26327,6 +26327,9 @@  static const struct builtin_description bdesc_args
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
 
+  { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
+
   /* SSE MMX or 3Dnow!A */
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
@@ -26510,9 +26513,6 @@  static const struct builtin_description bdesc_args
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
 
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
-
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
 
   /* SSE2 MMX */
@@ -28081,7 +28081,7 @@  ix86_init_builtins (void)
   def_builtin_const (0, "__builtin_huge_valq",
 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
 
-  /* We will expand them to normal call if SSE2 isn't available since
+  /* We will expand them to normal call if SSE isn't available since
      they are used by libgcc. */
   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
   t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
@@ -30215,8 +30215,8 @@  rdrand_step:
 	{
 	case IX86_BUILTIN_FABSQ:
 	case IX86_BUILTIN_COPYSIGNQ:
-	  if (!TARGET_SSE2)
-	    /* Emit a normal call if SSE2 isn't available.  */
+	  if (!TARGET_SSE)
+	    /* Emit a normal call if SSE isn't available.  */
 	    return expand_call (exp, target, ignore);
 	default:
 	  return ix86_expand_args_builtin (d, exp, target);
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 187435)
+++ config/i386/i386.md	(working copy)
@@ -2708,7 +2708,7 @@ 
 (define_insn "*pushtf"
   [(set (match_operand:TF 0 "push_operand" "=<,<,<")
 	(match_operand:TF 1 "general_no_elim_operand" "x,Fo,*r"))]
-  "TARGET_SSE2"
+  "TARGET_SSE"
 {
   /* This insn should be already split before reg-stack.  */
   gcc_unreachable ();
@@ -2721,7 +2721,7 @@ 
 (define_split
   [(set (match_operand:TF 0 "push_operand")
 	(match_operand:TF 1 "sse_reg_operand"))]
-  "TARGET_SSE2 && reload_completed"
+  "TARGET_SSE && reload_completed"
   [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -16)))
    (set (mem:TF (reg:P SP_REG)) (match_dup 1))])
 
@@ -2859,7 +2859,7 @@ 
 (define_expand "movtf"
   [(set (match_operand:TF 0 "nonimmediate_operand")
 	(match_operand:TF 1 "nonimmediate_operand"))]
-  "TARGET_SSE2"
+  "TARGET_SSE"
 {
   ix86_expand_move (TFmode, operands);
   DONE;
@@ -2874,7 +2874,7 @@ 
 (define_insn "*movtf_internal"
   [(set (match_operand:TF 0 "nonimmediate_operand" "=x,x ,m,?*r ,!o")
 	(match_operand:TF 1 "general_operand"	   "C ,xm,x,*roF,F*r"))]
-  "TARGET_SSE2
+  "TARGET_SSE
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (!can_create_pseudo_p ()
        || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
@@ -2929,7 +2929,8 @@ 
 		 (const_string "V4SF")
 	       (match_test "TARGET_AVX")
 		 (const_string "TI")
-	       (match_test "optimize_function_for_size_p (cfun)")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
 		 (const_string "V4SF")
 	       ]
 	       (const_string "TI")))])
@@ -8710,7 +8711,7 @@ 
 (define_expand "<code>tf2"
   [(set (match_operand:TF 0 "register_operand")
 	(absneg:TF (match_operand:TF 1 "register_operand")))]
-  "TARGET_SSE2"
+  "TARGET_SSE"
   "ix86_expand_fp_absneg_operator (<CODE>, TFmode, operands); DONE;")
 
 (define_insn "*absnegtf2_sse"
@@ -8719,7 +8720,7 @@ 
 	  [(match_operand:TF 1 "register_operand" "0,x")]))
    (use (match_operand:TF 2 "nonimmediate_operand" "xm,0"))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_SSE2"
+  "TARGET_SSE"
   "#")
 
 ;; Splitters for fp abs and neg.
@@ -8898,7 +8899,7 @@ 
    (match_operand:CSGNMODE 1 "nonmemory_operand")
    (match_operand:CSGNMODE 2 "register_operand")]
   "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
-   || (TARGET_SSE2 && (<MODE>mode == TFmode))"
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
   "ix86_expand_copysign (operands); DONE;")
 
 (define_insn_and_split "copysign<mode>3_const"
@@ -8909,7 +8910,7 @@ 
 	   (match_operand:<CSGNVMODE> 3 "nonimmediate_operand" "xm")]
 	  UNSPEC_COPYSIGN))]
   "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
-   || (TARGET_SSE2 && (<MODE>mode == TFmode))"
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
   "#"
   "&& reload_completed"
   [(const_int 0)]
@@ -8925,7 +8926,7 @@ 
 	  UNSPEC_COPYSIGN))
    (clobber (match_scratch:<CSGNVMODE> 1 "=x,x,x,x,x"))]
   "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
-   || (TARGET_SSE2 && (<MODE>mode == TFmode))"
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
   "#")
 
 (define_split
@@ -8938,7 +8939,7 @@ 
 	  UNSPEC_COPYSIGN))
    (clobber (match_scratch:<CSGNVMODE> 1))]
   "((SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
-    || (TARGET_SSE2 && (<MODE>mode == TFmode)))
+    || (TARGET_SSE && (<MODE>mode == TFmode)))
    && reload_completed"
   [(const_int 0)]
   "ix86_split_copysign_var (operands); DONE;")