Patchwork [v2,i386] : Expand round(a) = sgn(a) * trunc(fabs(a) + 0.5++) using SSE4 ROUND insn

login
register
mail settings
Submitter Uros Bizjak
Date Aug. 14, 2011, 7:11 p.m.
Message ID <CAFULd4aYPA5KdRi1PXVk4MaeE_y=_RKiQp618Ac9kgT65qv3AA@mail.gmail.com>
Download mbox | patch
Permalink /patch/109966/
State New
Headers show

Comments

Uros Bizjak - Aug. 14, 2011, 7:11 p.m.
On Sun, Aug 14, 2011 at 8:00 PM, Uros Bizjak <ubizjak@gmail.com> wrote:

>> We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
>> round(a) as (-O2 -ffast-math):
>
> I forgot to add that this expansion is expanded only under
> flag_unsafe_math_optimizations due to addition of 0.5. For the input
> of 0x1.fffffffffffffp-2, new insn sequence returns 1.0.

Actually, using an algorithm, proposed by Richi - sgn(a)*trunc(fabs(a)
+ 0.5++) - solves this failure.

New version of patch attached.

2011-08-14  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.c (ix86_expand_round_sse4): New function.
	* config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype.
	* config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4
	for TARGET_ROUND.

	(rint<mode>2): Simplify TARGET_ROUND check.
	(floor<mode>2): Ditto.
	(ceil<mode>2): Ditto.
	(btrunc<mode>2): Ditto.

Bootstrapped on x86_64-pc-linux-gnu {,-m32}, regression test still in progress.

Uros.

Patch

Index: i386.md
===================================================================
--- i386.md	(revision 177746)
+++ i386.md	(working copy)
@@ -14394,11 +14394,11 @@ 
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
 		   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else
 	ix86_expand_rint (operand0, operand1);
     }
@@ -14431,7 +14431,12 @@ 
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math && !flag_rounding_math)
     {
-      if (TARGET_64BIT || (<MODE>mode != DFmode))
+      if (TARGET_ROUND)
+        {
+	  operands[1] = force_reg (<MODE>mode, operands[1]);
+	  ix86_expand_round_sse4 (operands[0], operands[1]);
+	}
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
 	ix86_expand_round (operands[0], operands[1]);
       else
 	ix86_expand_rounddf_32 (operands[0], operands[1]);
@@ -14663,14 +14668,13 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
 		   (operands[0], operands[1], GEN_INT (ROUND_FLOOR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
 	ix86_expand_floorceil (operand0, operand1, true);
       else
@@ -14922,8 +14926,7 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
@@ -15179,8 +15182,7 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
Index: i386-protos.h
===================================================================
--- i386-protos.h	(revision 177746)
+++ i386-protos.h	(working copy)
@@ -174,6 +174,7 @@  extern void ix86_expand_lfloorceil (rtx, rtx, bool
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round_sse4 (rtx, rtx);
 extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_trunc (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c	(revision 177746)
+++ i386.c	(working copy)
@@ -32676,6 +32676,52 @@  ix86_expand_round (rtx operand0, rtx operand1)
 
   emit_move_insn (operand0, res);
 }
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, e3, res, half, mask;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx (*gen_round) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case SFmode:
+      gen_round = gen_sse4_1_roundsf2;
+      break;
+    case DFmode:
+      gen_round = gen_sse4_1_rounddf2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* e1 = fabs(op1) */
+  e1 = ix86_expand_sse_fabs (op1, &mask);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* e3 = trunc(e2) */
+  e3 = gen_reg_rtx (mode);
+  emit_insn (gen_round (e3, e2, GEN_INT (ROUND_TRUNC)));
+
+  /* res = copysign (e3, op1) */
+  res = gen_reg_rtx (mode);
+  ix86_sse_copysign_to_positive (res, e3, op1, mask);
+
+  emit_move_insn (op0, res);
+}
 
 
 /* Table of valid machine attributes.  */