Patchwork [i386] : Expand round(a) = sgn(a) * floor(fabs(a) + 0.5) using SSE4 ROUND insn

login
register
mail settings
Submitter Uros Bizjak
Date Aug. 14, 2011, 5:24 p.m.
Message ID <CAFULd4atDNoY-3gC9TkbTQ7P=r6fxtQ5F-Xf+Jk18fWP2Ga55g@mail.gmail.com>
Download mbox | patch
Permalink /patch/109962/
State New
Headers show

Comments

Uros Bizjak - Aug. 14, 2011, 5:24 p.m.
Hello!

We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
round(a) as (-O2 -ffast-math):

.LFB0:
	.cfi_startproc
	movsd	.LC1(%rip), %xmm1
	movapd	%xmm0, %xmm2
	movsd	.LC0(%rip), %xmm3
	andpd	%xmm1, %xmm2
	ucomisd	%xmm2, %xmm3
	jbe	.L2
	addsd	.LC2(%rip), %xmm2
	andnpd	%xmm0, %xmm1
	movapd	%xmm1, %xmm0
	cvttsd2siq	%xmm2, %rax
	cvtsi2sdq	%rax, %xmm2
	orpd	%xmm2, %xmm0
.L2:
	rep
	ret

Adding -msse4, we now generate branchless code using roundsd:

.LFB0:
	.cfi_startproc
	movsd	.LC0(%rip), %xmm2
	movapd	%xmm0, %xmm1
	andpd	%xmm2, %xmm1
	andnpd	%xmm0, %xmm2
	addsd	.LC1(%rip), %xmm1
	roundsd	$1, %xmm1, %xmm1
	orpd	%xmm2, %xmm1
	movapd	%xmm1, %xmm0
	ret

The patch also simplifies a couple of checks in related patterns.

2011-08-14  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.c (ix86_expand_round_sse4): New function.
	* config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype.
	* config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4
	for TARGET_ROUND.

	(rint<mode>2): Simplify TARGET_ROUND check.
	(floor<mode>2): Ditto.
	(ceil<mode>2): Ditto.
	(btrunc<mode>2): Ditto.

Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32},
will be committed to mainline soon.

Uros.
Uros Bizjak - Aug. 14, 2011, 6 p.m.
On Sun, Aug 14, 2011 at 7:24 PM, Uros Bizjak <ubizjak@gmail.com> wrote:

> We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
> round(a) as (-O2 -ffast-math):

I forgot to add that this expansion is expanded only under
flag_unsafe_math_optimizations due to addition of 0.5. For the input
of 0x1.fffffffffffffp-2, new insn sequence returns 1.0.

Uros.
Richard Guenther - Aug. 15, 2011, 10:15 a.m.
On Sun, Aug 14, 2011 at 7:24 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> Hello!
>
> We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
> round(a) as (-O2 -ffast-math):
>
> .LFB0:
>        .cfi_startproc
>        movsd   .LC1(%rip), %xmm1
>        movapd  %xmm0, %xmm2
>        movsd   .LC0(%rip), %xmm3
>        andpd   %xmm1, %xmm2
>        ucomisd %xmm2, %xmm3
>        jbe     .L2
>        addsd   .LC2(%rip), %xmm2
>        andnpd  %xmm0, %xmm1
>        movapd  %xmm1, %xmm0
>        cvttsd2siq      %xmm2, %rax
>        cvtsi2sdq       %rax, %xmm2
>        orpd    %xmm2, %xmm0
> .L2:
>        rep
>        ret
>
> Adding -msse4, we now generate branchless code using roundsd:
>
> .LFB0:
>        .cfi_startproc
>        movsd   .LC0(%rip), %xmm2
>        movapd  %xmm0, %xmm1
>        andpd   %xmm2, %xmm1
>        andnpd  %xmm0, %xmm2
>        addsd   .LC1(%rip), %xmm1
>        roundsd $1, %xmm1, %xmm1
>        orpd    %xmm2, %xmm1
>        movapd  %xmm1, %xmm0
>        ret

Hm, why do we need the sign-copy?  If I read the docs correctly
we can simply use roundsd directly, no?

> The patch also simplifies a couple of checks in related patterns.
>
> 2011-08-14  Uros Bizjak  <ubizjak@gmail.com>
>
>        * config/i386/i386.c (ix86_expand_round_sse4): New function.
>        * config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype.
>        * config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4
>        for TARGET_ROUND.
>
>        (rint<mode>2): Simplify TARGET_ROUND check.
>        (floor<mode>2): Ditto.
>        (ceil<mode>2): Ditto.
>        (btrunc<mode>2): Ditto.
>
> Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32},
> will be committed to mainline soon.
>
> Uros.
>
Michael Matz - Aug. 15, 2011, 2:47 p.m.
Hi,

On Mon, 15 Aug 2011, Richard Guenther wrote:

> > Adding -msse4, we now generate branchless code using roundsd:
> >
> > .LFB0:
> >        .cfi_startproc
> >        movsd   .LC0(%rip), %xmm2
> >        movapd  %xmm0, %xmm1
> >        andpd   %xmm2, %xmm1
> >        andnpd  %xmm0, %xmm2
> >        addsd   .LC1(%rip), %xmm1
> >        roundsd $1, %xmm1, %xmm1
> >        orpd    %xmm2, %xmm1
> >        movapd  %xmm1, %xmm0
> >        ret
> 
> Hm, why do we need the sign-copy?  If I read the docs correctly
> we can simply use roundsd directly, no?

round-half-away-from-zero breaks your neck.  round[ps][sd] only supports 
the usual four IEEE rounding modes.


Ciao,
Michael.
Michael Matz - Aug. 15, 2011, 3:25 p.m.
Hi,

On Mon, 15 Aug 2011, Michael Matz wrote:

> > > .LFB0:
> > >        .cfi_startproc
> > >        movsd   .LC0(%rip), %xmm2
> > >        movapd  %xmm0, %xmm1
> > >        andpd   %xmm2, %xmm1
> > >        andnpd  %xmm0, %xmm2
> > >        addsd   .LC1(%rip), %xmm1
> > >        roundsd $1, %xmm1, %xmm1
> > >        orpd    %xmm2, %xmm1
> > >        movapd  %xmm1, %xmm0
> > >        ret
> > 
> > Hm, why do we need the sign-copy?  If I read the docs correctly
> > we can simply use roundsd directly, no?
> 
> round-half-away-from-zero breaks your neck.  round[ps][sd] only supports 
> the usual four IEEE rounding modes.

But, you should be able to apply the sign to the 0.5, which wouldn't 
require building the absolute value of input:

round(x) = trunc(x + (copysign (0.5, x)))

which should roughly be expanded to:

       movsd   signbits(%rip), %xmm1
       andpd   %xmm0, %xmm1
       movsd   nextof0.5(%rip), %xmm2
       orpd    %xmm1, %xmm2
       addpd   %xmm2, %xmm0
       roundsd $1, %xmm0, %xmm0
       ret

Which has one logical operation less (and one move because I chose a more 
optimal register assignment).


Ciao,
Michael.
Uros Bizjak - Aug. 19, 2011, 7:51 p.m.
On Mon, Aug 15, 2011 at 5:25 PM, Michael Matz <matz@suse.de> wrote:

> On Mon, 15 Aug 2011, Michael Matz wrote:
>
>> > > .LFB0:
>> > >        .cfi_startproc
>> > >        movsd   .LC0(%rip), %xmm2
>> > >        movapd  %xmm0, %xmm1
>> > >        andpd   %xmm2, %xmm1
>> > >        andnpd  %xmm0, %xmm2
>> > >        addsd   .LC1(%rip), %xmm1
>> > >        roundsd $1, %xmm1, %xmm1
>> > >        orpd    %xmm2, %xmm1
>> > >        movapd  %xmm1, %xmm0
>> > >        ret
>> >
>> > Hm, why do we need the sign-copy?  If I read the docs correctly
>> > we can simply use roundsd directly, no?
>>
>> round-half-away-from-zero breaks your neck.  round[ps][sd] only supports
>> the usual four IEEE rounding modes.
>
> But, you should be able to apply the sign to the 0.5, which wouldn't
> require building the absolute value of input:
>
> round(x) = trunc(x + (copysign (0.5, x)))
>
> which should roughly be expanded to:
>
>        movsd   signbits(%rip), %xmm1
>       andpd   %xmm0, %xmm1
>       movsd   nextof0.5(%rip), %xmm2
>       orpd    %xmm1, %xmm2
>       addpd   %xmm2, %xmm0
>       roundsd $1, %xmm0, %xmm0
>        ret
>
> Which has one logical operation less (and one move because I chose a more
> optimal register assignment).

Thanks for the suggestion, I will implement and test it ASAP.

Uros.

Patch

Index: i386.md
===================================================================
--- i386.md	(revision 177746)
+++ i386.md	(working copy)
@@ -14394,11 +14394,11 @@ 
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
 		   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else
 	ix86_expand_rint (operand0, operand1);
     }
@@ -14431,7 +14431,12 @@ 
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math && !flag_rounding_math)
     {
-      if (TARGET_64BIT || (<MODE>mode != DFmode))
+      if (TARGET_ROUND)
+        {
+	  operands[1] = force_reg (<MODE>mode, operands[1]);
+	  ix86_expand_round_sse4 (operands[0], operands[1]);
+	}
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
 	ix86_expand_round (operands[0], operands[1]);
       else
 	ix86_expand_rounddf_32 (operands[0], operands[1]);
@@ -14663,14 +14668,13 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
 		   (operands[0], operands[1], GEN_INT (ROUND_FLOOR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
 	ix86_expand_floorceil (operand0, operand1, true);
       else
@@ -14922,8 +14926,7 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
@@ -15179,8 +15182,7 @@ 
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
Index: i386-protos.h
===================================================================
--- i386-protos.h	(revision 177746)
+++ i386-protos.h	(working copy)
@@ -174,6 +174,7 @@ 
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round_sse4 (rtx, rtx);
 extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_trunc (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c	(revision 177746)
+++ i386.c	(working copy)
@@ -32676,6 +32676,40 @@ 
 
   emit_move_insn (operand0, res);
 }
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, e3, res, half, mask;
+
+  half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, mode);
+
+  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+
+  /* e1 = fabs(op1) */
+  e1 = ix86_expand_sse_fabs (op1, &mask);
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (mode, half);
+  e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* e3 = floor(e2) */
+  e3 = gen_reg_rtx (mode);
+  emit_insn
+    (gen_rtx_SET (VOIDmode, e3,
+		  gen_rtx_UNSPEC (mode,
+				  gen_rtvec (2, e2, GEN_INT (ROUND_FLOOR)),
+				  UNSPEC_ROUND)));
+
+  /* res = copysign (e3, op1) */
+  res = gen_reg_rtx (mode);
+  ix86_sse_copysign_to_positive (res, e3, op1, mask);
+
+  emit_move_insn (op0, res);
+}
 
 
 /* Table of valid machine attributes.  */