Patchwork [i386] : Expand round(a) = sgn(a) * floor(fabs(a) + 0.5) using SSE4 ROUND insn

login
register
mail settings
Submitter Uros Bizjak
Date Aug. 20, 2011, 8:26 a.m.
Message ID <CAFULd4ZcMRDV_MGTzVmfmNTzWxEFsxA51AEF-9-b04v5u72a+w@mail.gmail.com>
Download mbox | patch
Permalink /patch/110761/
State New
Headers show

Comments

Uros Bizjak - Aug. 20, 2011, 8:26 a.m.
On Mon, Aug 15, 2011 at 5:25 PM, Michael Matz <matz@suse.de> wrote:

>> > > .LFB0:
>> > >        .cfi_startproc
>> > >        movsd   .LC0(%rip), %xmm2
>> > >        movapd  %xmm0, %xmm1
>> > >        andpd   %xmm2, %xmm1
>> > >        andnpd  %xmm0, %xmm2
>> > >        addsd   .LC1(%rip), %xmm1
>> > >        roundsd $1, %xmm1, %xmm1
>> > >        orpd    %xmm2, %xmm1
>> > >        movapd  %xmm1, %xmm0
>> > >        ret
>> >
>> > Hm, why do we need the sign-copy?  If I read the docs correctly
>> > we can simply use roundsd directly, no?
>>
>> round-half-away-from-zero breaks your neck.  round[ps][sd] only supports
>> the usual four IEEE rounding modes.
>
> But, you should be able to apply the sign to the 0.5, which wouldn't
> require building the absolute value of input:
>
> round(x) = trunc(x + (copysign (0.5, x)))
>
> which should roughly be expanded to:
>
>        movsd   signbits(%rip), %xmm1
>       andpd   %xmm0, %xmm1
>       movsd   nextof0.5(%rip), %xmm2
>       orpd    %xmm1, %xmm2
>       addpd   %xmm2, %xmm0
>       roundsd $1, %xmm0, %xmm0
>        ret
>
> Which has one logical operation less (and one move because I chose a more
> optimal register assignment).

x86 logical insns can load constants directly from the memory, so your
proposal creates even better code:

myround:
.LFB0:
	.cfi_startproc
	movapd	%xmm0, %xmm1
	andpd	.LC1(%rip), %xmm1
	orpd	.LC0(%rip), %xmm1
	addsd	%xmm0, %xmm1
	roundsd	$3, %xmm1, %xmm0
	ret

Please also note that copysign expander has special code path for op0
being constant.

I am testing following patch:

2011-08-20  Uros Bizjak  <ubizjak@gmail.com>
	    Michael Matz  <matz@suse.de>

	* config/i386/i386.c (ix86_expand_round_sse4): Expand as
	trunc (a + copysign (nextafter (0.5, 0.0), a)).

Uros.

Patch

Index: i386/i386.c
===================================================================
--- i386/i386.c	(revision 177925)
+++ i386/i386.c	(working copy)
@@ -32700,42 +32700,44 @@  void
 ix86_expand_round_sse4 (rtx op0, rtx op1)
 {
   enum machine_mode mode = GET_MODE (op0);
-  rtx e1, e2, e3, res, half, mask;
+  rtx e1, e2, res, half, mask;
   const struct real_format *fmt;
   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx (*gen_copysign) (rtx, rtx, rtx);
   rtx (*gen_round) (rtx, rtx, rtx);
 
   switch (mode)
     {
     case SFmode:
+      gen_copysign = gen_copysignsf3;
       gen_round = gen_sse4_1_roundsf2;
       break;
     case DFmode:
+      gen_copysign = gen_copysigndf3;
       gen_round = gen_sse4_1_rounddf2;
       break;
     default:
       gcc_unreachable ();
     }
 
-  /* e1 = fabs(op1) */
-  e1 = ix86_expand_sse_fabs (op1, &mask);
+  /* round (a) = trunc (a + copysign (0.5, a)) */
 
   /* load nextafter (0.5, 0.0) */
   fmt = REAL_MODE_FORMAT (mode);
   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+  half = const_double_from_real_value (pred_half, mode);
 
-  /* e2 = e1 + 0.5 */
-  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
-  e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT);
+  /* e1 = copysign (0.5, op1) */
+  e1 = gen_reg_rtx (mode);
+  emit_insn (gen_copysign (e1, half, op1));
 
-  /* e3 = trunc(e2) */
-  e3 = gen_reg_rtx (mode);
-  emit_insn (gen_round (e3, e2, GEN_INT (ROUND_TRUNC)));
+  /* e2 = op1 + e1 */
+  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
 
-  /* res = copysign (e3, op1) */
+  /* res = trunc (e2) */
   res = gen_reg_rtx (mode);
-  ix86_sse_copysign_to_positive (res, e3, op1, mask);
+  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
 
   emit_move_insn (op0, res);
 }