Patchwork [RFC,i386] : Vectorize calls to floor, ceil, trunc and rint functions.

login
register
mail settings
Submitter Uros Bizjak
Date Feb. 27, 2011, 1:45 p.m.
Message ID <AANLkTimV3Y56CDMdYwwGKM3-4_uwhiyAcuo3j3e8FUis@mail.gmail.com>
Download mbox | patch
Permalink /patch/84690/
State New
Headers show

Comments

Uros Bizjak - Feb. 27, 2011, 1:45 p.m.
Hello!

Attached [RFC] patch vectorizes calls to floor, ceil, trunc and rint
(and their float variants) functions using roundps/roundpd SSE4.1
instruction.

2011-02-27  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (ROUND_FLOOR): New constant.
	(ROUND_CEIL): Ditto.
	(ROUND_TRUNC): Ditto.
	(ROUND_MXCSR): Ditto.
	(ROUND_NO_EXC): Ditto.
	(rint<mode>2): Use new defines instead of numerical constant.
	(floor<mode>2): Ditto.
	(ceil<mode>2): Ditto.
	(btrunc<mode>2): Ditto.
	* config/i386/i386-builtin-types.def: Define ROUND function type
	aliases.
	* config/i386/i386.c (enum ix86_builtins): Add
	IX86_BUILTIN_{FLOOR,CEIL,TRUNC,RINT}{PS,PD}{,256} defines.
	(struct builtin_description): Add
	__builtin_ia32_{floor,ceil,trunc,rint}{pd,ps}{,256} descriptions.
	(ix86_expand_sse_round): New static function.
	(ix86_expand_args_builtin): Call ix86_expand_sse_round for ROUND
	function types.
	(ix86_builtin_vectorized_function): Handle
	BUILT_IN_{FLOOR,CEIL,TRUNC,RINT}{,F} builtins.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
(--with-fpmath=avx). Currently, it does not include test cases, but it
is RFC at this gcc development stage anyway.

Uros.
H.J. Lu - March 16, 2011, 6:38 p.m.
On Sun, Feb 27, 2011 at 5:45 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> Hello!
>
> Attached [RFC] patch vectorizes calls to floor, ceil, trunc and rint
> (and their float variants) functions using roundps/roundpd SSE4.1
> instruction.
>
> 2011-02-27  Uros Bizjak  <ubizjak@gmail.com>
>
>        * config/i386/i386.md (ROUND_FLOOR): New constant.
>        (ROUND_CEIL): Ditto.
>        (ROUND_TRUNC): Ditto.
>        (ROUND_MXCSR): Ditto.
>        (ROUND_NO_EXC): Ditto.
>        (rint<mode>2): Use new defines instead of numerical constant.
>        (floor<mode>2): Ditto.
>        (ceil<mode>2): Ditto.
>        (btrunc<mode>2): Ditto.
>        * config/i386/i386-builtin-types.def: Define ROUND function type
>        aliases.
>        * config/i386/i386.c (enum ix86_builtins): Add
>        IX86_BUILTIN_{FLOOR,CEIL,TRUNC,RINT}{PS,PD}{,256} defines.
>        (struct builtin_description): Add
>        __builtin_ia32_{floor,ceil,trunc,rint}{pd,ps}{,256} descriptions.
>        (ix86_expand_sse_round): New static function.
>        (ix86_expand_args_builtin): Call ix86_expand_sse_round for ROUND
>        function types.
>        (ix86_builtin_vectorized_function): Handle
>        BUILT_IN_{FLOOR,CEIL,TRUNC,RINT}{,F} builtins.
>
> Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
> (--with-fpmath=avx). Currently, it does not include test cases, but it
> is RFC at this gcc development stage anyway.

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=48154

We need to check TARGET_ROUND before using __builtin_ia32_truncps.

Patch

Index: i386.md
===================================================================
--- i386.md	(revision 170534)
+++ i386.md	(working copy)
@@ -272,6 +272,15 @@ 
   UNSPECV_SPLIT_STACK_RETURN
 ])
 
+;; Constants to represent rounding modes in the ROUND instruction
+(define_constants
+  [(ROUND_FLOOR			0x1)
+   (ROUND_CEIL			0x2)
+   (ROUND_TRUNC			0x3)
+   (ROUND_MXCSR			0x4)
+   (ROUND_NO_EXC		0x8)
+  ])
+
 ;; Constants to represent pcomtrue/pcomfalse variants
 (define_constants
   [(PCOM_FALSE			0)
@@ -14573,7 +14582,7 @@ 
 	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
-		   (operands[0], operands[1], GEN_INT (0x04)));
+		   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
       else
 	ix86_expand_rint (operand0, operand1);
     }
@@ -14819,7 +14828,7 @@ 
 	FAIL;
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
-		   (operands[0], operands[1], GEN_INT (0x01)));
+		   (operands[0], operands[1], GEN_INT (ROUND_FLOOR)));
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
 	ix86_expand_floorceil (operand0, operand1, true);
       else
@@ -15074,7 +15083,7 @@ 
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
-		   (operands[0], operands[1], GEN_INT (0x02)));
+		   (operands[0], operands[1], GEN_INT (ROUND_CEIL)));
       else if (optimize_insn_for_size_p ())
 	FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
@@ -15329,7 +15338,7 @@ 
     {
       if (TARGET_ROUND)
 	emit_insn (gen_sse4_1_round<mode>2
-		   (operands[0], operands[1], GEN_INT (0x03)));
+		   (operands[0], operands[1], GEN_INT (ROUND_TRUNC)));
       else if (optimize_insn_for_size_p ())
 	FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
Index: i386-builtin-types.def
===================================================================
--- i386-builtin-types.def	(revision 170534)
+++ i386-builtin-types.def	(working copy)
@@ -377,6 +377,11 @@  DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V1
 
 DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI)
 
+DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND)
+
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST)
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST)
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DF_V4DF, PTEST)
Index: i386.c
===================================================================
--- i386.c	(revision 170534)
+++ i386.c	(working copy)
@@ -23916,6 +23916,15 @@  enum ix86_builtins
   IX86_BUILTIN_ROUNDSD,
   IX86_BUILTIN_ROUNDSS,
 
+  IX86_BUILTIN_FLOORPD,
+  IX86_BUILTIN_CEILPD,
+  IX86_BUILTIN_TRUNCPD,
+  IX86_BUILTIN_RINTPD,
+  IX86_BUILTIN_FLOORPS,
+  IX86_BUILTIN_CEILPS,
+  IX86_BUILTIN_TRUNCPS,
+  IX86_BUILTIN_RINTPS,
+
   IX86_BUILTIN_PTESTZ,
   IX86_BUILTIN_PTESTC,
   IX86_BUILTIN_PTESTNZC,
@@ -24083,6 +24092,15 @@  enum ix86_builtins
   IX86_BUILTIN_ROUNDPD256,
   IX86_BUILTIN_ROUNDPS256,
 
+  IX86_BUILTIN_FLOORPD256,
+  IX86_BUILTIN_CEILPD256,
+  IX86_BUILTIN_TRUNCPD256,
+  IX86_BUILTIN_RINTPD256,
+  IX86_BUILTIN_FLOORPS256,
+  IX86_BUILTIN_CEILPS256,
+  IX86_BUILTIN_TRUNCPS256,
+  IX86_BUILTIN_RINTPS256,
+
   IX86_BUILTIN_UNPCKHPD256,
   IX86_BUILTIN_UNPCKLPD256,
   IX86_BUILTIN_UNPCKHPS256,
@@ -25105,6 +25123,16 @@  static const struct builtin_description 
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
 
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
+
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
@@ -25217,6 +25245,16 @@  static const struct builtin_description 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
 
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
+
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
@@ -26216,6 +26254,39 @@  ix86_expand_sse_comi (const struct built
   return SUBREG_REG (target);
 }
 
+/* Subroutine of ix86_expand_args_builtin to take care of round insns.  */
+
+static rtx
+ix86_expand_sse_round (const struct builtin_description *d, tree exp,
+		       rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op1, op0 = expand_normal (arg0);
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+
+  op1 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
 
 static rtx
@@ -26485,6 +26556,11 @@  ix86_expand_args_builtin (const struct b
 
   switch ((enum ix86_builtin_func_type) d->flag)
     {
+    case V2DF_FTYPE_V2DF_ROUND:
+    case V4DF_FTYPE_V4DF_ROUND:
+    case V4SF_FTYPE_V4SF_ROUND:
+    case V8SF_FTYPE_V8SF_ROUND:
+      return ix86_expand_sse_round (d, exp, target);
     case INT_FTYPE_V8SF_V8SF_PTEST:
     case INT_FTYPE_V4DI_V4DI_PTEST:
     case INT_FTYPE_V4DF_V4DF_PTEST:
@@ -27581,6 +27657,110 @@  ix86_builtin_vectorized_function (tree f
 	}
       break;
 
+    case BUILT_IN_FLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPD];
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPD256];
+	}
+      break;
+
+    case BUILT_IN_FLOORF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPS];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPS256];
+	}
+      break;
+
+    case BUILT_IN_CEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_CEILPD];
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_CEILPD256];
+	}
+      break;
+
+    case BUILT_IN_CEILF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_CEILPS];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_CEILPS256];
+	}
+      break;
+
+    case BUILT_IN_TRUNC:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_TRUNCPD];
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
+	}
+      break;
+
+    case BUILT_IN_TRUNCF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_TRUNCPS];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
+	}
+      break;
+
+    case BUILT_IN_RINT:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_RINTPD];
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_RINTPD256];
+	}
+      break;
+
+    case BUILT_IN_RINTF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math)
+	return NULL_TREE;
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_RINTPS];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_RINTPS256];
+	}
+      break;
+
     case BUILT_IN_FMA:
       if (out_mode == DFmode && in_mode == DFmode)
 	{