Patchwork [i386] : Vectorize (int) {floor,ceil,round}{,f} with SSE4.1+

login
register
mail settings
Submitter Uros Bizjak
Date Nov. 14, 2011, 6:21 p.m.
Message ID <CAFULd4bM8Zk8d4GcV5kCAwmmVGCkdKp5RXJcLXcem74v3Rkgbw@mail.gmail.com>
Download mbox | patch
Permalink /patch/125586/
State New
Headers show

Comments

Uros Bizjak - Nov. 14, 2011, 6:21 p.m.
Hello!

Currently, gcc vectorizes {floor,ceil,round}{,f}, but not i.e. (int)
floor [and other rounding functions] due to conversion of (int) floor
-> ifloor that inhibits vectorization.

Attached patch introduces new expanders that handle
rounding+conversions with expansion to round{pd,ps}+cvttpd2{ps,pd}
sequence on SSE41+ targets.

2011-11-14  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/sse.md (round<mode>2_sfix): New expander.
	(round<mode>2_vec_pack_sfix): Ditto.
	(<sse4_1>_round<ssemodesuffix>_sfix<avxsizesuffix>): Ditto.
	(<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>): Ditto.
	* config/i386/builtin-types.def (V4SI_FTYPE_V4SF_ROUND,
	V8SI_FTYPE_V8SF_ROUND, V4SI_FTYPE_V2DF_V2DF_ROUND,
	V8SI_FTYPE_V4DF_V4DF_ROUND): New builtin types.
	* config/i386/i386.c (ix86_builtins): Add
	IX86_BUILTIN_{FLOORPD,CEILPD,ROUNDPD_AZ}_VEC_PACK_SFIX{,256} and
	IX86_BUILTIN_{FLOORPS,CEILPS,ROUNDPS_AZ}_SFIX{,256} defines.
	(bdesc_args): Add __builtin_ia32_{floorpd,ceilpd}_vec_pack_sfix{,256},
	__builtin_ia32_roundpd_az_vec_pack_sfix{,256},
	__builtin_ia32_{floorps,ceilps}_sfix{,256}and
	__builtin_ia32_roundps_az_sfix{,256} descriptions.
	(ix86_expand_sse_round_vec_pack_sfix): New.
	(ix86_expand_args_builtin): Handle V4SI_FTYPE_V4SF_ROUND,
	V8SI_FTYPE_V8SF_ROUND, V4SI_FTYPE_V2DF_V2DF_ROUND and
	V8SI_FTYPE_V4DF_V4DF_ROUND types.  Check last argument of
	CODE_FOR_sse4_1_roundpd_vec_pack_sfix, CODE_FOR_sse4_1_roundps_sfix,
	CODE_FOR_avx_roundpd_vec_pack_sfix256 and CODE_FOR_avx_roundps_sfix256.
	(ix86_builtin_vectorized_function): Handle
	BUILT_IN_{I,L,LL}FLOOR{,F}, BUILT_IN_{I,L,LL}CEIL{,F} and
	BUILT_IN_{I,L,LL}ROUND{,F}

testsuite/ChangeLog:

2011-11-14  Uros Bizjak  <ubizjak@gmail.com>

	* gcc.target/i386/sse4_1-floor-sfix-vec.c: New test.
	* gcc.target/i386/sse4_1-floorf-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-floor-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-floorf-sfix-vec.c: Ditto.
	* gcc.target/i386/sse4_1-ceil-sfix-vec.c: Ditto.
	* gcc.target/i386/sse4_1-ceilf-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-ceil-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-ceilf-sfix-vec.c: Ditto.
	* gcc.target/i386/sse4_1-round-sfix-vec.c: Ditto.
	* gcc.target/i386/sse4_1-roundf-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-round-sfix-vec.c: Ditto.
	* gcc.target/i386/avx-roundf-sfix-vec.c: Ditto.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
{,-m32}. Since we don't vectorize functions that were vectorized
previously, this can be considered a regression, fixed with the above
patch.

Patch will be committed to mainline as soon as additional regression
tests with --mfpmath=avx configured gcc finish.

Uros.

Index: avx-floor-sfix-vec.c
===================================================================
--- avx-floor-sfix-vec.c	(revision 0)
+++ avx-floor-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-floor-sfix-vec.c"
Index: avx-floorf-sfix-vec.c
===================================================================
--- avx-floorf-sfix-vec.c	(revision 0)
+++ avx-floorf-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-floorf-sfix-vec.c"
Index: sse4_1-round-sfix-vec.c
===================================================================
--- sse4_1-round-sfix-vec.c	(revision 0)
+++ sse4_1-round-sfix-vec.c	(revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double round (double);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+  int i, sign = 1;
+  double f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  double a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) round (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) round (a[i]))
+      abort();
+}
Index: avx-round-sfix-vec.c
===================================================================
--- avx-round-sfix-vec.c	(revision 0)
+++ avx-round-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-round-sfix-vec.c"
Index: sse4_1-ceilf-sfix-vec.c
===================================================================
--- sse4_1-ceilf-sfix-vec.c	(revision 0)
+++ sse4_1-ceilf-sfix-vec.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#include "sse4_1-check.h"
+
+#include <math.h>
+
+extern float ceilf (float);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (float *src)
+{
+  int i, sign = 1;
+  float f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  float a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) ceilf (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) ceilf (a[i]))
+      abort();
+}
Index: avx-ceilf-sfix-vec.c
===================================================================
--- avx-ceilf-sfix-vec.c	(revision 0)
+++ avx-ceilf-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-ceilf-sfix-vec.c"
Index: avx-ceil-sfix-vec.c
===================================================================
--- avx-ceil-sfix-vec.c	(revision 0)
+++ avx-ceil-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-ceil-sfix-vec.c"
Index: sse4_1-roundf-sfix-vec.c
===================================================================
--- sse4_1-roundf-sfix-vec.c	(revision 0)
+++ sse4_1-roundf-sfix-vec.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#include "sse4_1-check.h"
+
+#include <math.h>
+
+extern float roundf (float);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (float *src)
+{
+  int i, sign = 1;
+  float f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  float a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) roundf (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) roundf (a[i]))
+      abort();
+}
Index: sse4_1-ceil-sfix-vec.c
===================================================================
--- sse4_1-ceil-sfix-vec.c	(revision 0)
+++ sse4_1-ceil-sfix-vec.c	(revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double ceil (double);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+  int i, sign = 1;
+  double f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  double a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) ceil (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) ceil (a[i]))
+      abort();
+}
Index: sse4_1-floorf-sfix-vec.c
===================================================================
--- sse4_1-floorf-sfix-vec.c	(revision 0)
+++ sse4_1-floorf-sfix-vec.c	(revision 0)
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#include "sse4_1-check.h"
+
+#include <math.h>
+
+extern float floorf (float);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (float *src)
+{
+  int i, sign = 1;
+  float f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  float a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) floorf (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) floorf (a[i]))
+      abort();
+}
Index: sse4_1-floor-sfix-vec.c
===================================================================
--- sse4_1-floor-sfix-vec.c	(revision 0)
+++ sse4_1-floor-sfix-vec.c	(revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double floor (double);
+
+#define NUM 64
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+  int i, sign = 1;
+  double f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1) * f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+      sign = -sign;
+    }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+sse4_1_test (void)
+{
+  double a[NUM];
+  int r[NUM];
+  int i;
+
+  init_src (a);
+
+  for (i = 0; i < NUM; i++)
+    r[i] = (int) floor (a[i]);
+
+  /* check results:  */
+  for (i = 0; i < NUM; i++)
+    if (r[i] != (int) floor (a[i]))
+      abort();
+}
Index: avx-roundf-sfix-vec.c
===================================================================
--- avx-roundf-sfix-vec.c	(revision 0)
+++ avx-roundf-sfix-vec.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-roundf-sfix-vec.c"

Patch

Index: sse.md
===================================================================
--- sse.md	(revision 181339)
+++ sse.md	(working copy)
@@ -9882,6 +9882,45 @@ 
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "<sse4_1>_round<ssemodesuffix>_sfix<avxsizesuffix>"
+  [(match_operand:<sseintvecmode> 0 "register_operand" "")
+   (match_operand:VF1 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_15_operand" "")]
+  "TARGET_ROUND"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  emit_insn
+    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp, operands[1],
+						       operands[2]));
+  emit_insn
+    (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>"
+  [(match_operand:<ssepackfltmode> 0 "register_operand" "")
+   (match_operand:VF2 1 "nonimmediate_operand" "")
+   (match_operand:VF2 2 "nonimmediate_operand" "")
+   (match_operand:SI 3 "const_0_to_15_operand" "")]
+  "TARGET_ROUND"
+{
+  rtx tmp0, tmp1;
+
+  tmp0 = gen_reg_rtx (<MODE>mode);
+  tmp1 = gen_reg_rtx (<MODE>mode);
+
+  emit_insn
+    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
+						       operands[3]));
+  emit_insn
+    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
+						       operands[3]));
+  emit_insn
+    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+  DONE;
+})
+
 (define_insn "sse4_1_round<ssescalarmodesuffix>"
   [(set (match_operand:VF_128 0 "register_operand" "=x,x")
 	(vec_merge:VF_128
@@ -9937,6 +9976,39 @@ 
   operands[5] = GEN_INT (ROUND_TRUNC);
 })
 
+(define_expand "round<mode>2_sfix"
+  [(match_operand:<sseintvecmode> 0 "register_operand" "")
+   (match_operand:VF1 1 "nonimmediate_operand" "")]
+  "TARGET_ROUND && !flag_trapping_math"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  emit_insn (gen_round<mode>2 (tmp, operands[1]));
+
+  emit_insn
+    (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "round<mode>2_vec_pack_sfix"
+  [(match_operand:<ssepackfltmode> 0 "register_operand" "")
+   (match_operand:VF2 1 "nonimmediate_operand" "")
+   (match_operand:VF2 2 "nonimmediate_operand" "")]
+  "TARGET_ROUND && !flag_trapping_math"
+{
+  rtx tmp0, tmp1;
+
+  tmp0 = gen_reg_rtx (<MODE>mode);
+  tmp1 = gen_reg_rtx (<MODE>mode);
+
+  emit_insn (gen_round<mode>2 (tmp0, operands[1]));
+  emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+
+  emit_insn
+    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Intel SSE4.2 string/text processing instructions
Index: i386-builtin-types.def
===================================================================
--- i386-builtin-types.def	(revision 181339)
+++ i386-builtin-types.def	(working copy)
@@ -465,6 +465,11 @@  DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND)
 
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V2DF_V2DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V4DF_V4DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SF, ROUND)
+
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST)
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST)
 DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DF_V4DF, PTEST)
Index: i386.c
===================================================================
--- i386.c	(revision 181339)
+++ i386.c	(working copy)
@@ -24891,22 +24891,32 @@  enum ix86_builtins
   IX86_BUILTIN_PMULDQ128,
   IX86_BUILTIN_PMULLD128,
 
-  IX86_BUILTIN_ROUNDPD,
-  IX86_BUILTIN_ROUNDPS,
   IX86_BUILTIN_ROUNDSD,
   IX86_BUILTIN_ROUNDSS,
 
+  IX86_BUILTIN_ROUNDPD,
+  IX86_BUILTIN_ROUNDPS,
+
   IX86_BUILTIN_FLOORPD,
   IX86_BUILTIN_CEILPD,
   IX86_BUILTIN_TRUNCPD,
   IX86_BUILTIN_RINTPD,
   IX86_BUILTIN_ROUNDPD_AZ,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
+
   IX86_BUILTIN_FLOORPS,
   IX86_BUILTIN_CEILPS,
   IX86_BUILTIN_TRUNCPS,
   IX86_BUILTIN_RINTPS,
   IX86_BUILTIN_ROUNDPS_AZ,
 
+  IX86_BUILTIN_FLOORPS_SFIX,
+  IX86_BUILTIN_CEILPS_SFIX,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX,
+
   IX86_BUILTIN_PTESTZ,
   IX86_BUILTIN_PTESTC,
   IX86_BUILTIN_PTESTNZC,
@@ -25080,12 +25090,21 @@  enum ix86_builtins
   IX86_BUILTIN_TRUNCPD256,
   IX86_BUILTIN_RINTPD256,
   IX86_BUILTIN_ROUNDPD_AZ256,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
+
   IX86_BUILTIN_FLOORPS256,
   IX86_BUILTIN_CEILPS256,
   IX86_BUILTIN_TRUNCPS256,
   IX86_BUILTIN_RINTPS256,
   IX86_BUILTIN_ROUNDPS_AZ256,
 
+  IX86_BUILTIN_FLOORPS_SFIX256,
+  IX86_BUILTIN_CEILPS_SFIX256,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
+
   IX86_BUILTIN_UNPCKHPD256,
   IX86_BUILTIN_UNPCKLPD256,
   IX86_BUILTIN_UNPCKHPS256,
@@ -26290,14 +26309,22 @@  static const struct builtin_description bdesc_args
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
 
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
 
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
 
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
+
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
 
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
@@ -26417,13 +26444,21 @@  static const struct builtin_description bdesc_args
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
 
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
+
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
 
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
+
   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
@@ -27864,7 +27899,7 @@  ix86_expand_sse_comi (const struct builtin_descrip
   return SUBREG_REG (target);
 }
 
-/* Subroutine of ix86_expand_args_builtin to take care of round insns.  */
+/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
 
 static rtx
 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
@@ -27897,6 +27932,44 @@  ix86_expand_sse_round (const struct builtin_descri
   return target;
 }
 
+static rtx
+ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+				     tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op0 = safe_vector_operand (op0, mode0);
+  op1 = safe_vector_operand (op1, mode1);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
 
 static rtx
@@ -28170,7 +28243,12 @@  ix86_expand_args_builtin (const struct builtin_des
     case V4DF_FTYPE_V4DF_ROUND:
     case V4SF_FTYPE_V4SF_ROUND:
     case V8SF_FTYPE_V8SF_ROUND:
+    case V4SI_FTYPE_V4SF_ROUND:
+    case V8SI_FTYPE_V8SF_ROUND:
       return ix86_expand_sse_round (d, exp, target);
+    case V4SI_FTYPE_V2DF_V2DF_ROUND:
+    case V8SI_FTYPE_V4DF_V4DF_ROUND:
+      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
     case INT_FTYPE_V8SF_V8SF_PTEST:
     case INT_FTYPE_V4DI_V4DI_PTEST:
     case INT_FTYPE_V4DF_V4DF_PTEST:
@@ -28487,15 +28565,22 @@  ix86_expand_args_builtin (const struct builtin_des
 		error ("the last argument must be an 1-bit immediate");
 		return const0_rtx;
 
+	      case CODE_FOR_sse4_1_roundsd:
+	      case CODE_FOR_sse4_1_roundss:
+
 	      case CODE_FOR_sse4_1_roundpd:
 	      case CODE_FOR_sse4_1_roundps:
-	      case CODE_FOR_sse4_1_roundsd:
-	      case CODE_FOR_sse4_1_roundss:
+	      case CODE_FOR_avx_roundpd256:
+	      case CODE_FOR_avx_roundps256:
+
+	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+	      case CODE_FOR_sse4_1_roundps_sfix:
+	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+	      case CODE_FOR_avx_roundps_sfix256:
+
 	      case CODE_FOR_sse4_1_blendps:
 	      case CODE_FOR_avx_blendpd256:
 	      case CODE_FOR_avx_vpermilv4df:
-	      case CODE_FOR_avx_roundpd256:
-	      case CODE_FOR_avx_roundps256:
 		error ("the last argument must be a 4-bit immediate");
 		return const0_rtx;
 
@@ -29521,6 +29606,70 @@  ix86_builtin_vectorized_function (tree fndecl, tre
 	}
       break;
 
+    case BUILT_IN_IFLOOR:
+    case BUILT_IN_LFLOOR:
+    case BUILT_IN_LLFLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
+	}
+      break;
+
+    case BUILT_IN_IFLOORF:
+    case BUILT_IN_LFLOORF:
+    case BUILT_IN_LLFLOORF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
+	}
+      break;
+
+    case BUILT_IN_ICEIL:
+    case BUILT_IN_LCEIL:
+    case BUILT_IN_LLCEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
+	}
+      break;
+
+    case BUILT_IN_ICEILF:
+    case BUILT_IN_LCEILF:
+    case BUILT_IN_LLCEILF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
+	}
+      break;
+
     case BUILT_IN_IRINT:
     case BUILT_IN_LRINT:
     case BUILT_IN_LLRINT:
@@ -29545,6 +29694,38 @@  ix86_builtin_vectorized_function (tree fndecl, tre
 	}
       break;
 
+    case BUILT_IN_IROUND:
+    case BUILT_IN_LROUND:
+    case BUILT_IN_LLROUND:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
+	}
+      break;
+
+    case BUILT_IN_IROUNDF:
+    case BUILT_IN_LROUNDF:
+    case BUILT_IN_LLROUNDF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
+	}
+      break;
+
     case BUILT_IN_COPYSIGN:
       if (out_mode == DFmode && in_mode == DFmode)
 	{