diff mbox

[i386,10/8,AVX512] Add missing AVX-512ER patterns, intrinsics, tests.

Message ID 20140110162038.GC63041@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Kirill Yukhin Jan. 10, 2014, 4:20 p.m. UTC
Hello,

It seems that we miss few more intrinsics.
I've also added some missing substed predicates
Also I've fixed bogus rcp14 pattern and removeed
some redundant subst attributes.

Bootstrapped. New & existing tests pass (expcept 
for those mentioned in PR about REE).

Is it ok for trunk?

gcc/
	* config/i386/avx512erintrin.h (_mm_rcp28_round_sd): New.
	(_mm_rcp28_round_ss): Ditto.
	(_mm_rsqrt28_round_sd): Ditto.
	(_mm_rsqrt28_round_ss): Ditto.
	(_mm_rcp28_sd): Ditto.
	(_mm_rcp28_ss): Ditto.
	(_mm_rsqrt28_sd): Ditto.
	(_mm_rsqrt28_ss): Ditto.
	* config/i386/avx512fintrin.h (_mm512_stream_load_si512): Ditto.
	* config/i386/i386-builtin-types.def (V8DI_FTYPE_PV8DI): Ditto.
	* config/i386/i386.c (IX86_BUILTIN_MOVNTDQA512): Ditto.
	(IX86_BUILTIN_RCP28SD): Ditto.
	(IX86_BUILTIN_RCP28SS): Ditto.
	(IX86_BUILTIN_RSQRT28SD): Ditto.
	(IX86_BUILTIN_RSQRT28SS): Ditto.
	(bdesc_special_args): Define __builtin_ia32_movntdqa512,
	__builtin_ia32_rcp28sd_round, __builtin_ia32_rcp28ss_round,
	__builtin_ia32_rsqrt28sd_round, __builtin_ia32_rsqrt28ss_round.
	(ix86_expand_special_args_builtin): Expand new FTYPE.
	* config/i386/sse.md (define_mode_attr "sse4_1_avx2"): Expand to V8DI.
	(srcp14<mode>): Make insn unary.
	(avx512f_vmscalef<mode><round_name>): Use substed predicate.
	(avx512f_sgetexp<mode><round_saeonly_name>): Ditto.
	(avx512f_rndscale<mode><round_saeonly_name>): Ditto.
	(<sse4_1_avx2>_movntdqa): Extend to 512 bits.
	(avx512er_exp2<mode><mask_name><round_saeonly_name>):
	Fix rounding: make it SAE only.
	(<mask_codefor>avx512er_rcp28<mode><mask_name><round_saeonly_name>): Ditto.
	(<mask_codefor>avx512er_rsqrt28<mode><mask_name><round_saeonly_name>): Ditto.
	(avx512er_vmrcp28<mode><round_saeonly_name>): Ditto.
	(avx512er_vmrsqrt28<mode><round_saeonly_name>): Ditto.
	(avx512f_getmant<mode><mask_name><round_saeonly_name>): Ditto.
	* config/i386/subst.md (round_saeonly_mask_scalar_operand3): Remove.
	(round_saeonly_mask_scalar_operand4): Ditto.
	(round_saeonly_mask_scalar_op3): Ditto.
	(round_saeonly_mask_scalar_op4): Ditto.

gcc/testsuite/

	* gcc.target/i386/avx-1.c: Fix __builtin_ia32_exp2ps_mask,
	__builtin_ia32_exp2pd_mask, __builtin_ia32_rcp28ps_mask,
	__builtin_ia32_rcp28pd_mask, __builtin_ia32_rsqrt28ps_mask,
	__builtin_ia32_rsqrt28pd_mask. Add __builtin_ia32_rcp28ss_round,
	__builtin_ia32_rcp28sd_round, __builtin_ia32_rsqrt28ss_round,
	__builtin_ia32_rsqrt28sd_round.
	* gcc.target/i386/avx512er-vexp2pd-1.c: Fix rounding mode.
	* gcc.target/i386/avx512er-vexp2ps-1.c: Ditto.
	* gcc.target/i386/avx512er-vrcp28pd-1.c: Ditto.
	* gcc.target/i386/avx512er-vrcp28ps-1.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28pd-1.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28ps-1.c: Ditto.
	* gcc.target/i386/avx512er-vrcp28sd-1.c: New.
	* gcc.target/i386/avx512er-vrcp28sd-2.c: Ditto.
	* gcc.target/i386/avx512er-vrcp28ss-1.c: Ditto.
	* gcc.target/i386/avx512er-vrcp28ss-2.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28sd-1.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28sd-2.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28ss-1.c: Ditto.
	* gcc.target/i386/avx512er-vrsqrt28ss-2.c: Ditto.
	* gcc.target/i386/avx512f-vmovntdqa-1.c: Ditto.
	* gcc.target/i386/avx512f-vmovntdqa-2.c: Ditto.
	* gcc.target/i386/avx512f-vrcp14sd-2.c: Fix.
	* gcc.target/i386/avx512f-vrcp14ss-2.c: Ditto.
	* gcc.target/i386/sse-22.c: Extend with new built-ins,
	fix wrong rounding mode (see above).
	* gcc.target/i386/sse-23.c: Ditto.

--
Thanks, K

PS: There're few more missing intrinsics to go.

---
 gcc/config/i386/avx512erintrin.h                   | 62 +++++++++++++++++++
 gcc/config/i386/avx512fintrin.h                    |  7 +++
 gcc/config/i386/i386-builtin-types.def             |  1 +
 gcc/config/i386/i386.c                             | 13 ++++
 gcc/config/i386/sse.md                             | 71 +++++++++++++++-------
 gcc/config/i386/subst.md                           |  4 --
 gcc/testsuite/gcc.target/i386/avx-1.c              | 20 ++++--
 gcc/testsuite/gcc.target/i386/avx512er-vexp2pd-1.c | 12 ++--
 gcc/testsuite/gcc.target/i386/avx512er-vexp2ps-1.c | 12 ++--
 .../gcc.target/i386/avx512er-vrcp28pd-1.c          | 12 ++--
 .../gcc.target/i386/avx512er-vrcp28ps-1.c          | 12 ++--
 .../gcc.target/i386/avx512er-vrcp28sd-1.c          | 15 +++++
 .../gcc.target/i386/avx512er-vrcp28sd-2.c          | 29 +++++++++
 .../gcc.target/i386/avx512er-vrcp28ss-1.c          | 15 +++++
 .../gcc.target/i386/avx512er-vrcp28ss-2.c          | 29 +++++++++
 .../gcc.target/i386/avx512er-vrsqrt28pd-1.c        | 12 ++--
 .../gcc.target/i386/avx512er-vrsqrt28ps-1.c        | 12 ++--
 .../gcc.target/i386/avx512er-vrsqrt28sd-1.c        | 15 +++++
 .../gcc.target/i386/avx512er-vrsqrt28sd-2.c        | 29 +++++++++
 .../gcc.target/i386/avx512er-vrsqrt28ss-1.c        | 15 +++++
 .../gcc.target/i386/avx512er-vrsqrt28ss-2.c        | 29 +++++++++
 .../gcc.target/i386/avx512f-vmovntdqa-1.c          | 14 +++++
 .../gcc.target/i386/avx512f-vmovntdqa-2.c          | 17 ++++++
 gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c |  6 +-
 gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c | 10 +--
 gcc/testsuite/gcc.target/i386/sse-22.c             | 40 ++++++------
 gcc/testsuite/gcc.target/i386/sse-23.c             | 16 +++--
 27 files changed, 429 insertions(+), 100 deletions(-)

Comments

Jakub Jelinek Jan. 10, 2014, 4:24 p.m. UTC | #1
On Fri, Jan 10, 2014 at 07:20:38PM +0300, Kirill Yukhin wrote:
> @@ -28920,6 +28927,7 @@ static const struct builtin_description bdesc_special_args[] =
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF },
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF },
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI },
> +  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntdqa, "__builtin_ia32_movntdqa512", IX86_BUILTIN_MOVNTDQA512, UNKNOWN, (int) V8DI_FTYPE_PV8DI },
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv16si_mask, "__builtin_ia32_storedqusi512_mask", IX86_BUILTIN_STOREDQUSI512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv8di_mask, "__builtin_ia32_storedqudi512_mask", IX86_BUILTIN_STOREDQUDI512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeupd512_mask, "__builtin_ia32_storeupd512_mask", IX86_BUILTIN_STOREUPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },

This means you should ensure aligned_mem will be set for
CODE_FOR_avx512f_movntdqa in ix86_expand_special_args_builtin.

Leaving the rest of review to Uros/Richard.

	Jakub
Uros Bizjak Jan. 11, 2014, 11:42 a.m. UTC | #2
On Fri, Jan 10, 2014 at 5:24 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Jan 10, 2014 at 07:20:38PM +0300, Kirill Yukhin wrote:
>> @@ -28920,6 +28927,7 @@ static const struct builtin_description bdesc_special_args[] =
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF },
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF },
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI },
>> +  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntdqa, "__builtin_ia32_movntdqa512", IX86_BUILTIN_MOVNTDQA512, UNKNOWN, (int) V8DI_FTYPE_PV8DI },
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv16si_mask, "__builtin_ia32_storedqusi512_mask", IX86_BUILTIN_STOREDQUSI512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv8di_mask, "__builtin_ia32_storedqudi512_mask", IX86_BUILTIN_STOREDQUDI512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
>>    { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeupd512_mask, "__builtin_ia32_storeupd512_mask", IX86_BUILTIN_STOREUPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },
>
> This means you should ensure aligned_mem will be set for
> CODE_FOR_avx512f_movntdqa in ix86_expand_special_args_builtin.
>
> Leaving the rest of review to Uros/Richard.

The rest is OK.

Thanks,
Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/avx512erintrin.h b/gcc/config/i386/avx512erintrin.h
index f442f2b..6fe05bc 100644
--- a/gcc/config/i386/avx512erintrin.h
+++ b/gcc/config/i386/avx512erintrin.h
@@ -159,6 +159,24 @@  _mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R)
 					       (__mmask16) __U, __R);
 }
 
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __A,
+						 (__v2df) __B,
+						 __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __A,
+						(__v4sf) __B,
+						__R);
+}
+
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_rsqrt28_round_pd (__m512d __A, int __R)
@@ -214,6 +232,25 @@  _mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
 						 (__v16sf) _mm512_setzero_ps (),
 						 (__mmask16) __U, __R);
 }
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __A,
+						   (__v2df) __B,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __A,
+						  (__v4sf) __B,
+						  __R);
+}
+
 #else
 #define _mm512_exp2a23_round_pd(A, C)            \
     __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
@@ -268,6 +305,19 @@  _mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
 
 #define _mm512_maskz_rsqrt28_round_ps(U, A, C)   \
     __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_rcp28_round_sd(A, B, R)	\
+    __builtin_ia32_rcp28sd_round(A, B, R)
+
+#define _mm_rcp28_round_ss(A, B, R)	\
+    __builtin_ia32_rcp28ss_round(A, B, R)
+
+#define _mm_rsqrt28_round_sd(A, B, R)	\
+    __builtin_ia32_rsqrt28sd_round(A, B, R)
+
+#define _mm_rsqrt28_round_ss(A, B, R)	\
+    __builtin_ia32_rsqrt28ss_round(A, B, R)
+
 #endif
 
 #define _mm512_exp2a23_pd(A)                    \
@@ -324,6 +374,18 @@  _mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
 #define _mm512_maskz_rsqrt28_ps(U, A)     \
     _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
 
+#define _mm_rcp28_sd(A, B)	\
+    __builtin_ia32_rcp28sd_round(A, B, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_ss(A, B)	\
+    __builtin_ia32_rcp28ss_round(A, B, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_sd(A, B)	\
+    __builtin_ia32_rsqrt28sd_round(A, B, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_ss(A, B)	\
+    __builtin_ia32_rsqrt28ss_round(A, B, _MM_FROUND_CUR_DIRECTION)
+
 #ifdef __DISABLE_AVX512ER__
 #undef __DISABLE_AVX512ER__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index a2ee88e..26f8cb6 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -7809,6 +7809,13 @@  _mm512_stream_pd (double *__P, __m512d __A)
   __builtin_ia32_movntpd512 (__P, (__v8df) __A);
 }
 
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
 #ifdef __OPTIMIZE__
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index d19ca84..acf2f32 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -287,6 +287,7 @@  DEF_FUNCTION_TYPE (V8DI, PV4DI)
 DEF_FUNCTION_TYPE (V8DF, PV4DF)
 DEF_FUNCTION_TYPE (V8UHI, V8UHI)
 DEF_FUNCTION_TYPE (V8USI, V8USI)
+DEF_FUNCTION_TYPE (V8DI, PV8DI)
 
 DEF_FUNCTION_TYPE (DI, V2DI, INT)
 DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 52ad5c1..a9a4b68 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -28050,6 +28050,7 @@  enum ix86_builtins
   IX86_BUILTIN_MOVDQA64STORE512,
   IX86_BUILTIN_MOVDQA64_512,
   IX86_BUILTIN_MOVNTDQ512,
+  IX86_BUILTIN_MOVNTDQA512,
   IX86_BUILTIN_MOVNTPD512,
   IX86_BUILTIN_MOVNTPS512,
   IX86_BUILTIN_MOVSHDUP512,
@@ -28326,13 +28327,19 @@  enum ix86_builtins
   IX86_BUILTIN_GATHERPFQPS,
   IX86_BUILTIN_SCATTERPFDPS,
   IX86_BUILTIN_SCATTERPFQPS,
+
+  /* AVX-512ER */
   IX86_BUILTIN_EXP2PD_MASK,
   IX86_BUILTIN_EXP2PS_MASK,
   IX86_BUILTIN_EXP2PS,
   IX86_BUILTIN_RCP28PD,
   IX86_BUILTIN_RCP28PS,
+  IX86_BUILTIN_RCP28SD,
+  IX86_BUILTIN_RCP28SS,
   IX86_BUILTIN_RSQRT28PD,
   IX86_BUILTIN_RSQRT28PS,
+  IX86_BUILTIN_RSQRT28SD,
+  IX86_BUILTIN_RSQRT28SS,
 
   /* SHA builtins.  */
   IX86_BUILTIN_SHA1MSG1,
@@ -28920,6 +28927,7 @@  static const struct builtin_description bdesc_special_args[] =
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntdqa, "__builtin_ia32_movntdqa512", IX86_BUILTIN_MOVNTDQA512, UNKNOWN, (int) V8DI_FTYPE_PV8DI },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv16si_mask, "__builtin_ia32_storedqusi512_mask", IX86_BUILTIN_STOREDQUSI512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv8di_mask, "__builtin_ia32_storedqudi512_mask", IX86_BUILTIN_STOREDQUDI512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
   { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeupd512_mask, "__builtin_ia32_storeupd512_mask", IX86_BUILTIN_STOREUPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },
@@ -30133,8 +30141,12 @@  static const struct builtin_description bdesc_round_args[] =
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf_mask_round, "__builtin_ia32_exp2ps_mask", IX86_BUILTIN_EXP2PS_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v8df_mask_round, "__builtin_ia32_rcp28pd_mask", IX86_BUILTIN_RCP28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v16sf_mask_round, "__builtin_ia32_rcp28ps_mask", IX86_BUILTIN_RCP28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrcp28v2df_round, "__builtin_ia32_rcp28sd_round", IX86_BUILTIN_RCP28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrcp28v4sf_round, "__builtin_ia32_rcp28ss_round", IX86_BUILTIN_RCP28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v8df_mask_round, "__builtin_ia32_rsqrt28pd_mask", IX86_BUILTIN_RSQRT28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
   { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v16sf_mask_round, "__builtin_ia32_rsqrt28ps_mask", IX86_BUILTIN_RSQRT28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrsqrt28v2df_round, "__builtin_ia32_rsqrt28sd_round", IX86_BUILTIN_RSQRT28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrsqrt28v4sf_round, "__builtin_ia32_rsqrt28ss_round", IX86_BUILTIN_RSQRT28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
 };
 
 /* FMA4 and XOP.  */
@@ -34367,6 +34379,7 @@  ix86_expand_special_args_builtin (const struct builtin_description *d,
     case V16SI_FTYPE_PV4SI:
     case V16SF_FTYPE_PV4SF:
     case V8DI_FTYPE_PV4DI:
+    case V8DI_FTYPE_PV8DI:
     case V8DF_FTYPE_PV4DF:
       nargs = 1;
       klass = load;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index dfc98ba..31e94fe 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -356,7 +356,7 @@ 
    [(V16QI "sse4_1") (V32QI "avx2")
     (V8HI "sse4_1") (V16HI "avx2")
     (V4SI "sse4_1") (V8SI "avx2") (V16SI "avx512f")
-    (V2DI "sse4_1") (V4DI "avx2")])
+    (V2DI "sse4_1") (V4DI "avx2") (V8DI "avx512f")])
 
 (define_mode_attr avx_avx2
   [(V4SF "avx") (V2DF "avx")
@@ -1463,13 +1463,12 @@ 
   [(set (match_operand:VF_128 0 "register_operand" "=v")
 	(vec_merge:VF_128
 	  (unspec:VF_128
-	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "nonimmediate_operand" "vm")]
+	    [(match_operand:VF_128 1 "nonimmediate_operand" "vm")]
 	    UNSPEC_RCP14)
-	  (match_dup 1)
+	  (match_operand:VF_128 2 "register_operand" "v")
 	  (const_int 1)))]
   "TARGET_AVX512F"
-  "vrcp14<ssescalarmodesuffix>\t{%2, %1, %0|, %1, %2}"
+  "vrcp14<ssescalarmodesuffix>\t{%1, %2, %0|%0, %2, %1}"
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
@@ -6570,7 +6569,7 @@ 
 	(vec_merge:VF_128
 	  (unspec:VF_128
 	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "nonimmediate_operand" "<round_constraint>")]
+	     (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>")]
 	    UNSPEC_SCALEF)
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6650,7 +6649,7 @@ 
 	(vec_merge:VF_128
 	  (unspec:VF_128
 	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>")]
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
 	    UNSPEC_GETEXP)
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6815,7 +6814,7 @@ 
 	(vec_merge:VF_128
 	  (unspec:VF_128
 	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
 	     (match_operand:SI 3 "const_0_to_255_operand")]
 	    UNSPEC_ROUND)
 	  (match_dup 1)
@@ -11499,14 +11498,14 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse4_1_avx2>_movntdqa"
-  [(set (match_operand:VI8_AVX2 0 "register_operand" "=x")
-	(unspec:VI8_AVX2 [(match_operand:VI8_AVX2 1 "memory_operand" "m")]
+  [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand" "=x, v")
+	(unspec:VI8_AVX2_AVX512F [(match_operand:VI8_AVX2_AVX512F 1 "memory_operand" "m, m")]
 		     UNSPEC_MOVNTDQA))]
   "TARGET_SSE4_1"
   "%vmovntdqa\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "maybe_vex")
+   (set_attr "prefix_extra" "1, *")
+   (set_attr "prefix" "maybe_vex, evex")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<sse4_1_avx2>_mpsadbw"
@@ -12635,36 +12634,64 @@ 
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "avx512er_exp2<mode><mask_name><round_name>"
+(define_insn "avx512er_exp2<mode><mask_name><round_saeonly_name>"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
 	(unspec:VF_512
-	  [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
 	  UNSPEC_EXP2))]
   "TARGET_AVX512ER"
-  "vexp2<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  "vexp2<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "<mask_codefor>avx512er_rcp28<mode><mask_name><round_name>"
+(define_insn "<mask_codefor>avx512er_rcp28<mode><mask_name><round_saeonly_name>"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
 	(unspec:VF_512
-	  [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
 	  UNSPEC_RCP28))]
   "TARGET_AVX512ER"
-  "vrcp28<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  "vrcp28<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "<mask_codefor>avx512er_rsqrt28<mode><mask_name><round_name>"
+(define_insn "avx512er_vmrcp28<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	    UNSPEC_RCP28)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512ER"
+  "vrcp28<ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %2<round_saeonly_op3>}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<mask_codefor>avx512er_rsqrt28<mode><mask_name><round_saeonly_name>"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
 	(unspec:VF_512
-	  [(match_operand:VF_512 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
 	  UNSPEC_RSQRT28))]
   "TARGET_AVX512ER"
-  "vrsqrt28<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  "vrsqrt28<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx512er_vmrsqrt28<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	    UNSPEC_RSQRT28)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512ER"
+  "vrsqrt28<ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %2<round_saeonly_op3>}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; XOP instructions
@@ -15201,7 +15228,7 @@ 
 	(vec_merge:VF_128
 	  (unspec:VF_128
 	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "nonimmediate_operand" "<round_saeonly_constraint>")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
 	     (match_operand:SI 3 "const_0_to_15_operand")]
 	    UNSPEC_GETMANT)
 	  (match_dup 1)
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 7fd3948..7948e78 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -133,8 +133,6 @@ 
 (define_subst_attr "round_saeonly_name" "round_saeonly" "" "_round")
 (define_subst_attr "round_saeonly_mask_operand2" "mask" "%R2" "%R4")
 (define_subst_attr "round_saeonly_mask_operand3" "mask" "%R3" "%R5")
-(define_subst_attr "round_saeonly_mask_scalar_operand3" "mask_scalar" "%R3" "%R5")
-(define_subst_attr "round_saeonly_mask_scalar_operand4" "mask_scalar" "%R4" "%R6")
 (define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%R4" "%R5")
 (define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%R5" "%R7")
 (define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%R2")
@@ -145,8 +143,6 @@ 
 (define_subst_attr "round_saeonly_prefix" "round_saeonly" "vex" "evex")
 (define_subst_attr "round_saeonly_mask_op2" "round_saeonly" "" "<round_saeonly_mask_operand2>")
 (define_subst_attr "round_saeonly_mask_op3" "round_saeonly" "" "<round_saeonly_mask_operand3>")
-(define_subst_attr "round_saeonly_mask_scalar_op3" "round_saeonly" "" "<round_saeonly_mask_scalar_operand3>")
-(define_subst_attr "round_saeonly_mask_scalar_op4" "round_saeonly" "" "<round_saeonly_mask_scalar_operand4>")
 (define_subst_attr "round_saeonly_mask_scalar_merge_op4" "round_saeonly" "" "<round_saeonly_mask_scalar_merge_operand4>")
 (define_subst_attr "round_saeonly_sd_mask_op5" "round_saeonly" "" "<round_saeonly_sd_mask_operand5>")
 (define_subst_attr "round_saeonly_constraint" "round_saeonly" "vm" "v")
diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c
index 7201592..12674ad 100644
--- a/gcc/testsuite/gcc.target/i386/avx-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx-1.c
@@ -344,12 +344,20 @@ 
 #define __builtin_ia32_vfnmsubps512_mask3(A, B, C, D, E) __builtin_ia32_vfnmsubps512_mask3(A, B, C, D, 1)
 #define __builtin_ia32_vpermilpd512_mask(A, E, C, D) __builtin_ia32_vpermilpd512_mask(A, 1, C, D)
 #define __builtin_ia32_vpermilps512_mask(A, E, C, D) __builtin_ia32_vpermilps512_mask(A, 1, C, D)
-#define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask(A, B, C, 1)
-#define __builtin_ia32_exp2pd_mask(A, B, C, D) __builtin_ia32_exp2pd_mask(A, B, C, 1)
-#define __builtin_ia32_rcp28ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask(A, B, C, 1)
-#define __builtin_ia32_rcp28pd_mask(A, B, C, D) __builtin_ia32_exp2pd_mask(A, B, C, 1)
-#define __builtin_ia32_rsqrt28ps_mask(A, B, C, D) __builtin_ia32_rsqrt28ps_mask(A, B, C, 1)
-#define __builtin_ia32_rsqrt28pd_mask(A, B, C, D) __builtin_ia32_rsqrt28pd_mask(A, B, C, 1)
+
+/* avx512erintrin.h */
+#define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask(A, B, C, 5)
+#define __builtin_ia32_exp2pd_mask(A, B, C, D) __builtin_ia32_exp2pd_mask(A, B, C, 5)
+#define __builtin_ia32_rcp28ps_mask(A, B, C, D) __builtin_ia32_rcp28ps_mask(A, B, C, 5)
+#define __builtin_ia32_rcp28pd_mask(A, B, C, D) __builtin_ia32_rcp28pd_mask(A, B, C, 5)
+#define __builtin_ia32_rsqrt28ps_mask(A, B, C, D) __builtin_ia32_rsqrt28ps_mask(A, B, C, 5)
+#define __builtin_ia32_rsqrt28pd_mask(A, B, C, D) __builtin_ia32_rsqrt28pd_mask(A, B, C, 5)
+#define __builtin_ia32_rcp28ss_round(A, B, C) __builtin_ia32_rcp28ss_round(A, B, 5)
+#define __builtin_ia32_rcp28sd_round(A, B, C) __builtin_ia32_rcp28sd_round(A, B, 5)
+#define __builtin_ia32_rsqrt28ss_round(A, B, C) __builtin_ia32_rsqrt28ss_round(A, B, 5)
+#define __builtin_ia32_rsqrt28sd_round(A, B, C) __builtin_ia32_rsqrt28sd_round(A, B, 5)
+
+/* avx512pfintrin.h */
 #define __builtin_ia32_gatherpfdps(A, B, C, D, E) __builtin_ia32_gatherpfdps(A, B, C, 1, 1)
 #define __builtin_ia32_gatherpfqps(A, B, C, D, E) __builtin_ia32_gatherpfqps(A, B, C, 1, 1)
 #define __builtin_ia32_scatterpfdps(A, B, C, D, E) __builtin_ia32_scatterpfdps(A, B, C, 1, 1)
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vexp2pd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vexp2pd-1.c
index 9fb87cf..22c086d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vexp2pd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vexp2pd-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_exp2a23_pd (x);
   x = _mm512_mask_exp2a23_pd (x, m, x);
   x = _mm512_maskz_exp2a23_pd (m, x);
-  x = _mm512_exp2a23_round_pd (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_exp2a23_round_pd (x, m, x, _MM_FROUND_TO_NEG_INF);
-  x = _mm512_maskz_exp2a23_round_pd (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_exp2a23_round_pd (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_exp2a23_round_pd (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_exp2a23_round_pd (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vexp2ps-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vexp2ps-1.c
index a7e7009e..9d1178e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vexp2ps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vexp2ps-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{ru-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vexp2ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_exp2a23_ps (x);
   x = _mm512_mask_exp2a23_ps (x, m, x);
   x = _mm512_maskz_exp2a23_ps (m, x);
-  x = _mm512_exp2a23_round_ps (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_exp2a23_round_ps (x, m, x, _MM_FROUND_TO_POS_INF);
-  x = _mm512_maskz_exp2a23_round_ps (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_exp2a23_round_ps (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_exp2a23_round_ps (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_exp2a23_round_ps (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28pd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28pd-1.c
index 06b6160..505c0eb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28pd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28pd-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_rcp28_pd (x);
   x = _mm512_mask_rcp28_pd (x, m, x);
   x = _mm512_maskz_rcp28_pd (m, x);
-  x = _mm512_rcp28_round_pd (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_rcp28_round_pd (x, m, x, _MM_FROUND_TO_NEG_INF);
-  x = _mm512_maskz_rcp28_round_pd (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_rcp28_round_pd (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_rcp28_round_pd (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_rcp28_round_pd (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-1.c
index 023d6b2..e9245ba 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{ru-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_rcp28_ps (x);
   x = _mm512_mask_rcp28_ps (x, m, x);
   x = _mm512_maskz_rcp28_ps (m, x);
-  x = _mm512_rcp28_round_ps (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_rcp28_round_ps (x, m, x, _MM_FROUND_TO_POS_INF);
-  x = _mm512_maskz_rcp28_round_ps (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_rcp28_round_ps (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_rcp28_round_ps (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_rcp28_round_ps (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c
new file mode 100644
index 0000000..d09ba57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512er -O2" } */
+/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\n\]*%xmm\[0-9\]\[\\n\]" 2 } } */
+/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]\[^\{\]*\n" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128d x, y;
+
+void extern
+avx512er_test (void)
+{
+  x = _mm_rcp28_sd (x, y);
+  x = _mm_rcp28_round_sd (x, y, _MM_FROUND_NO_EXC);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c
new file mode 100644
index 0000000..d30f088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -mavx512er" } */
+
+#include "avx512er-check.h"
+#include "avx512f-mask-type.h"
+#include "avx512f-helper.h"
+#include <math.h>
+
+void static
+avx512er_test (void)
+{
+  union128d src, res;
+  double res_ref[2];
+  int i;
+  
+  for (i = 0; i < 2; i++)
+    {
+      src.a[i] = 179.345 - 6.5645 * i;
+      res_ref[i] = src.a[i];
+    }
+
+  res_ref[0] = 1.0 / src.a[0];
+
+  res.x = _mm_rcp28_round_sd (src.x, src.x, _MM_FROUND_NO_EXC);
+
+  if (checkVd (res.a, res_ref, 2))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c
new file mode 100644
index 0000000..3f5ccea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512er -O2" } */
+/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\n\]*%xmm\[0-9\]\[\\n\]" 2 } } */
+/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]\[^\{\]*\n" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128 x, y;
+
+void extern
+avx512er_test (void)
+{
+  x = _mm_rcp28_ss (x, y);
+  x = _mm_rcp28_round_ss (x, y, _MM_FROUND_NO_EXC);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c
new file mode 100644
index 0000000..499a977
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -mavx512er" } */
+
+#include "avx512er-check.h"
+#include "avx512f-mask-type.h"
+#include "avx512f-helper.h"
+#include <math.h>
+
+void static
+avx512er_test (void)
+{
+  union128 src, res;
+  float res_ref[4];
+  int i;
+  
+  for (i = 0; i < 4; i++)
+    {
+      src.a[i] = 179.345 - 6.5645 * i;
+      res_ref[i] = src.a[i];
+    }
+
+  res_ref[0] = 1.0 / src.a[0];
+
+  res.x = _mm_rsqrt28_round_ss (src.x, src.x, _MM_FROUND_NO_EXC);
+
+  if (checkVf (res.a, res_ref, 4))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28pd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28pd-1.c
index dfb95b2..5d264ac 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28pd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28pd-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28pd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_rsqrt28_pd (x);
   x = _mm512_mask_rsqrt28_pd (x, m, x);
   x = _mm512_maskz_rsqrt28_pd (m, x);
-  x = _mm512_rsqrt28_round_pd (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_rsqrt28_round_pd (x, m, x, _MM_FROUND_TO_NEG_INF);
-  x = _mm512_maskz_rsqrt28_round_pd (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_rsqrt28_round_pd (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_rsqrt28_round_pd (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_rsqrt28_round_pd (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-1.c
index ecd3a6f..bfdb9ac 100644
--- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-1.c
@@ -3,9 +3,9 @@ 
 /* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[\\n\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 2 } } */
 /* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 2 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%zmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{ru-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
-/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\[^\{\]*\n" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28ps\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 
 #include <immintrin.h>
 
@@ -18,7 +18,7 @@  avx512er_test (void)
   x = _mm512_rsqrt28_ps (x);
   x = _mm512_mask_rsqrt28_ps (x, m, x);
   x = _mm512_maskz_rsqrt28_ps (m, x);
-  x = _mm512_rsqrt28_round_ps (x, _MM_FROUND_TO_NEAREST_INT);
-  x = _mm512_mask_rsqrt28_round_ps (x, m, x, _MM_FROUND_TO_POS_INF);
-  x = _mm512_maskz_rsqrt28_round_ps (m, x, _MM_FROUND_TO_ZERO);
+  x = _mm512_rsqrt28_round_ps (x, _MM_FROUND_NO_EXC);
+  x = _mm512_mask_rsqrt28_round_ps (x, m, x, _MM_FROUND_NO_EXC);
+  x = _mm512_maskz_rsqrt28_round_ps (m, x, _MM_FROUND_NO_EXC);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c
new file mode 100644
index 0000000..59dff78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512er -O2" } */
+/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\{^\n\]*%xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128d x, y;
+
+void extern
+avx512er_test (void)
+{
+  x = _mm_rsqrt28_sd (x, y);
+  x = _mm_rsqrt28_round_sd (x, y, _MM_FROUND_NO_EXC);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c
new file mode 100644
index 0000000..1537a59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -mavx512er" } */
+
+#include "avx512er-check.h"
+#include "avx512f-mask-type.h"
+#include "avx512f-helper.h"
+#include <math.h>
+
+void static
+avx512er_test (void)
+{
+  union128d src, res;
+  double res_ref[2];
+  int i;
+  
+  for (i = 0; i < 2; i++)
+    {
+      src.a[i] = 179.345 - 6.5645 * i;
+      res_ref[i] = src.a[i];
+    }
+
+  res_ref[0] = 1.0 / sqrt (src.a[0]);
+
+  res.x = _mm_rsqrt28_round_sd (src.x, src.x, _MM_FROUND_NO_EXC);
+
+  if (checkVd (res.a, res_ref, 2))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c
new file mode 100644
index 0000000..a334375
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512er -O2" } */
+/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\{^\n\]*%xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128 x, y;
+
+void extern
+avx512er_test (void)
+{
+  x = _mm_rsqrt28_ss (x, y);
+  x = _mm_rsqrt28_round_ss (x, y, _MM_FROUND_NO_EXC);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c
new file mode 100644
index 0000000..f88422e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -mavx512er" } */
+
+#include "avx512er-check.h"
+#include "avx512f-mask-type.h"
+#include "avx512f-helper.h"
+#include <math.h>
+
+void static
+avx512er_test (void)
+{
+  union128 src, res;
+  float res_ref[4];
+  int i;
+  
+  for (i = 0; i < 4; i++)
+    {
+      src.a[i] = 179.345 - 6.5645 * i;
+      res_ref[i] = src.a[i];
+    }
+
+  res_ref[0] = 1.0 / sqrt (src.a[0]);
+
+  res.x = _mm_rsqrt28_round_ss (src.x, src.x, _MM_FROUND_NO_EXC);
+
+  if (checkVf (res.a, res_ref, 4))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-1.c
new file mode 100644
index 0000000..d5be976
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-1.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler "vmovntdqa\[ \\t\]+\[^\n\]*%zmm\[0-9\]" } } */
+
+#include <immintrin.h>
+
+__m512i *x;
+volatile __m512i y;
+
+void extern
+avx512f_test (void)
+{
+  y = _mm512_stream_load_si512 (x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-2.c
new file mode 100644
index 0000000..0825781
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovntdqa-2.c
@@ -0,0 +1,17 @@ 
+/* { dg-do run } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+void static
+avx512f_test (void)
+{
+  union512i_q s, res;
+
+  s.x = _mm512_set_epi64 (39578, -429496, 7856, 0, 85632, -1234, 47563, -1);
+  res.x = _mm512_stream_load_si512 (&s.x);
+
+  if (check_union512i_q (s, res.a))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c
index 9ff3541..0c9211a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c
@@ -8,8 +8,8 @@ 
 static void
 compute_vrcp14sd (double *s1, double *s2, double *r)
 {
-  r[0] = 1.0 / s2[0];
-  r[1] = s1[1];
+  r[0] = 1.0 / s1[0];
+  r[1] = s2[1];
 }
 
 static void
@@ -26,6 +26,6 @@  avx512f_test (void)
 
   compute_vrcp14sd (s1.a, s2.a, res_ref);
 
-  if (check_union128d (res1, res_ref))
+  if (checkVd (res1.a, res_ref, 2))
     abort ();
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c
index fe8989a..3344dad 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c
@@ -8,10 +8,10 @@ 
 static void
 compute_vrcp14ss (float *s1, float *s2, float *r)
 {
-  r[0] = 1.0 / s2[0];
-  r[1] = s1[1];
-  r[2] = s1[2];
-  r[3] = s1[3];
+  r[0] = 1.0 / s1[0];
+  r[1] = s2[1];
+  r[2] = s2[2];
+  r[3] = s2[3];
 }
 
 static void
@@ -28,6 +28,6 @@  avx512f_test (void)
 
   compute_vrcp14ss (s1.a, s2.a, res_ref);
 
-  if (check_union128 (res1, res_ref))
+  if (checkVf (res1.a, res_ref, 4))
     abort ();
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
index 05b4af0..630c952 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -647,24 +647,28 @@  test_3vx (_mm512_mask_prefetch_i64gather_ps, __m512i, __mmask8, void const *, 1,
 test_3vx (_mm512_mask_prefetch_i64scatter_ps, void const *, __mmask8, __m512i, 1, 1)
 
 /* avx512erintrin.h */
-test_1 (_mm512_exp2a23_round_pd, __m512d, __m512d, 1)
-test_1 (_mm512_exp2a23_round_ps, __m512, __m512, 1)
-test_1 (_mm512_rcp28_round_pd, __m512d, __m512d, 1)
-test_1 (_mm512_rcp28_round_ps, __m512, __m512, 1)
-test_1 (_mm512_rsqrt28_round_pd, __m512d, __m512d, 1)
-test_1 (_mm512_rsqrt28_round_ps, __m512, __m512, 1)
-test_2 (_mm512_maskz_exp2a23_round_pd, __m512d, __mmask8, __m512d, 1)
-test_2 (_mm512_maskz_exp2a23_round_ps, __m512, __mmask16, __m512, 1)
-test_2 (_mm512_maskz_rcp28_round_pd, __m512d, __mmask8, __m512d, 1)
-test_2 (_mm512_maskz_rcp28_round_ps, __m512, __mmask16, __m512, 1)
-test_2 (_mm512_maskz_rsqrt28_round_pd, __m512d, __mmask8, __m512d, 1)
-test_2 (_mm512_maskz_rsqrt28_round_ps, __m512, __mmask16, __m512, 1)
-test_3 (_mm512_mask_exp2a23_round_pd, __m512d, __m512d, __mmask8, __m512d, 1)
-test_3 (_mm512_mask_exp2a23_round_ps, __m512, __m512, __mmask16, __m512, 1)
-test_3 (_mm512_mask_rcp28_round_pd, __m512d, __m512d, __mmask8, __m512d, 1)
-test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 1)
-test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 1)
-test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 1)
+test_1 (_mm512_exp2a23_round_pd, __m512d, __m512d, 5)
+test_1 (_mm512_exp2a23_round_ps, __m512, __m512, 5)
+test_1 (_mm512_rcp28_round_pd, __m512d, __m512d, 5)
+test_1 (_mm512_rcp28_round_ps, __m512, __m512, 5)
+test_1 (_mm512_rsqrt28_round_pd, __m512d, __m512d, 5)
+test_1 (_mm512_rsqrt28_round_ps, __m512, __m512, 5)
+test_2 (_mm512_maskz_exp2a23_round_pd, __m512d, __mmask8, __m512d, 5)
+test_2 (_mm512_maskz_exp2a23_round_ps, __m512, __mmask16, __m512, 5)
+test_2 (_mm512_maskz_rcp28_round_pd, __m512d, __mmask8, __m512d, 5)
+test_2 (_mm512_maskz_rcp28_round_ps, __m512, __mmask16, __m512, 5)
+test_2 (_mm512_maskz_rsqrt28_round_pd, __m512d, __mmask8, __m512d, 5)
+test_2 (_mm512_maskz_rsqrt28_round_ps, __m512, __mmask16, __m512, 5)
+test_3 (_mm512_mask_exp2a23_round_pd, __m512d, __m512d, __mmask8, __m512d, 5)
+test_3 (_mm512_mask_exp2a23_round_ps, __m512, __m512, __mmask16, __m512, 5)
+test_3 (_mm512_mask_rcp28_round_pd, __m512d, __m512d, __mmask8, __m512d, 5)
+test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 5)
+test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 5)
+test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 5)
+test_2 (_mm_rcp28_round_sd, __m128d, __m128d, __m128d, 5)
+test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 5)
+test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 5)
+test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 5)
 
 /* shaintrin.h */
 test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index a6a7b39..309cd73 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -367,12 +367,16 @@ 
 #define __builtin_ia32_scatterpfqps(A, B, C, D, E) __builtin_ia32_scatterpfqps(A, B, C, 1, 1)
 
 /* avx512erintrin.h */
-#define __builtin_ia32_exp2pd_mask(A, B, C, D) __builtin_ia32_exp2pd_mask (A, B, C, 1)
-#define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask (A, B, C, 1)
-#define __builtin_ia32_rcp28pd_mask(A, B, C, D) __builtin_ia32_rcp28pd_mask (A, B, C, 1)
-#define __builtin_ia32_rcp28ps_mask(A, B, C, D) __builtin_ia32_rcp28ps_mask (A, B, C, 1)
-#define __builtin_ia32_rsqrt28pd_mask(A, B, C, D) __builtin_ia32_rsqrt28pd_mask (A, B, C, 1)
-#define __builtin_ia32_rsqrt28ps_mask(A, B, C, D) __builtin_ia32_rsqrt28ps_mask (A, B, C, 1)
+#define __builtin_ia32_exp2pd_mask(A, B, C, D) __builtin_ia32_exp2pd_mask (A, B, C, 5)
+#define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask (A, B, C, 5)
+#define __builtin_ia32_rcp28pd_mask(A, B, C, D) __builtin_ia32_rcp28pd_mask (A, B, C, 5)
+#define __builtin_ia32_rcp28ps_mask(A, B, C, D) __builtin_ia32_rcp28ps_mask (A, B, C, 5)
+#define __builtin_ia32_rsqrt28pd_mask(A, B, C, D) __builtin_ia32_rsqrt28pd_mask (A, B, C, 5)
+#define __builtin_ia32_rsqrt28ps_mask(A, B, C, D) __builtin_ia32_rsqrt28ps_mask (A, B, C, 5)
+#define __builtin_ia32_rcp28sd_round(A, B, C) __builtin_ia32_rcp28sd_round(A, B, 5)
+#define __builtin_ia32_rcp28ss_round(A, B, C) __builtin_ia32_rcp28ss_round(A, B, 5)
+#define __builtin_ia32_rsqrt28sd_round(A, B, C) __builtin_ia32_rsqrt28sd_round(A, B, 5)
+#define __builtin_ia32_rsqrt28ss_round(A, B, C) __builtin_ia32_rsqrt28ss_round(A, B, 5)
 
 /* shaintrin.h */
 #define __builtin_ia32_sha1rnds4(A, B, C) __builtin_ia32_sha1rnds4(A, B, 1)