diff mbox series

Add missing avx512fintrin.h intrinsics (PR target/89602)

Message ID 20190306234925.GQ7611@tucnak
State New
Headers show
Series Add missing avx512fintrin.h intrinsics (PR target/89602) | expand

Commit Message

Jakub Jelinek March 6, 2019, 11:49 p.m. UTC
Hi!

The following patch adds vmovss/vmovsd masked intrinsics.
On
#include <x86intrin.h>
__m128 f1 (__m128 w, __mmask8 u, const float *p) { return _mm_mask_load_ss (w, u, p); }
__m128 f2 (__mmask8 u, const float *p) { return _mm_maskz_load_ss (u, p); }
__m128d f3 (__m128d w, __mmask8 u, const double *p) { return _mm_mask_load_sd (w, u, p); }
__m128d f4 (__mmask8 u, const double *p) { return _mm_maskz_load_sd (u, p); }
__m128 f5 (__m128 w, __mmask8 u, __m128 a, __m128 b) { return _mm_mask_move_ss (w, u, a, b); }
__m128 f6 (__mmask8 u, __m128 a, __m128 b) { return _mm_maskz_move_ss (u, a, b); }
__m128d f7 (__m128d w, __mmask8 u, __m128d a, __m128d b) { return _mm_mask_move_sd (w, u, a, b); }
__m128d f8 (__mmask8 u, __m128d a, __m128d b) { return _mm_maskz_move_sd (u, a, b); }
void f9 (float *p, __mmask8 u, __m128 a) { _mm_mask_store_ss (p, u, a); }
void f10 (double *p, __mmask8 u, __m128d a) { _mm_mask_store_sd (p, u, a); }
it generates the same assembly with -O2 -mavx512f as icc 19 or clang trunk.
It mostly does a good job also when the mask is constant, on the above
testcase with u arguments replaced with 1 I get:
f1: vmovss (%rsi), %xmm0
f2: vmovss (%rsi), %xmm0
f3: vmovq (%rsi), %xmm0
f4: movzbl .LC0(%rip), %eax; kmovw %eax, %k1; vmovsd (%rsi), %xmm0{%k1}{z}
f5: vmovss %xmm2, %xmm1, %xmm0
f6: vmovss %xmm1, %xmm0, %xmm0
f7: vmovsd %xmm2, %xmm1, %xmm0
f8: vmovsd %xmm1, %xmm0, %xmm0
f9: vmovss %xmm0, (%rdi)
f10: vmovlpd %xmm0, (%rdi)
Except for f4 that looks reasonable to me (and as tested in the testsuite
works too), for f4 guess either we need to improve simplify-rtx.c or add
some pattern for the combiner.  Can handle that as follow-up.
When instead using 0 mask, I get:
f1: kxorw %k1, %k1, %k1; vmovss (%rsi), %xmm0{%k1}
f2: vxorps %xmm0, %xmm0, %xmm0
f3: kxorw %k1, %k1, %k1; vmovsd (%rsi), %xmm0{%k1}
f4: vxorpd %xmm0, %xmm0, %xmm0
f5: vmovss %xmm0, %xmm1, %xmm0
f6: kxorw %k1, %k1, %k1; vmovss %xmm1, %xmm0, %xmm0{%k1}{z}
f7: vmovsd %xmm0, %xmm1, %xmm0
f8: kxorw %k1, %k1, %k1; vmovsd %xmm1, %xmm0, %xmm0{%k1}{z}
f9: nothing
f10: nothing
which looks good to me.  For f1/f3/f6/f8, I really have no idea if there is
some single insn that could do that kind of operation.  This is also tested
at runtime in the testsuite.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-03-07  Jakub Jelinek  <jakub@redhat.com>

	PR target/89602
	* config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
	*avx512f_load<mode>_mask, avx512f_store<mode>_mask): New define_insns.
	(avx512f_load<mode>_mask): New define_expand.
	* config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
	__builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
	__builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
	__builtin_ia32_movess_mask): New builtins.
	* config/i386/avx512fintrin.h (_mm_mask_load_ss, _mm_maskz_load_ss,
	_mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
	_mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
	_mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.

	* gcc.target/i386/avx512f-vmovss-1.c: New test.
	* gcc.target/i386/avx512f-vmovss-2.c: New test.
	* gcc.target/i386/avx512f-vmovss-3.c: New test.
	* gcc.target/i386/avx512f-vmovsd-1.c: New test.
	* gcc.target/i386/avx512f-vmovsd-2.c: New test.
	* gcc.target/i386/avx512f-vmovsd-3.c: New test.


	Jakub

Comments

Uros Bizjak March 7, 2019, 7:11 a.m. UTC | #1
On Thu, Mar 7, 2019 at 12:49 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch adds vmovss/vmovsd masked intrinsics.
> On
> #include <x86intrin.h>
> __m128 f1 (__m128 w, __mmask8 u, const float *p) { return _mm_mask_load_ss (w, u, p); }
> __m128 f2 (__mmask8 u, const float *p) { return _mm_maskz_load_ss (u, p); }
> __m128d f3 (__m128d w, __mmask8 u, const double *p) { return _mm_mask_load_sd (w, u, p); }
> __m128d f4 (__mmask8 u, const double *p) { return _mm_maskz_load_sd (u, p); }
> __m128 f5 (__m128 w, __mmask8 u, __m128 a, __m128 b) { return _mm_mask_move_ss (w, u, a, b); }
> __m128 f6 (__mmask8 u, __m128 a, __m128 b) { return _mm_maskz_move_ss (u, a, b); }
> __m128d f7 (__m128d w, __mmask8 u, __m128d a, __m128d b) { return _mm_mask_move_sd (w, u, a, b); }
> __m128d f8 (__mmask8 u, __m128d a, __m128d b) { return _mm_maskz_move_sd (u, a, b); }
> void f9 (float *p, __mmask8 u, __m128 a) { _mm_mask_store_ss (p, u, a); }
> void f10 (double *p, __mmask8 u, __m128d a) { _mm_mask_store_sd (p, u, a); }
> it generates the same assembly with -O2 -mavx512f as icc 19 or clang trunk.
> It mostly does a good job also when the mask is constant, on the above
> testcase with u arguments replaced with 1 I get:
> f1: vmovss (%rsi), %xmm0
> f2: vmovss (%rsi), %xmm0
> f3: vmovq (%rsi), %xmm0
> f4: movzbl .LC0(%rip), %eax; kmovw %eax, %k1; vmovsd (%rsi), %xmm0{%k1}{z}
> f5: vmovss %xmm2, %xmm1, %xmm0
> f6: vmovss %xmm1, %xmm0, %xmm0
> f7: vmovsd %xmm2, %xmm1, %xmm0
> f8: vmovsd %xmm1, %xmm0, %xmm0
> f9: vmovss %xmm0, (%rdi)
> f10: vmovlpd %xmm0, (%rdi)
> Except for f4 that looks reasonable to me (and as tested in the testsuite
> works too), for f4 guess either we need to improve simplify-rtx.c or add
> some pattern for the combiner.  Can handle that as follow-up.
> When instead using 0 mask, I get:
> f1: kxorw %k1, %k1, %k1; vmovss (%rsi), %xmm0{%k1}
> f2: vxorps %xmm0, %xmm0, %xmm0
> f3: kxorw %k1, %k1, %k1; vmovsd (%rsi), %xmm0{%k1}
> f4: vxorpd %xmm0, %xmm0, %xmm0
> f5: vmovss %xmm0, %xmm1, %xmm0
> f6: kxorw %k1, %k1, %k1; vmovss %xmm1, %xmm0, %xmm0{%k1}{z}
> f7: vmovsd %xmm0, %xmm1, %xmm0
> f8: kxorw %k1, %k1, %k1; vmovsd %xmm1, %xmm0, %xmm0{%k1}{z}
> f9: nothing
> f10: nothing
> which looks good to me.  For f1/f3/f6/f8, I really have no idea if there is
> some single insn that could do that kind of operation.  This is also tested
> at runtime in the testsuite.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-03-07  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/89602
>         * config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
>         *avx512f_load<mode>_mask, avx512f_store<mode>_mask): New define_insns.
>         (avx512f_load<mode>_mask): New define_expand.
>         * config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
>         __builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
>         __builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
>         __builtin_ia32_movess_mask): New builtins.
>         * config/i386/avx512fintrin.h (_mm_mask_load_ss, _mm_maskz_load_ss,
>         _mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
>         _mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
>         _mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.
>
>         * gcc.target/i386/avx512f-vmovss-1.c: New test.
>         * gcc.target/i386/avx512f-vmovss-2.c: New test.
>         * gcc.target/i386/avx512f-vmovss-3.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-1.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-2.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-3.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2019-02-20 23:40:17.119140235 +0100
> +++ gcc/config/i386/sse.md      2019-03-06 19:15:12.379749161 +0100
> @@ -1151,6 +1151,67 @@ (define_insn "<avx512>_load<mode>_mask"
>     (set_attr "memory" "none,load")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn "avx512f_mov<ssescalarmodelower>_mask"
> +  [(set (match_operand:VF_128 0 "register_operand" "=v")
> +       (vec_merge:VF_128
> +         (vec_merge:VF_128
> +           (match_operand:VF_128 2 "register_operand" "v")
> +           (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
> +           (match_operand:QI 4 "register_operand" "Yk"))
> +         (match_operand:VF_128 1 "register_operand" "v")
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<ssescalarmode>")])
> +
> +(define_expand "avx512f_load<mode>_mask"
> +  [(set (match_operand:<ssevecmode> 0 "register_operand")
> +       (vec_merge:<ssevecmode>
> +         (vec_merge:<ssevecmode>
> +           (vec_duplicate:<ssevecmode>
> +             (match_operand:MODEF 1 "memory_operand"))
> +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
> +           (match_operand:QI 3 "nonmemory_operand"))
> +         (match_dup 4)
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
> +
> +(define_insn "*avx512f_load<mode>_mask"
> +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> +       (vec_merge:<ssevecmode>
> +         (vec_merge:<ssevecmode>
> +           (vec_duplicate:<ssevecmode>
> +             (match_operand:MODEF 1 "memory_operand" "m"))
> +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> +           (match_operand:QI 3 "nonmemory_operand" "Yk"))

Is there a reason to have nonmemory_operand predicate here instead of
register_operand?

Uros.

> +         (match_operand:<ssevecmode> 4 "const0_operand" "C")
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "memory" "load")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "avx512f_store<mode>_mask"
> +  [(set (match_operand:MODEF 0 "memory_operand" "=m")
> +       (if_then_else:MODEF
> +         (and:QI (match_operand:QI 2 "nonmemory_operand" "Yk")
> +                (const_int 1))
> +         (vec_select:MODEF
> +           (match_operand:<ssevecmode> 1 "register_operand" "v")
> +           (parallel [(const_int 0)]))
> +         (match_dup 0)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "memory" "store")
> +   (set_attr "mode" "<MODE>")])
> +
>  (define_insn "<avx512>_blendm<mode>"
>    [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
>         (vec_merge:V48_AVX512VL
> --- gcc/config/i386/i386-builtin.def.jj 2019-01-22 23:26:46.622213698 +0100
> +++ gcc/config/i386/i386-builtin.def    2019-03-06 15:20:59.096670143 +0100
> @@ -255,6 +255,10 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loaddf_mask, "__builtin_ia32_loadsd_mask", IX86_BUILTIN_LOADSD_MASK, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask, "__builtin_ia32_loadss_mask", IX86_BUILTIN_LOADSS_MASK, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT_V4SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask, "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask, "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF_UQI)
>
>  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
>  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
> @@ -1470,6 +1474,8 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movdf_mask, "__builtin_ia32_movesd_mask", IX86_BUILTIN_MOVSD_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movsf_mask, "__builtin_ia32_movess_mask", IX86_BUILTIN_MOVSS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv16sf3,  "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF)
> --- gcc/config/i386/avx512fintrin.h.jj  2019-01-17 13:20:00.812472551 +0100
> +++ gcc/config/i386/avx512fintrin.h     2019-03-06 15:22:53.662791558 +0100
> @@ -6273,6 +6273,83 @@ _mm512_mask_storeu_ps (void *__P, __mmas
>                                    (__mmask16) __U);
>  }
>
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
> +{
> +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_load_ss (__mmask8 __U, const float *__P)
> +{
> +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
> +                                             __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
> +{
> +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_load_sd (__mmask8 __U, const double *__P)
> +{
> +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
> +                                              __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
> +{
> +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
> +                                             (__v4sf) __W, __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
> +{
> +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
> +                                             (__v4sf) _mm_setzero_ps (), __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
> +{
> +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
> +                                              (__v2df) __W, __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
> +{
> +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
> +                                              (__v2df) _mm_setzero_pd (),
> +                                              __U);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
> +{
> +  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
> +{
> +  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
> +}
> +
>  extern __inline __m512i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c.jj 2019-03-06 15:34:07.972734673 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c    2019-03-06 15:44:46.891258107 +0100
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +
> +#include <immintrin.h>
> +
> +volatile __m128 x1, x2, x3;
> +volatile __mmask8 m;
> +float *volatile p;
> +
> +void extern
> +avx512f_test (void)
> +{
> +  x1 = _mm_mask_load_ss (x1, m, p);
> +  x1 = _mm_maskz_load_ss (m, p);
> +  x1 = _mm_mask_move_ss (x1, m, x2, x3);
> +  x1 = _mm_maskz_move_ss (m, x2, x3);
> +  _mm_mask_store_ss (p, m, x1);
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c.jj 2019-03-06 15:50:52.072264356 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c    2019-03-06 19:08:14.933598873 +0100
> @@ -0,0 +1,87 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 32)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128 res1, res2, res3, res4, src1, src2, src3;
> +  volatile __mmask8 mask = 5;
> +  float val[2] = { 35.5f, 0.0f };
> +  float *volatile p = &val[0];
> +  float res_ref[SIZE];
> +  float zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5f + i;
> +      src2.a[i] = 7.5f + i;
> +      src3.a[i] = 4.5f + i;
> +      zero[i] = 0.0f;
> +    }
> +
> +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_ss (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_ss (p + 1, mask, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  mask ^= 1;
> +
> +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_ss (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0f;
> +  _mm_mask_store_ss (p + 1, mask, src1.x);
> +  if (val[1] != 42.0f)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c.jj 2019-03-06 19:11:19.058577646 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c    2019-03-06 19:11:46.815122188 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 32)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128 res1, res2, res3, res4, src1, src2, src3;
> +  float val[2] = { 35.5f, 0.0f };
> +  float *volatile p = &val[0];
> +  float res_ref[SIZE];
> +  float zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5f + i;
> +      src2.a[i] = 7.5f + i;
> +      src3.a[i] = 4.5f + i;
> +      zero[i] = 0.0f;
> +    }
> +
> +  res1.x = _mm_mask_load_ss (src1.x, 1, p);
> +  res2.x = _mm_maskz_load_ss (1, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, 1, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (1, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_ss (p + 1, 1, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  res1.x = _mm_mask_load_ss (src1.x, 0, p);
> +  res2.x = _mm_maskz_load_ss (0, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, 0, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (0, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0f;
> +  _mm_mask_store_ss (p + 1, 0, src1.x);
> +  if (val[1] != 42.0f)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c.jj 2019-03-06 15:45:04.922962437 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c    2019-03-06 15:45:30.032550703 +0100
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +
> +#include <immintrin.h>
> +
> +volatile __m128d x1, x2, x3;
> +volatile __mmask8 m;
> +double *volatile p;
> +
> +void extern
> +avx512f_test (void)
> +{
> +  x1 = _mm_mask_load_sd (x1, m, p);
> +  x1 = _mm_maskz_load_sd (m, p);
> +  x1 = _mm_mask_move_sd (x1, m, x2, x3);
> +  x1 = _mm_maskz_move_sd (m, x2, x3);
> +  _mm_mask_store_sd (p, m, x1);
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c.jj 2019-03-06 19:05:18.862487956 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c    2019-03-06 19:07:58.954861065 +0100
> @@ -0,0 +1,87 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 64)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128d res1, res2, res3, res4, src1, src2, src3;
> +  volatile __mmask8 mask = 5;
> +  double val[2] = { 35.5, 0.0 };
> +  double *volatile p = &val[0];
> +  double res_ref[SIZE];
> +  double zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5 + i;
> +      src2.a[i] = 7.5 + i;
> +      src3.a[i] = 4.5 + i;
> +      zero[i] = 0.0;
> +    }
> +
> +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_sd (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_sd (p + 1, mask, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  mask ^= 1;
> +
> +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_sd (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0;
> +  _mm_mask_store_sd (p + 1, mask, src1.x);
> +  if (val[1] != 42.0)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c.jj 2019-03-06 19:11:57.977939021 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c    2019-03-06 19:12:47.090133163 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 64)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128d res1, res2, res3, res4, src1, src2, src3;
> +  double val[2] = { 35.5, 0.0 };
> +  double *volatile p = &val[0];
> +  double res_ref[SIZE];
> +  double zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5 + i;
> +      src2.a[i] = 7.5 + i;
> +      src3.a[i] = 4.5 + i;
> +      zero[i] = 0.0;
> +    }
> +
> +  res1.x = _mm_mask_load_sd (src1.x, 1, p);
> +  res2.x = _mm_maskz_load_sd (1, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, 1, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (1, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_sd (p + 1, 1, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  res1.x = _mm_mask_load_sd (src1.x, 0, p);
> +  res2.x = _mm_maskz_load_sd (0, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, 0, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (0, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0;
> +  _mm_mask_store_sd (p + 1, 0, src1.x);
> +  if (val[1] != 42.0)
> +    abort ();
> +}
>
>         Jakub
Jakub Jelinek March 7, 2019, 8:09 a.m. UTC | #2
On Thu, Mar 07, 2019 at 08:11:53AM +0100, Uros Bizjak wrote:
> > +(define_insn "*avx512f_load<mode>_mask"
> > +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> > +       (vec_merge:<ssevecmode>
> > +         (vec_merge:<ssevecmode>
> > +           (vec_duplicate:<ssevecmode>
> > +             (match_operand:MODEF 1 "memory_operand" "m"))
> > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> > +           (match_operand:QI 3 "nonmemory_operand" "Yk"))
> 
> Is there a reason to have nonmemory_operand predicate here instead of
> register_operand?

Thanks for catching that up, that was from my earlier attempt to have
Yk,n constraints and deal with that during output.  For store it was
possible, for others there were some cases it couldn't handle but further
testing revealed that the combiner already handles most of the constant
mask cases right.

Here is updated patch, I've changed this in two spots.  It even improves the
constant 1 case (the only one that is still not optimized as much as it
should):
 f4:
-	movzbl	.LC0(%rip), %eax
+	movl	$1, %eax
 	kmovw	%eax, %k1
 	vmovsd	(%rsi), %xmm0{%k1}{z}
 	ret
Tested so far with make check-gcc RUNTESTFLAGS=i386.exp=avx512f-vmovs*.c
and compiling/eyeballing differences on the short testcase I've posted
in the description with also the u, -> 1, and u, -> 0, changes, appart
from the above f4 no differences.

Ok for trunk if it passes another full bootstrap/regtest?

2019-03-07  Jakub Jelinek  <jakub@redhat.com>

	PR target/89602
	* config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
	*avx512f_load<mode>_mask, avx512f_store<mode>_mask): New define_insns.
	(avx512f_load<mode>_mask): New define_expand.
	* config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
	__builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
	__builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
	__builtin_ia32_movess_mask): New builtins.
	* config/i386/avx512fintrin.h (_mm_mask_load_ss, _mm_maskz_load_ss,
	_mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
	_mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
	_mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.

	* gcc.target/i386/avx512f-vmovss-1.c: New test.
	* gcc.target/i386/avx512f-vmovss-2.c: New test.
	* gcc.target/i386/avx512f-vmovss-3.c: New test.
	* gcc.target/i386/avx512f-vmovsd-1.c: New test.
	* gcc.target/i386/avx512f-vmovsd-2.c: New test.
	* gcc.target/i386/avx512f-vmovsd-3.c: New test.

--- gcc/config/i386/sse.md.jj	2019-02-20 23:40:17.119140235 +0100
+++ gcc/config/i386/sse.md	2019-03-06 19:15:12.379749161 +0100
@@ -1151,6 +1151,67 @@ (define_insn "<avx512>_load<mode>_mask"
    (set_attr "memory" "none,load")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "avx512f_mov<ssescalarmodelower>_mask"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (vec_merge:VF_128
+	    (match_operand:VF_128 2 "register_operand" "v")
+	    (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
+	    (match_operand:QI 4 "register_operand" "Yk"))
+	  (match_operand:VF_128 1 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand")
+	(vec_merge:<ssevecmode>
+	  (vec_merge:<ssevecmode>
+	    (vec_duplicate:<ssevecmode>
+	      (match_operand:MODEF 1 "memory_operand"))
+	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
+	    (match_operand:QI 3 "nonmemory_operand"))
+	  (match_dup 4)
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
+
+(define_insn "*avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
+	(vec_merge:<ssevecmode>
+	  (vec_merge:<ssevecmode>
+	    (vec_duplicate:<ssevecmode>
+	      (match_operand:MODEF 1 "memory_operand" "m"))
+	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_operand:<ssevecmode> 4 "const0_operand" "C")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_store<mode>_mask"
+  [(set (match_operand:MODEF 0 "memory_operand" "=m")
+	(if_then_else:MODEF
+	  (and:QI (match_operand:QI 2 "register_operand" "Yk")
+		 (const_int 1))
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 1 "register_operand" "v")
+	    (parallel [(const_int 0)]))
+	  (match_dup 0)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<avx512>_blendm<mode>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:V48_AVX512VL
--- gcc/config/i386/i386-builtin.def.jj	2019-01-22 23:26:46.622213698 +0100
+++ gcc/config/i386/i386-builtin.def	2019-03-06 15:20:59.096670143 +0100
@@ -255,6 +255,10 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loaddf_mask, "__builtin_ia32_loadsd_mask", IX86_BUILTIN_LOADSD_MASK, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask, "__builtin_ia32_loadss_mask", IX86_BUILTIN_LOADSS_MASK, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask, "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask, "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF_UQI)
 
 BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
 BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
@@ -1470,6 +1474,8 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movdf_mask, "__builtin_ia32_movesd_mask", IX86_BUILTIN_MOVSD_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movsf_mask, "__builtin_ia32_movess_mask", IX86_BUILTIN_MOVSS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
 
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv16sf3,  "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF)
--- gcc/config/i386/avx512fintrin.h.jj	2019-01-17 13:20:00.812472551 +0100
+++ gcc/config/i386/avx512fintrin.h	2019-03-06 15:22:53.662791558 +0100
@@ -6273,6 +6273,83 @@ _mm512_mask_storeu_ps (void *__P, __mmas
 				   (__mmask16) __U);
 }
 
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ss (__mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
+					      __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sd (__mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
+					       __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+					      (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+					      (__v4sf) _mm_setzero_ps (), __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+					       (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+					       (__v2df) _mm_setzero_pd (),
+					       __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
+}
+
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c.jj	2019-03-06 15:34:07.972734673 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c	2019-03-06 15:44:46.891258107 +0100
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128 x1, x2, x3;
+volatile __mmask8 m;
+float *volatile p;
+
+void extern
+avx512f_test (void)
+{
+  x1 = _mm_mask_load_ss (x1, m, p);
+  x1 = _mm_maskz_load_ss (m, p);
+  x1 = _mm_mask_move_ss (x1, m, x2, x3);
+  x1 = _mm_maskz_move_ss (m, x2, x3);
+  _mm_mask_store_ss (p, m, x1);
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c.jj	2019-03-06 15:50:52.072264356 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c	2019-03-06 19:08:14.933598873 +0100
@@ -0,0 +1,87 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 32)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128 res1, res2, res3, res4, src1, src2, src3;
+  volatile __mmask8 mask = 5;
+  float val[2] = { 35.5f, 0.0f };
+  float *volatile p = &val[0];
+  float res_ref[SIZE];
+  float zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5f + i;
+      src2.a[i] = 7.5f + i;
+      src3.a[i] = 4.5f + i;
+      zero[i] = 0.0f;
+    }
+
+  res1.x = _mm_mask_load_ss (src1.x, mask, p);
+  res2.x = _mm_maskz_load_ss (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_ss (p + 1, mask, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  mask ^= 1;
+
+  res1.x = _mm_mask_load_ss (src1.x, mask, p);
+  res2.x = _mm_maskz_load_ss (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0f;
+  _mm_mask_store_ss (p + 1, mask, src1.x);
+  if (val[1] != 42.0f)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c.jj	2019-03-06 19:11:19.058577646 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c	2019-03-06 19:11:46.815122188 +0100
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 32)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128 res1, res2, res3, res4, src1, src2, src3;
+  float val[2] = { 35.5f, 0.0f };
+  float *volatile p = &val[0];
+  float res_ref[SIZE];
+  float zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5f + i;
+      src2.a[i] = 7.5f + i;
+      src3.a[i] = 4.5f + i;
+      zero[i] = 0.0f;
+    }
+
+  res1.x = _mm_mask_load_ss (src1.x, 1, p);
+  res2.x = _mm_maskz_load_ss (1, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, 1, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (1, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_ss (p + 1, 1, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  res1.x = _mm_mask_load_ss (src1.x, 0, p);
+  res2.x = _mm_maskz_load_ss (0, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, 0, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (0, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0f;
+  _mm_mask_store_ss (p + 1, 0, src1.x);
+  if (val[1] != 42.0f)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c.jj	2019-03-06 15:45:04.922962437 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c	2019-03-06 15:45:30.032550703 +0100
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128d x1, x2, x3;
+volatile __mmask8 m;
+double *volatile p;
+
+void extern
+avx512f_test (void)
+{
+  x1 = _mm_mask_load_sd (x1, m, p);
+  x1 = _mm_maskz_load_sd (m, p);
+  x1 = _mm_mask_move_sd (x1, m, x2, x3);
+  x1 = _mm_maskz_move_sd (m, x2, x3);
+  _mm_mask_store_sd (p, m, x1);
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c.jj	2019-03-06 19:05:18.862487956 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c	2019-03-06 19:07:58.954861065 +0100
@@ -0,0 +1,87 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 64)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128d res1, res2, res3, res4, src1, src2, src3;
+  volatile __mmask8 mask = 5;
+  double val[2] = { 35.5, 0.0 };
+  double *volatile p = &val[0];
+  double res_ref[SIZE];
+  double zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5 + i;
+      src2.a[i] = 7.5 + i;
+      src3.a[i] = 4.5 + i;
+      zero[i] = 0.0;
+    }
+
+  res1.x = _mm_mask_load_sd (src1.x, mask, p);
+  res2.x = _mm_maskz_load_sd (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_sd (p + 1, mask, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  mask ^= 1;
+
+  res1.x = _mm_mask_load_sd (src1.x, mask, p);
+  res2.x = _mm_maskz_load_sd (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0;
+  _mm_mask_store_sd (p + 1, mask, src1.x);
+  if (val[1] != 42.0)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c.jj	2019-03-06 19:11:57.977939021 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c	2019-03-06 19:12:47.090133163 +0100
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 64)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128d res1, res2, res3, res4, src1, src2, src3;
+  double val[2] = { 35.5, 0.0 };
+  double *volatile p = &val[0];
+  double res_ref[SIZE];
+  double zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5 + i;
+      src2.a[i] = 7.5 + i;
+      src3.a[i] = 4.5 + i;
+      zero[i] = 0.0;
+    }
+
+  res1.x = _mm_mask_load_sd (src1.x, 1, p);
+  res2.x = _mm_maskz_load_sd (1, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, 1, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (1, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_sd (p + 1, 1, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  res1.x = _mm_mask_load_sd (src1.x, 0, p);
+  res2.x = _mm_maskz_load_sd (0, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, 0, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (0, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0;
+  _mm_mask_store_sd (p + 1, 0, src1.x);
+  if (val[1] != 42.0)
+    abort ();
+}


	Jakub
Uros Bizjak March 7, 2019, 8:15 a.m. UTC | #3
On Thu, Mar 7, 2019 at 9:09 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Thu, Mar 07, 2019 at 08:11:53AM +0100, Uros Bizjak wrote:
> > > +(define_insn "*avx512f_load<mode>_mask"
> > > +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> > > +       (vec_merge:<ssevecmode>
> > > +         (vec_merge:<ssevecmode>
> > > +           (vec_duplicate:<ssevecmode>
> > > +             (match_operand:MODEF 1 "memory_operand" "m"))
> > > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> > > +           (match_operand:QI 3 "nonmemory_operand" "Yk"))
> >
> > Is there a reason to have nonmemory_operand predicate here instead of
> > register_operand?
>
> Thanks for catching that up, that was from my earlier attempt to have
> Yk,n constraints and deal with that during output.  For store it was
> possible, for others there were some cases it couldn't handle but further
> testing revealed that the combiner already handles most of the constant
> mask cases right.
>
> Here is updated patch, I've changed this in two spots.  It even improves the
> constant 1 case (the only one that is still not optimized as much as it
> should):
>  f4:
> -       movzbl  .LC0(%rip), %eax
> +       movl    $1, %eax
>         kmovw   %eax, %k1
>         vmovsd  (%rsi), %xmm0{%k1}{z}
>         ret
> Tested so far with make check-gcc RUNTESTFLAGS=i386.exp=avx512f-vmovs*.c
> and compiling/eyeballing differences on the short testcase I've posted
> in the description with also the u, -> 1, and u, -> 0, changes, appart
> from the above f4 no differences.
>
> Ok for trunk if it passes another full bootstrap/regtest?

LGTM with another fixup below.

HJ should approve addition of intrinsic in header files.

Thanks,
Uros.

>
> 2019-03-07  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/89602
>         * config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
>         *avx512f_load<mode>_mask, avx512f_store<mode>_mask): New define_insns.
>         (avx512f_load<mode>_mask): New define_expand.
>         * config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
>         __builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
>         __builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
>         __builtin_ia32_movess_mask): New builtins.
>         * config/i386/avx512fintrin.h (_mm_mask_load_ss, _mm_maskz_load_ss,
>         _mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
>         _mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
>         _mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.
>
>         * gcc.target/i386/avx512f-vmovss-1.c: New test.
>         * gcc.target/i386/avx512f-vmovss-2.c: New test.
>         * gcc.target/i386/avx512f-vmovss-3.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-1.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-2.c: New test.
>         * gcc.target/i386/avx512f-vmovsd-3.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2019-02-20 23:40:17.119140235 +0100
> +++ gcc/config/i386/sse.md      2019-03-06 19:15:12.379749161 +0100
> @@ -1151,6 +1151,67 @@ (define_insn "<avx512>_load<mode>_mask"
>     (set_attr "memory" "none,load")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn "avx512f_mov<ssescalarmodelower>_mask"
> +  [(set (match_operand:VF_128 0 "register_operand" "=v")
> +       (vec_merge:VF_128
> +         (vec_merge:VF_128
> +           (match_operand:VF_128 2 "register_operand" "v")
> +           (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
> +           (match_operand:QI 4 "register_operand" "Yk"))
> +         (match_operand:VF_128 1 "register_operand" "v")
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<ssescalarmode>")])
> +
> +(define_expand "avx512f_load<mode>_mask"
> +  [(set (match_operand:<ssevecmode> 0 "register_operand")
> +       (vec_merge:<ssevecmode>
> +         (vec_merge:<ssevecmode>
> +           (vec_duplicate:<ssevecmode>
> +             (match_operand:MODEF 1 "memory_operand"))
> +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
> +           (match_operand:QI 3 "nonmemory_operand"))

register operand here, the expander should match corresponding insn pattern.

> +         (match_dup 4)
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
> +
> +(define_insn "*avx512f_load<mode>_mask"
> +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> +       (vec_merge:<ssevecmode>
> +         (vec_merge:<ssevecmode>
> +           (vec_duplicate:<ssevecmode>
> +             (match_operand:MODEF 1 "memory_operand" "m"))
> +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> +           (match_operand:QI 3 "register_operand" "Yk"))
> +         (match_operand:<ssevecmode> 4 "const0_operand" "C")
> +         (const_int 1)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "memory" "load")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "avx512f_store<mode>_mask"
> +  [(set (match_operand:MODEF 0 "memory_operand" "=m")
> +       (if_then_else:MODEF
> +         (and:QI (match_operand:QI 2 "register_operand" "Yk")
> +                (const_int 1))
> +         (vec_select:MODEF
> +           (match_operand:<ssevecmode> 1 "register_operand" "v")
> +           (parallel [(const_int 0)]))
> +         (match_dup 0)))]
> +  "TARGET_AVX512F"
> +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "memory" "store")
> +   (set_attr "mode" "<MODE>")])
> +
>  (define_insn "<avx512>_blendm<mode>"
>    [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
>         (vec_merge:V48_AVX512VL
> --- gcc/config/i386/i386-builtin.def.jj 2019-01-22 23:26:46.622213698 +0100
> +++ gcc/config/i386/i386-builtin.def    2019-03-06 15:20:59.096670143 +0100
> @@ -255,6 +255,10 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loaddf_mask, "__builtin_ia32_loadsd_mask", IX86_BUILTIN_LOADSD_MASK, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask, "__builtin_ia32_loadss_mask", IX86_BUILTIN_LOADSS_MASK, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT_V4SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask, "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask, "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF_UQI)
>
>  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
>  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
> @@ -1470,6 +1474,8 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movdf_mask, "__builtin_ia32_movesd_mask", IX86_BUILTIN_MOVSD_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movsf_mask, "__builtin_ia32_movess_mask", IX86_BUILTIN_MOVSS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv16sf3,  "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF)
> --- gcc/config/i386/avx512fintrin.h.jj  2019-01-17 13:20:00.812472551 +0100
> +++ gcc/config/i386/avx512fintrin.h     2019-03-06 15:22:53.662791558 +0100
> @@ -6273,6 +6273,83 @@ _mm512_mask_storeu_ps (void *__P, __mmas
>                                    (__mmask16) __U);
>  }
>
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
> +{
> +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_load_ss (__mmask8 __U, const float *__P)
> +{
> +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
> +                                             __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
> +{
> +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_load_sd (__mmask8 __U, const double *__P)
> +{
> +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
> +                                              __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
> +{
> +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
> +                                             (__v4sf) __W, __U);
> +}
> +
> +extern __inline __m128
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
> +{
> +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
> +                                             (__v4sf) _mm_setzero_ps (), __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
> +{
> +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
> +                                              (__v2df) __W, __U);
> +}
> +
> +extern __inline __m128d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
> +{
> +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
> +                                              (__v2df) _mm_setzero_pd (),
> +                                              __U);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
> +{
> +  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
> +{
> +  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
> +}
> +
>  extern __inline __m512i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c.jj 2019-03-06 15:34:07.972734673 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c    2019-03-06 15:44:46.891258107 +0100
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +
> +#include <immintrin.h>
> +
> +volatile __m128 x1, x2, x3;
> +volatile __mmask8 m;
> +float *volatile p;
> +
> +void extern
> +avx512f_test (void)
> +{
> +  x1 = _mm_mask_load_ss (x1, m, p);
> +  x1 = _mm_maskz_load_ss (m, p);
> +  x1 = _mm_mask_move_ss (x1, m, x2, x3);
> +  x1 = _mm_maskz_move_ss (m, x2, x3);
> +  _mm_mask_store_ss (p, m, x1);
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c.jj 2019-03-06 15:50:52.072264356 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c    2019-03-06 19:08:14.933598873 +0100
> @@ -0,0 +1,87 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 32)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128 res1, res2, res3, res4, src1, src2, src3;
> +  volatile __mmask8 mask = 5;
> +  float val[2] = { 35.5f, 0.0f };
> +  float *volatile p = &val[0];
> +  float res_ref[SIZE];
> +  float zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5f + i;
> +      src2.a[i] = 7.5f + i;
> +      src3.a[i] = 4.5f + i;
> +      zero[i] = 0.0f;
> +    }
> +
> +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_ss (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_ss (p + 1, mask, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  mask ^= 1;
> +
> +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_ss (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0f;
> +  _mm_mask_store_ss (p + 1, mask, src1.x);
> +  if (val[1] != 42.0f)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c.jj 2019-03-06 19:11:19.058577646 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c    2019-03-06 19:11:46.815122188 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 32)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128 res1, res2, res3, res4, src1, src2, src3;
> +  float val[2] = { 35.5f, 0.0f };
> +  float *volatile p = &val[0];
> +  float res_ref[SIZE];
> +  float zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5f + i;
> +      src2.a[i] = 7.5f + i;
> +      src3.a[i] = 4.5f + i;
> +      zero[i] = 0.0f;
> +    }
> +
> +  res1.x = _mm_mask_load_ss (src1.x, 1, p);
> +  res2.x = _mm_maskz_load_ss (1, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, 1, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (1, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_ss (p + 1, 1, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  res1.x = _mm_mask_load_ss (src1.x, 0, p);
> +  res2.x = _mm_maskz_load_ss (0, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_ss (src1.x, 0, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_ss (0, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128 (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128 (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0f;
> +  _mm_mask_store_ss (p + 1, 0, src1.x);
> +  if (val[1] != 42.0f)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c.jj 2019-03-06 15:45:04.922962437 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c    2019-03-06 15:45:30.032550703 +0100
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +
> +#include <immintrin.h>
> +
> +volatile __m128d x1, x2, x3;
> +volatile __mmask8 m;
> +double *volatile p;
> +
> +void extern
> +avx512f_test (void)
> +{
> +  x1 = _mm_mask_load_sd (x1, m, p);
> +  x1 = _mm_maskz_load_sd (m, p);
> +  x1 = _mm_mask_move_sd (x1, m, x2, x3);
> +  x1 = _mm_maskz_move_sd (m, x2, x3);
> +  _mm_mask_store_sd (p, m, x1);
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c.jj 2019-03-06 19:05:18.862487956 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c    2019-03-06 19:07:58.954861065 +0100
> @@ -0,0 +1,87 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 64)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128d res1, res2, res3, res4, src1, src2, src3;
> +  volatile __mmask8 mask = 5;
> +  double val[2] = { 35.5, 0.0 };
> +  double *volatile p = &val[0];
> +  double res_ref[SIZE];
> +  double zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5 + i;
> +      src2.a[i] = 7.5 + i;
> +      src3.a[i] = 4.5 + i;
> +      zero[i] = 0.0;
> +    }
> +
> +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_sd (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_sd (p + 1, mask, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  mask ^= 1;
> +
> +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> +  res2.x = _mm_maskz_load_sd (mask, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0;
> +  _mm_mask_store_sd (p + 1, mask, src1.x);
> +  if (val[1] != 42.0)
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c.jj 2019-03-06 19:11:57.977939021 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c    2019-03-06 19:12:47.090133163 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-helper.h"
> +
> +#define SIZE (128 / 64)
> +#include "avx512f-mask-type.h"
> +
> +void
> +avx512f_test (void)
> +{
> +  int i, sign;
> +  union128d res1, res2, res3, res4, src1, src2, src3;
> +  double val[2] = { 35.5, 0.0 };
> +  double *volatile p = &val[0];
> +  double res_ref[SIZE];
> +  double zero[SIZE];
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      src1.a[i] = 1.5 + i;
> +      src2.a[i] = 7.5 + i;
> +      src3.a[i] = 4.5 + i;
> +      zero[i] = 0.0;
> +    }
> +
> +  res1.x = _mm_mask_load_sd (src1.x, 1, p);
> +  res2.x = _mm_maskz_load_sd (1, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = val[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, 1, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (1, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src3.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  _mm_mask_store_sd (p + 1, 1, src1.x);
> +  if (val[1] != src1.a[0])
> +    abort ();
> +
> +  res1.x = _mm_mask_load_sd (src1.x, 0, p);
> +  res2.x = _mm_maskz_load_sd (0, p);
> +
> +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res1, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res2, res_ref))
> +    abort ();
> +
> +  res3.x = _mm_mask_move_sd (src1.x, 0, src2.x, src3.x);
> +  res4.x = _mm_maskz_move_sd (0, src2.x, src3.x);
> +
> +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> +  res_ref[0] = src1.a[0];
> +  if (check_union128d (res3, res_ref))
> +    abort ();
> +
> +  res_ref[0] = zero[0];
> +  if (check_union128d (res4, res_ref))
> +    abort ();
> +
> +  val[1] = 42.0;
> +  _mm_mask_store_sd (p + 1, 0, src1.x);
> +  if (val[1] != 42.0)
> +    abort ();
> +}
>
>
>         Jakub
H.J. Lu March 7, 2019, 10:07 a.m. UTC | #4
Looks good to me.

Thanks.

On Thu, Mar 7, 2019, 4:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:

> On Thu, Mar 7, 2019 at 9:09 AM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > On Thu, Mar 07, 2019 at 08:11:53AM +0100, Uros Bizjak wrote:
> > > > +(define_insn "*avx512f_load<mode>_mask"
> > > > +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> > > > +       (vec_merge:<ssevecmode>
> > > > +         (vec_merge:<ssevecmode>
> > > > +           (vec_duplicate:<ssevecmode>
> > > > +             (match_operand:MODEF 1 "memory_operand" "m"))
> > > > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> > > > +           (match_operand:QI 3 "nonmemory_operand" "Yk"))
> > >
> > > Is there a reason to have nonmemory_operand predicate here instead of
> > > register_operand?
> >
> > Thanks for catching that up, that was from my earlier attempt to have
> > Yk,n constraints and deal with that during output.  For store it was
> > possible, for others there were some cases it couldn't handle but further
> > testing revealed that the combiner already handles most of the constant
> > mask cases right.
> >
> > Here is updated patch, I've changed this in two spots.  It even improves
> the
> > constant 1 case (the only one that is still not optimized as much as it
> > should):
> >  f4:
> > -       movzbl  .LC0(%rip), %eax
> > +       movl    $1, %eax
> >         kmovw   %eax, %k1
> >         vmovsd  (%rsi), %xmm0{%k1}{z}
> >         ret
> > Tested so far with make check-gcc RUNTESTFLAGS=i386.exp=avx512f-vmovs*.c
> > and compiling/eyeballing differences on the short testcase I've posted
> > in the description with also the u, -> 1, and u, -> 0, changes, appart
> > from the above f4 no differences.
> >
> > Ok for trunk if it passes another full bootstrap/regtest?
>
> LGTM with another fixup below.
>
> HJ should approve addition of intrinsic in header files.
>
> Thanks,
> Uros.
>
> >
> > 2019-03-07  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/89602
> >         * config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
> >         *avx512f_load<mode>_mask, avx512f_store<mode>_mask): New
> define_insns.
> >         (avx512f_load<mode>_mask): New define_expand.
> >         * config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
> >         __builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
> >         __builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
> >         __builtin_ia32_movess_mask): New builtins.
> >         * config/i386/avx512fintrin.h (_mm_mask_load_ss,
> _mm_maskz_load_ss,
> >         _mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
> >         _mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
> >         _mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.
> >
> >         * gcc.target/i386/avx512f-vmovss-1.c: New test.
> >         * gcc.target/i386/avx512f-vmovss-2.c: New test.
> >         * gcc.target/i386/avx512f-vmovss-3.c: New test.
> >         * gcc.target/i386/avx512f-vmovsd-1.c: New test.
> >         * gcc.target/i386/avx512f-vmovsd-2.c: New test.
> >         * gcc.target/i386/avx512f-vmovsd-3.c: New test.
> >
> > --- gcc/config/i386/sse.md.jj   2019-02-20 23:40:17.119140235 +0100
> > +++ gcc/config/i386/sse.md      2019-03-06 19:15:12.379749161 +0100
> > @@ -1151,6 +1151,67 @@ (define_insn "<avx512>_load<mode>_mask"
> >     (set_attr "memory" "none,load")
> >     (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn "avx512f_mov<ssescalarmodelower>_mask"
> > +  [(set (match_operand:VF_128 0 "register_operand" "=v")
> > +       (vec_merge:VF_128
> > +         (vec_merge:VF_128
> > +           (match_operand:VF_128 2 "register_operand" "v")
> > +           (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
> > +           (match_operand:QI 4 "register_operand" "Yk"))
> > +         (match_operand:VF_128 1 "register_operand" "v")
> > +         (const_int 1)))]
> > +  "TARGET_AVX512F"
> > +  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
> > +  [(set_attr "type" "ssemov")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "mode" "<ssescalarmode>")])
> > +
> > +(define_expand "avx512f_load<mode>_mask"
> > +  [(set (match_operand:<ssevecmode> 0 "register_operand")
> > +       (vec_merge:<ssevecmode>
> > +         (vec_merge:<ssevecmode>
> > +           (vec_duplicate:<ssevecmode>
> > +             (match_operand:MODEF 1 "memory_operand"))
> > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
> > +           (match_operand:QI 3 "nonmemory_operand"))
>
> register operand here, the expander should match corresponding insn
> pattern.
>
> > +         (match_dup 4)
> > +         (const_int 1)))]
> > +  "TARGET_AVX512F"
> > +  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
> > +
> > +(define_insn "*avx512f_load<mode>_mask"
> > +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> > +       (vec_merge:<ssevecmode>
> > +         (vec_merge:<ssevecmode>
> > +           (vec_duplicate:<ssevecmode>
> > +             (match_operand:MODEF 1 "memory_operand" "m"))
> > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> > +           (match_operand:QI 3 "register_operand" "Yk"))
> > +         (match_operand:<ssevecmode> 4 "const0_operand" "C")
> > +         (const_int 1)))]
> > +  "TARGET_AVX512F"
> > +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
> > +  [(set_attr "type" "ssemov")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "memory" "load")
> > +   (set_attr "mode" "<MODE>")])
> > +
> > +(define_insn "avx512f_store<mode>_mask"
> > +  [(set (match_operand:MODEF 0 "memory_operand" "=m")
> > +       (if_then_else:MODEF
> > +         (and:QI (match_operand:QI 2 "register_operand" "Yk")
> > +                (const_int 1))
> > +         (vec_select:MODEF
> > +           (match_operand:<ssevecmode> 1 "register_operand" "v")
> > +           (parallel [(const_int 0)]))
> > +         (match_dup 0)))]
> > +  "TARGET_AVX512F"
> > +  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
> > +  [(set_attr "type" "ssemov")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "memory" "store")
> > +   (set_attr "mode" "<MODE>")])
> > +
> >  (define_insn "<avx512>_blendm<mode>"
> >    [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
> >         (vec_merge:V48_AVX512VL
> > --- gcc/config/i386/i386-builtin.def.jj 2019-01-22 23:26:46.622213698
> +0100
> > +++ gcc/config/i386/i386-builtin.def    2019-03-06 15:20:59.096670143
> +0100
> > @@ -255,6 +255,10 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev16si_mask,
> "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512,
> UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8df_mask,
> "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int)
> VOID_FTYPE_PV8DF_V8DF_UQI)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8di_mask,
> "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512,
> UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loaddf_mask,
> "__builtin_ia32_loadsd_mask", IX86_BUILTIN_LOADSD_MASK, UNKNOWN, (int)
> V2DF_FTYPE_PCDOUBLE_V2DF_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask,
> "__builtin_ia32_loadss_mask", IX86_BUILTIN_LOADSS_MASK, UNKNOWN, (int)
> V4SF_FTYPE_PCFLOAT_V4SF_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask,
> "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int)
> VOID_FTYPE_PDOUBLE_V2DF_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask,
> "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int)
> VOID_FTYPE_PFLOAT_V4SF_UQI)
> >
> >  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb,
> "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int)
> VOID_FTYPE_PVOID)
> >  BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb,
> "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int)
> PVOID_FTYPE_VOID)
> > @@ -1470,6 +1474,8 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0,
> CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz",
> IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int)
> V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_mask,
> "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK,
> UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_maskz,
> "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ,
> UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movdf_mask,
> "__builtin_ia32_movesd_mask", IX86_BUILTIN_MOVSD_MASK, UNKNOWN, (int)
> V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
> > +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movsf_mask,
> "__builtin_ia32_movess_mask", IX86_BUILTIN_MOVSS_MASK, UNKNOWN, (int)
> V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
> >
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv16sf3,
> "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int)
> V16SF_FTYPE_V16SF_V16SF)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,
> "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int)
> V8DF_FTYPE_V8DF_V8DF)
> > --- gcc/config/i386/avx512fintrin.h.jj  2019-01-17 13:20:00.812472551
> +0100
> > +++ gcc/config/i386/avx512fintrin.h     2019-03-06 15:22:53.662791558
> +0100
> > @@ -6273,6 +6273,83 @@ _mm512_mask_storeu_ps (void *__P, __mmas
> >                                    (__mmask16) __U);
> >  }
> >
> > +extern __inline __m128
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
> > +{
> > +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
> > +}
> > +
> > +extern __inline __m128
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_maskz_load_ss (__mmask8 __U, const float *__P)
> > +{
> > +  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf)
> _mm_setzero_ps (),
> > +                                             __U);
> > +}
> > +
> > +extern __inline __m128d
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
> > +{
> > +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
> > +}
> > +
> > +extern __inline __m128d
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_maskz_load_sd (__mmask8 __U, const double *__P)
> > +{
> > +  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df)
> _mm_setzero_pd (),
> > +                                              __U);
> > +}
> > +
> > +extern __inline __m128
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
> > +{
> > +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf)
> __B,
> > +                                             (__v4sf) __W, __U);
> > +}
> > +
> > +extern __inline __m128
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
> > +{
> > +  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf)
> __B,
> > +                                             (__v4sf) _mm_setzero_ps
> (), __U);
> > +}
> > +
> > +extern __inline __m128d
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
> > +{
> > +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df)
> __B,
> > +                                              (__v2df) __W, __U);
> > +}
> > +
> > +extern __inline __m128d
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
> > +{
> > +  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df)
> __B,
> > +                                              (__v2df) _mm_setzero_pd
> (),
> > +                                              __U);
> > +}
> > +
> > +extern __inline void
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
> > +{
> > +  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
> > +}
> > +
> > +extern __inline void
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
> > +{
> > +  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
> > +}
> > +
> >  extern __inline __m512i
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> >  _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c.jj 2019-03-06
> 15:34:07.972734673 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c    2019-03-06
> 15:44:46.891258107 +0100
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -masm=att" } */
> > +/* { dg-final { scan-assembler-times "vmovss\[
> \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } }
> */
> > +/* { dg-final { scan-assembler-times "vmovss\[
> \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"
> 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+,
> %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+,
> %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+,
> \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +
> > +#include <immintrin.h>
> > +
> > +volatile __m128 x1, x2, x3;
> > +volatile __mmask8 m;
> > +float *volatile p;
> > +
> > +void extern
> > +avx512f_test (void)
> > +{
> > +  x1 = _mm_mask_load_ss (x1, m, p);
> > +  x1 = _mm_maskz_load_ss (m, p);
> > +  x1 = _mm_mask_move_ss (x1, m, x2, x3);
> > +  x1 = _mm_maskz_move_ss (m, x2, x3);
> > +  _mm_mask_store_ss (p, m, x1);
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c.jj 2019-03-06
> 15:50:52.072264356 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c    2019-03-06
> 19:08:14.933598873 +0100
> > @@ -0,0 +1,87 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include "avx512f-check.h"
> > +
> > +#include "avx512f-helper.h"
> > +
> > +#define SIZE (128 / 32)
> > +#include "avx512f-mask-type.h"
> > +
> > +void
> > +avx512f_test (void)
> > +{
> > +  int i, sign;
> > +  union128 res1, res2, res3, res4, src1, src2, src3;
> > +  volatile __mmask8 mask = 5;
> > +  float val[2] = { 35.5f, 0.0f };
> > +  float *volatile p = &val[0];
> > +  float res_ref[SIZE];
> > +  float zero[SIZE];
> > +
> > +  for (i = 0; i < SIZE; i++)
> > +    {
> > +      src1.a[i] = 1.5f + i;
> > +      src2.a[i] = 7.5f + i;
> > +      src3.a[i] = 4.5f + i;
> > +      zero[i] = 0.0f;
> > +    }
> > +
> > +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> > +  res2.x = _mm_maskz_load_ss (mask, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = val[0];
> > +  if (check_union128 (res1, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128 (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src3.a[0];
> > +  if (check_union128 (res3, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128 (res4, res_ref))
> > +    abort ();
> > +
> > +  _mm_mask_store_ss (p + 1, mask, src1.x);
> > +  if (val[1] != src1.a[0])
> > +    abort ();
> > +
> > +  mask ^= 1;
> > +
> > +  res1.x = _mm_mask_load_ss (src1.x, mask, p);
> > +  res2.x = _mm_maskz_load_ss (mask, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128 (res1, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128 (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128 (res3, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128 (res4, res_ref))
> > +    abort ();
> > +
> > +  val[1] = 42.0f;
> > +  _mm_mask_store_ss (p + 1, mask, src1.x);
> > +  if (val[1] != 42.0f)
> > +    abort ();
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c.jj 2019-03-06
> 19:11:19.058577646 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c    2019-03-06
> 19:11:46.815122188 +0100
> > @@ -0,0 +1,84 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include "avx512f-check.h"
> > +
> > +#include "avx512f-helper.h"
> > +
> > +#define SIZE (128 / 32)
> > +#include "avx512f-mask-type.h"
> > +
> > +void
> > +avx512f_test (void)
> > +{
> > +  int i, sign;
> > +  union128 res1, res2, res3, res4, src1, src2, src3;
> > +  float val[2] = { 35.5f, 0.0f };
> > +  float *volatile p = &val[0];
> > +  float res_ref[SIZE];
> > +  float zero[SIZE];
> > +
> > +  for (i = 0; i < SIZE; i++)
> > +    {
> > +      src1.a[i] = 1.5f + i;
> > +      src2.a[i] = 7.5f + i;
> > +      src3.a[i] = 4.5f + i;
> > +      zero[i] = 0.0f;
> > +    }
> > +
> > +  res1.x = _mm_mask_load_ss (src1.x, 1, p);
> > +  res2.x = _mm_maskz_load_ss (1, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = val[0];
> > +  if (check_union128 (res1, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128 (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_ss (src1.x, 1, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_ss (1, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src3.a[0];
> > +  if (check_union128 (res3, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128 (res4, res_ref))
> > +    abort ();
> > +
> > +  _mm_mask_store_ss (p + 1, 1, src1.x);
> > +  if (val[1] != src1.a[0])
> > +    abort ();
> > +
> > +  res1.x = _mm_mask_load_ss (src1.x, 0, p);
> > +  res2.x = _mm_maskz_load_ss (0, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128 (res1, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128 (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_ss (src1.x, 0, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_ss (0, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128 (res3, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128 (res4, res_ref))
> > +    abort ();
> > +
> > +  val[1] = 42.0f;
> > +  _mm_mask_store_ss (p + 1, 0, src1.x);
> > +  if (val[1] != 42.0f)
> > +    abort ();
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c.jj 2019-03-06
> 15:45:04.922962437 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c    2019-03-06
> 15:45:30.032550703 +0100
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -masm=att" } */
> > +/* { dg-final { scan-assembler-times "vmovsd\[
> \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } }
> */
> > +/* { dg-final { scan-assembler-times "vmovsd\[
> \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"
> 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+,
> %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+,
> %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+,
> \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> > +
> > +#include <immintrin.h>
> > +
> > +volatile __m128d x1, x2, x3;
> > +volatile __mmask8 m;
> > +double *volatile p;
> > +
> > +void extern
> > +avx512f_test (void)
> > +{
> > +  x1 = _mm_mask_load_sd (x1, m, p);
> > +  x1 = _mm_maskz_load_sd (m, p);
> > +  x1 = _mm_mask_move_sd (x1, m, x2, x3);
> > +  x1 = _mm_maskz_move_sd (m, x2, x3);
> > +  _mm_mask_store_sd (p, m, x1);
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c.jj 2019-03-06
> 19:05:18.862487956 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c    2019-03-06
> 19:07:58.954861065 +0100
> > @@ -0,0 +1,87 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include "avx512f-check.h"
> > +
> > +#include "avx512f-helper.h"
> > +
> > +#define SIZE (128 / 64)
> > +#include "avx512f-mask-type.h"
> > +
> > +void
> > +avx512f_test (void)
> > +{
> > +  int i, sign;
> > +  union128d res1, res2, res3, res4, src1, src2, src3;
> > +  volatile __mmask8 mask = 5;
> > +  double val[2] = { 35.5, 0.0 };
> > +  double *volatile p = &val[0];
> > +  double res_ref[SIZE];
> > +  double zero[SIZE];
> > +
> > +  for (i = 0; i < SIZE; i++)
> > +    {
> > +      src1.a[i] = 1.5 + i;
> > +      src2.a[i] = 7.5 + i;
> > +      src3.a[i] = 4.5 + i;
> > +      zero[i] = 0.0;
> > +    }
> > +
> > +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> > +  res2.x = _mm_maskz_load_sd (mask, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = val[0];
> > +  if (check_union128d (res1, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128d (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src3.a[0];
> > +  if (check_union128d (res3, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128d (res4, res_ref))
> > +    abort ();
> > +
> > +  _mm_mask_store_sd (p + 1, mask, src1.x);
> > +  if (val[1] != src1.a[0])
> > +    abort ();
> > +
> > +  mask ^= 1;
> > +
> > +  res1.x = _mm_mask_load_sd (src1.x, mask, p);
> > +  res2.x = _mm_maskz_load_sd (mask, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128d (res1, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128d (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128d (res3, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128d (res4, res_ref))
> > +    abort ();
> > +
> > +  val[1] = 42.0;
> > +  _mm_mask_store_sd (p + 1, mask, src1.x);
> > +  if (val[1] != 42.0)
> > +    abort ();
> > +}
> > --- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c.jj 2019-03-06
> 19:11:57.977939021 +0100
> > +++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c    2019-03-06
> 19:12:47.090133163 +0100
> > @@ -0,0 +1,84 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include "avx512f-check.h"
> > +
> > +#include "avx512f-helper.h"
> > +
> > +#define SIZE (128 / 64)
> > +#include "avx512f-mask-type.h"
> > +
> > +void
> > +avx512f_test (void)
> > +{
> > +  int i, sign;
> > +  union128d res1, res2, res3, res4, src1, src2, src3;
> > +  double val[2] = { 35.5, 0.0 };
> > +  double *volatile p = &val[0];
> > +  double res_ref[SIZE];
> > +  double zero[SIZE];
> > +
> > +  for (i = 0; i < SIZE; i++)
> > +    {
> > +      src1.a[i] = 1.5 + i;
> > +      src2.a[i] = 7.5 + i;
> > +      src3.a[i] = 4.5 + i;
> > +      zero[i] = 0.0;
> > +    }
> > +
> > +  res1.x = _mm_mask_load_sd (src1.x, 1, p);
> > +  res2.x = _mm_maskz_load_sd (1, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = val[0];
> > +  if (check_union128d (res1, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128d (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_sd (src1.x, 1, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_sd (1, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src3.a[0];
> > +  if (check_union128d (res3, res_ref))
> > +    abort ();
> > +
> > +  if (check_union128d (res4, res_ref))
> > +    abort ();
> > +
> > +  _mm_mask_store_sd (p + 1, 1, src1.x);
> > +  if (val[1] != src1.a[0])
> > +    abort ();
> > +
> > +  res1.x = _mm_mask_load_sd (src1.x, 0, p);
> > +  res2.x = _mm_maskz_load_sd (0, p);
> > +
> > +  __builtin_memcpy (res_ref, zero, sizeof (zero));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128d (res1, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128d (res2, res_ref))
> > +    abort ();
> > +
> > +  res3.x = _mm_mask_move_sd (src1.x, 0, src2.x, src3.x);
> > +  res4.x = _mm_maskz_move_sd (0, src2.x, src3.x);
> > +
> > +  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
> > +  res_ref[0] = src1.a[0];
> > +  if (check_union128d (res3, res_ref))
> > +    abort ();
> > +
> > +  res_ref[0] = zero[0];
> > +  if (check_union128d (res4, res_ref))
> > +    abort ();
> > +
> > +  val[1] = 42.0;
> > +  _mm_mask_store_sd (p + 1, 0, src1.x);
> > +  if (val[1] != 42.0)
> > +    abort ();
> > +}
> >
> >
> >         Jakub
>
H.J. Lu March 8, 2019, 8:45 a.m. UTC | #5
On Thu, Mar 7, 2019 at 4:09 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Thu, Mar 07, 2019 at 08:11:53AM +0100, Uros Bizjak wrote:
> > > +(define_insn "*avx512f_load<mode>_mask"
> > > +  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
> > > +       (vec_merge:<ssevecmode>
> > > +         (vec_merge:<ssevecmode>
> > > +           (vec_duplicate:<ssevecmode>
> > > +             (match_operand:MODEF 1 "memory_operand" "m"))
> > > +           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
> > > +           (match_operand:QI 3 "nonmemory_operand" "Yk"))
> >
> > Is there a reason to have nonmemory_operand predicate here instead of
> > register_operand?
>
> Thanks for catching that up, that was from my earlier attempt to have
> Yk,n constraints and deal with that during output.  For store it was
> possible, for others there were some cases it couldn't handle but further
> testing revealed that the combiner already handles most of the constant
> mask cases right.
>
> Here is updated patch, I've changed this in two spots.  It even improves the
> constant 1 case (the only one that is still not optimized as much as it
> should):
>  f4:
> -       movzbl  .LC0(%rip), %eax
> +       movl    $1, %eax
>         kmovw   %eax, %k1
>         vmovsd  (%rsi), %xmm0{%k1}{z}
>         ret
> Tested so far with make check-gcc RUNTESTFLAGS=i386.exp=avx512f-vmovs*.c
> and compiling/eyeballing differences on the short testcase I've posted
> in the description with also the u, -> 1, and u, -> 0, changes, appart
> from the above f4 no differences.
>
> Ok for trunk if it passes another full bootstrap/regtest?
>
> 2019-03-07  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/89602
>         * config/i386/sse.md (avx512f_mov<ssescalarmodelower>_mask,
>         *avx512f_load<mode>_mask, avx512f_store<mode>_mask): New define_insns.
>         (avx512f_load<mode>_mask): New define_expand.
>         * config/i386/i386-builtin.def (__builtin_ia32_loadsd_mask,
>         __builtin_ia32_loadss_mask, __builtin_ia32_storesd_mask,
>         __builtin_ia32_storess_mask, __builtin_ia32_movesd_mask,
>         __builtin_ia32_movess_mask): New builtins.
>         * config/i386/avx512fintrin.h (_mm_mask_load_ss, _mm_maskz_load_ss,
>         _mm_mask_load_sd, _mm_maskz_load_sd, _mm_mask_move_ss,
>         _mm_maskz_move_ss, _mm_mask_move_sd, _mm_maskz_move_sd,
>         _mm_mask_store_ss, _mm_mask_store_sd): New intrinsics.
>

This caused:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89630

This looks very strange since this patch only touched backend.
diff mbox series

Patch

--- gcc/config/i386/sse.md.jj	2019-02-20 23:40:17.119140235 +0100
+++ gcc/config/i386/sse.md	2019-03-06 19:15:12.379749161 +0100
@@ -1151,6 +1151,67 @@  (define_insn "<avx512>_load<mode>_mask"
    (set_attr "memory" "none,load")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "avx512f_mov<ssescalarmodelower>_mask"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (vec_merge:VF_128
+	    (match_operand:VF_128 2 "register_operand" "v")
+	    (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
+	    (match_operand:QI 4 "register_operand" "Yk"))
+	  (match_operand:VF_128 1 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand")
+	(vec_merge:<ssevecmode>
+	  (vec_merge:<ssevecmode>
+	    (vec_duplicate:<ssevecmode>
+	      (match_operand:MODEF 1 "memory_operand"))
+	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
+	    (match_operand:QI 3 "nonmemory_operand"))
+	  (match_dup 4)
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
+
+(define_insn "*avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
+	(vec_merge:<ssevecmode>
+	  (vec_merge:<ssevecmode>
+	    (vec_duplicate:<ssevecmode>
+	      (match_operand:MODEF 1 "memory_operand" "m"))
+	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
+	    (match_operand:QI 3 "nonmemory_operand" "Yk"))
+	  (match_operand:<ssevecmode> 4 "const0_operand" "C")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_store<mode>_mask"
+  [(set (match_operand:MODEF 0 "memory_operand" "=m")
+	(if_then_else:MODEF
+	  (and:QI (match_operand:QI 2 "nonmemory_operand" "Yk")
+		 (const_int 1))
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 1 "register_operand" "v")
+	    (parallel [(const_int 0)]))
+	  (match_dup 0)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<avx512>_blendm<mode>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:V48_AVX512VL
--- gcc/config/i386/i386-builtin.def.jj	2019-01-22 23:26:46.622213698 +0100
+++ gcc/config/i386/i386-builtin.def	2019-03-06 15:20:59.096670143 +0100
@@ -255,6 +255,10 @@  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loaddf_mask, "__builtin_ia32_loadsd_mask", IX86_BUILTIN_LOADSD_MASK, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask, "__builtin_ia32_loadss_mask", IX86_BUILTIN_LOADSS_MASK, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask, "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask, "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF_UQI)
 
 BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
 BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
@@ -1470,6 +1474,8 @@  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movdf_mask, "__builtin_ia32_movesd_mask", IX86_BUILTIN_MOVSD_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movsf_mask, "__builtin_ia32_movess_mask", IX86_BUILTIN_MOVSS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
 
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv16sf3,  "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF)
--- gcc/config/i386/avx512fintrin.h.jj	2019-01-17 13:20:00.812472551 +0100
+++ gcc/config/i386/avx512fintrin.h	2019-03-06 15:22:53.662791558 +0100
@@ -6273,6 +6273,83 @@  _mm512_mask_storeu_ps (void *__P, __mmas
 				   (__mmask16) __U);
 }
 
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ss (__mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
+					      __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sd (__mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
+					       __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+					      (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+					      (__v4sf) _mm_setzero_ps (), __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+					       (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+					       (__v2df) _mm_setzero_pd (),
+					       __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
+}
+
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c.jj	2019-03-06 15:34:07.972734673 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-1.c	2019-03-06 15:44:46.891258107 +0100
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128 x1, x2, x3;
+volatile __mmask8 m;
+float *volatile p;
+
+void extern
+avx512f_test (void)
+{
+  x1 = _mm_mask_load_ss (x1, m, p);
+  x1 = _mm_maskz_load_ss (m, p);
+  x1 = _mm_mask_move_ss (x1, m, x2, x3);
+  x1 = _mm_maskz_move_ss (m, x2, x3);
+  _mm_mask_store_ss (p, m, x1);
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c.jj	2019-03-06 15:50:52.072264356 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-2.c	2019-03-06 19:08:14.933598873 +0100
@@ -0,0 +1,87 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 32)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128 res1, res2, res3, res4, src1, src2, src3;
+  volatile __mmask8 mask = 5;
+  float val[2] = { 35.5f, 0.0f };
+  float *volatile p = &val[0];
+  float res_ref[SIZE];
+  float zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5f + i;
+      src2.a[i] = 7.5f + i;
+      src3.a[i] = 4.5f + i;
+      zero[i] = 0.0f;
+    }
+
+  res1.x = _mm_mask_load_ss (src1.x, mask, p);
+  res2.x = _mm_maskz_load_ss (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_ss (p + 1, mask, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  mask ^= 1;
+
+  res1.x = _mm_mask_load_ss (src1.x, mask, p);
+  res2.x = _mm_maskz_load_ss (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0f;
+  _mm_mask_store_ss (p + 1, mask, src1.x);
+  if (val[1] != 42.0f)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c.jj	2019-03-06 19:11:19.058577646 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovss-3.c	2019-03-06 19:11:46.815122188 +0100
@@ -0,0 +1,84 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 32)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128 res1, res2, res3, res4, src1, src2, src3;
+  float val[2] = { 35.5f, 0.0f };
+  float *volatile p = &val[0];
+  float res_ref[SIZE];
+  float zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5f + i;
+      src2.a[i] = 7.5f + i;
+      src3.a[i] = 4.5f + i;
+      zero[i] = 0.0f;
+    }
+
+  res1.x = _mm_mask_load_ss (src1.x, 1, p);
+  res2.x = _mm_maskz_load_ss (1, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, 1, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (1, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_ss (p + 1, 1, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  res1.x = _mm_mask_load_ss (src1.x, 0, p);
+  res2.x = _mm_maskz_load_ss (0, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_ss (src1.x, 0, src2.x, src3.x);
+  res4.x = _mm_maskz_move_ss (0, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128 (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128 (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0f;
+  _mm_mask_store_ss (p + 1, 0, src1.x);
+  if (val[1] != 42.0f)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c.jj	2019-03-06 15:45:04.922962437 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-1.c	2019-03-06 15:45:30.032550703 +0100
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+\\(%\[a-z0-9,]*\\), %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, %xmm\[0-9\]+, %xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovsd\[ \\t\]+%xmm\[0-9\]+, \\(%\[a-z0-9,]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128d x1, x2, x3;
+volatile __mmask8 m;
+double *volatile p;
+
+void extern
+avx512f_test (void)
+{
+  x1 = _mm_mask_load_sd (x1, m, p);
+  x1 = _mm_maskz_load_sd (m, p);
+  x1 = _mm_mask_move_sd (x1, m, x2, x3);
+  x1 = _mm_maskz_move_sd (m, x2, x3);
+  _mm_mask_store_sd (p, m, x1);
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c.jj	2019-03-06 19:05:18.862487956 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-2.c	2019-03-06 19:07:58.954861065 +0100
@@ -0,0 +1,87 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 64)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128d res1, res2, res3, res4, src1, src2, src3;
+  volatile __mmask8 mask = 5;
+  double val[2] = { 35.5, 0.0 };
+  double *volatile p = &val[0];
+  double res_ref[SIZE];
+  double zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5 + i;
+      src2.a[i] = 7.5 + i;
+      src3.a[i] = 4.5 + i;
+      zero[i] = 0.0;
+    }
+
+  res1.x = _mm_mask_load_sd (src1.x, mask, p);
+  res2.x = _mm_maskz_load_sd (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_sd (p + 1, mask, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  mask ^= 1;
+
+  res1.x = _mm_mask_load_sd (src1.x, mask, p);
+  res2.x = _mm_maskz_load_sd (mask, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, mask, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (mask, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0;
+  _mm_mask_store_sd (p + 1, mask, src1.x);
+  if (val[1] != 42.0)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c.jj	2019-03-06 19:11:57.977939021 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovsd-3.c	2019-03-06 19:12:47.090133163 +0100
@@ -0,0 +1,84 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-helper.h"
+
+#define SIZE (128 / 64)
+#include "avx512f-mask-type.h"
+
+void
+avx512f_test (void)
+{
+  int i, sign;
+  union128d res1, res2, res3, res4, src1, src2, src3;
+  double val[2] = { 35.5, 0.0 };
+  double *volatile p = &val[0];
+  double res_ref[SIZE];
+  double zero[SIZE];
+
+  for (i = 0; i < SIZE; i++)
+    {
+      src1.a[i] = 1.5 + i;
+      src2.a[i] = 7.5 + i;
+      src3.a[i] = 4.5 + i;
+      zero[i] = 0.0;
+    }
+
+  res1.x = _mm_mask_load_sd (src1.x, 1, p);
+  res2.x = _mm_maskz_load_sd (1, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = val[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, 1, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (1, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src3.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  _mm_mask_store_sd (p + 1, 1, src1.x);
+  if (val[1] != src1.a[0])
+    abort ();
+
+  res1.x = _mm_mask_load_sd (src1.x, 0, p);
+  res2.x = _mm_maskz_load_sd (0, p);
+
+  __builtin_memcpy (res_ref, zero, sizeof (zero));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res1, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res2, res_ref))
+    abort ();
+
+  res3.x = _mm_mask_move_sd (src1.x, 0, src2.x, src3.x);
+  res4.x = _mm_maskz_move_sd (0, src2.x, src3.x);
+
+  __builtin_memcpy (res_ref, src2.a, sizeof (src2.a));
+  res_ref[0] = src1.a[0];
+  if (check_union128d (res3, res_ref))
+    abort ();
+
+  res_ref[0] = zero[0];
+  if (check_union128d (res4, res_ref))
+    abort ();
+
+  val[1] = 42.0;
+  _mm_mask_store_sd (p + 1, 0, src1.x);
+  if (val[1] != 42.0)
+    abort ();
+}