Improve AVX/AVX512F ix86_expand_vector_init_one_nonzero (PR target/83203)

Message ID 20180111185933.GV1833@tucnak
State New
Headers show
Series
  • Improve AVX/AVX512F ix86_expand_vector_init_one_nonzero (PR target/83203)
Related show

Commit Message

Jakub Jelinek Jan. 11, 2018, 6:59 p.m.
Hi!

This patch improves insertion of a single scalar into the first element
of otherwise empty vector for 256-bit and 512-bit vectors.
As 128-bit vmovd/vmovq/vmovss/vinsertps all clear all the upper bits
of the target, there is no need to do anything but these instructions
(or, when tuning for amd or generic, these from memory).
E.g. given:
typedef long long v4di __attribute__((vector_size (32)));
typedef int v8si __attribute__((vector_size (32)));
typedef double v4df __attribute__((vector_size (32)));
typedef float v8sf __attribute__((vector_size (32)));

v4di
f1 (long long x)
{
  return (v4di) { x };
}

v8si
f2 (int x)
{
  return (v8si) { x };
}

v4df
f3 (double x)
{
  return (v4df) { x };
}

v8sf
f4 (float x)
{
  return (v8sf) { x };
}

#ifdef __AVX512F__
typedef long long v8di __attribute__((vector_size (64)));
typedef int v16si __attribute__((vector_size (64)));
typedef double v8df __attribute__((vector_size (64)));
typedef float v16sf __attribute__((vector_size (64)));

v8di
f5 (long long x)
{
  return (v8di) { x };
}

v16si
f6 (int x)
{
  return (v16si) { x };
}

v8df
f7 (double x)
{
  return (v8df) { x };
}

v16sf
f8 (float x)
{
  return (v16sf) { x };
}
#endif

with -O2 -m64 -mavx512{bw,dq,vl} -mtune=intel, the difference with the
patch is:
 f1:
        vmovq   %rdi, %xmm0
-       vmovdqa %xmm0, %xmm0
        ret
 f2:
        vmovd   %edi, %xmm0
-       vmovdqa %xmm0, %xmm0
        ret
 f3:
        vmovq   %xmm0, %xmm0
-       vmovapd %xmm0, %xmm0
        ret
 f4:
        vinsertps       $0xe, %xmm0, %xmm0, %xmm0
-       vmovaps %xmm0, %xmm0
        ret
 f5:
        vmovq   %rdi, %xmm0
-       vmovdqa %xmm0, %xmm0
-       vmovdqa %ymm0, %ymm0
        ret
 f6:
-       vpxor   %xmm1, %xmm1, %xmm1
        vmovd   %edi, %xmm0
-       vpunpcklqdq     %xmm1, %xmm0, %xmm0
-       vmovdqa %xmm0, %xmm0
-       vmovdqa %ymm0, %ymm0
        ret
 f7:
        vmovq   %xmm0, %xmm0
-       vmovapd %xmm0, %xmm0
-       vmovapd %ymm0, %ymm0
        ret
 f8:
-       pushq   %rbp
-       vxorps  %xmm1, %xmm1, %xmm1
-       movq    %rsp, %rbp
-       andq    $-64, %rsp
-       vmovss  %xmm0, -4(%rsp)
-       vmovss  -4(%rsp), %xmm0
-       vmovlhps        %xmm1, %xmm0, %xmm0
-       vmovaps %xmm0, %xmm0
-       vmovaps %ymm0, %ymm0
-       leave
+       vinsertps       $0xe, %xmm0, %xmm0, %xmm0
        ret

Bootstrapped/regtested on x86_64-linux and i686-linux,
ok for trunk?

2018-01-11  Jakub Jelinek  <jakub@redhat.com>

	PR target/83203
	* config/i386/i386.c (ix86_expand_vector_init_one_nonzero): If one_var
	is 0, for V{8,16}S[IF] and V[48]D[IF]mode use gen_vec_set<mode>_0.
	* config/i386/sse.md (VI8_AVX_AVX512F, VI4F_256_512): New mode
	iterators.
	(ssescalarmodesuffix): Add 512-bit vectors.  Use "d" or "q" for
	integral modes instead of "ss" and "sd".
	(vec_set<mode>_0): New define_insns for 256-bit and 512-bit
	vectors with 32-bit and 64-bit elements.
	(vecdupssescalarmodesuffix): New mode attribute.
	(vec_dup<mode>): Use it.


	Jakub

Comments

Uros Bizjak Jan. 11, 2018, 8:32 p.m. | #1
On Thu, Jan 11, 2018 at 7:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> This patch improves insertion of a single scalar into the first element
> of otherwise empty vector for 256-bit and 512-bit vectors.
> As 128-bit vmovd/vmovq/vmovss/vinsertps all clear all the upper bits
> of the target, there is no need to do anything but these instructions
> (or, when tuning for amd or generic, these from memory).
> E.g. given:
> typedef long long v4di __attribute__((vector_size (32)));
> typedef int v8si __attribute__((vector_size (32)));
> typedef double v4df __attribute__((vector_size (32)));
> typedef float v8sf __attribute__((vector_size (32)));
>
> v4di
> f1 (long long x)
> {
>   return (v4di) { x };
> }
>
> v8si
> f2 (int x)
> {
>   return (v8si) { x };
> }
>
> v4df
> f3 (double x)
> {
>   return (v4df) { x };
> }
>
> v8sf
> f4 (float x)
> {
>   return (v8sf) { x };
> }
>
> #ifdef __AVX512F__
> typedef long long v8di __attribute__((vector_size (64)));
> typedef int v16si __attribute__((vector_size (64)));
> typedef double v8df __attribute__((vector_size (64)));
> typedef float v16sf __attribute__((vector_size (64)));
>
> v8di
> f5 (long long x)
> {
>   return (v8di) { x };
> }
>
> v16si
> f6 (int x)
> {
>   return (v16si) { x };
> }
>
> v8df
> f7 (double x)
> {
>   return (v8df) { x };
> }
>
> v16sf
> f8 (float x)
> {
>   return (v16sf) { x };
> }
> #endif
>
> with -O2 -m64 -mavx512{bw,dq,vl} -mtune=intel, the difference with the
> patch is:
>  f1:
>         vmovq   %rdi, %xmm0
> -       vmovdqa %xmm0, %xmm0
>         ret
>  f2:
>         vmovd   %edi, %xmm0
> -       vmovdqa %xmm0, %xmm0
>         ret
>  f3:
>         vmovq   %xmm0, %xmm0
> -       vmovapd %xmm0, %xmm0
>         ret
>  f4:
>         vinsertps       $0xe, %xmm0, %xmm0, %xmm0
> -       vmovaps %xmm0, %xmm0
>         ret
>  f5:
>         vmovq   %rdi, %xmm0
> -       vmovdqa %xmm0, %xmm0
> -       vmovdqa %ymm0, %ymm0
>         ret
>  f6:
> -       vpxor   %xmm1, %xmm1, %xmm1
>         vmovd   %edi, %xmm0
> -       vpunpcklqdq     %xmm1, %xmm0, %xmm0
> -       vmovdqa %xmm0, %xmm0
> -       vmovdqa %ymm0, %ymm0
>         ret
>  f7:
>         vmovq   %xmm0, %xmm0
> -       vmovapd %xmm0, %xmm0
> -       vmovapd %ymm0, %ymm0
>         ret
>  f8:
> -       pushq   %rbp
> -       vxorps  %xmm1, %xmm1, %xmm1
> -       movq    %rsp, %rbp
> -       andq    $-64, %rsp
> -       vmovss  %xmm0, -4(%rsp)
> -       vmovss  -4(%rsp), %xmm0
> -       vmovlhps        %xmm1, %xmm0, %xmm0
> -       vmovaps %xmm0, %xmm0
> -       vmovaps %ymm0, %ymm0
> -       leave
> +       vinsertps       $0xe, %xmm0, %xmm0, %xmm0
>         ret
>
> Bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk?
>
> 2018-01-11  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/83203
>         * config/i386/i386.c (ix86_expand_vector_init_one_nonzero): If one_var
>         is 0, for V{8,16}S[IF] and V[48]D[IF]mode use gen_vec_set<mode>_0.
>         * config/i386/sse.md (VI8_AVX_AVX512F, VI4F_256_512): New mode
>         iterators.
>         (ssescalarmodesuffix): Add 512-bit vectors.  Use "d" or "q" for
>         integral modes instead of "ss" and "sd".
>         (vec_set<mode>_0): New define_insns for 256-bit and 512-bit
>         vectors with 32-bit and 64-bit elements.
>         (vecdupssescalarmodesuffix): New mode attribute.
>         (vec_dup<mode>): Use it.

OK.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2018-01-10 17:08:56.076912734 +0100
> +++ gcc/config/i386/i386.c      2018-01-11 15:32:12.686848932 +0100
> @@ -41762,6 +41762,7 @@ ix86_expand_vector_init_one_nonzero (boo
>    rtx new_target;
>    rtx x, tmp;
>    bool use_vector_set = false;
> +  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
>
>    switch (mode)
>      {
> @@ -41786,14 +41787,41 @@ ix86_expand_vector_init_one_nonzero (boo
>        break;
>      case E_V32QImode:
>      case E_V16HImode:
> +      use_vector_set = TARGET_AVX;
> +      break;
>      case E_V8SImode:
> +      use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv8si_0;
> +      break;
>      case E_V8SFmode:
> +      use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv8sf_0;
> +      break;
>      case E_V4DFmode:
>        use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = gen_vec_setv4df_0;
>        break;
>      case E_V4DImode:
>        /* Use ix86_expand_vector_set in 64bit mode only.  */
>        use_vector_set = TARGET_AVX && TARGET_64BIT;
> +      gen_vec_set_0 = gen_vec_setv4di_0;
> +      break;
> +    case E_V16SImode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv16si_0;
> +      break;
> +    case E_V16SFmode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv16sf_0;
> +      break;
> +    case E_V8DFmode:
> +      use_vector_set = TARGET_AVX512F && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv8df_0;
> +      break;
> +    case E_V8DImode:
> +      /* Use ix86_expand_vector_set in 64bit mode only.  */
> +      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv8di_0;
>        break;
>      default:
>        break;
> @@ -41801,6 +41829,12 @@ ix86_expand_vector_init_one_nonzero (boo
>
>    if (use_vector_set)
>      {
> +      if (gen_vec_set_0 && one_var == 0)
> +       {
> +         var = force_reg (GET_MODE_INNER (mode), var);
> +         emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
> +         return true;
> +       }
>        emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
>        var = force_reg (GET_MODE_INNER (mode), var);
>        ix86_expand_vector_set (mmx_ok, target, var, one_var);
> --- gcc/config/i386/sse.md.jj   2018-01-05 17:39:34.591260408 +0100
> +++ gcc/config/i386/sse.md      2018-01-11 15:52:22.762139416 +0100
> @@ -401,6 +401,9 @@ (define_mode_iterator VI8_AVX2
>  (define_mode_iterator VI8_AVX2_AVX512F
>    [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
>
> +(define_mode_iterator VI8_AVX_AVX512F
> +  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")])
> +
>  (define_mode_iterator VI4_128_8_256
>    [V4SI V4DI])
>
> @@ -622,6 +625,9 @@ (define_mode_iterator VI4F_128 [V4SI V4S
>  (define_mode_iterator VI8F_128 [V2DI V2DF])
>  (define_mode_iterator VI4F_256 [V8SI V8SF])
>  (define_mode_iterator VI8F_256 [V4DI V4DF])
> +(define_mode_iterator VI4F_256_512
> +  [V8SI V8SF
> +   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
>  (define_mode_iterator VI48F_256_512
>    [V8SI V8SF
>    (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
> @@ -838,10 +844,12 @@ (define_mode_attr sseintprefix
>  ;; SSE scalar suffix for vector modes
>  (define_mode_attr ssescalarmodesuffix
>    [(SF "ss") (DF "sd")
> +   (V16SF "ss") (V8DF "sd")
>     (V8SF "ss") (V4DF "sd")
>     (V4SF "ss") (V2DF "sd")
> -   (V8SI "ss") (V4DI "sd")
> -   (V4SI "d")])
> +   (V16SI "d") (V8DI "q")
> +   (V8SI "d") (V4DI "q")
> +   (V4SI "d") (V2DI "q")])
>
>  ;; Pack/unpack vector modes
>  (define_mode_attr sseunpackmode
> @@ -7092,6 +7100,26 @@ (define_insn "*vec_setv4sf_sse4_1"
>     (set_attr "prefix" "orig,orig,maybe_evex")
>     (set_attr "mode" "V4SF")])
>
> +;; All of vinsertps, vmovss, vmovd clear also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi")
> +       (vec_merge:VI4F_256_512
> +         (vec_duplicate:VI4F_256_512
> +           (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r"))
> +         (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "@
> +   vinsertps\t{$0xe, %2, %2, %x0|%x0, %2, %2, 0xe}
> +   vmov<ssescalarmodesuffix>\t{%x2, %x0|%x0, %2}
> +   vmovd\t{%2, %x0|%x0, %2}"
> +  [(set (attr "type")
> +     (if_then_else (eq_attr "alternative" "0")
> +                  (const_string "sselog")
> +                  (const_string "ssemov")))
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "SF,<ssescalarmode>,SI")])
> +
>  (define_insn "sse4_1_insertps"
>    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
>         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> @@ -9220,6 +9248,20 @@ (define_insn "vec_concatv2df"
>            (const_string "orig")))
>     (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")])
>
> +;; vmovq clears also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
> +       (vec_merge:VF2_512_256
> +         (vec_duplicate:VF2_512_256
> +           (match_operand:<ssescalarmode> 2 "general_operand" "xm"))
> +         (match_operand:VF2_512_256 1 "const0_operand" "C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "vmovq\t{%2, %x0|%x0, %2}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "DF")])
> +
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>  ;;
>  ;; Parallel integer down-conversion operations
> @@ -13993,6 +14035,22 @@ (define_insn "vec_concatv2di"
>            (const_string "orig")))
>     (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
>
> +;; vmovq clears also the higher bits.
> +(define_insn "vec_set<mode>_0"
> +  [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v")
> +       (vec_merge:VI8_AVX_AVX512F
> +         (vec_duplicate:VI8_AVX_AVX512F
> +           (match_operand:<ssescalarmode> 2 "general_operand" "r,vm"))
> +         (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C")
> +         (const_int 1)))]
> +  "TARGET_AVX"
> +  "vmovq\t{%2, %x0|%x0, %2}"
> +  [(set_attr "isa" "x64,*")
> +   (set_attr "type" "ssemov")
> +   (set_attr "prefix_rex" "1,*")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "TI")])
> +
>  (define_expand "vec_unpacks_lo_<mode>"
>    [(match_operand:<sseunpackmode> 0 "register_operand")
>     (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
> @@ -17743,6 +17801,8 @@ (define_insn "avx2_vbroadcasti128_<mode>
>  ;; Modes handled by AVX vec_dup patterns.
>  (define_mode_iterator AVX_VEC_DUP_MODE
>    [V8SI V8SF V4DI V4DF])
> +(define_mode_attr vecdupssescalarmodesuffix
> +  [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")])
>  ;; Modes handled by AVX2 vec_dup patterns.
>  (define_mode_iterator AVX2_VEC_DUP_MODE
>    [V32QI V16QI V16HI V8HI V8SI V4SI])
> @@ -17769,7 +17829,7 @@ (define_insn "vec_dup<mode>"
>    "TARGET_AVX"
>    "@
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}
> -   vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1}
> +   vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1}
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}
>     v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1}
>     #"
>
>         Jakub

Patch

--- gcc/config/i386/i386.c.jj	2018-01-10 17:08:56.076912734 +0100
+++ gcc/config/i386/i386.c	2018-01-11 15:32:12.686848932 +0100
@@ -41762,6 +41762,7 @@  ix86_expand_vector_init_one_nonzero (boo
   rtx new_target;
   rtx x, tmp;
   bool use_vector_set = false;
+  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
 
   switch (mode)
     {
@@ -41786,14 +41787,41 @@  ix86_expand_vector_init_one_nonzero (boo
       break;
     case E_V32QImode:
     case E_V16HImode:
+      use_vector_set = TARGET_AVX;
+      break;
     case E_V8SImode:
+      use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv8si_0;
+      break;
     case E_V8SFmode:
+      use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv8sf_0;
+      break;
     case E_V4DFmode:
       use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv4df_0;
       break;
     case E_V4DImode:
       /* Use ix86_expand_vector_set in 64bit mode only.  */
       use_vector_set = TARGET_AVX && TARGET_64BIT;
+      gen_vec_set_0 = gen_vec_setv4di_0;
+      break;
+    case E_V16SImode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv16si_0;
+      break;
+    case E_V16SFmode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv16sf_0;
+      break;
+    case E_V8DFmode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv8df_0;
+      break;
+    case E_V8DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv8di_0;
       break;
     default:
       break;
@@ -41801,6 +41829,12 @@  ix86_expand_vector_init_one_nonzero (boo
 
   if (use_vector_set)
     {
+      if (gen_vec_set_0 && one_var == 0)
+	{
+	  var = force_reg (GET_MODE_INNER (mode), var);
+	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
+	  return true;
+	}
       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
       var = force_reg (GET_MODE_INNER (mode), var);
       ix86_expand_vector_set (mmx_ok, target, var, one_var);
--- gcc/config/i386/sse.md.jj	2018-01-05 17:39:34.591260408 +0100
+++ gcc/config/i386/sse.md	2018-01-11 15:52:22.762139416 +0100
@@ -401,6 +401,9 @@  (define_mode_iterator VI8_AVX2
 (define_mode_iterator VI8_AVX2_AVX512F
   [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
+(define_mode_iterator VI8_AVX_AVX512F
+  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX")])
+
 (define_mode_iterator VI4_128_8_256
   [V4SI V4DI])
 
@@ -622,6 +625,9 @@  (define_mode_iterator VI4F_128 [V4SI V4S
 (define_mode_iterator VI8F_128 [V2DI V2DF])
 (define_mode_iterator VI4F_256 [V8SI V8SF])
 (define_mode_iterator VI8F_256 [V4DI V4DF])
+(define_mode_iterator VI4F_256_512
+  [V8SI V8SF
+   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
 (define_mode_iterator VI48F_256_512
   [V8SI V8SF
   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
@@ -838,10 +844,12 @@  (define_mode_attr sseintprefix
 ;; SSE scalar suffix for vector modes
 (define_mode_attr ssescalarmodesuffix
   [(SF "ss") (DF "sd")
+   (V16SF "ss") (V8DF "sd")
    (V8SF "ss") (V4DF "sd")
    (V4SF "ss") (V2DF "sd")
-   (V8SI "ss") (V4DI "sd")
-   (V4SI "d")])
+   (V16SI "d") (V8DI "q")
+   (V8SI "d") (V4DI "q")
+   (V4SI "d") (V2DI "q")])
 
 ;; Pack/unpack vector modes
 (define_mode_attr sseunpackmode
@@ -7092,6 +7100,26 @@  (define_insn "*vec_setv4sf_sse4_1"
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "V4SF")])
 
+;; All of vinsertps, vmovss, vmovd clear also the higher bits.
+(define_insn "vec_set<mode>_0"
+  [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi")
+	(vec_merge:VI4F_256_512
+	  (vec_duplicate:VI4F_256_512
+	    (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r"))
+	  (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "@
+   vinsertps\t{$0xe, %2, %2, %x0|%x0, %2, %2, 0xe}
+   vmov<ssescalarmodesuffix>\t{%x2, %x0|%x0, %2}
+   vmovd\t{%2, %x0|%x0, %2}"
+  [(set (attr "type")
+     (if_then_else (eq_attr "alternative" "0")
+		   (const_string "sselog")
+		   (const_string "ssemov")))
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "SF,<ssescalarmode>,SI")])
+
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
 	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
@@ -9220,6 +9248,20 @@  (define_insn "vec_concatv2df"
 	   (const_string "orig")))
    (set_attr "mode" "V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF")])
 
+;; vmovq clears also the higher bits.
+(define_insn "vec_set<mode>_0"
+  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
+	(vec_merge:VF2_512_256
+	  (vec_duplicate:VF2_512_256
+	    (match_operand:<ssescalarmode> 2 "general_operand" "xm"))
+	  (match_operand:VF2_512_256 1 "const0_operand" "C")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vmovq\t{%2, %x0|%x0, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "DF")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integer down-conversion operations
@@ -13993,6 +14035,22 @@  (define_insn "vec_concatv2di"
 	   (const_string "orig")))
    (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
 
+;; vmovq clears also the higher bits.
+(define_insn "vec_set<mode>_0"
+  [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v")
+	(vec_merge:VI8_AVX_AVX512F
+	  (vec_duplicate:VI8_AVX_AVX512F
+	    (match_operand:<ssescalarmode> 2 "general_operand" "r,vm"))
+	  (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vmovq\t{%2, %x0|%x0, %2}"
+  [(set_attr "isa" "x64,*")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_rex" "1,*")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_expand "vec_unpacks_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
    (match_operand:VI124_AVX2_24_AVX512F_1_AVX512BW 1 "register_operand")]
@@ -17743,6 +17801,8 @@  (define_insn "avx2_vbroadcasti128_<mode>
 ;; Modes handled by AVX vec_dup patterns.
 (define_mode_iterator AVX_VEC_DUP_MODE
   [V8SI V8SF V4DI V4DF])
+(define_mode_attr vecdupssescalarmodesuffix
+  [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")])
 ;; Modes handled by AVX2 vec_dup patterns.
 (define_mode_iterator AVX2_VEC_DUP_MODE
   [V32QI V16QI V16HI V8HI V8SI V4SI])
@@ -17769,7 +17829,7 @@  (define_insn "vec_dup<mode>"
   "TARGET_AVX"
   "@
    v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}
-   vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1}
+   vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1}
    v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}
    v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1}
    #"