Patchwork [i386,3/8,AVX512,1/n] Add AVX-512 patterns: VF iterator extended.

login
register
mail settings
Submitter Kirill Yukhin
Date Aug. 27, 2013, 6:37 p.m.
Message ID <20130827183717.GB42618@msticlxl57.ims.intel.com>
Download mbox | patch
Permalink /patch/270193/
State New
Headers show

Comments

Kirill Yukhin - Aug. 27, 2013, 6:37 p.m.
Hello,

> This patch is still far too large.
> 
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Problem is that some iterators are depend on each other, so patches are
not going to be tiny.

Here is 1st one. It extends VF iterator - biggest impact I believe

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f option.


--
Thanks, K

PS. If it is - I am going to strip out ChangeLog lines from big patch

---
 gcc/config/i386/i386.c  |  62 +++++++++--
 gcc/config/i386/i386.md |   1 +
 gcc/config/i386/sse.md  | 283 +++++++++++++++++++++++++++++++-----------------
 3 files changed, 241 insertions(+), 105 deletions(-)
Kirill Yukhin - Sept. 6, 2013, 1:41 p.m.
Hello,

PING.

--
Thanks, K
Kirill Yukhin - Sept. 9, 2013, 11:11 a.m.
Hello,
On 06 Sep 17:41, Kirill Yukhin wrote:
> Hello,
> 
> PING.
PING.

--
Thanks, K
Kirill Yukhin - Sept. 13, 2013, 10:28 a.m.
Hello,
On 09 Sep 15:11, Kirill Yukhin wrote:
> Hello,
> On 06 Sep 17:41, Kirill Yukhin wrote:
> > Hello,
> > 
> > PING.
> PING.
PING.

--
Thanks, K
Kirill Yukhin - Sept. 18, 2013, 7:17 a.m.
Hello,
On 13 Sep 14:28, Kirill Yukhin wrote:
> Hello,
> On 09 Sep 15:11, Kirill Yukhin wrote:
> > Hello,
> > On 06 Sep 17:41, Kirill Yukhin wrote:
> > > Hello,
> > > 
> > > PING.
> > PING.
> PING.
PING

--
Thanks, K
Kirill Yukhin - Sept. 24, 2013, 9:02 a.m.
Hello,
On 18 Sep 11:17, Kirill Yukhin wrote:
> Hello,
> On 13 Sep 14:28, Kirill Yukhin wrote:
> > Hello,
> > On 09 Sep 15:11, Kirill Yukhin wrote:
> > > Hello,
> > > On 06 Sep 17:41, Kirill Yukhin wrote:
> > > > Hello,
> > > > 
> > > > PING.
> > > PING.
> > PING.
> PING
PING.

--
Thanks, K
Richard Henderson - Sept. 24, 2013, 5:04 p.m.
On 08/27/2013 11:37 AM, Kirill Yukhin wrote:
> Hello,
> 
>> This patch is still far too large.
>>
>> I think you should split it up based on every single mode iterator that
>> you need to add or change.
> 
> Problem is that some iterators are depend on each other, so patches are
> not going to be tiny.
> 
> Here is 1st one. It extends VF iterator - biggest impact I believe
> 
> Is it Ok?
> 
> Testing:
>   1. Bootstrap pass.
>   2. make check shows no regressions.
>   3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f option.
>   4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f option.


Ok.


r~
Ilya Verbin - Sept. 25, 2013, 6:04 p.m.
On 24 Sep 10:04, Richard Henderson wrote:
> On 08/27/2013 11:37 AM, Kirill Yukhin wrote:
> > Hello,
> > 
> >> This patch is still far too large.
> >>
> >> I think you should split it up based on every single mode iterator that
> >> you need to add or change.
> > 
> > Problem is that some iterators are depend on each other, so patches are
> > not going to be tiny.
> > 
> > Here is 1st one. It extends VF iterator - biggest impact I believe
> > 
> > Is it Ok?
> > 
> > Testing:
> >   1. Bootstrap pass.
> >   2. make check shows no regressions.
> >   3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f option.
> >   4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f option.
> 
> 
> Ok.
> 
> 
> r~

Checked into main trunk by Kirill Yukhin:
http://gcc.gnu.org/ml/gcc-cvs/2013-09/msg00779.html

  -- Ilya

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8325919..5f50533 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -16538,8 +16538,8 @@  ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
       gcc_unreachable ();
     case V32QImode:
       extract = gen_avx_vextractf128v32qi;
-      load_unaligned = gen_avx_loaddqu256;
-      store_unaligned = gen_avx_storedqu256;
+      load_unaligned = gen_avx_loaddquv32qi;
+      store_unaligned = gen_avx_storedquv32qi;
       mode = V16QImode;
       break;
     case V8SFmode:
@@ -16642,10 +16642,56 @@  void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 {
   rtx op0, op1, m;
+  rtx (*load_unaligned) (rtx, rtx);
+  rtx (*store_unaligned) (rtx, rtx);
 
   op0 = operands[0];
   op1 = operands[1];
 
+  if (GET_MODE_SIZE (mode) == 64)
+    {
+      switch (GET_MODE_CLASS (mode))
+	{
+	case MODE_VECTOR_INT:
+	case MODE_INT:
+	  op0 = gen_lowpart (V16SImode, op0);
+	  op1 = gen_lowpart (V16SImode, op1);
+	  /* FALLTHRU */
+
+	case MODE_VECTOR_FLOAT:
+	  switch (GET_MODE (op0))
+	    {
+	    default:
+	      gcc_unreachable ();
+	    case V16SImode:
+	      load_unaligned = gen_avx512f_loaddquv16si;
+	      store_unaligned = gen_avx512f_storedquv16si;
+	      break;
+	    case V16SFmode:
+	      load_unaligned = gen_avx512f_loadups512;
+	      store_unaligned = gen_avx512f_storeups512;
+	      break;
+	    case V8DFmode:
+	      load_unaligned = gen_avx512f_loadupd512;
+	      store_unaligned = gen_avx512f_storeupd512;
+	      break;
+	    }
+
+	  if (MEM_P (op1))
+	    emit_insn (load_unaligned (op0, op1));
+	  else if (MEM_P (op0))
+	    emit_insn (store_unaligned (op0, op1));
+	  else
+	    gcc_unreachable ();
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      return;
+    }
+
   if (TARGET_AVX
       && GET_MODE_SIZE (mode) == 32)
     {
@@ -16678,7 +16724,7 @@  ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 	  op0 = gen_lowpart (V16QImode, op0);
 	  op1 = gen_lowpart (V16QImode, op1);
 	  /* We will eventually emit movups based on insn attributes.  */
-	  emit_insn (gen_sse2_loaddqu (op0, op1));
+	  emit_insn (gen_sse2_loaddquv16qi (op0, op1));
 	}
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16753,7 +16799,7 @@  ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 	  op0 = gen_lowpart (V16QImode, op0);
 	  op1 = gen_lowpart (V16QImode, op1);
 	  /* We will eventually emit movups based on insn attributes.  */
-	  emit_insn (gen_sse2_storedqu (op0, op1));
+	  emit_insn (gen_sse2_storedquv16qi (op0, op1));
 	}
       else if (TARGET_SSE2 && mode == V2DFmode)
 	{
@@ -27473,13 +27519,13 @@  static const struct builtin_description bdesc_special_args[] =
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
@@ -27508,8 +27554,8 @@  static const struct builtin_description bdesc_special_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 01c85d8..e458932 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -778,6 +778,7 @@ 
 ;; Mapping of logic operators
 (define_code_iterator any_logic [and ior xor])
 (define_code_iterator any_or [ior xor])
+(define_code_iterator fpint_logic [and xor])
 
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9d9469e..10637cc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -97,13 +97,13 @@ 
 
 ;; All vector modes including V?TImode, used in move patterns.
 (define_mode_iterator VMOVE
-  [(V32QI "TARGET_AVX") V16QI
-   (V16HI "TARGET_AVX") V8HI
-   (V8SI "TARGET_AVX") V4SI
-   (V4DI "TARGET_AVX") V2DI
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
    (V2TI "TARGET_AVX") V1TI
-   (V8SF "TARGET_AVX") V4SF
-   (V4DF "TARGET_AVX") V2DF])
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F")  (V4DF "TARGET_AVX") V2DF])
 
 ;; All vector modes
 (define_mode_iterator V
@@ -124,6 +124,11 @@ 
 
 ;; All vector float modes
 (define_mode_iterator VF
+  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
+;; 128- and 256-bit float vector modes
+(define_mode_iterator VF_128_256
   [(V8SF "TARGET_AVX") V4SF
    (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -143,6 +148,10 @@ 
 (define_mode_iterator VF_256
   [V8SF V4DF])
 
+;; All 512bit vector float modes
+(define_mode_iterator VF_512
+  [V16SF V8DF])
+
 ;; All vector integer modes
 (define_mode_iterator VI
   [(V32QI "TARGET_AVX") V16QI
@@ -160,6 +169,10 @@ 
 (define_mode_iterator VI1
   [(V32QI "TARGET_AVX") V16QI])
 
+(define_mode_iterator VI_UNALIGNED_LOADSTORE
+  [(V32QI "TARGET_AVX") V16QI
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+
 ;; All DImode vector integer modes
 (define_mode_iterator VI8
   [(V4DI "TARGET_AVX") V2DI])
@@ -212,11 +225,18 @@ 
    (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
 
+(define_mode_attr sse2_avx_avx512f
+  [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f")
+   (V4SI  "sse2") (V8SI  "avx") (V16SI "avx512f")
+   (V8DI "avx512f")
+   (V16SF "avx512f") (V8SF "avx") (V4SF "avx")
+   (V8DF "avx512f") (V4DF "avx") (V2DF "avx")])
+
 (define_mode_attr sse2_avx2
   [(V16QI "sse2") (V32QI "avx2")
    (V8HI "sse2") (V16HI "avx2")
-   (V4SI "sse2") (V8SI "avx2")
-   (V2DI "sse2") (V4DI "avx2")
+   (V4SI "sse2") (V8SI "avx2") (V16SI "avx512f")
+   (V2DI "sse2") (V4DI "avx2") (V8DI "avx512f")
    (V1TI "sse2") (V2TI "avx2")])
 
 (define_mode_attr ssse3_avx2
@@ -229,7 +249,7 @@ 
 (define_mode_attr sse4_1_avx2
    [(V16QI "sse4_1") (V32QI "avx2")
     (V8HI "sse4_1") (V16HI "avx2")
-    (V4SI "sse4_1") (V8SI "avx2")
+    (V4SI "sse4_1") (V8SI "avx2") (V16SI "avx512f")
     (V2DI "sse4_1") (V4DI "avx2")])
 
 (define_mode_attr avx_avx2
@@ -244,6 +264,12 @@ 
    (V4SI "vec") (V8SI "avx2")
    (V2DI "vec") (V4DI "avx2")])
 
+(define_mode_attr avx2_avx512f
+  [(V4SI "avx2") (V8SI "avx2") (V16SI "avx512f")
+   (V2DI "avx2") (V4DI "avx2") (V8DI "avx512f")
+   (V8SF "avx2") (V16SF "avx512f")
+   (V4DF "avx2") (V8DF "avx512f")])
+
 (define_mode_attr shuffletype
   [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i")
   (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i")
@@ -287,22 +313,26 @@ 
 (define_mode_attr sse
   [(SF "sse") (DF "sse2")
    (V4SF "sse") (V2DF "sse2")
-   (V8SF "avx") (V4DF "avx")])
+   (V16SF "avx512f") (V8SF "avx")
+   (V8DF "avx512f") (V4DF "avx")])
 
 (define_mode_attr sse2
-  [(V16QI "sse2") (V32QI "avx")
-   (V2DI "sse2") (V4DI "avx")])
+  [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f")
+   (V2DI "sse2") (V4DI "avx") (V8DI "avx512f")])
 
 (define_mode_attr sse3
   [(V16QI "sse3") (V32QI "avx")])
 
 (define_mode_attr sse4_1
   [(V4SF "sse4_1") (V2DF "sse4_1")
-   (V8SF "avx") (V4DF "avx")])
+   (V8SF "avx") (V4DF "avx")
+   (V8DF "avx512f")])
 
 (define_mode_attr avxsizesuffix
-  [(V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256")
+  [(V64QI "512") (V32HI "512") (V16SI "512") (V8DI "512")
+   (V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256")
    (V16QI "") (V8HI "") (V4SI "") (V2DI "")
+   (V16SF "512") (V8DF "512")
    (V8SF "256") (V4DF "256")
    (V4SF "") (V2DF "")])
 
@@ -318,11 +348,13 @@ 
 
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
-  [(V8SF "V8SI") (V4DF "V4DI")
-   (V4SF "V4SI") (V2DF "V2DI")
-   (V8SI "V8SI") (V4DI "V4DI")
-   (V4SI "V4SI") (V2DI "V2DI")
-   (V16HI "V16HI") (V8HI "V8HI")
+  [(V16SF "V16SI") (V8DF  "V8DI")
+   (V8SF  "V8SI")  (V4DF  "V4DI")
+   (V4SF  "V4SI")  (V2DF  "V2DI")
+   (V16SI "V16SI") (V8DI  "V8DI")
+   (V8SI  "V8SI")  (V4DI  "V4DI")
+   (V4SI  "V4SI")  (V2DI  "V2DI")
+   (V16HI "V16HI") (V8HI  "V8HI")
    (V32QI "V32QI") (V16QI "V16QI")])
 
 (define_mode_attr sseintvecmodelower
@@ -349,8 +381,10 @@ 
 
 ;; Mapping of vector modes ti packed single mode of the same size
 (define_mode_attr ssePSmode
-  [(V32QI "V8SF") (V16QI "V4SF")
-   (V16HI "V8SF") (V8HI "V4SF")
+  [(V16SI "V16SF") (V8DF "V16SF")
+   (V16SF "V16SF") (V8DI "V16SF")
+   (V64QI "V16SF") (V32QI "V8SF") (V16QI "V4SF")
+   (V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")
    (V8SI "V8SF") (V4SI "V4SF")
    (V4DI "V8SF") (V2DI "V4SF")
    (V2TI "V8SF") (V1TI "V4SF")
@@ -665,12 +699,13 @@ 
 (define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF 0 "register_operand" "=v")
 	(unspec:VF
-	  [(match_operand:VF 1 "memory_operand" "m")]
+	  [(match_operand:VF 1 "nonimmediate_operand" "vm")]
 	  UNSPEC_LOADU))]
   "TARGET_SSE"
 {
   switch (get_attr_mode (insn))
     {
+    case MODE_V16SF:
     case MODE_V8SF:
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
@@ -694,12 +729,13 @@ 
 (define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF 0 "memory_operand" "=m")
 	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "x")]
+	  [(match_operand:VF 1 "register_operand" "v")]
 	  UNSPEC_STOREU))]
   "TARGET_SSE"
 {
   switch (get_attr_mode (insn))
     {
+    case MODE_V16SF:
     case MODE_V8SF:
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
@@ -721,10 +757,11 @@ 
 	      ]
 	      (const_string "<MODE>")))])
 
-(define_insn "<sse2>_loaddqu<avxsizesuffix>"
-  [(set (match_operand:VI1 0 "register_operand" "=v")
-	(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
-		    UNSPEC_LOADU))]
+(define_insn "<sse2_avx_avx512f>_loaddqu<mode>"
+  [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand" "=v")
+	(unspec:VI_UNALIGNED_LOADSTORE
+	  [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_LOADU))]
   "TARGET_SSE2"
 {
   switch (get_attr_mode (insn))
@@ -732,6 +769,11 @@ 
     case MODE_V8SF:
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
+    case MODE_XI:
+      if (<MODE>mode == V8DImode)
+	return "vmovdqu64\t{%1, %0|%0, %1}";
+      else
+	return "vmovdqu32\t{%1, %0|%0, %1}";
     default:
       return "%vmovdqu\t{%1, %0|%0, %1}";
     }
@@ -754,10 +796,11 @@ 
 	      ]
 	      (const_string "<sseinsnmode>")))])
 
-(define_insn "<sse2>_storedqu<avxsizesuffix>"
-  [(set (match_operand:VI1 0 "memory_operand" "=m")
-	(unspec:VI1 [(match_operand:VI1 1 "register_operand" "v")]
-		    UNSPEC_STOREU))]
+(define_insn "<sse2_avx_avx512f>_storedqu<mode>"
+  [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "memory_operand" "=m")
+	(unspec:VI_UNALIGNED_LOADSTORE
+	  [(match_operand:VI_UNALIGNED_LOADSTORE 1 "register_operand" "v")]
+	  UNSPEC_STOREU))]
   "TARGET_SSE2"
 {
   switch (get_attr_mode (insn))
@@ -765,6 +808,11 @@ 
     case MODE_V8SF:
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
+    case MODE_XI:
+      if (<MODE>mode == V8DImode)
+	return "vmovdqu64\t{%1, %0|%0, %1}";
+      else
+	return "vmovdqu32\t{%1, %0|%0, %1}";
     default:
       return "%vmovdqu\t{%1, %0|%0, %1}";
     }
@@ -821,8 +869,9 @@ 
 
 (define_insn "<sse>_movnt<mode>"
   [(set (match_operand:VF 0 "memory_operand" "=m")
-	(unspec:VF [(match_operand:VF 1 "register_operand" "x")]
-		   UNSPEC_MOVNT))]
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "v")]
+	  UNSPEC_MOVNT))]
   "TARGET_SSE"
   "%vmovnt<ssemodesuffix>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
@@ -852,9 +901,9 @@ 
 (define_mode_iterator STORENT_MODE
   [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2")
    (SF "TARGET_SSE4A") (DF "TARGET_SSE4A")
-   (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2")
-   (V8SF "TARGET_AVX") V4SF
-   (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2")
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
 (define_expand "storent<mode>"
   [(set (match_operand:STORENT_MODE 0 "memory_operand")
@@ -877,10 +926,10 @@ 
   "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
 
 (define_insn_and_split "*absneg<mode>2"
-  [(set (match_operand:VF 0 "register_operand" "=x,x,x,x")
+  [(set (match_operand:VF 0 "register_operand" "=x,x,v,v")
 	(match_operator:VF 3 "absneg_operator"
-	  [(match_operand:VF 1 "nonimmediate_operand" "0, xm,x, m")]))
-   (use (match_operand:VF 2 "nonimmediate_operand"    "xm,0, xm,x"))]
+	  [(match_operand:VF 1 "nonimmediate_operand" "0, xm, v, m")]))
+   (use (match_operand:VF 2 "nonimmediate_operand"    "xm, 0, vm,v"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -962,10 +1011,10 @@ 
   "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
 
 (define_insn "*mul<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
 	(mult:VF
-	  (match_operand:VF 1 "nonimmediate_operand" "%0,x")
-	  (match_operand:VF 2 "nonimmediate_operand" "xm,xm")))]
+	  (match_operand:VF 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))]
   "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
   "@
    mul<ssemodesuffix>\t{%2, %0|%0, %2}
@@ -1239,10 +1288,10 @@ 
 ;; presence of -0.0 and NaN.
 
 (define_insn "*ieee_smin<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
+  [(set (match_operand:VF 0 "register_operand" "=v,v")
 	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]
+	  [(match_operand:VF 1 "register_operand" "0,v")
+	   (match_operand:VF 2 "nonimmediate_operand" "vm,vm")]
 	 UNSPEC_IEEE_MIN))]
   "TARGET_SSE"
   "@
@@ -1254,10 +1303,10 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*ieee_smax<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
+  [(set (match_operand:VF 0 "register_operand" "=v,v")
 	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]
+	  [(match_operand:VF 1 "register_operand" "0,v")
+	   (match_operand:VF 2 "nonimmediate_operand" "vm,vm")]
 	 UNSPEC_IEEE_MAX))]
   "TARGET_SSE"
   "@
@@ -1632,10 +1681,10 @@ 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "avx_cmp<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x")
-	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm")
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm")
 	   (match_operand:SI 3 "const_0_to_31_operand" "n")]
 	  UNSPEC_PCMP))]
   "TARGET_AVX"
@@ -1663,10 +1712,10 @@ 
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "*<sse>_maskcmp<mode>3_comm"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-	(match_operator:VF 3 "sse_comparison_operator"
-	  [(match_operand:VF 1 "register_operand" "%0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]))]
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(match_operator:VF_128_256 3 "sse_comparison_operator"
+	  [(match_operand:VF_128_256 1 "register_operand" "%0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))]
   "TARGET_SSE
    && GET_RTX_CLASS (GET_CODE (operands[3])) == RTX_COMM_COMPARE"
   "@
@@ -1679,10 +1728,10 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse>_maskcmp<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-	(match_operator:VF 3 "sse_comparison_operator"
-	  [(match_operand:VF 1 "register_operand" "0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")]))]
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(match_operator:VF_128_256 3 "sse_comparison_operator"
+	  [(match_operand:VF_128_256 1 "register_operand" "0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))]
   "TARGET_SSE"
   "@
    cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2}
@@ -1792,11 +1841,11 @@ 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "<sse>_andnot<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
 	(and:VF
 	  (not:VF
-	    (match_operand:VF 1 "register_operand" "0,x"))
-	  (match_operand:VF 2 "nonimmediate_operand" "xm,xm")))]
+	    (match_operand:VF 1 "register_operand" "0,v"))
+	  (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))]
   "TARGET_SSE"
 {
   static char buf[32];
@@ -1825,12 +1874,19 @@ 
       gcc_unreachable ();
     }
 
+  /* There is no vandnp[sd].  Use vpandnq.  */
+  if (GET_MODE_SIZE (<MODE>mode) == 64)
+    {
+      suffix = "q";
+      ops = "vpandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+    }
+
   snprintf (buf, sizeof (buf), ops, suffix);
   return buf;
 }
   [(set_attr "isa" "noavx,avx")
    (set_attr "type" "sselog")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix" "orig,maybe_evex")
    (set (attr "mode")
 	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 		 (const_string "<ssePSmode>")
@@ -1842,13 +1898,21 @@ 
 	       (const_string "<MODE>")))])
 
 (define_expand "<code><mode>3"
-  [(set (match_operand:VF 0 "register_operand")
-	(any_logic:VF
-	  (match_operand:VF 1 "nonimmediate_operand")
-	  (match_operand:VF 2 "nonimmediate_operand")))]
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(any_logic:VF_128_256
+	  (match_operand:VF_128_256 1 "nonimmediate_operand")
+	  (match_operand:VF_128_256 2 "nonimmediate_operand")))]
   "TARGET_SSE"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
+(define_expand "<code><mode>3"
+  [(set (match_operand:VF_512 0 "register_operand")
+       (fpint_logic:VF_512
+         (match_operand:VF_512 1 "nonimmediate_operand")
+         (match_operand:VF_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
 (define_insn "*<code><mode>3"
   [(set (match_operand:VF 0 "register_operand" "=x,v")
 	(any_logic:VF
@@ -1882,12 +1946,19 @@ 
       gcc_unreachable ();
     }
 
+  /* There is no v<logic>p[sd].  Use vp<logic>q.  */
+  if (GET_MODE_SIZE (<MODE>mode) == 64)
+    {
+      suffix = "q";
+      ops = "vp<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+    }
+
   snprintf (buf, sizeof (buf), ops, suffix);
   return buf;
 }
   [(set_attr "isa" "noavx,avx")
    (set_attr "type" "sselog")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix" "orig,maybe_evex")
    (set (attr "mode")
 	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 		 (const_string "<ssePSmode>")
@@ -2105,6 +2176,23 @@ 
 	       ]
 	       (const_string "TI")))])
 
+;; There are no floating point xor for V16SF and V8DF in avx512f
+;; but we need them for negation.  Instead we use int versions of
+;; xor.  Maybe there could be a better way to do that.
+
+(define_mode_attr avx512flogicsuff
+  [(V16SF "d") (V8DF "q")])
+
+(define_insn "avx512f_<logic><mode>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(fpint_logic:VF_512
+	  (match_operand:VF_512 1 "register_operand" "v")
+	  (match_operand:VF_512 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vp<logic><avx512flogicsuff>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; FMA floating point multiply/accumulate instructions.  These include
@@ -7747,7 +7835,7 @@ 
 (define_insn "<sse>_movmsk<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI
-	  [(match_operand:VF 1 "register_operand" "x")]
+	  [(match_operand:VF_128_256 1 "register_operand" "x")]
 	  UNSPEC_MOVMSK))]
   "TARGET_SSE"
   "%vmovmsk<ssemodesuffix>\t{%1, %0|%0, %1}"
@@ -8537,10 +8625,10 @@ 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-	(vec_merge:VF
-	  (match_operand:VF 2 "nonimmediate_operand" "xm,xm")
-	  (match_operand:VF 1 "register_operand" "0,x")
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(vec_merge:VF_128_256
+	  (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
+	  (match_operand:VF_128_256 1 "register_operand" "0,x")
 	  (match_operand:SI 3 "const_0_to_<blendbits>_operand")))]
   "TARGET_SSE4_1"
   "@
@@ -8555,11 +8643,11 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")
-	   (match_operand:VF 3 "register_operand" "Yz,x")]
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:VF_128_256 3 "register_operand" "Yz,x")]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "@
@@ -8575,10 +8663,10 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-	(unspec:VF
-	  [(match_operand:VF 1 "nonimmediate_operand" "%0,x")
-	   (match_operand:VF 2 "nonimmediate_operand" "xm,xm")
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "nonimmediate_operand" "%0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
 	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
 	  UNSPEC_DP))]
   "TARGET_SSE4_1"
@@ -8909,8 +8997,8 @@ 
 ;; setting FLAGS_REG. But it is not a really compare instruction.
 (define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>"
   [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_operand:VF 0 "register_operand" "x")
-		    (match_operand:VF 1 "nonimmediate_operand" "xm")]
+	(unspec:CC [(match_operand:VF_128_256 0 "register_operand" "x")
+		    (match_operand:VF_128_256 1 "nonimmediate_operand" "xm")]
 		   UNSPEC_VTESTP))]
   "TARGET_AVX"
   "vtest<ssemodesuffix>\t{%1, %0|%0, %1}"
@@ -8947,9 +9035,9 @@ 
    (set_attr "mode" "TI")])
 
 (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "register_operand" "=x")
-	(unspec:VF
-	  [(match_operand:VF 1 "nonimmediate_operand" "xm")
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "nonimmediate_operand" "xm")
 	   (match_operand:SI 2 "const_0_to_15_operand" "n")]
 	  UNSPEC_ROUND))]
   "TARGET_ROUND"
@@ -10341,10 +10429,10 @@ 
    (set_attr "mode" "TI")])
 
 (define_insn "xop_vpermil2<mode>3"
-  [(set (match_operand:VF 0 "register_operand" "=x")
-	(unspec:VF
-	  [(match_operand:VF 1 "register_operand" "x")
-	   (match_operand:VF 2 "nonimmediate_operand" "%x")
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "%x")
 	   (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "xm")
 	   (match_operand:SI 4 "const_0_to_3_operand" "n")]
 	  UNSPEC_VPERMIL2))]
@@ -10794,7 +10882,7 @@ 
     = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
 })
 
-(define_insn "*avx_vpermilp<mode>"
+(define_insn "*<sse2_avx_avx512f>_vpermilp<mode>"
   [(set (match_operand:VF 0 "register_operand" "=v")
 	(vec_select:VF
 	  (match_operand:VF 1 "nonimmediate_operand" "vm")
@@ -10811,9 +10899,9 @@ 
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "avx_vpermilvar<mode>3"
+(define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3"
   [(set (match_operand:VF 0 "register_operand" "=v")
 	(unspec:VF
 	  [(match_operand:VF 1 "register_operand" "v")
@@ -10823,9 +10911,10 @@ 
   "vpermil<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
    (set_attr "btver2_decode" "vector")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 
 (define_expand "avx_vperm2f128<mode>3"
   [(set (match_operand:AVX256MODE2P 0 "register_operand")