diff mbox

[i386,2/8,AVX512] Add mask registers.

Message ID 20130814072358.GC52726@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Kirill Yukhin Aug. 14, 2013, 7:23 a.m. UTC
Hello,
Patch was rebased on top of trunk.

It is applicable on top of [1/8] (which was rebased on new trunk today).
	
Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f option.
  4. Spec 2000 & 2006 run shows no regressions without -m512d option.
		
Thanks, K

---
 gcc/config/i386/constraints.md |   8 +-
 gcc/config/i386/i386.c         |  32 +++++--
 gcc/config/i386/i386.h         |  40 ++++++--
 gcc/config/i386/i386.md        | 204 +++++++++++++++++++++++++++++++++++++----
 4 files changed, 247 insertions(+), 37 deletions(-)

Comments

Richard Henderson Aug. 19, 2013, 9:17 p.m. UTC | #1
On 08/14/2013 12:23 AM, Kirill Yukhin wrote:
> +  ;; For AVX512F mask support
> +  UNSPEC_KIOR
> +  UNSPEC_KXOR
> +  UNSPEC_KAND
> +  UNSPEC_KANDN

I thought we determined that you didn't need these,
that "*Yk" as a constraint was sufficient.

> +(define_insn "kandn<mode>"
> +(define_insn "kand<mode>"
> +(define_insn "kior<mode>"
> +(define_insn "kxor<mode>"

Because otherwise there's nothing different between these...

> +(define_insn "kxnor<mode>"
> +(define_insn "kortestzhi"
> +(define_insn "kortestchi"
> +(define_insn "kunpckhi"
>  (define_insn "*one_cmpl<mode>2_1"
> +(define_insn "*one_cmplhi2_1"
>  (define_insn "*one_cmplqi2_1"

... and these.



r~
Kirill Yukhin Aug. 22, 2013, 9:35 a.m. UTC | #2
Hello Richard,

On 19 Aug 14:17, Richard Henderson wrote:
> On 08/14/2013 12:23 AM, Kirill Yukhin wrote:
> > +  ;; For AVX512F mask support
> > +  UNSPEC_KIOR
> > +  UNSPEC_KXOR
> > +  UNSPEC_KAND
> > +  UNSPEC_KANDN
> 
> I thought we determined that you didn't need these,
> that "*Yk" as a constraint was sufficient.

As far as I understood, we're talking about incorporating
of mask logic instructions into existing patterns + making 
mask constraints disparage.

E.g. for OR we have:
  (define_insn "*<code><mode>_1"
    [(set (match_operand:SWI248 0 "nonimmediate_operand" "=r,rm")
          (any_or:SWI248
           (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
           (match_operand:SWI248 2 "<general_operand>" "<g>,r<i>")))
     (clobber (reg:CC FLAGS_REG))]
    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
    "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"

Despite of generic OR, mask version of OR do not clobber FLAGS_REG.
Of course, we may conservatively think that it is, but I believe
this is not good idea.
Making single constraint in new pattern disparage have no sense
as far as I understad, since this is relative notion.
So, what should I do?

--
Thanks, K
Richard Henderson Aug. 22, 2013, 3:49 p.m. UTC | #3
On 08/22/2013 02:35 AM, Kirill Yukhin wrote:
> Despite of generic OR, mask version of OR do not clobber FLAGS_REG.
> Of course, we may conservatively think that it is, but I believe
> this is not good idea.

I believe that having two different patterns is a worse idea.

You can always split away the clobber after reload, as we do for
when add gets implemented with lea.


r~
diff mbox

Patch

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 28e626f..92e0c05 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -19,7 +19,7 @@ 
 
 ;;; Unused letters:
 ;;;     B     H           T
-;;;           h jk
+;;;           h j
 
 ;; Integer register constraints.
 ;; It is not necessary to define 'r' here.
@@ -78,6 +78,12 @@ 
  "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_SECOND_REG : NO_REGS"
  "Second from top of 80387 floating-point stack (@code{%st(1)}).")
 
+(define_register_constraint "k" "TARGET_AVX512F ? MASK_EVEX_REGS : NO_REGS"
+"@internal Any mask register that can be used as predicate, i.e. k1-k7.")
+
+(define_register_constraint "Yk" "TARGET_AVX512F ? MASK_REGS : NO_REGS"
+"@internal Any mask register.")
+
 ;; Vector registers (also used for plain floating point nowadays).
 (define_register_constraint "y" "TARGET_MMX ? MMX_REGS : NO_REGS"
  "Any MMX register.")
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index dd97e6b..7412745 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2303,6 +2303,9 @@  enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  /* Mask registers.  */
+  MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
+  MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
 };
 
 /* The "default" register map used in 32bit mode.  */
@@ -2318,6 +2321,7 @@  int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* new SSE registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* new SSE registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* The "default" register map used in 64bit mode.  */
@@ -2333,6 +2337,7 @@  int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
   67, 68, 69, 70, 71, 72, 73, 74,       /* new SSE registers 16-23 */
   75, 76, 77, 78, 79, 80, 81, 82,       /* new SSE registers 24-31 */
+  118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
 };
 
 /* Define the register numbers to be used in Dwarf debugging information.
@@ -2400,6 +2405,7 @@  int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* new SSE registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* new SSE registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* Define parameter passing and return registers.  */
@@ -4457,7 +4463,8 @@  ix86_conditional_register_usage (void)
   /* If AVX512F is disabled, squash the registers.  */
   if (! TARGET_AVX512F)
     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)EVEX_SSE_REGS], i))
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)MASK_REGS], i)
+	  || TEST_HARD_REG_BIT (reg_class_contents[(int)EVEX_SSE_REGS], i))
 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
 }
 
@@ -34114,10 +34121,12 @@  ix86_preferred_reload_class (rtx x, reg_class_t regclass)
     return regclass;
 
   /* Force constants into memory if we are loading a (nonzero) constant into
-     an MMX or SSE register.  This is because there are no MMX/SSE instructions
-     to load from a constant.  */
+     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+     instructions to load from a constant.  */
   if (CONSTANT_P (x)
-      && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
+      && (MAYBE_MMX_CLASS_P (regclass)
+	  || MAYBE_SSE_CLASS_P (regclass)
+	  || MAYBE_MASK_CLASS_P (regclass)))
     return NO_REGS;
 
   /* Prefer SSE regs only, if we can use them for math.  */
@@ -34221,10 +34230,11 @@  ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
 
   /* QImode spills from non-QI registers require
      intermediate register on 32bit targets.  */
-  if (!TARGET_64BIT
-      && !in_p && mode == QImode
-      && INTEGER_CLASS_P (rclass)
-      && MAYBE_NON_Q_CLASS_P (rclass))
+  if (mode == QImode
+      && (MAYBE_MASK_CLASS_P (rclass)
+	  || (!TARGET_64BIT && !in_p
+	      && INTEGER_CLASS_P (rclass)
+	      && MAYBE_NON_Q_CLASS_P (rclass))))
     {
       int regno;
 
@@ -34646,6 +34656,8 @@  ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
     return false;
   if (STACK_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
+  if (MASK_REGNO_P (regno))
+    return VALID_MASK_REG_MODE (mode);
   if (SSE_REGNO_P (regno))
     {
       /* We implement the move patterns for all vector modes into and
@@ -35446,6 +35458,10 @@  x86_order_regs_for_local_alloc (void)
    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
      reg_alloc_order [pos++] = i;
 
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
    /* x87 registers.  */
    if (TARGET_SSE_MATH)
      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 7700d0a..7dc6856 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -893,7 +893,7 @@  enum target_cpu_default
    eliminated during reloading in favor of either the stack or frame
    pointer.  */
 
-#define FIRST_PSEUDO_REGISTER 69
+#define FIRST_PSEUDO_REGISTER 77
 
 /* Number of hardware registers that go into the DWARF-2 unwind info.
    If not defined, equals FIRST_PSEUDO_REGISTER.  */
@@ -923,7 +923,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
      0,   0,    0,    0,    0,    0,    0,    0,		\
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
-     0,   0,    0,    0,    0,    0,    0,    0 }
+     0,   0,    0,    0,    0,    0,    0,    0,		\
+/*  k0,  k1, k2, k3, k4, k5, k6, k7*/				\
+     0,  0,   0,  0,  0,  0,  0,  0 }
 
 /* 1 for registers not available across function calls.
    These must include the FIXED_REGISTERS and also any
@@ -955,7 +957,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
      6,    6,     6,    6,    6,    6,    6,    6,		\
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
-     6,    6,     6,    6,    6,    6,    6,    6 }
+     6,    6,     6,    6,    6,    6,    6,    6,		\
+ /* k0,  k1,  k2,  k3,  k4,  k5,  k6,  k7*/			\
+     1,   1,   1,   1,   1,   1,   1,   1 }
 
 /* Order in which to allocate registers.  Each register must be
    listed once, even those in FIXED_REGISTERS.  List frame pointer
@@ -971,7 +975,7 @@  enum target_cpu_default
    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,	\
    33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,  \
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,	\
-   63, 64, 65, 66, 67, 68 }
+   63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76 }
 
 /* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
    to be rearranged based on a particular function.  When using sse math,
@@ -1068,6 +1072,8 @@  enum target_cpu_default
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
    || (MODE) == V16SFmode)
 
+#define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
+
 /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE.  */
 
 #define HARD_REGNO_MODE_OK(REGNO, MODE)	\
@@ -1093,8 +1099,10 @@  enum target_cpu_default
   (CC_REGNO_P (REGNO) ? VOIDmode					\
    : (MODE) == VOIDmode && (NREGS) != 1 ? VOIDmode			\
    : (MODE) == VOIDmode ? choose_hard_reg_mode ((REGNO), (NREGS), false) \
-   : (MODE) == HImode && !TARGET_PARTIAL_REG_STALL ? SImode		\
-   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)) ? SImode	\
+   : (MODE) == HImode && !(TARGET_PARTIAL_REG_STALL			\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
+   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)		\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
    : (MODE))
 
 /* The only ABI that saves SSE registers across calls is Win64 (thus no
@@ -1141,6 +1149,9 @@  enum target_cpu_default
 #define FIRST_EXT_REX_SSE_REG  (LAST_REX_SSE_REG + 1) /*53*/
 #define LAST_EXT_REX_SSE_REG   (FIRST_EXT_REX_SSE_REG + 15) /*68*/
 
+#define FIRST_MASK_REG  (LAST_EXT_REX_SSE_REG + 1) /*69*/
+#define LAST_MASK_REG   (FIRST_MASK_REG + 7) /*76*/
+
 /* Override this in other tm.h files to cope with various OS lossage
    requiring a frame pointer.  */
 #ifndef SUBTARGET_FRAME_POINTER_REQUIRED
@@ -1229,6 +1240,8 @@  enum reg_class
   FLOAT_INT_REGS,
   INT_SSE_REGS,
   FLOAT_INT_SSE_REGS,
+  MASK_EVEX_REGS,
+  MASK_REGS,
   ALL_REGS, LIM_REG_CLASSES
 };
 
@@ -1250,6 +1263,8 @@  enum reg_class
   reg_classes_intersect_p ((CLASS), ALL_SSE_REGS)
 #define MAYBE_MMX_CLASS_P(CLASS) \
   reg_classes_intersect_p ((CLASS), MMX_REGS)
+#define MAYBE_MASK_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), MASK_REGS)
 
 #define Q_CLASS_P(CLASS) \
   reg_class_subset_p ((CLASS), Q_REGS)
@@ -1282,6 +1297,8 @@  enum reg_class
    "FLOAT_INT_REGS",			\
    "INT_SSE_REGS",			\
    "FLOAT_INT_SSE_REGS",		\
+   "MASK_EVEX_REGS",			\
+   "MASK_REGS",				\
    "ALL_REGS" }
 
 /* Define which registers fit in which classes.  This is an initializer
@@ -1319,7 +1336,9 @@  enum reg_class
 {   0x11ffff,    0x1fe0,   0x0 },       /* FLOAT_INT_REGS */            \
 { 0x1ff100ff,0xffffffe0,  0x1f },       /* INT_SSE_REGS */              \
 { 0x1ff1ffff,0xffffffe0,  0x1f },       /* FLOAT_INT_SSE_REGS */        \
-{ 0xffffffff,0xffffffff,  0x1f }                                        \
+       { 0x0,       0x0,0x1fc0 },       /* MASK_EVEX_REGS */           \
+       { 0x0,       0x0,0x1fe0 },       /* MASK_REGS */                 \
+{ 0xffffffff,0xffffffff,0x1fff }                                        \
 }
 
 /* The same information, inverted:
@@ -1377,6 +1396,8 @@  enum reg_class
          : (N) <= LAST_REX_SSE_REG ? (FIRST_REX_SSE_REG + (N) - 8) \
                                    : (FIRST_EXT_REX_SSE_REG + (N) - 16))
 
+#define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
+#define ANY_MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
 
 #define SSE_FLOAT_MODE_P(MODE) \
   ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
@@ -1429,7 +1450,7 @@  enum reg_class
 
 /* Get_secondary_mem widens integral modes to BITS_PER_WORD.
    There is no need to emit full 64 bit move on 64 bit targets
-   for integral modes that can be moved using 32 bit move.  */
+   for integral modes that can be moved using 8 bit move.  */
 #define SECONDARY_MEMORY_NEEDED_MODE(MODE)			\
   (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE)	\
    ? mode_for_size (32, GET_MODE_CLASS (MODE), 0)		\
@@ -1933,7 +1954,8 @@  do {							\
  "xmm16", "xmm17", "xmm18", "xmm19",					\
  "xmm20", "xmm21", "xmm22", "xmm23",					\
  "xmm24", "xmm25", "xmm26", "xmm27",					\
- "xmm28", "xmm29", "xmm30", "xmm31" }
+ "xmm28", "xmm29", "xmm30", "xmm31",					\
+ "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" }
 
 #define REGISTER_NAMES HI_REGISTER_NAMES
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a666794..630b87e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -178,6 +178,12 @@ 
   ;; For BMI2 support
   UNSPEC_PDEP
   UNSPEC_PEXT
+
+  ;; For AVX512F mask support
+  UNSPEC_KIOR
+  UNSPEC_KXOR
+  UNSPEC_KAND
+  UNSPEC_KANDN
 ])
 
 (define_c_enum "unspecv" [
@@ -328,6 +334,14 @@ 
    (XMM29_REG			66)
    (XMM30_REG			67)
    (XMM31_REG			68)
+   (MASK0_REG			69)
+   (MASK1_REG			70)
+   (MASK2_REG			71)
+   (MASK3_REG			72)
+   (MASK4_REG			73)
+   (MASK5_REG			74)
+   (MASK6_REG			75)
+   (MASK7_REG			76)
   ])
 
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
@@ -360,7 +374,7 @@ 
    sseishft,sseishft1,ssecmp,ssecomi,
    ssecvt,ssecvt1,sseicvt,sseins,
    sseshuf,sseshuf1,ssemuladd,sse4arg,
-   lwp,
+   lwp,mskmov,msklog,
    mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
   (const_string "other"))
 
@@ -379,7 +393,7 @@ 
 			  ssemul,sseimul,ssediv,sselog,sselog1,
 			  sseishft,sseishft1,ssecmp,ssecomi,
 			  ssecvt,ssecvt1,sseicvt,sseins,
-			  sseshuf,sseshuf1,ssemuladd,sse4arg")
+			  sseshuf,sseshuf1,ssemuladd,sse4arg,mskmov")
 	   (const_string "sse")
 	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
 	   (const_string "mmx")
@@ -390,7 +404,7 @@ 
 ;; The (bounding maximum) length of an instruction immediate.
 (define_attr "length_immediate" ""
   (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
-			  bitmanip,imulx")
+			  bitmanip,imulx,msklog,mskmov")
 	   (const_int 0)
 	 (eq_attr "unit" "i387,sse,mmx")
 	   (const_int 0)
@@ -451,7 +465,7 @@ 
 ;; Set when 0f opcode prefix is used.
 (define_attr "prefix_0f" ""
   (if_then_else
-    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip")
+    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip,msklog,mskmov")
 	 (eq_attr "unit" "sse,mmx"))
     (const_int 1)
     (const_int 0)))
@@ -651,7 +665,7 @@ 
 		   fmov,fcmp,fsgn,
 		   sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,
 		   sselog1,sseshuf1,sseadd1,sseiadd1,sseishft1,
-		   mmx,mmxmov,mmxcmp,mmxcvt")
+		   mmx,mmxmov,mmxcmp,mmxcvt,mskmov,msklog")
 	      (match_operand 2 "memory_operand"))
 	   (const_string "load")
 	 (and (eq_attr "type" "icmov,ssemuladd,sse4arg")
@@ -2211,8 +2225,8 @@ 
 	   (const_string "SI")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m")
-	(match_operand:HI 1 "general_operand"	   "r ,rn,rm,rn"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,Yk,Yk,rm")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,rm,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2221,6 +2235,16 @@ 
       /* movzwl is faster than movw on p2 due to partial word stalls,
 	 though not as fast as an aligned movl.  */
       return "movz{wl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 4: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 5: return "kmovw\t{%1, %0|%0, %1}";
+	case 6: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2238,11 +2262,17 @@ 
 	    (and (eq_attr "alternative" "1,2")
 		 (match_operand:HI 1 "aligned_operand"))
 	      (const_string "imov")
+	    (eq_attr "alternative" "4,5,6")
+	      (const_string "mskmov")
 	    (and (match_test "TARGET_MOVX")
 		 (eq_attr "alternative" "0,2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
+    (set (attr "prefix")
+      (if_then_else (eq_attr "alternative" "4,5,6")
+	(const_string "vex")
+	(const_string "orig")))
     (set (attr "mode")
       (cond [(eq_attr "type" "imovx")
 	       (const_string "SI")
@@ -2267,8 +2297,8 @@ 
 ;; register stall machines with, where we use QImode instructions, since
 ;; partial register stall can be caused there.  Then we use movzx.
 (define_insn "*movqi_internal"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
-	(match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn"))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m ,Yk,Yk,r")
+	(match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn,r ,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2276,6 +2306,16 @@ 
     case TYPE_IMOVX:
       gcc_assert (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1]));
       return "movz{bl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 7: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 8: return "kmovw\t{%1, %0|%0, %1}";
+	case 9: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2295,11 +2335,17 @@ 
 	      (const_string "imov")
 	    (eq_attr "alternative" "3,5")
 	      (const_string "imovx")
+	    (eq_attr "alternative" "7,8,9")
+	      (const_string "mskmov")
 	    (and (match_test "TARGET_MOVX")
 		 (eq_attr "alternative" "2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9")
+       (const_string "vex")
+       (const_string "orig")))
    (set (attr "mode")
       (cond [(eq_attr "alternative" "3,4,5")
 	       (const_string "SI")
@@ -7639,6 +7685,18 @@ 
        (const_string "*")))
    (set_attr "mode" "HI,HI,SI")])
 
+(define_insn "kandn<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=Yk")
+	(unspec:SWI12
+	  [(match_operand:SWI12 1 "register_operand" "Yk")
+	   (match_operand:SWI12 2 "register_operand" "Yk")]
+	 UNSPEC_KANDN))]
+  "TARGET_AVX512F"
+  "kandnw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "<MODE>")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; %%% Potential partial reg stall on alternative 2.  What to do?
 (define_insn "*andqi_1"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
@@ -7664,6 +7722,18 @@ 
   [(set_attr "type" "alu1")
    (set_attr "mode" "QI")])
 
+(define_insn "kand<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=Yk")
+	(unspec:SWI12
+	  [(match_operand:SWI12 1 "register_operand" "Yk")
+	   (match_operand:SWI12 2 "register_operand" "Yk")]
+	  UNSPEC_KAND))]
+  "TARGET_AVX512F"
+  "kandw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 ;; Turn *anddi_1 into *andsi_1_zext if possible.
 (define_split
   [(set (match_operand:DI 0 "register_operand")
@@ -8067,6 +8137,81 @@ 
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "kior<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=Yk")
+	(unspec:SWI12
+	  [(match_operand:SWI12 1 "register_operand" "Yk")
+	   (match_operand:SWI12 2 "register_operand" "Yk")]
+	  UNSPEC_KIOR))]
+  "TARGET_AVX512F"
+  "korw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "kxor<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=Yk")
+	(unspec:SWI12
+	  [(match_operand:SWI12 1 "register_operand" "Yk")
+	   (match_operand:SWI12 2 "register_operand" "Yk")]
+	  UNSPEC_KXOR))]
+  "TARGET_AVX512F"
+  "kxorw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "kxnor<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=Yk")
+	(not:SWI12
+	  (xor:SWI12
+	    (match_operand:SWI12 1 "register_operand" "Yk")
+	    (match_operand:SWI12 2 "register_operand" "Yk"))))]
+  "TARGET_AVX512F"
+  "kxnorw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "<MODE>")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kortestzhi"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "%Yk")
+	    (match_operand:HI 1 "register_operand" "Yk"))
+	  (const_int 0)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCZmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kortestchi"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "%Yk")
+	    (match_operand:HI 1 "register_operand" "Yk"))
+	  (const_int -1)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCCmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kunpckhi"
+  [(set (match_operand:HI 0 "register_operand" "=Yk")
+	(ior:HI
+	  (ashift:HI
+	    (match_operand:HI 1 "register_operand" "Yk")
+	    (const_int 8))
+	  (zero_extend:HI (subreg:QI (match_operand:HI 2 "register_operand" "Yk") 0))))]
+  "TARGET_AVX512F"
+  "kunpckbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
 ;; ??? Special case for immediate operand is missing - it is tricky.
 (define_insn "*<code>si_2_zext"
@@ -8636,23 +8781,38 @@ 
   "ix86_expand_unary_operator (NOT, <MODE>mode, operands); DONE;")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(not:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
   "not{<imodesuffix>}\t%0"
   [(set_attr "type" "negnot")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*one_cmplhi2_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yk")
+	(not:HI (match_operand:HI 1 "nonimmediate_operand" "0,Yk")))]
+  "ix86_unary_operator_ok (NOT, HImode, operands)"
+  "@
+   not{w}\t%0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,avx512f")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "HI")])
+
 ;; %%% Potential partial reg stall on alternative 1.  What to do?
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,*Yk")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,*Yk")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")])
+   not{l}\t%k0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "mode" "QI,SI,QI")])
 
 ;; ??? Currently never generated - xor is used instead.
 (define_insn "*one_cmplsi2_1_zext"
@@ -16376,7 +16536,7 @@ 
 })
 
 ;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
-
+;; Do not split instructions with mask registers.
 (define_split
   [(set (match_operand 0 "register_operand")
 	(match_operator 3 "promotable_binary_operator"
@@ -16390,7 +16550,10 @@ 
 	    || !CONST_INT_P (operands[2])
 	    || satisfies_constraint_K (operands[2])))
        || (GET_MODE (operands[0]) == QImode
-	   && (TARGET_PROMOTE_QImode || optimize_function_for_size_p (cfun))))"
+	   && (TARGET_PROMOTE_QImode || optimize_function_for_size_p (cfun))))
+   && (! ANY_MASK_REG_P (operands[0])
+	|| ! ANY_MASK_REG_P (operands[1])
+	|| ! ANY_MASK_REG_P (operands[2]))"
   [(parallel [(set (match_dup 0)
 		   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
 	      (clobber (reg:CC FLAGS_REG))])]
@@ -16475,6 +16638,7 @@ 
   operands[1] = gen_lowpart (SImode, operands[1]);
 })
 
+;; Do not split instructions with mask regs.
 (define_split
   [(set (match_operand 0 "register_operand")
 	(not (match_operand 1 "register_operand")))]
@@ -16482,7 +16646,9 @@ 
    && (GET_MODE (operands[0]) == HImode
        || (GET_MODE (operands[0]) == QImode
 	   && (TARGET_PROMOTE_QImode
-	       || optimize_insn_for_size_p ())))"
+	       || optimize_insn_for_size_p ())))
+   && (! ANY_MASK_REG_P (operands[0])
+	 || ! ANY_MASK_REG_P (operands[1]))"
   [(set (match_dup 0)
 	(not:SI (match_dup 1)))]
 {