Patchwork [i386,2/8,AVX512] Add mask registers.

login
register
mail settings
Submitter Kirill Yukhin
Date Aug. 28, 2013, 5:45 p.m.
Message ID <20130828174536.GA47152@msticlxl57.ims.intel.com>
Download mbox | patch
Permalink /patch/270613/
State New
Headers show

Comments

Kirill Yukhin - Aug. 28, 2013, 5:45 p.m.
Hello Richard,

On 27 Aug 13:07, Richard Henderson wrote:
> On 08/27/2013 11:11 AM, Kirill Yukhin wrote:
> >> > What happened to the bmi andn alternative we discussed?
> > BMI only supported for 4- and 8- byte integers, while
> > kandw - for HI/QI
> >
>
> We're talking about values in registers.  Ignoring the high bits of the andn
> result still produces the correct results.

I've updated patch, adding BMI alternative and clobber of flags:
+(define_insn "kandn<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
+       (and:SWI12
+         (not:SWI12
+           (match_operand:SWI12 1 "register_operand" "r,0,k"))
+         (match_operand:SWI12 2 "register_operand" "r,r,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #
+   kandnw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip,*,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "btver2_decode" "direct,*,*")
+   (set_attr "mode" "<MODE>")])

However I am not fully understand why do we need this.
`kandn' is different from BMI `andn' in clobbering of flags reg.
So, having such a pattern we'll make compiler think that `kandn'
clobber, which seems to me like opportunity to misoptimization as
far as `kandn' doesn't clobber.

Anyway, it seems to work.

Testing:
  1. Bootstrap pass
  2. make check shows no regressions
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f option
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f option

Is it ok?

---
 gcc/config/i386/constraints.md |   8 +-
 gcc/config/i386/i386.c         |  34 ++++-
 gcc/config/i386/i386.h         |  40 ++++--
 gcc/config/i386/i386.md        | 283 ++++++++++++++++++++++++++++++++++-------
 gcc/config/i386/predicates.md  |   9 ++
 5 files changed, 314 insertions(+), 60 deletions(-)
Richard Henderson - Aug. 28, 2013, 5:55 p.m.
On 08/28/2013 10:45 AM, Kirill Yukhin wrote:
> Hello Richard,
> 
> On 27 Aug 13:07, Richard Henderson wrote:
>> On 08/27/2013 11:11 AM, Kirill Yukhin wrote:
>>>>> What happened to the bmi andn alternative we discussed?
>>> BMI only supported for 4- and 8- byte integers, while
>>> kandw - for HI/QI
>>>
>>
>> We're talking about values in registers.  Ignoring the high bits of the andn
>> result still produces the correct results.
> 
> I've updated patch, adding BMI alternative and clobber of flags:
> +(define_insn "kandn<mode>"
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
> +       (and:SWI12
> +         (not:SWI12
> +           (match_operand:SWI12 1 "register_operand" "r,0,k"))
> +         (match_operand:SWI12 2 "register_operand" "r,r,k")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F"
> +  "@
> +   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
> +   #
> +   kandnw\t{%2, %1, %0|%0, %1, %2}"
> +  [(set_attr "type" "bitmanip,*,msklog")
> +   (set_attr "prefix" "*,*,vex")
> +   (set_attr "btver2_decode" "direct,*,*")
> +   (set_attr "mode" "<MODE>")])
> 
> However I am not fully understand why do we need this.
> `kandn' is different from BMI `andn' in clobbering of flags reg.
> So, having such a pattern we'll make compiler think that `kandn'
> clobber, which seems to me like opportunity to misoptimization as
> far as `kandn' doesn't clobber.

This is no different than ANY OTHER use of the mask logical ops.

When combine puts the AND and the NOT together, we don't know what registers we
want the data in.  If we do not supply the general register alternative, with
the clobber, then we will be FORCED to implement the operation in the mask
registers, even if this operation had nothing to do with actual vector masks.
And it ought to come as no surprise that X & ~Y is a fairly common operation.

I suppose a real question here in how this is written: Does TARGET_AVX512F
imply TARGET_BMI?  If so, then we can eliminate the second alternative.  If
not, then you are missing an set_attr isa to restrict the first alternative.



r~

Patch

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 28e626f..92e0c05 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -19,7 +19,7 @@ 
 
 ;;; Unused letters:
 ;;;     B     H           T
-;;;           h jk
+;;;           h j
 
 ;; Integer register constraints.
 ;; It is not necessary to define 'r' here.
@@ -78,6 +78,12 @@ 
  "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_SECOND_REG : NO_REGS"
  "Second from top of 80387 floating-point stack (@code{%st(1)}).")
 
+(define_register_constraint "k" "TARGET_AVX512F ? MASK_EVEX_REGS : NO_REGS"
+"@internal Any mask register that can be used as predicate, i.e. k1-k7.")
+
+(define_register_constraint "Yk" "TARGET_AVX512F ? MASK_REGS : NO_REGS"
+"@internal Any mask register.")
+
 ;; Vector registers (also used for plain floating point nowadays).
 (define_register_constraint "y" "TARGET_MMX ? MMX_REGS : NO_REGS"
  "Any MMX register.")
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d05dbf0..8325919 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2032,6 +2032,9 @@  enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  /* Mask registers.  */
+  MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
+  MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
 };
 
 /* The "default" register map used in 32bit mode.  */
@@ -2047,6 +2050,7 @@  int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* The "default" register map used in 64bit mode.  */
@@ -2062,6 +2066,7 @@  int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
   25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
   67, 68, 69, 70, 71, 72, 73, 74,       /* AVX-512 registers 16-23 */
   75, 76, 77, 78, 79, 80, 81, 82,       /* AVX-512 registers 24-31 */
+  118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
 };
 
 /* Define the register numbers to be used in Dwarf debugging information.
@@ -2129,6 +2134,7 @@  int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* Define parameter passing and return registers.  */
@@ -4219,8 +4225,13 @@  ix86_conditional_register_usage (void)
 
   /* If AVX512F is disabled, squash the registers.  */
   if (! TARGET_AVX512F)
+  {
     for (i = FIRST_EXT_REX_SSE_REG; i < LAST_EXT_REX_SSE_REG; i++)
       fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+    for (i = FIRST_MASK_REG; i < LAST_MASK_REG; i++)
+      fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+  }
 }
 
 
@@ -33889,10 +33900,12 @@  ix86_preferred_reload_class (rtx x, reg_class_t regclass)
     return regclass;
 
   /* Force constants into memory if we are loading a (nonzero) constant into
-     an MMX or SSE register.  This is because there are no MMX/SSE instructions
-     to load from a constant.  */
+     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+     instructions to load from a constant.  */
   if (CONSTANT_P (x)
-      && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
+      && (MAYBE_MMX_CLASS_P (regclass)
+  || MAYBE_SSE_CLASS_P (regclass)
+  || MAYBE_MASK_CLASS_P (regclass)))
     return NO_REGS;
 
   /* Prefer SSE regs only, if we can use them for math.  */
@@ -33996,10 +34009,11 @@  ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
 
   /* QImode spills from non-QI registers require
      intermediate register on 32bit targets.  */
-  if (!TARGET_64BIT
-      && !in_p && mode == QImode
-      && INTEGER_CLASS_P (rclass)
-      && MAYBE_NON_Q_CLASS_P (rclass))
+  if (mode == QImode
+      && (MAYBE_MASK_CLASS_P (rclass)
+  || (!TARGET_64BIT && !in_p
+      && INTEGER_CLASS_P (rclass)
+      && MAYBE_NON_Q_CLASS_P (rclass))))
     {
       int regno;
 
@@ -34421,6 +34435,8 @@  ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
     return false;
   if (STACK_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
+  if (MASK_REGNO_P (regno))
+    return VALID_MASK_REG_MODE (mode);
   if (SSE_REGNO_P (regno))
     {
       /* We implement the move patterns for all vector modes into and
@@ -35230,6 +35246,10 @@  x86_order_regs_for_local_alloc (void)
    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
      reg_alloc_order [pos++] = i;
 
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
    /* x87 registers.  */
    if (TARGET_SSE_MATH)
      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e820aa6..13572bf2 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -893,7 +893,7 @@  enum target_cpu_default
    eliminated during reloading in favor of either the stack or frame
    pointer.  */
 
-#define FIRST_PSEUDO_REGISTER 69
+#define FIRST_PSEUDO_REGISTER 77
 
 /* Number of hardware registers that go into the DWARF-2 unwind info.
    If not defined, equals FIRST_PSEUDO_REGISTER.  */
@@ -923,7 +923,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/ \
      0,   0,    0,    0,    0,    0,    0,    0, \
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/ \
-     0,   0,    0,    0,    0,    0,    0,    0 }
+     0,   0,    0,    0,    0,    0,    0,    0, \
+/*  k0,  k1, k2, k3, k4, k5, k6, k7*/ \
+     0,  0,   0,  0,  0,  0,  0,  0 }
 
 /* 1 for registers not available across function calls.
    These must include the FIXED_REGISTERS and also any
@@ -955,7 +957,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/ \
      6,    6,     6,    6,    6,    6,    6,    6, \
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/ \
-     6,    6,     6,    6,    6,    6,    6,    6 }
+     6,    6,     6,    6,    6,    6,    6,    6, \
+ /* k0,  k1,  k2,  k3,  k4,  k5,  k6,  k7*/ \
+     1,   1,   1,   1,   1,   1,   1,   1 }
 
 /* Order in which to allocate registers.  Each register must be
    listed once, even those in FIXED_REGISTERS.  List frame pointer
@@ -971,7 +975,7 @@  enum target_cpu_default
    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, \
    33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,  \
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, \
-   63, 64, 65, 66, 67, 68 }
+   63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76 }
 
 /* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
    to be rearranged based on a particular function.  When using sse math,
@@ -1068,6 +1072,8 @@  enum target_cpu_default
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
    || (MODE) == V16SFmode)
 
+#define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
+
 /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE.  */
 
 #define HARD_REGNO_MODE_OK(REGNO, MODE) \
@@ -1093,8 +1099,10 @@  enum target_cpu_default
   (CC_REGNO_P (REGNO) ? VOIDmode \
    : (MODE) == VOIDmode && (NREGS) != 1 ? VOIDmode \
    : (MODE) == VOIDmode ? choose_hard_reg_mode ((REGNO), (NREGS), false) \
-   : (MODE) == HImode && !TARGET_PARTIAL_REG_STALL ? SImode \
-   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)) ? SImode \
+   : (MODE) == HImode && !(TARGET_PARTIAL_REG_STALL \
+   || MASK_REGNO_P (REGNO)) ? SImode \
+   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO) \
+   || MASK_REGNO_P (REGNO)) ? SImode \
    : (MODE))
 
 /* The only ABI that saves SSE registers across calls is Win64 (thus no
@@ -1141,6 +1149,9 @@  enum target_cpu_default
 #define FIRST_EXT_REX_SSE_REG  (LAST_REX_SSE_REG + 1) /*53*/
 #define LAST_EXT_REX_SSE_REG   (FIRST_EXT_REX_SSE_REG + 15) /*68*/
 
+#define FIRST_MASK_REG  (LAST_EXT_REX_SSE_REG + 1) /*69*/
+#define LAST_MASK_REG   (FIRST_MASK_REG + 7) /*76*/
+
 /* Override this in other tm.h files to cope with various OS lossage
    requiring a frame pointer.  */
 #ifndef SUBTARGET_FRAME_POINTER_REQUIRED
@@ -1229,6 +1240,8 @@  enum reg_class
   FLOAT_INT_REGS,
   INT_SSE_REGS,
   FLOAT_INT_SSE_REGS,
+  MASK_EVEX_REGS,
+  MASK_REGS,
   ALL_REGS, LIM_REG_CLASSES
 };
 
@@ -1250,6 +1263,8 @@  enum reg_class
   reg_classes_intersect_p ((CLASS), ALL_SSE_REGS)
 #define MAYBE_MMX_CLASS_P(CLASS) \
   reg_classes_intersect_p ((CLASS), MMX_REGS)
+#define MAYBE_MASK_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), MASK_REGS)
 
 #define Q_CLASS_P(CLASS) \
   reg_class_subset_p ((CLASS), Q_REGS)
@@ -1282,6 +1297,8 @@  enum reg_class
    "FLOAT_INT_REGS", \
    "INT_SSE_REGS", \
    "FLOAT_INT_SSE_REGS", \
+   "MASK_EVEX_REGS", \
+   "MASK_REGS", \
    "ALL_REGS" }
 
 /* Define which registers fit in which classes.  This is an initializer
@@ -1319,7 +1336,9 @@  enum reg_class
 {   0x11ffff,    0x1fe0,   0x0 },       /* FLOAT_INT_REGS */            \
 { 0x1ff100ff,0xffffffe0,  0x1f },       /* INT_SSE_REGS */              \
 { 0x1ff1ffff,0xffffffe0,  0x1f },       /* FLOAT_INT_SSE_REGS */        \
-{ 0xffffffff,0xffffffff,  0x1f }                                        \
+       { 0x0,       0x0,0x1fc0 },       /* MASK_EVEX_REGS */           \
+       { 0x0,       0x0,0x1fe0 },       /* MASK_REGS */                 \
+{ 0xffffffff,0xffffffff,0x1fff }                                        \
 }
 
 /* The same information, inverted:
@@ -1377,6 +1396,8 @@  enum reg_class
          : (N) <= LAST_REX_SSE_REG ? (FIRST_REX_SSE_REG + (N) - 8) \
                                    : (FIRST_EXT_REX_SSE_REG + (N) - 16))
 
+#define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
+#define ANY_MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
 
 #define SSE_FLOAT_MODE_P(MODE) \
   ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
@@ -1429,7 +1450,7 @@  enum reg_class
 
 /* Get_secondary_mem widens integral modes to BITS_PER_WORD.
    There is no need to emit full 64 bit move on 64 bit targets
-   for integral modes that can be moved using 32 bit move.  */
+   for integral modes that can be moved using 8 bit move.  */
 #define SECONDARY_MEMORY_NEEDED_MODE(MODE) \
   (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE) \
    ? mode_for_size (32, GET_MODE_CLASS (MODE), 0) \
@@ -1933,7 +1954,8 @@  do { \
  "xmm16", "xmm17", "xmm18", "xmm19", \
  "xmm20", "xmm21", "xmm22", "xmm23", \
  "xmm24", "xmm25", "xmm26", "xmm27", \
- "xmm28", "xmm29", "xmm30", "xmm31" }
+ "xmm28", "xmm29", "xmm30", "xmm31", \
+ "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" }
 
 #define REGISTER_NAMES HI_REGISTER_NAMES
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3d7533a..4ecfec9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -328,6 +328,14 @@ 
    (XMM29_REG 66)
    (XMM30_REG 67)
    (XMM31_REG 68)
+   (MASK0_REG 69)
+   (MASK1_REG 70)
+   (MASK2_REG 71)
+   (MASK3_REG 72)
+   (MASK4_REG 73)
+   (MASK5_REG 74)
+   (MASK6_REG 75)
+   (MASK7_REG 76)
   ])
 
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
@@ -360,7 +368,7 @@ 
    sseishft,sseishft1,ssecmp,ssecomi,
    ssecvt,ssecvt1,sseicvt,sseins,
    sseshuf,sseshuf1,ssemuladd,sse4arg,
-   lwp,
+   lwp,mskmov,msklog,
    mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
   (const_string "other"))
 
@@ -379,7 +387,7 @@ 
   ssemul,sseimul,ssediv,sselog,sselog1,
   sseishft,sseishft1,ssecmp,ssecomi,
   ssecvt,ssecvt1,sseicvt,sseins,
-  sseshuf,sseshuf1,ssemuladd,sse4arg")
+  sseshuf,sseshuf1,ssemuladd,sse4arg,mskmov")
    (const_string "sse")
  (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
    (const_string "mmx")
@@ -390,7 +398,7 @@ 
 ;; The (bounding maximum) length of an instruction immediate.
 (define_attr "length_immediate" ""
   (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
-  bitmanip,imulx")
+  bitmanip,imulx,msklog,mskmov")
    (const_int 0)
  (eq_attr "unit" "i387,sse,mmx")
    (const_int 0)
@@ -451,7 +459,7 @@ 
 ;; Set when 0f opcode prefix is used.
 (define_attr "prefix_0f" ""
   (if_then_else
-    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip")
+    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip,msklog,mskmov")
  (eq_attr "unit" "sse,mmx"))
     (const_int 1)
     (const_int 0)))
@@ -651,7 +659,7 @@ 
    fmov,fcmp,fsgn,
    sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,
    sselog1,sseshuf1,sseadd1,sseiadd1,sseishft1,
-   mmx,mmxmov,mmxcmp,mmxcvt")
+   mmx,mmxmov,mmxcmp,mmxcvt,mskmov,msklog")
       (match_operand 2 "memory_operand"))
    (const_string "load")
  (and (eq_attr "type" "icmov,ssemuladd,sse4arg")
@@ -2213,8 +2221,8 @@ 
    (const_string "SI")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m")
- (match_operand:HI 1 "general_operand"   "r ,rn,rm,rn"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,Yk,Yk,rm")
+ (match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,rm,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2223,6 +2231,16 @@ 
       /* movzwl is faster than movw on p2 due to partial word stalls,
  though not as fast as an aligned movl.  */
       return "movz{wl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+ case 4: return "kmovw\t{%k1, %0|%0, %k1}";
+ case 5: return "kmovw\t{%1, %0|%0, %1}";
+ case 6: return "kmovw\t{%1, %k0|%k0, %1}";
+ default: gcc_unreachable ();
+ }
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2240,11 +2258,17 @@ 
     (and (eq_attr "alternative" "1,2")
  (match_operand:HI 1 "aligned_operand"))
       (const_string "imov")
+    (eq_attr "alternative" "4,5,6")
+      (const_string "mskmov")
     (and (match_test "TARGET_MOVX")
  (eq_attr "alternative" "0,2"))
       (const_string "imovx")
    ]
    (const_string "imov")))
+    (set (attr "prefix")
+      (if_then_else (eq_attr "alternative" "4,5,6")
+ (const_string "vex")
+ (const_string "orig")))
     (set (attr "mode")
       (cond [(eq_attr "type" "imovx")
        (const_string "SI")
@@ -2269,8 +2293,8 @@ 
 ;; register stall machines with, where we use QImode instructions, since
 ;; partial register stall can be caused there.  Then we use movzx.
 (define_insn "*movqi_internal"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
- (match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn"))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m ,Yk,Yk,r")
+ (match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn,r ,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2278,6 +2302,16 @@ 
     case TYPE_IMOVX:
       gcc_assert (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1]));
       return "movz{bl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+ case 7: return "kmovw\t{%k1, %0|%0, %k1}";
+ case 8: return "kmovw\t{%1, %0|%0, %1}";
+ case 9: return "kmovw\t{%1, %k0|%k0, %1}";
+ default: gcc_unreachable ();
+ }
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2297,11 +2331,17 @@ 
       (const_string "imov")
     (eq_attr "alternative" "3,5")
       (const_string "imovx")
+    (eq_attr "alternative" "7,8,9")
+      (const_string "mskmov")
     (and (match_test "TARGET_MOVX")
  (eq_attr "alternative" "2"))
       (const_string "imovx")
    ]
    (const_string "imov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9")
+       (const_string "vex")
+       (const_string "orig")))
    (set (attr "mode")
       (cond [(eq_attr "alternative" "3,4,5")
        (const_string "SI")
@@ -7494,6 +7534,26 @@ 
   operands[3] = gen_lowpart (QImode, operands[3]);
 })
 
+(define_split
+  [(set (match_operand:SWI12 0 "mask_reg_operand")
+ (any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand")
+ (match_operand:SWI12 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(set (match_dup 0)
+ (any_logic:SWI12 (match_dup 1)
+ (match_dup 2)))])
+
+(define_insn "*k<logic><mode>"
+  [(set (match_operand:SWI12 0 "mask_reg_operand" "=Yk")
+ (any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand" "Yk")
+ (match_operand:SWI12 2 "mask_reg_operand" "Yk")))]
+  "TARGET_AVX512F && reload_completed"
+  "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+  [(set_attr "mode" "<MODE>")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; %%% This used to optimize known byte-wide and operations to memory,
 ;; and sometimes to QImode registers.  If this is considered useful,
 ;; it should be done with splitters.
@@ -7616,10 +7676,10 @@ 
   [(set_attr "type" "alu")
    (set_attr "mode" "SI")])
 
-(define_insn "*andhi_1"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,Ya")
- (and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm")
- (match_operand:HI 2 "general_operand" "rn,rm,L")))
+(define_insn "andhi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,Ya,!Yk")
+ (and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm,Yk")
+ (match_operand:HI 2 "general_operand" "rn,rm,L,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, HImode, operands)"
 {
@@ -7628,34 +7688,38 @@ 
     case TYPE_IMOVX:
       return "#";
 
+    case TYPE_MSKLOG:
+      return "kandw\t{%2, %1, %0|%0, %1, %2}";
+
     default:
       gcc_assert (rtx_equal_p (operands[0], operands[1]));
       return "and{w}\t{%2, %0|%0, %2}";
     }
 }
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+  [(set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
     (match_operand 1 "ext_QIreg_operand"))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "HI,HI,SI")])
+   (set_attr "mode" "HI,HI,SI,HI")])
 
 ;; %%% Potential partial reg stall on alternative 2.  What to do?
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
- (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
- (match_operand:QI 2 "general_operand" "qn,qmn,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,!Yk")
+ (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,Yk")
+ (match_operand:QI 2 "general_operand" "qn,qmn,rn,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
   "@
    and{b}\t{%2, %0|%0, %2}
    and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")])
+   and{l}\t{%k2, %k0|%k0, %k2}
+   kandw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
 
 (define_insn "*andqi_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
@@ -7668,6 +7732,38 @@ 
   [(set_attr "type" "alu1")
    (set_attr "mode" "QI")])
 
+(define_insn "kandn<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
+ (and:SWI12
+  (not:SWI12
+    (match_operand:SWI12 1 "register_operand" "r,0,k"))
+  (match_operand:SWI12 2 "register_operand" "r,r,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #
+   kandnw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip,*,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "btver2_decode" "direct,*,*")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "general_reg_operand")
+ (and:SWI12
+  (not:SWI12
+    (match_dup 0))
+  (match_operand:SWI12 1 "general_reg_operand")))]
+  "TARGET_AVX512F && !TARGET_BMI"
+  [(set (match_dup 0)
+ (not:HI (match_dup 0)))
+   (parallel [(set (match_dup 0)
+   (and:HI (match_dup 0)
+   (match_dup 1)))
+      (clobber (reg:CC FLAGS_REG))])]
+  "")
+
 ;; Turn *anddi_1 into *andsi_1_zext if possible.
 (define_split
   [(set (match_operand:DI 0 "register_operand")
@@ -7999,29 +8095,44 @@ 
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=r,rm")
- (any_or:SWI248
- (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
- (match_operand:SWI248 2 "<general_operand>" "<g>,r<i>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm")
+ (any_or:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "%0,0")
+ (match_operand:SWI48 2 "<general_operand>" "<g>,r<i>")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<code>hi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!Yk")
+ (any_or:HI
+ (match_operand:HI 1 "nonimmediate_operand" "%0,0,Yk")
+ (match_operand:HI 2 "general_operand" "<g>,r<i>,Yk")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, HImode, operands)"
+  "@
+  <logic>{w}\t{%2, %0|%0, %2}
+  <logic>{w}\t{%2, %0|%0, %2}
+  k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,msklog")
+   (set_attr "mode" "HI")])
+
 ;; %%% Potential partial reg stall on alternative 2.  What to do?
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r")
- (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-   (match_operand:QI 2 "general_operand" "qmn,qn,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r,!Yk")
+ (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,Yk")
+   (match_operand:QI 2 "general_operand" "qmn,qn,rn,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
   "@
    <logic>{b}\t{%2, %0|%0, %2}
    <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")])
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
 
 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
 (define_insn "*<code>si_1_zext"
@@ -8071,6 +8182,74 @@ 
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "kxnor<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,!k")
+ (not:SWI12
+  (xor:SWI12
+    (match_operand:SWI12 1 "register_operand" "0,k")
+    (match_operand:SWI12 2 "register_operand" "r,k"))))]
+  "TARGET_AVX512F"
+  "@
+   #
+   kxnorw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "*,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "register_operand")
+ (not:SWI12
+  (xor:SWI12
+    (match_dup 0)
+    (match_operand:SWI12 1 "register_operand"))))]
+  "TARGET_AVX512F && !ANY_MASK_REG_P (operands [0])"
+   [(parallel [(set (match_dup 0)
+    (xor:HI (match_dup 0)
+    (match_dup 1)))
+       (clobber (reg:CC FLAGS_REG))])
+    (set (match_dup 0)
+ (not:HI (match_dup 0)))]
+  "")
+
+(define_insn "kortestzhi"
+  [(set (reg:CCZ FLAGS_REG)
+ (compare:CCZ
+  (ior:HI
+    (match_operand:HI 0 "register_operand" "%Yk")
+    (match_operand:HI 1 "register_operand" "Yk"))
+  (const_int 0)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCZmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kortestchi"
+  [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+  (ior:HI
+    (match_operand:HI 0 "register_operand" "%Yk")
+    (match_operand:HI 1 "register_operand" "Yk"))
+  (const_int -1)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCCmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kunpckhi"
+  [(set (match_operand:HI 0 "register_operand" "=Yk")
+ (ior:HI
+  (ashift:HI
+    (match_operand:HI 1 "register_operand" "Yk")
+    (const_int 8))
+  (zero_extend:HI (match_operand:QI 2 "register_operand" "Yk"))))]
+  "TARGET_AVX512F"
+  "kunpckbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
 ;; ??? Special case for immediate operand is missing - it is tricky.
 (define_insn "*<code>si_2_zext"
@@ -8640,23 +8819,38 @@ 
   "ix86_expand_unary_operator (NOT, <MODE>mode, operands); DONE;")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
- (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+ (not:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
   "not{<imodesuffix>}\t%0"
   [(set_attr "type" "negnot")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*one_cmplhi2_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,!Yk")
+ (not:HI (match_operand:HI 1 "nonimmediate_operand" "0,Yk")))]
+  "ix86_unary_operator_ok (NOT, HImode, operands)"
+  "@
+   not{w}\t%0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,avx512f")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "HI")])
+
 ;; %%% Potential partial reg stall on alternative 1.  What to do?
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
- (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,!Yk")
+ (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,Yk")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")])
+   not{l}\t%k0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "mode" "QI,SI,QI")])
 
 ;; ??? Currently never generated - xor is used instead.
 (define_insn "*one_cmplsi2_1_zext"
@@ -16380,11 +16574,11 @@ 
 })
 
 ;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
-
+;; Do not split instructions with mask registers.
 (define_split
-  [(set (match_operand 0 "register_operand")
+  [(set (match_operand 0 "general_reg_operand")
  (match_operator 3 "promotable_binary_operator"
-   [(match_operand 1 "register_operand")
+   [(match_operand 1 "general_reg_operand")
     (match_operand 2 "aligned_operand")]))
    (clobber (reg:CC FLAGS_REG))]
   "! TARGET_PARTIAL_REG_STALL && reload_completed
@@ -16479,6 +16673,7 @@ 
   operands[1] = gen_lowpart (SImode, operands[1]);
 })
 
+;; Do not split instructions with mask regs.
 (define_split
   [(set (match_operand 0 "register_operand")
  (not (match_operand 1 "register_operand")))]
@@ -16486,7 +16681,9 @@ 
    && (GET_MODE (operands[0]) == HImode
        || (GET_MODE (operands[0]) == QImode
    && (TARGET_PROMOTE_QImode
-       || optimize_insn_for_size_p ())))"
+       || optimize_insn_for_size_p ())))
+   && (! ANY_MASK_REG_P (operands[0])
+ || ! ANY_MASK_REG_P (operands[1]))"
   [(set (match_dup 0)
  (not:SI (match_dup 1)))]
 {
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 3959c38..18f425c 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -32,6 +32,11 @@ 
   (and (match_code "reg")
        (not (match_test "ANY_FP_REGNO_P (REGNO (op))"))))
 
+;; True if the operand is a GENERAL class register.
+(define_predicate "general_reg_operand"
+  (and (match_code "reg")
+       (match_test "GENERAL_REG_P (op)")))
+
 ;; Return true if OP is a register operand other than an i387 fp register.
 (define_predicate "register_and_not_fp_reg_operand"
   (and (match_code "reg")
@@ -52,6 +57,10 @@ 
   (and (match_code "reg")
        (match_test "EXT_REX_SSE_REGNO_P (REGNO (op))")))
 
+;; True if the operand is an AVX-512 mask register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
 
 ;; True if the operand is a Q_REGS class register.
 (define_predicate "q_regs_operand"