Patchwork [i386,2/8,AVX512] Add mask registers.

login
register
mail settings
Submitter Kirill Yukhin
Date Aug. 28, 2013, 6:38 p.m.
Message ID <20130828183805.GA14518@msticlxl57.ims.intel.com>
Download mbox | patch
Permalink /patch/270616/
State New
Headers show

Comments

Kirill Yukhin - Aug. 28, 2013, 6:38 p.m.
Hello Richard,

On 28 Aug 10:55, Richard Henderson wrote:
> On 08/28/2013 10:45 AM, Kirill Yukhin wrote:
> > Hello Richard,
> > 
> > On 27 Aug 13:07, Richard Henderson wrote:
> >> On 08/27/2013 11:11 AM, Kirill Yukhin wrote:
> >>>>> What happened to the bmi andn alternative we discussed?
> >>> BMI only supported for 4- and 8- byte integers, while
> >>> kandw - for HI/QI
> >>>
> >>
> >> We're talking about values in registers.  Ignoring the high bits of the andn
> >> result still produces the correct results.
> > 
> > However I am not fully understand why do we need this.
> > `kandn' is different from BMI `andn' in clobbering of flags reg.
> > So, having such a pattern we'll make compiler think that `kandn'
> > clobber, which seems to me like opportunity to misoptimization as
> > far as `kandn' doesn't clobber.
> 
> This is no different than ANY OTHER use of the mask logical ops.
> 
> When combine puts the AND and the NOT together, we don't know what registers we
> want the data in.  If we do not supply the general register alternative, with
> the clobber, then we will be FORCED to implement the operation in the mask
> registers, even if this operation had nothing to do with actual vector masks.
> And it ought to come as no surprise that X & ~Y is a fairly common operation.
I agree with all of that. But why to put in BMI alternative as well? Without it
me may have this pattern w/o clobber and add it when doing split for GPR constraint.
I am just thing that presense of flags clobber in `kandn' pattern is not good from
optimization point of view. Anyway I don't think this is big deal...

> I suppose a real question here in how this is written: Does TARGET_AVX512F
> imply TARGET_BMI?  If so, then we can eliminate the second alternative.  If
> not, then you are missing an set_attr isa to restrict the first alternative.

I think that it should be possible to use AVX-512F w/o BMI, so I've added
new isa attribute "bmi". I am testing previous patch with that change:
@@ -703,7 +703,7 @@
 ;; Used to control the "enabled" attribute on a per-instruction basis.
 (define_attr "isa" "base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
 		    sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
-		    avx2,noavx2,bmi2,fma4,fma,avx512f,noavx512f,fma_avx512f"
+		    avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,fma_avx512f"
   (const_string "base"))
 
 (define_attr "enabled" ""
@@ -726,6 +726,7 @@
 	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
 	 (eq_attr "isa" "avx2") (symbol_ref "TARGET_AVX2")
 	 (eq_attr "isa" "noavx2") (symbol_ref "!TARGET_AVX2")
+	 (eq_attr "isa" "bmi") (symbol_ref "TARGET_BMI")
 	 (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
 	 (eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4")
 	 (eq_attr "isa" "fma") (symbol_ref "TARGET_FMA")
@@ -7744,7 +7745,8 @@
    andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
    #
    kandnw\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip,*,msklog")
+  [(set_attr "isa" "bmi,*,avx512f")
+   (set_attr "type" "bitmanip,*,msklog")
    (set_attr "prefix" "*,*,vex")
    (set_attr "btver2_decode" "direct,*,*")
    (set_attr "mode" "<MODE>")])

Full patch below.

Ok if testing pass?

Thanks, K

---
 gcc/config/i386/constraints.md |   8 +-
 gcc/config/i386/i386.c         |  34 ++++-
 gcc/config/i386/i386.h         |  40 ++++--
 gcc/config/i386/i386.md        | 287 ++++++++++++++++++++++++++++++++++-------
 gcc/config/i386/predicates.md  |   9 ++
 5 files changed, 317 insertions(+), 61 deletions(-)
Richard Henderson - Aug. 28, 2013, 8:17 p.m.
On 08/28/2013 11:38 AM, Kirill Yukhin wrote:
>> When combine puts the AND and the NOT together, we don't know what registers we
>> want the data in.  If we do not supply the general register alternative, with
>> the clobber, then we will be FORCED to implement the operation in the mask
>> registers, even if this operation had nothing to do with actual vector masks.
>> And it ought to come as no surprise that X & ~Y is a fairly common operation.
> I agree with all of that. But why to put in BMI alternative as well? Without it
> me may have this pattern w/o clobber and add it when doing split for GPR constraint.

Uh, no, you can't just add it when doing the split.  You could be adding it in
a place that the flags register is live.  You must ALWAYS have the clobber on
the whole pattern when gprs are possible.

> @@ -4219,8 +4225,13 @@ ix86_conditional_register_usage (void)
>  
>    /* If AVX512F is disabled, squash the registers.  */
>    if (! TARGET_AVX512F)
> +  {
>      for (i = FIRST_EXT_REX_SSE_REG; i < LAST_EXT_REX_SSE_REG; i++)
>        fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
> +
> +    for (i = FIRST_MASK_REG; i < LAST_MASK_REG; i++)
> +      fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
> +  }

Fix the indentation.

> @@ -1429,7 +1450,7 @@ enum reg_class
>  
>  /* Get_secondary_mem widens integral modes to BITS_PER_WORD.
>     There is no need to emit full 64 bit move on 64 bit targets
> -   for integral modes that can be moved using 32 bit move.  */
> +   for integral modes that can be moved using 8 bit move.  */
>  #define SECONDARY_MEMORY_NEEDED_MODE(MODE)			\
>    (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE)	\
>     ? mode_for_size (32, GET_MODE_CLASS (MODE), 0)		\

Spurious comment change.

> +(define_insn "kandn<mode>"
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
> +	(and:SWI12
> +	  (not:SWI12
> +	    (match_operand:SWI12 1 "register_operand" "r,0,k"))
> +	  (match_operand:SWI12 2 "register_operand" "r,r,k")))
> +   (clobber (reg:CC FLAGS_REG))]

Yk not k?

> +(define_insn "kxnor<mode>"
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,!k")
> +	(not:SWI12
> +	  (xor:SWI12
> +	    (match_operand:SWI12 1 "register_operand" "0,k")
> +	    (match_operand:SWI12 2 "register_operand" "r,k"))))]
> +  "TARGET_AVX512F"
> +  "@
> +   #
> +   kxnorw\t{%2, %1, %0|%0, %1, %2}"
> +  [(set_attr "type" "*,msklog")
> +   (set_attr "prefix" "*,vex")
> +   (set_attr "mode" "<MODE>")])

Likewise.

> +(define_split
> +  [(set (match_operand:SWI12 0 "register_operand")
> +	(not:SWI12
> +	  (xor:SWI12
> +	    (match_dup 0)
> +	    (match_operand:SWI12 1 "register_operand"))))]
> +  "TARGET_AVX512F && !ANY_MASK_REG_P (operands [0])"
> +   [(parallel [(set (match_dup 0)
> +		    (xor:HI (match_dup 0)
> +			    (match_dup 1)))
> +	       (clobber (reg:CC FLAGS_REG))])
> +    (set (match_dup 0)
> +	 (not:HI (match_dup 0)))]
> +  "")

general_reg_operand.

> +(define_insn "kortestzhi"
> +  [(set (reg:CCZ FLAGS_REG)
> +	(compare:CCZ
> +	  (ior:HI
> +	    (match_operand:HI 0 "register_operand" "%Yk")
> +	    (match_operand:HI 1 "register_operand" "Yk"))

Omit the %; the two operands are identical.

> +(define_insn "kortestchi"
> +  [(set (reg:CCC FLAGS_REG)
> +	(compare:CCC
> +	  (ior:HI
> +	    (match_operand:HI 0 "register_operand" "%Yk")
> +	    (match_operand:HI 1 "register_operand" "Yk"))

Likewise.

> +;; Do not split instructions with mask regs.
>  (define_split
>    [(set (match_operand 0 "register_operand")
>  	(not (match_operand 1 "register_operand")))]
> @@ -16486,7 +16683,9 @@
>     && (GET_MODE (operands[0]) == HImode
>         || (GET_MODE (operands[0]) == QImode
>  	   && (TARGET_PROMOTE_QImode
> -	       || optimize_insn_for_size_p ())))"
> +	       || optimize_insn_for_size_p ())))
> +   && (! ANY_MASK_REG_P (operands[0])
> +	 || ! ANY_MASK_REG_P (operands[1]))"
>    [(set (match_dup 0)
>  	(not:SI (match_dup 1)))]

general_reg_operand.


r~

Patch

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 28e626f..92e0c05 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -19,7 +19,7 @@ 
 
 ;;; Unused letters:
 ;;;     B     H           T
-;;;           h jk
+;;;           h j
 
 ;; Integer register constraints.
 ;; It is not necessary to define 'r' here.
@@ -78,6 +78,12 @@ 
  "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_SECOND_REG : NO_REGS"
  "Second from top of 80387 floating-point stack (@code{%st(1)}).")
 
+(define_register_constraint "k" "TARGET_AVX512F ? MASK_EVEX_REGS : NO_REGS"
+"@internal Any mask register that can be used as predicate, i.e. k1-k7.")
+
+(define_register_constraint "Yk" "TARGET_AVX512F ? MASK_REGS : NO_REGS"
+"@internal Any mask register.")
+
 ;; Vector registers (also used for plain floating point nowadays).
 (define_register_constraint "y" "TARGET_MMX ? MMX_REGS : NO_REGS"
  "Any MMX register.")
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d05dbf0..8325919 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2032,6 +2032,9 @@  enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  /* Mask registers.  */
+  MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
+  MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
 };
 
 /* The "default" register map used in 32bit mode.  */
@@ -2047,6 +2050,7 @@  int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* The "default" register map used in 64bit mode.  */
@@ -2062,6 +2066,7 @@  int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
   67, 68, 69, 70, 71, 72, 73, 74,       /* AVX-512 registers 16-23 */
   75, 76, 77, 78, 79, 80, 81, 82,       /* AVX-512 registers 24-31 */
+  118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
 };
 
 /* Define the register numbers to be used in Dwarf debugging information.
@@ -2129,6 +2134,7 @@  int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
 };
 
 /* Define parameter passing and return registers.  */
@@ -4219,8 +4225,13 @@  ix86_conditional_register_usage (void)
 
   /* If AVX512F is disabled, squash the registers.  */
   if (! TARGET_AVX512F)
+  {
     for (i = FIRST_EXT_REX_SSE_REG; i < LAST_EXT_REX_SSE_REG; i++)
       fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+    for (i = FIRST_MASK_REG; i < LAST_MASK_REG; i++)
+      fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+  }
 }
 
 
@@ -33889,10 +33900,12 @@  ix86_preferred_reload_class (rtx x, reg_class_t regclass)
     return regclass;
 
   /* Force constants into memory if we are loading a (nonzero) constant into
-     an MMX or SSE register.  This is because there are no MMX/SSE instructions
-     to load from a constant.  */
+     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+     instructions to load from a constant.  */
   if (CONSTANT_P (x)
-      && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
+      && (MAYBE_MMX_CLASS_P (regclass)
+	  || MAYBE_SSE_CLASS_P (regclass)
+	  || MAYBE_MASK_CLASS_P (regclass)))
     return NO_REGS;
 
   /* Prefer SSE regs only, if we can use them for math.  */
@@ -33996,10 +34009,11 @@  ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
 
   /* QImode spills from non-QI registers require
      intermediate register on 32bit targets.  */
-  if (!TARGET_64BIT
-      && !in_p && mode == QImode
-      && INTEGER_CLASS_P (rclass)
-      && MAYBE_NON_Q_CLASS_P (rclass))
+  if (mode == QImode
+      && (MAYBE_MASK_CLASS_P (rclass)
+	  || (!TARGET_64BIT && !in_p
+	      && INTEGER_CLASS_P (rclass)
+	      && MAYBE_NON_Q_CLASS_P (rclass))))
     {
       int regno;
 
@@ -34421,6 +34435,8 @@  ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
     return false;
   if (STACK_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
+  if (MASK_REGNO_P (regno))
+    return VALID_MASK_REG_MODE (mode);
   if (SSE_REGNO_P (regno))
     {
       /* We implement the move patterns for all vector modes into and
@@ -35230,6 +35246,10 @@  x86_order_regs_for_local_alloc (void)
    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
      reg_alloc_order [pos++] = i;
 
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
    /* x87 registers.  */
    if (TARGET_SSE_MATH)
      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e820aa6..13572bf2 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -893,7 +893,7 @@  enum target_cpu_default
    eliminated during reloading in favor of either the stack or frame
    pointer.  */
 
-#define FIRST_PSEUDO_REGISTER 69
+#define FIRST_PSEUDO_REGISTER 77
 
 /* Number of hardware registers that go into the DWARF-2 unwind info.
    If not defined, equals FIRST_PSEUDO_REGISTER.  */
@@ -923,7 +923,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
      0,   0,    0,    0,    0,    0,    0,    0,		\
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
-     0,   0,    0,    0,    0,    0,    0,    0 }
+     0,   0,    0,    0,    0,    0,    0,    0,		\
+/*  k0,  k1, k2, k3, k4, k5, k6, k7*/				\
+     0,  0,   0,  0,  0,  0,  0,  0 }
 
 /* 1 for registers not available across function calls.
    These must include the FIXED_REGISTERS and also any
@@ -955,7 +957,9 @@  enum target_cpu_default
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
      6,    6,     6,    6,    6,    6,    6,    6,		\
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
-     6,    6,     6,    6,    6,    6,    6,    6 }
+     6,    6,     6,    6,    6,    6,    6,    6,		\
+ /* k0,  k1,  k2,  k3,  k4,  k5,  k6,  k7*/			\
+     1,   1,   1,   1,   1,   1,   1,   1 }
 
 /* Order in which to allocate registers.  Each register must be
    listed once, even those in FIXED_REGISTERS.  List frame pointer
@@ -971,7 +975,7 @@  enum target_cpu_default
    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,	\
    33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,  \
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,	\
-   63, 64, 65, 66, 67, 68 }
+   63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76 }
 
 /* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
    to be rearranged based on a particular function.  When using sse math,
@@ -1068,6 +1072,8 @@  enum target_cpu_default
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
    || (MODE) == V16SFmode)
 
+#define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
+
 /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE.  */
 
 #define HARD_REGNO_MODE_OK(REGNO, MODE)	\
@@ -1093,8 +1099,10 @@  enum target_cpu_default
   (CC_REGNO_P (REGNO) ? VOIDmode					\
    : (MODE) == VOIDmode && (NREGS) != 1 ? VOIDmode			\
    : (MODE) == VOIDmode ? choose_hard_reg_mode ((REGNO), (NREGS), false) \
-   : (MODE) == HImode && !TARGET_PARTIAL_REG_STALL ? SImode		\
-   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)) ? SImode	\
+   : (MODE) == HImode && !(TARGET_PARTIAL_REG_STALL			\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
+   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)		\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
    : (MODE))
 
 /* The only ABI that saves SSE registers across calls is Win64 (thus no
@@ -1141,6 +1149,9 @@  enum target_cpu_default
 #define FIRST_EXT_REX_SSE_REG  (LAST_REX_SSE_REG + 1) /*53*/
 #define LAST_EXT_REX_SSE_REG   (FIRST_EXT_REX_SSE_REG + 15) /*68*/
 
+#define FIRST_MASK_REG  (LAST_EXT_REX_SSE_REG + 1) /*69*/
+#define LAST_MASK_REG   (FIRST_MASK_REG + 7) /*76*/
+
 /* Override this in other tm.h files to cope with various OS lossage
    requiring a frame pointer.  */
 #ifndef SUBTARGET_FRAME_POINTER_REQUIRED
@@ -1229,6 +1240,8 @@  enum reg_class
   FLOAT_INT_REGS,
   INT_SSE_REGS,
   FLOAT_INT_SSE_REGS,
+  MASK_EVEX_REGS,
+  MASK_REGS,
   ALL_REGS, LIM_REG_CLASSES
 };
 
@@ -1250,6 +1263,8 @@  enum reg_class
   reg_classes_intersect_p ((CLASS), ALL_SSE_REGS)
 #define MAYBE_MMX_CLASS_P(CLASS) \
   reg_classes_intersect_p ((CLASS), MMX_REGS)
+#define MAYBE_MASK_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), MASK_REGS)
 
 #define Q_CLASS_P(CLASS) \
   reg_class_subset_p ((CLASS), Q_REGS)
@@ -1282,6 +1297,8 @@  enum reg_class
    "FLOAT_INT_REGS",			\
    "INT_SSE_REGS",			\
    "FLOAT_INT_SSE_REGS",		\
+   "MASK_EVEX_REGS",			\
+   "MASK_REGS",				\
    "ALL_REGS" }
 
 /* Define which registers fit in which classes.  This is an initializer
@@ -1319,7 +1336,9 @@  enum reg_class
 {   0x11ffff,    0x1fe0,   0x0 },       /* FLOAT_INT_REGS */            \
 { 0x1ff100ff,0xffffffe0,  0x1f },       /* INT_SSE_REGS */              \
 { 0x1ff1ffff,0xffffffe0,  0x1f },       /* FLOAT_INT_SSE_REGS */        \
-{ 0xffffffff,0xffffffff,  0x1f }                                        \
+       { 0x0,       0x0,0x1fc0 },       /* MASK_EVEX_REGS */           \
+       { 0x0,       0x0,0x1fe0 },       /* MASK_REGS */                 \
+{ 0xffffffff,0xffffffff,0x1fff }                                        \
 }
 
 /* The same information, inverted:
@@ -1377,6 +1396,8 @@  enum reg_class
          : (N) <= LAST_REX_SSE_REG ? (FIRST_REX_SSE_REG + (N) - 8) \
                                    : (FIRST_EXT_REX_SSE_REG + (N) - 16))
 
+#define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
+#define ANY_MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
 
 #define SSE_FLOAT_MODE_P(MODE) \
   ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
@@ -1429,7 +1450,7 @@  enum reg_class
 
 /* Get_secondary_mem widens integral modes to BITS_PER_WORD.
    There is no need to emit full 64 bit move on 64 bit targets
-   for integral modes that can be moved using 32 bit move.  */
+   for integral modes that can be moved using 8 bit move.  */
 #define SECONDARY_MEMORY_NEEDED_MODE(MODE)			\
   (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE)	\
    ? mode_for_size (32, GET_MODE_CLASS (MODE), 0)		\
@@ -1933,7 +1954,8 @@  do {							\
  "xmm16", "xmm17", "xmm18", "xmm19",					\
  "xmm20", "xmm21", "xmm22", "xmm23",					\
  "xmm24", "xmm25", "xmm26", "xmm27",					\
- "xmm28", "xmm29", "xmm30", "xmm31" }
+ "xmm28", "xmm29", "xmm30", "xmm31",					\
+ "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" }
 
 #define REGISTER_NAMES HI_REGISTER_NAMES
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3d7533a..f41b6b8 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -328,6 +328,14 @@ 
    (XMM29_REG			66)
    (XMM30_REG			67)
    (XMM31_REG			68)
+   (MASK0_REG			69)
+   (MASK1_REG			70)
+   (MASK2_REG			71)
+   (MASK3_REG			72)
+   (MASK4_REG			73)
+   (MASK5_REG			74)
+   (MASK6_REG			75)
+   (MASK7_REG			76)
   ])
 
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
@@ -360,7 +368,7 @@ 
    sseishft,sseishft1,ssecmp,ssecomi,
    ssecvt,ssecvt1,sseicvt,sseins,
    sseshuf,sseshuf1,ssemuladd,sse4arg,
-   lwp,
+   lwp,mskmov,msklog,
    mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
   (const_string "other"))
 
@@ -379,7 +387,7 @@ 
 			  ssemul,sseimul,ssediv,sselog,sselog1,
 			  sseishft,sseishft1,ssecmp,ssecomi,
 			  ssecvt,ssecvt1,sseicvt,sseins,
-			  sseshuf,sseshuf1,ssemuladd,sse4arg")
+			  sseshuf,sseshuf1,ssemuladd,sse4arg,mskmov")
 	   (const_string "sse")
 	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
 	   (const_string "mmx")
@@ -390,7 +398,7 @@ 
 ;; The (bounding maximum) length of an instruction immediate.
 (define_attr "length_immediate" ""
   (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
-			  bitmanip,imulx")
+			  bitmanip,imulx,msklog,mskmov")
 	   (const_int 0)
 	 (eq_attr "unit" "i387,sse,mmx")
 	   (const_int 0)
@@ -451,7 +459,7 @@ 
 ;; Set when 0f opcode prefix is used.
 (define_attr "prefix_0f" ""
   (if_then_else
-    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip")
+    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip,msklog,mskmov")
 	 (eq_attr "unit" "sse,mmx"))
     (const_int 1)
     (const_int 0)))
@@ -651,7 +659,7 @@ 
 		   fmov,fcmp,fsgn,
 		   sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,
 		   sselog1,sseshuf1,sseadd1,sseiadd1,sseishft1,
-		   mmx,mmxmov,mmxcmp,mmxcvt")
+		   mmx,mmxmov,mmxcmp,mmxcvt,mskmov,msklog")
 	      (match_operand 2 "memory_operand"))
 	   (const_string "load")
 	 (and (eq_attr "type" "icmov,ssemuladd,sse4arg")
@@ -695,7 +703,7 @@ 
 ;; Used to control the "enabled" attribute on a per-instruction basis.
 (define_attr "isa" "base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
 		    sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
-		    avx2,noavx2,bmi2,fma4,fma,avx512f,noavx512f,fma_avx512f"
+		    avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,fma_avx512f"
   (const_string "base"))
 
 (define_attr "enabled" ""
@@ -718,6 +726,7 @@ 
 	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
 	 (eq_attr "isa" "avx2") (symbol_ref "TARGET_AVX2")
 	 (eq_attr "isa" "noavx2") (symbol_ref "!TARGET_AVX2")
+	 (eq_attr "isa" "bmi") (symbol_ref "TARGET_BMI")
 	 (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
 	 (eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4")
 	 (eq_attr "isa" "fma") (symbol_ref "TARGET_FMA")
@@ -2213,8 +2222,8 @@ 
 	   (const_string "SI")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m")
-	(match_operand:HI 1 "general_operand"	   "r ,rn,rm,rn"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,Yk,Yk,rm")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,rm,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2223,6 +2232,16 @@ 
       /* movzwl is faster than movw on p2 due to partial word stalls,
 	 though not as fast as an aligned movl.  */
       return "movz{wl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 4: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 5: return "kmovw\t{%1, %0|%0, %1}";
+	case 6: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2240,11 +2259,17 @@ 
 	    (and (eq_attr "alternative" "1,2")
 		 (match_operand:HI 1 "aligned_operand"))
 	      (const_string "imov")
+	    (eq_attr "alternative" "4,5,6")
+	      (const_string "mskmov")
 	    (and (match_test "TARGET_MOVX")
 		 (eq_attr "alternative" "0,2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
+    (set (attr "prefix")
+      (if_then_else (eq_attr "alternative" "4,5,6")
+	(const_string "vex")
+	(const_string "orig")))
     (set (attr "mode")
       (cond [(eq_attr "type" "imovx")
 	       (const_string "SI")
@@ -2269,8 +2294,8 @@ 
 ;; register stall machines with, where we use QImode instructions, since
 ;; partial register stall can be caused there.  Then we use movzx.
 (define_insn "*movqi_internal"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
-	(match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn"))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m ,Yk,Yk,r")
+	(match_operand:QI 1 "general_operand"      "q ,qn,qm,q,rn,qm,qn,r ,Yk,Yk"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2278,6 +2303,16 @@ 
     case TYPE_IMOVX:
       gcc_assert (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1]));
       return "movz{bl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 7: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 8: return "kmovw\t{%1, %0|%0, %1}";
+	case 9: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
     default:
       if (get_attr_mode (insn) == MODE_SI)
         return "mov{l}\t{%k1, %k0|%k0, %k1}";
@@ -2297,11 +2332,17 @@ 
 	      (const_string "imov")
 	    (eq_attr "alternative" "3,5")
 	      (const_string "imovx")
+	    (eq_attr "alternative" "7,8,9")
+	      (const_string "mskmov")
 	    (and (match_test "TARGET_MOVX")
 		 (eq_attr "alternative" "2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9")
+       (const_string "vex")
+       (const_string "orig")))
    (set (attr "mode")
       (cond [(eq_attr "alternative" "3,4,5")
 	       (const_string "SI")
@@ -7494,6 +7535,26 @@ 
   operands[3] = gen_lowpart (QImode, operands[3]);
 })
 
+(define_split
+  [(set (match_operand:SWI12 0 "mask_reg_operand")
+	(any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand")
+			 (match_operand:SWI12 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(set (match_dup 0)
+	(any_logic:SWI12 (match_dup 1)
+			 (match_dup 2)))])
+
+(define_insn "*k<logic><mode>"
+  [(set (match_operand:SWI12 0 "mask_reg_operand" "=Yk")
+	(any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand" "Yk")
+			 (match_operand:SWI12 2 "mask_reg_operand" "Yk")))]
+  "TARGET_AVX512F && reload_completed"
+  "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+  [(set_attr "mode" "<MODE>")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; %%% This used to optimize known byte-wide and operations to memory,
 ;; and sometimes to QImode registers.  If this is considered useful,
 ;; it should be done with splitters.
@@ -7616,10 +7677,10 @@ 
   [(set_attr "type" "alu")
    (set_attr "mode" "SI")])
 
-(define_insn "*andhi_1"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,Ya")
-	(and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm")
-		(match_operand:HI 2 "general_operand" "rn,rm,L")))
+(define_insn "andhi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,Ya,!Yk")
+	(and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm,Yk")
+		(match_operand:HI 2 "general_operand" "rn,rm,L,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, HImode, operands)"
 {
@@ -7628,34 +7689,38 @@ 
     case TYPE_IMOVX:
       return "#";
 
+    case TYPE_MSKLOG:
+      return "kandw\t{%2, %1, %0|%0, %1, %2}";
+
     default:
       gcc_assert (rtx_equal_p (operands[0], operands[1]));
       return "and{w}\t{%2, %0|%0, %2}";
     }
 }
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+  [(set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
 	    (match_operand 1 "ext_QIreg_operand"))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "HI,HI,SI")])
+   (set_attr "mode" "HI,HI,SI,HI")])
 
 ;; %%% Potential partial reg stall on alternative 2.  What to do?
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		(match_operand:QI 2 "general_operand" "qn,qmn,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,!Yk")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,Yk")
+		(match_operand:QI 2 "general_operand" "qn,qmn,rn,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
   "@
    and{b}\t{%2, %0|%0, %2}
    and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")])
+   and{l}\t{%k2, %k0|%k0, %k2}
+   kandw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
 
 (define_insn "*andqi_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
@@ -7668,6 +7733,39 @@ 
   [(set_attr "type" "alu1")
    (set_attr "mode" "QI")])
 
+(define_insn "kandn<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
+	(and:SWI12
+	  (not:SWI12
+	    (match_operand:SWI12 1 "register_operand" "r,0,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,r,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #
+   kandnw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "bmi,*,avx512f")
+   (set_attr "type" "bitmanip,*,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "btver2_decode" "direct,*,*")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "general_reg_operand")
+	(and:SWI12
+	  (not:SWI12
+	    (match_dup 0))
+	  (match_operand:SWI12 1 "general_reg_operand")))]
+  "TARGET_AVX512F && !TARGET_BMI"
+  [(set (match_dup 0)
+	(not:HI (match_dup 0)))
+   (parallel [(set (match_dup 0)
+		   (and:HI (match_dup 0)
+			   (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "")
+
 ;; Turn *anddi_1 into *andsi_1_zext if possible.
 (define_split
   [(set (match_operand:DI 0 "register_operand")
@@ -7999,29 +8097,44 @@ 
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=r,rm")
-	(any_or:SWI248
-	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-	 (match_operand:SWI248 2 "<general_operand>" "<g>,r<i>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm")
+	(any_or:SWI48
+	 (match_operand:SWI48 1 "nonimmediate_operand" "%0,0")
+	 (match_operand:SWI48 2 "<general_operand>" "<g>,r<i>")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<code>hi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!Yk")
+	(any_or:HI
+	 (match_operand:HI 1 "nonimmediate_operand" "%0,0,Yk")
+	 (match_operand:HI 2 "general_operand" "<g>,r<i>,Yk")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, HImode, operands)"
+  "@
+  <logic>{w}\t{%2, %0|%0, %2}
+  <logic>{w}\t{%2, %0|%0, %2}
+  k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,msklog")
+   (set_attr "mode" "HI")])
+
 ;; %%% Potential partial reg stall on alternative 2.  What to do?
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r")
-	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		   (match_operand:QI 2 "general_operand" "qmn,qn,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r,!Yk")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,Yk")
+		   (match_operand:QI 2 "general_operand" "qmn,qn,rn,Yk")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
   "@
    <logic>{b}\t{%2, %0|%0, %2}
    <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")])
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
 
 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
 (define_insn "*<code>si_1_zext"
@@ -8071,6 +8184,74 @@ 
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "kxnor<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,!k")
+	(not:SWI12
+	  (xor:SWI12
+	    (match_operand:SWI12 1 "register_operand" "0,k")
+	    (match_operand:SWI12 2 "register_operand" "r,k"))))]
+  "TARGET_AVX512F"
+  "@
+   #
+   kxnorw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "*,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "register_operand")
+	(not:SWI12
+	  (xor:SWI12
+	    (match_dup 0)
+	    (match_operand:SWI12 1 "register_operand"))))]
+  "TARGET_AVX512F && !ANY_MASK_REG_P (operands [0])"
+   [(parallel [(set (match_dup 0)
+		    (xor:HI (match_dup 0)
+			    (match_dup 1)))
+	       (clobber (reg:CC FLAGS_REG))])
+    (set (match_dup 0)
+	 (not:HI (match_dup 0)))]
+  "")
+
+(define_insn "kortestzhi"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "%Yk")
+	    (match_operand:HI 1 "register_operand" "Yk"))
+	  (const_int 0)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCZmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kortestchi"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "%Yk")
+	    (match_operand:HI 1 "register_operand" "Yk"))
+	  (const_int -1)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCCmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kunpckhi"
+  [(set (match_operand:HI 0 "register_operand" "=Yk")
+	(ior:HI
+	  (ashift:HI
+	    (match_operand:HI 1 "register_operand" "Yk")
+	    (const_int 8))
+	  (zero_extend:HI (match_operand:QI 2 "register_operand" "Yk"))))]
+  "TARGET_AVX512F"
+  "kunpckbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
 ;; ??? Special case for immediate operand is missing - it is tricky.
 (define_insn "*<code>si_2_zext"
@@ -8640,23 +8821,38 @@ 
   "ix86_expand_unary_operator (NOT, <MODE>mode, operands); DONE;")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(not:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
   "not{<imodesuffix>}\t%0"
   [(set_attr "type" "negnot")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*one_cmplhi2_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,!Yk")
+	(not:HI (match_operand:HI 1 "nonimmediate_operand" "0,Yk")))]
+  "ix86_unary_operator_ok (NOT, HImode, operands)"
+  "@
+   not{w}\t%0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,avx512f")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "HI")])
+
 ;; %%% Potential partial reg stall on alternative 1.  What to do?
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,!Yk")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,Yk")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")])
+   not{l}\t%k0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "mode" "QI,SI,QI")])
 
 ;; ??? Currently never generated - xor is used instead.
 (define_insn "*one_cmplsi2_1_zext"
@@ -16380,11 +16576,11 @@ 
 })
 
 ;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
-
+;; Do not split instructions with mask registers.
 (define_split
-  [(set (match_operand 0 "register_operand")
+  [(set (match_operand 0 "general_reg_operand")
 	(match_operator 3 "promotable_binary_operator"
-	   [(match_operand 1 "register_operand")
+	   [(match_operand 1 "general_reg_operand")
 	    (match_operand 2 "aligned_operand")]))
    (clobber (reg:CC FLAGS_REG))]
   "! TARGET_PARTIAL_REG_STALL && reload_completed
@@ -16479,6 +16675,7 @@ 
   operands[1] = gen_lowpart (SImode, operands[1]);
 })
 
+;; Do not split instructions with mask regs.
 (define_split
   [(set (match_operand 0 "register_operand")
 	(not (match_operand 1 "register_operand")))]
@@ -16486,7 +16683,9 @@ 
    && (GET_MODE (operands[0]) == HImode
        || (GET_MODE (operands[0]) == QImode
 	   && (TARGET_PROMOTE_QImode
-	       || optimize_insn_for_size_p ())))"
+	       || optimize_insn_for_size_p ())))
+   && (! ANY_MASK_REG_P (operands[0])
+	 || ! ANY_MASK_REG_P (operands[1]))"
   [(set (match_dup 0)
 	(not:SI (match_dup 1)))]
 {
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 3959c38..18f425c 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -32,6 +32,11 @@ 
   (and (match_code "reg")
        (not (match_test "ANY_FP_REGNO_P (REGNO (op))"))))
 
+;; True if the operand is a GENERAL class register.
+(define_predicate "general_reg_operand"
+  (and (match_code "reg")
+       (match_test "GENERAL_REG_P (op)")))
+
 ;; Return true if OP is a register operand other than an i387 fp register.
 (define_predicate "register_and_not_fp_reg_operand"
   (and (match_code "reg")
@@ -52,6 +57,10 @@ 
   (and (match_code "reg")
        (match_test "EXT_REX_SSE_REGNO_P (REGNO (op))")))
 
+;; True if the operand is an AVX-512 mask register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
 
 ;; True if the operand is a Q_REGS class register.
 (define_predicate "q_regs_operand"