diff mbox series

[4/4,PR,target/88808] Enable bitwise operator for AVX512 masks.

Message ID CAMZc-bw+Fy5YsJKBodYV9Q_W7Ra9xgdngr0bYAS_X4uU8kKZ2A@mail.gmail.com
State New
Headers show
Series [1/4,PR,target/88808] Enable bitwise operator for AVX512 masks. | expand

Commit Message

Hongtao Liu Aug. 14, 2020, 8:27 a.m. UTC
Enable operator or/xor/and/andn/not for mask register, kxnor is not
enabled since there's no corresponding instruction for general
registers.

gcc/
        PR target/88808
        * config/i386/i386.md: (*movsi_internal): Adjust constraints
        for mask registers.
        (*movhi_internal): Ditto.
        (*movqi_internal): Ditto.
        (*anddi_1): Support mask register operations
        (*and<mode>_1): Ditto.
        (*andqi_1): Ditto.
        (*andn<mode>_1): Ditto.
        (*<code><mode>_1): Ditto.
        (*<code>qi_1): Ditto.
        (*one_cmpl<mode>2_1): Ditto.
        (*one_cmplsi2_1_zext): Ditto.
        (*one_cmplqi2_1): Ditto.

gcc/testsuite/
        * gcc.target/i386/bitwise_mask_op-1.c: New test.
        * gcc.target/i386/bitwise_mask_op-2.c: New test.
        * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
        * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
        * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
        * gcc.target/i386/avx512f-kmovw-5.c: Ditto.

Comments

Uros Bizjak Aug. 17, 2020, 10:08 a.m. UTC | #1
On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Enable operator or/xor/and/andn/not for mask register, kxnor is not
> enabled since there's no corresponding instruction for general
> registers.
>
> gcc/
>         PR target/88808
>         * config/i386/i386.md: (*movsi_internal): Adjust constraints
>         for mask registers.
>         (*movhi_internal): Ditto.
>         (*movqi_internal): Ditto.
>         (*anddi_1): Support mask register operations
>         (*and<mode>_1): Ditto.
>         (*andqi_1): Ditto.
>         (*andn<mode>_1): Ditto.
>         (*<code><mode>_1): Ditto.
>         (*<code>qi_1): Ditto.
>         (*one_cmpl<mode>2_1): Ditto.
>         (*one_cmplsi2_1_zext): Ditto.
>         (*one_cmplqi2_1): Ditto.
>
> gcc/testsuite/
>         * gcc.target/i386/bitwise_mask_op-1.c: New test.
>         * gcc.target/i386/bitwise_mask_op-2.c: New test.
>         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
>         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
>         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.

index 74d207c3711..e8ad79d1b0a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2294,7 +2294,7 @@

 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
     (match_operand:SI 1 "general_operand"
     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"

I'd rather see *k everywhere, also with *movqi_internal and
*movhi_internal patterns. The "*" means that the allocator won't
allocate a mask register by default, but it will be used to optimize
moves. With the above change, you are risking that during integer
register pressure, the register allocator will allocate zero to a mask
register, and later "optimize" the move with a direct maskreg-intreg
move.

The current strategy is that only general registers get allocated for
integer modes. Let's keep it this way for now.

Otherwise, the patchset LGTM, but please test the suggested changes and repost.

BTW: Do you plan to remove mask operations from sse.md? ATM, they are
used to distinguish mask operations, generated from builtins from
generic operations, so I'd like to keep them for a while. The drawback
is, that they are not combined with other operations, but at the end
of the day, this is what the programmer asked for by using builtins.

Uros.
Hongtao Liu Aug. 19, 2020, 2:26 a.m. UTC | #2
On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > enabled since there's no corresponding instruction for general
> > registers.
> >
> > gcc/
> >         PR target/88808
> >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> >         for mask registers.
> >         (*movhi_internal): Ditto.
> >         (*movqi_internal): Ditto.
> >         (*anddi_1): Support mask register operations
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >
> > gcc/testsuite/
> >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
>
> index 74d207c3711..e8ad79d1b0a 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2294,7 +2294,7 @@
>
>  (define_insn "*movsi_internal"
>    [(set (match_operand:SI 0 "nonimmediate_operand"
> -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
>      (match_operand:SI 1 "general_operand"
>      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>
> I'd rather see *k everywhere, also with *movqi_internal and
> *movhi_internal patterns. The "*" means that the allocator won't
> allocate a mask register by default, but it will be used to optimize
> moves. With the above change, you are risking that during integer
> register pressure, the register allocator will allocate zero to a mask
> register, and later "optimize" the move with a direct maskreg-intreg
> move.
>
> The current strategy is that only general registers get allocated for
> integer modes. Let's keep it this way for now.
>

Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
move zero into mask register directly.

> Otherwise, the patchset LGTM, but please test the suggested changes and repost.
>
> BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> used to distinguish mask operations, generated from builtins from
> generic operations, so I'd like to keep them for a while. The drawback
> is, that they are not combined with other operations, but at the end
> of the day, this is what the programmer asked for by using builtins.

Agree, I prefer to keep them.

>
> Uros.

Bootstrap is ok, regression test is ok for i386/x86-64 backend(After
adjusting testcase).

impact for SPEC2017 on SKL.

500.perlbench_r 0.00%
502.gcc_r 1.59%
505.mcf_r 1.49%
520.omnetpp_r 1.91%
523.xalancbmk_r -1.22%
525.x264_r 0.00%
531.deepsjeng_r 0.00%
541.leela_r -0.22%
548.exchange2_r 2.27%
557.xz_r 0.63%
INT geomean 0.64%

503.bwaves_r 3.68%
507.cactuBSSN_r -0.62%
508.namd_r 0.51%
510.parest_r -0.16%
511.povray_r 0.57%
519.lbm_r 0.50%
521.wrf_r 0.00%
526.blender_r 0.00%
527.cam4_r 0.00%
538.imagick_r -0.41%
544.nab_r 0.00%
549.fotonik3d_r -0.20%
554.roms_r 4.19%
FP geomean 0.66%
Uros Bizjak Aug. 19, 2020, 7:05 a.m. UTC | #3
On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > enabled since there's no corresponding instruction for general
> > > registers.
> > >
> > > gcc/
> > >         PR target/88808
> > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > >         for mask registers.
> > >         (*movhi_internal): Ditto.
> > >         (*movqi_internal): Ditto.
> > >         (*anddi_1): Support mask register operations
> > >         (*and<mode>_1): Ditto.
> > >         (*andqi_1): Ditto.
> > >         (*andn<mode>_1): Ditto.
> > >         (*<code><mode>_1): Ditto.
> > >         (*<code>qi_1): Ditto.
> > >         (*one_cmpl<mode>2_1): Ditto.
> > >         (*one_cmplsi2_1_zext): Ditto.
> > >         (*one_cmplqi2_1): Ditto.
> > >
> > > gcc/testsuite/
> > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> >
> > index 74d207c3711..e8ad79d1b0a 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -2294,7 +2294,7 @@
> >
> >  (define_insn "*movsi_internal"
> >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> >      (match_operand:SI 1 "general_operand"
> >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >
> > I'd rather see *k everywhere, also with *movqi_internal and
> > *movhi_internal patterns. The "*" means that the allocator won't
> > allocate a mask register by default, but it will be used to optimize
> > moves. With the above change, you are risking that during integer
> > register pressure, the register allocator will allocate zero to a mask
> > register, and later "optimize" the move with a direct maskreg-intreg
> > move.
> >
> > The current strategy is that only general registers get allocated for
> > integer modes. Let's keep it this way for now.
> >
>
> Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> move zero into mask register directly.

Although it would be nice if the register allocator was smart enough,
the current strategy is to introduce peephole2 patterns to fix these
problems, similar to [1]. These peepholes can be introduced in a
follow-up patch.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html

> > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> >
> > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > used to distinguish mask operations, generated from builtins from
> > generic operations, so I'd like to keep them for a while. The drawback
> > is, that they are not combined with other operations, but at the end
> > of the day, this is what the programmer asked for by using builtins.
>
> Agree, I prefer to keep them.

Thinking some more about the approach, it looks to me that the optimal
solution is a post-reload splitter that would convert "generic"
patterns to mask operations from sse.md. The mask operations don't set
flags, so we can substantially improve post reload scheduling of these
instructions by removing flags clobber.

So, simply add "#" to relevant alternatives of logic patterns and add
something like:

--cut here--
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 41c6dbfa668..ad49bdc7583 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1470,6 +1470,18 @@
           ]
           (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+       (any_logic:SWI1248_AVX512BW
+         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
+         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kandn<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
        (and:SWI1248_AVX512BW
--cut here--

and similar for kandn and knot in sse.md. You will have to add
mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
for example.

We don't lose anything, because all important transformations,
propagations and simplifications with these patterns happen before
reload.

Uros.
Hongtao Liu Aug. 20, 2020, 7:24 a.m. UTC | #4
On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > enabled since there's no corresponding instruction for general
> > > > registers.
> > > >
> > > > gcc/
> > > >         PR target/88808
> > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > >         for mask registers.
> > > >         (*movhi_internal): Ditto.
> > > >         (*movqi_internal): Ditto.
> > > >         (*anddi_1): Support mask register operations
> > > >         (*and<mode>_1): Ditto.
> > > >         (*andqi_1): Ditto.
> > > >         (*andn<mode>_1): Ditto.
> > > >         (*<code><mode>_1): Ditto.
> > > >         (*<code>qi_1): Ditto.
> > > >         (*one_cmpl<mode>2_1): Ditto.
> > > >         (*one_cmplsi2_1_zext): Ditto.
> > > >         (*one_cmplqi2_1): Ditto.
> > > >
> > > > gcc/testsuite/
> > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > >
> > > index 74d207c3711..e8ad79d1b0a 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -2294,7 +2294,7 @@
> > >
> > >  (define_insn "*movsi_internal"
> > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > >      (match_operand:SI 1 "general_operand"
> > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > >
> > > I'd rather see *k everywhere, also with *movqi_internal and
> > > *movhi_internal patterns. The "*" means that the allocator won't
> > > allocate a mask register by default, but it will be used to optimize
> > > moves. With the above change, you are risking that during integer
> > > register pressure, the register allocator will allocate zero to a mask
> > > register, and later "optimize" the move with a direct maskreg-intreg
> > > move.
> > >
> > > The current strategy is that only general registers get allocated for
> > > integer modes. Let's keep it this way for now.
> > >
> >
> > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > move zero into mask register directly.
>
> Although it would be nice if the register allocator was smart enough,
> the current strategy is to introduce peephole2 patterns to fix these
> problems, similar to [1]. These peepholes can be introduced in a
> follow-up patch.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
>

peephole2 added.

> > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > >
> > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > used to distinguish mask operations, generated from builtins from
> > > generic operations, so I'd like to keep them for a while. The drawback
> > > is, that they are not combined with other operations, but at the end
> > > of the day, this is what the programmer asked for by using builtins.
> >
> > Agree, I prefer to keep them.
>
> Thinking some more about the approach, it looks to me that the optimal
> solution is a post-reload splitter that would convert "generic"
> patterns to mask operations from sse.md. The mask operations don't set
> flags, so we can substantially improve post reload scheduling of these
> instructions by removing flags clobber.
>
> So, simply add "#" to relevant alternatives of logic patterns and add
> something like:
>
> --cut here--
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 41c6dbfa668..ad49bdc7583 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1470,6 +1470,18 @@
>            ]
>            (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +       (any_logic:SWI1248_AVX512BW
> +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "kandn<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>         (and:SWI1248_AVX512BW
> --cut here--
>
> and similar for kandn and knot in sse.md. You will have to add
> mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> for example.
>
> We don't lose anything, because all important transformations,
> propagations and simplifications with these patterns happen before
> reload.

define_splits are added for those bitwise operations.

>
> Uros.

Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c

-     must go into Q_REGS.  */
+     must go into Q_REGS or ALL_MASK_REGS.  */
   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
       if (Q_CLASS_P (regclass))
        return regclass;
       else if (reg_class_subset_p (Q_REGS, regclass))
        return Q_REGS;
+      else if (MASK_CLASS_P (regclass))
+       return regclass;
       else
        return NO_REGS;


Update patch.


--
BR,
Hongtao
Hongtao Liu Aug. 20, 2020, 7:32 a.m. UTC | #5
On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > enabled since there's no corresponding instruction for general
> > > > > registers.
> > > > >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movhi_internal): Ditto.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > --- a/gcc/config/i386/i386.md
> > > > +++ b/gcc/config/i386/i386.md
> > > > @@ -2294,7 +2294,7 @@
> > > >
> > > >  (define_insn "*movsi_internal"
> > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > >      (match_operand:SI 1 "general_operand"
> > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > >
> > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > allocate a mask register by default, but it will be used to optimize
> > > > moves. With the above change, you are risking that during integer
> > > > register pressure, the register allocator will allocate zero to a mask
> > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > move.
> > > >
> > > > The current strategy is that only general registers get allocated for
> > > > integer modes. Let's keep it this way for now.
> > > >
> > >
> > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > move zero into mask register directly.
> >
> > Although it would be nice if the register allocator was smart enough,
> > the current strategy is to introduce peephole2 patterns to fix these
> > problems, similar to [1]. These peepholes can be introduced in a
> > follow-up patch.
> >
> > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> >
>
> peephole2 added.
>
> > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > >
> > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > used to distinguish mask operations, generated from builtins from
> > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > is, that they are not combined with other operations, but at the end
> > > > of the day, this is what the programmer asked for by using builtins.
> > >
> > > Agree, I prefer to keep them.
> >
> > Thinking some more about the approach, it looks to me that the optimal
> > solution is a post-reload splitter that would convert "generic"
> > patterns to mask operations from sse.md. The mask operations don't set
> > flags, so we can substantially improve post reload scheduling of these
> > instructions by removing flags clobber.
> >
> > So, simply add "#" to relevant alternatives of logic patterns and add
> > something like:
> >
> > --cut here--
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 41c6dbfa668..ad49bdc7583 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1470,6 +1470,18 @@
> >            ]
> >            (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +       (any_logic:SWI1248_AVX512BW
> > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "kandn<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >         (and:SWI1248_AVX512BW
> > --cut here--
> >
> > and similar for kandn and knot in sse.md. You will have to add
> > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > for example.
> >
> > We don't lose anything, because all important transformations,
> > propagations and simplifications with these patterns happen before
> > reload.
>
> define_splits are added for those bitwise operations.
>
> >
> > Uros.
>
> Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
>
> -     must go into Q_REGS.  */
> +     must go into Q_REGS or ALL_MASK_REGS.  */
>    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
>      {
>        if (Q_CLASS_P (regclass))
>         return regclass;
>        else if (reg_class_subset_p (Q_REGS, regclass))
>         return Q_REGS;
> +      else if (MASK_CLASS_P (regclass))
> +       return regclass;
>        else
>         return NO_REGS;
>
>
> Update patch.
>
>
> --
> BR,
> Hongtao

networking is slow to send out mail with attachment, so i copy the
patch into mail.

gcc/
        PR target/88808
        * config/i386/i386.c (ix86_preferred_reload_class): Allow
        QImode data go into mask registers.
        * config/i386/i386.md: (*movhi_internal): Adjust constraints
        for mask registers.
        (*movqi_internal): Ditto.
        (*anddi_1): Support mask register operations
        (*and<mode>_1): Ditto.
        (*andqi_1): Ditto.
        (*andn<mode>_1): Ditto.
        (*<code><mode>_1): Ditto.
        (*<code>qi_1): Ditto.
        (*one_cmpl<mode>2_1): Ditto.
        (*one_cmplsi2_1_zext): Ditto.
        (*one_cmplqi2_1): Ditto.
        (define_peephole2): Move constant 0/-1 directly into mask
        registers.
        * config/i386/predicates.md (mask_reg_operand): New predicate.
        * config/i386/sse.md (define_split): Add post-reload splitters
        that would convert "generic" patterns to mask patterns.
        (*knotsi_1_zext): New define_insn.

gcc/testsuite/
        * gcc.target/i386/bitwise_mask_op-1.c: New test.
        * gcc.target/i386/bitwise_mask_op-2.c: New test.
        * gcc.target/i386/bitwise_mask_op-3.c: New test.
        * gcc.target/i386/avx512bw-pr88465.c: New testcase.
        * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
        * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
        * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
        * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
---
 gcc/config/i386/i386.c                        |   4 +-
 gcc/config/i386/i386.md                       | 209 ++++++++++++------
 gcc/config/i386/predicates.md                 |   5 +
 gcc/config/i386/sse.md                        |  59 +++++
 .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
 .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
 .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
 .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
 .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
 .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
 .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
 .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
 12 files changed, 471 insertions(+), 67 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d71d6d55be6..e8a2182ceb0 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
reg_class_t regclass)
     return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;

   /* QImode constants are easy to load, but non-constant QImode data
-     must go into Q_REGS.  */
+     must go into Q_REGS or ALL_MASK_REGS.  */
   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
       if (Q_CLASS_P (regclass))
         return regclass;
       else if (reg_class_subset_p (Q_REGS, regclass))
         return Q_REGS;
+      else if (MASK_CLASS_P (regclass))
+        return regclass;
       else
         return NO_REGS;
     }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3a15941c3e8..676525fbc1f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2403,8 +2403,8 @@
            (symbol_ref "true")))])

 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-        (match_operand:HI 1 "general_operand"      "r
,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
,*r,*m,*k")
+        (match_operand:HI 1 "general_operand"      "r
,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@

 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
         (match_operand:QI 1 "general_operand"
-                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -2624,6 +2624,19 @@
            ]
            (const_string "QI")))])

+/* Reload dislikes loading 0/-1 directly into mask registers.
+   Try to tidy things up here.  */
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+        (match_operand:SWI 1 "immediate_operand"))
+   (set (match_operand:SWI 2 "mask_reg_operand")
+        (match_dup 0))]
+  "peep2_reg_dead_p (2, operands[0])
+   && (const0_operand (operands[1], <MODE>mode)
+       || (constm1_operand (operands[1], <MODE>mode)
+           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
+  [(set (match_dup 2) (match_dup 1))])
+
 ;; Stores and loads of ax to arbitrary constant address.
 ;; We fake an second form of instruction to force reload to load address
 ;; into register when rax is not available
@@ -9044,19 +9057,21 @@
 })

 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
         (and:DI
-         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9079,7 @@
                  (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])

 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9145,25 @@
    (set_attr "mode" "SI")])

 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "3")
+                 (if_then_else (eq_attr "mode" "SI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9171,28 @@
                  (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])

 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-                (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
   "@
    and{b}\t{%2, %0|%0, %2}
    and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   and{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "2")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "3")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9570,42 @@
 })

 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
         (and:SWI48
-          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%2, %1, %0|%0, %1, %2}
+   andn\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])

 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
         (and:SWI12
-          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-          (match_operand:SWI12 2 "register_operand" "r")))
+          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+          (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #"
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "0")
+                 (const_string "SI")
+               (and (eq_attr "alternative" "1")
+                    (match_test "!TARGET_AVX512DQ"))
+                  (const_string "HI")
+              ]
+              (const_string "<MODE>")))])

 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9676,24 @@
 })

 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
         (any_or:SWI248
-         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "2")
+                 (if_then_else (eq_attr "mode" "SI,DI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])

 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9766,26 @@
    (set_attr "mode" "SI")])

 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
   "@
    <logic>{b}\t{%2, %0|%0, %2}
    <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "2")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "3")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10434,52 @@
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")

 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+   not{<imodesuffix>}\t%0
+   #"
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "2")
+                 (if_then_else (eq_attr "mode" "SI,DI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])

 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
         (zero_extend:DI
-          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])

 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "1")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "2")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 07e69d555c0..dd1b31479f5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -87,6 +87,11 @@
   (and (match_code "reg")
        (match_test "REGNO (op) == FLAGS_REG")))

+;; True if the operand is a MASK register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
+
 ;; Match a DI, SI, HI or QImode nonimmediate_operand.
 (define_special_predicate "int_nonimmediate_operand"
   (and (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b6348de67cb..4372a9fd785 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1452,6 +1452,18 @@
   "TARGET_AVX512F
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))")

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (any_logic:SWI1248_AVX512BW
+          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
+          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "k<code><mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (any_logic:SWI1248_AVX512BW
@@ -1474,6 +1486,21 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (and:SWI1248_AVX512BW
+          (not:SWI1248_AVX512BW
+            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
+          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (and:SWI1248_AVX512BW
+             (not:SWI1248_AVX512BW (match_dup 1))
+             (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kandn<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (and:SWI1248_AVX512BW
@@ -1520,6 +1547,16 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (not:SWI1248_AVX512BW
+          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (not:SWI1248_AVX512BW (match_dup 1)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "knot<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (not:SWI1248_AVX512BW
@@ -1541,6 +1578,28 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+        (zero_extend:DI
+          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (zero_extend:DI
+             (not:SI (match_dup 1))))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
+(define_insn "*knotsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+        (zero_extend:DI
+          (not:SI (match_operand:SI 1 "register_operand" "k"))))
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "knotd\t{%1, %0|%0, %1}";
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
 (define_insn "kadd<mode>"
   [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
         (plus:SWI1248_AVX512BWDQ2
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
index 94422f36010..46d9351f275 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
index c68ad8cc1f7..fe13f4f33fc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
new file mode 100644
index 00000000000..8e34bf45365
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
@@ -0,0 +1,23 @@
+/* PR target/88465 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
+/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
+
+void
+foo (void)
+{
+  unsigned int k = 0;
+  __asm volatile ("" : : "k" (k));
+  k = -1;
+  __asm volatile ("" : : "k" (k));
+}
+
+void
+bar (void)
+{
+  unsigned long long k = 0;
+  __asm volatile ("" : : "k" (k));
+  k = -1;
+  __asm volatile ("" : : "k" (k));
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
index 49817097e26..114e03ee93d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512dq -O2" } */
+/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovb\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
index 7bb34d34d8d..79d37394b36 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -O2" } */
+/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovw\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
new file mode 100644
index 00000000000..61f71ab8b23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
@@ -0,0 +1,178 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
+
+#include <immintrin.h>
+__m512i
+foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
+
+__m512i
+foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kord" "1" } }  */
+
+__m512i
+foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
+}
+
+__m512i
+foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
+
+__m512i
+foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxord" "1" } }  */
+
+__m512i
+foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
+}
+
+__m512i
+foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  return _mm512_mask_add_epi8 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  return _mm512_mask_add_epi16 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  return _mm512_mask_add_epi32 (c, ~m1, a, d);
+}
+
+__m512i
+foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  return _mm512_mask_add_epi64 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotw" "4" } }  */
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
new file mode 100644
index 00000000000..850f0b42652
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
@@ -0,0 +1,8 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times "knotb" "2" } }  */
+/* { dg-final { scan-assembler-times "korb" "1" } }  */
+/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
+#include "bitwise_mask_op-1.c"
+
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
new file mode 100644
index 00000000000..18bf4f0d768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
@@ -0,0 +1,44 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+
+#include <immintrin.h>
+volatile __mmask8 foo;
+void
+foo_orb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 | m2;
+}
+
+/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
+
+void
+foo_xorb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 ^ m2;
+}
+
+/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
+
+void
+foo_andb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 & m2;
+}
+
+void
+foo_andnb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 & ~m2;
+}
+
+/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
+/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
Uros Bizjak Aug. 20, 2020, 7:40 a.m. UTC | #6
On Thu, Aug 20, 2020 at 9:31 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > > enabled since there's no corresponding instruction for general
> > > > > > registers.
> > > > > >
> > > > > > gcc/
> > > > > >         PR target/88808
> > > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > > >         for mask registers.
> > > > > >         (*movhi_internal): Ditto.
> > > > > >         (*movqi_internal): Ditto.
> > > > > >         (*anddi_1): Support mask register operations
> > > > > >         (*and<mode>_1): Ditto.
> > > > > >         (*andqi_1): Ditto.
> > > > > >         (*andn<mode>_1): Ditto.
> > > > > >         (*<code><mode>_1): Ditto.
> > > > > >         (*<code>qi_1): Ditto.
> > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > >
> > > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > > --- a/gcc/config/i386/i386.md
> > > > > +++ b/gcc/config/i386/i386.md
> > > > > @@ -2294,7 +2294,7 @@
> > > > >
> > > > >  (define_insn "*movsi_internal"
> > > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > > >      (match_operand:SI 1 "general_operand"
> > > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > >
> > > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > > allocate a mask register by default, but it will be used to optimize
> > > > > moves. With the above change, you are risking that during integer
> > > > > register pressure, the register allocator will allocate zero to a mask
> > > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > > move.
> > > > >
> > > > > The current strategy is that only general registers get allocated for
> > > > > integer modes. Let's keep it this way for now.
> > > > >
> > > >
> > > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > > move zero into mask register directly.
> > >
> > > Although it would be nice if the register allocator was smart enough,
> > > the current strategy is to introduce peephole2 patterns to fix these
> > > problems, similar to [1]. These peepholes can be introduced in a
> > > follow-up patch.
> > >
> > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> > >
> >
> > peephole2 added.
> >
> > > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > > >
> > > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > > used to distinguish mask operations, generated from builtins from
> > > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > > is, that they are not combined with other operations, but at the end
> > > > > of the day, this is what the programmer asked for by using builtins.
> > > >
> > > > Agree, I prefer to keep them.
> > >
> > > Thinking some more about the approach, it looks to me that the optimal
> > > solution is a post-reload splitter that would convert "generic"
> > > patterns to mask operations from sse.md. The mask operations don't set
> > > flags, so we can substantially improve post reload scheduling of these
> > > instructions by removing flags clobber.
> > >
> > > So, simply add "#" to relevant alternatives of logic patterns and add
> > > something like:
> > >
> > > --cut here--
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 41c6dbfa668..ad49bdc7583 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -1470,6 +1470,18 @@
> > >            ]
> > >            (const_string "<MODE>")))])
> > >
> > > +(define_split
> > > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > > +       (any_logic:SWI1248_AVX512BW
> > > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "TARGET_AVX512F && reload_completed"
> > > +  [(parallel
> > > +     [(set (match_dup 0)
> > > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > > +
> > >  (define_insn "kandn<mode>"
> > >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> > >         (and:SWI1248_AVX512BW
> > > --cut here--
> > >
> > > and similar for kandn and knot in sse.md. You will have to add
> > > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > > for example.
> > >
> > > We don't lose anything, because all important transformations,
> > > propagations and simplifications with these patterns happen before
> > > reload.
> >
> > define_splits are added for those bitwise operations.
> >
> > >
> > > Uros.
> >
> > Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
> >
> > -     must go into Q_REGS.  */
> > +     must go into Q_REGS or ALL_MASK_REGS.  */
> >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> >      {
> >        if (Q_CLASS_P (regclass))
> >         return regclass;
> >        else if (reg_class_subset_p (Q_REGS, regclass))
> >         return Q_REGS;
> > +      else if (MASK_CLASS_P (regclass))
> > +       return regclass;
> >        else
> >         return NO_REGS;
> >
> >
> > Update patch.
> >
> >
> > --
> > BR,
> > Hongtao
>
> networking is slow to send out mail with attachment, so i copy the
> patch into mail.
>
> gcc/
>         PR target/88808
>         * config/i386/i386.c (ix86_preferred_reload_class): Allow
>         QImode data go into mask registers.
>         * config/i386/i386.md: (*movhi_internal): Adjust constraints
>         for mask registers.
>         (*movqi_internal): Ditto.
>         (*anddi_1): Support mask register operations
>         (*and<mode>_1): Ditto.
>         (*andqi_1): Ditto.
>         (*andn<mode>_1): Ditto.
>         (*<code><mode>_1): Ditto.
>         (*<code>qi_1): Ditto.
>         (*one_cmpl<mode>2_1): Ditto.
>         (*one_cmplsi2_1_zext): Ditto.
>         (*one_cmplqi2_1): Ditto.
>         (define_peephole2): Move constant 0/-1 directly into mask
>         registers.
>         * config/i386/predicates.md (mask_reg_operand): New predicate.
>         * config/i386/sse.md (define_split): Add post-reload splitters
>         that would convert "generic" patterns to mask patterns.
>         (*knotsi_1_zext): New define_insn.
>
> gcc/testsuite/
>         * gcc.target/i386/bitwise_mask_op-1.c: New test.
>         * gcc.target/i386/bitwise_mask_op-2.c: New test.
>         * gcc.target/i386/bitwise_mask_op-3.c: New test.
>         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
>         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
>         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.

A little nit, please put new splitters after the instruction pattern.

OK for the whole patch set with the above change,

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c                        |   4 +-
>  gcc/config/i386/i386.md                       | 209 ++++++++++++------
>  gcc/config/i386/predicates.md                 |   5 +
>  gcc/config/i386/sse.md                        |  59 +++++
>  .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
>  .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
>  .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
>  .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
>  .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
>  .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
>  .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
>  .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
>  12 files changed, 471 insertions(+), 67 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d71d6d55be6..e8a2182ceb0 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
> reg_class_t regclass)
>      return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
>
>    /* QImode constants are easy to load, but non-constant QImode data
> -     must go into Q_REGS.  */
> +     must go into Q_REGS or ALL_MASK_REGS.  */
>    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
>      {
>        if (Q_CLASS_P (regclass))
>          return regclass;
>        else if (reg_class_subset_p (Q_REGS, regclass))
>          return Q_REGS;
> +      else if (MASK_CLASS_P (regclass))
> +        return regclass;
>        else
>          return NO_REGS;
>      }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 3a15941c3e8..676525fbc1f 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2403,8 +2403,8 @@
>             (symbol_ref "true")))])
>
>  (define_insn "*movhi_internal"
> -  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
> -        (match_operand:HI 1 "general_operand"      "r
> ,rn,rm,rn,r,km,k,k,CBC"))]
> +  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
> ,*r,*m,*k")
> +        (match_operand:HI 1 "general_operand"      "r
> ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    switch (get_attr_type (insn))
> @@ -2491,9 +2491,9 @@
>
>  (define_insn "*movqi_internal"
>    [(set (match_operand:QI 0 "nonimmediate_operand"
> -                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
> +                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
>          (match_operand:QI 1 "general_operand"
> -                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
> +                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    char buf[128];
> @@ -2624,6 +2624,19 @@
>             ]
>             (const_string "QI")))])
>
> +/* Reload dislikes loading 0/-1 directly into mask registers.
> +   Try to tidy things up here.  */
> +(define_peephole2
> +  [(set (match_operand:SWI 0 "general_reg_operand")
> +        (match_operand:SWI 1 "immediate_operand"))
> +   (set (match_operand:SWI 2 "mask_reg_operand")
> +        (match_dup 0))]
> +  "peep2_reg_dead_p (2, operands[0])
> +   && (const0_operand (operands[1], <MODE>mode)
> +       || (constm1_operand (operands[1], <MODE>mode)
> +           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
> +  [(set (match_dup 2) (match_dup 1))])
> +
>  ;; Stores and loads of ax to arbitrary constant address.
>  ;; We fake an second form of instruction to force reload to load address
>  ;; into register when rax is not available
> @@ -9044,19 +9057,21 @@
>  })
>
>  (define_insn "*anddi_1"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
>          (and:DI
> -         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
> -         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
> +         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> +         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
>    "@
>     and{l}\t{%k2, %k0|%k0, %k2}
>     and{q}\t{%2, %0|%0, %2}
>     and{q}\t{%2, %0|%0, %2}
> +   #
>     #"
> -  [(set_attr "type" "alu,alu,alu,imovx")
> -   (set_attr "length_immediate" "*,*,*,0")
> +  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
> +   (set_attr "type" "alu,alu,alu,imovx,msklog")
> +   (set_attr "length_immediate" "*,*,*,0,*")
>     (set (attr "prefix_rex")
>       (if_then_else
>         (and (eq_attr "type" "imovx")
> @@ -9064,7 +9079,7 @@
>                   (match_operand 1 "ext_QIreg_operand")))
>         (const_string "1")
>         (const_string "*")))
> -   (set_attr "mode" "SI,DI,DI,SI")])
> +   (set_attr "mode" "SI,DI,DI,SI,DI")])
>
>  (define_insn_and_split "*anddi_1_btr"
>    [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
> @@ -9130,17 +9145,25 @@
>     (set_attr "mode" "SI")])
>
>  (define_insn "*and<mode>_1"
> -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
> -        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
> -                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
> +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> +        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
> +                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
>    "@
>     and{<imodesuffix>}\t{%2, %0|%0, %2}
>     and{<imodesuffix>}\t{%2, %0|%0, %2}
> +   #
>     #"
> -  [(set_attr "type" "alu,alu,imovx")
> -   (set_attr "length_immediate" "*,*,0")
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "3")
> +                 (if_then_else (eq_attr "mode" "SI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "alu,alu,imovx,msklog")
> +   (set_attr "length_immediate" "*,*,0,*")
>     (set (attr "prefix_rex")
>       (if_then_else
>         (and (eq_attr "type" "imovx")
> @@ -9148,20 +9171,28 @@
>                   (match_operand 1 "ext_QIreg_operand")))
>         (const_string "1")
>         (const_string "*")))
> -   (set_attr "mode" "<MODE>,<MODE>,SI")])
> +   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
>
>  (define_insn "*andqi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> -        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> -                (match_operand:QI 2 "general_operand" "qn,m,rn")))
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> +                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (AND, QImode, operands)"
>    "@
>     and{b}\t{%2, %0|%0, %2}
>     and{b}\t{%2, %0|%0, %2}
> -   and{l}\t{%k2, %k0|%k0, %k2}"
> -  [(set_attr "type" "alu")
> -   (set_attr "mode" "QI,QI,SI")
> +   and{l}\t{%k2, %k0|%k0, %k2}
> +   #"
> +  [(set_attr "type" "alu,alu,alu,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "2")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "3")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 2.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "2")
> @@ -9539,28 +9570,42 @@
>  })
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
> +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
>          (and:SWI48
> -          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
> -          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
> +          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> +          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "TARGET_BMI"
> -  "andn\t{%2, %1, %0|%0, %1, %2}"
> -  [(set_attr "type" "bitmanip")
> -   (set_attr "btver2_decode" "direct, double")
> +  "TARGET_BMI || TARGET_AVX512BW"
> +  "@
> +   andn\t{%2, %1, %0|%0, %1, %2}
> +   andn\t{%2, %1, %0|%0, %1, %2}
> +   #"
> +  [(set_attr "isa" "bmi,bmi,avx512bw")
> +   (set_attr "type" "bitmanip,bitmanip,msklog")
> +   (set_attr "btver2_decode" "direct, double,*")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI12 0 "register_operand" "=r")
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
>          (and:SWI12
> -          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
> -          (match_operand:SWI12 2 "register_operand" "r")))
> +          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> +          (match_operand:SWI12 2 "register_operand" "r,k")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "TARGET_BMI"
> -  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
> -  [(set_attr "type" "bitmanip")
> -   (set_attr "btver2_decode" "direct")
> -   (set_attr "mode" "SI")])
> +  "TARGET_BMI || TARGET_AVX512BW"
> +  "@
> +   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
> +   #"
> +  [(set_attr "isa" "bmi,avx512f")
> +   (set_attr "type" "bitmanip,msklog")
> +   (set_attr "btver2_decode" "direct,*")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "0")
> +                 (const_string "SI")
> +               (and (eq_attr "alternative" "1")
> +                    (match_test "!TARGET_AVX512DQ"))
> +                  (const_string "HI")
> +              ]
> +              (const_string "<MODE>")))])
>
>  (define_insn "*andn_<mode>_ccno"
>    [(set (reg FLAGS_REG)
> @@ -9631,14 +9676,24 @@
>  })
>
>  (define_insn "*<code><mode>_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
>          (any_or:SWI248
> -         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
> -         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
> +         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> +         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> -  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
> -  [(set_attr "type" "alu")
> +  "@
> +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> +   #"
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "2")
> +                 (if_then_else (eq_attr "mode" "SI,DI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "alu, alu, msklog")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn_and_split "*iordi_1_bts"
> @@ -9711,17 +9766,26 @@
>     (set_attr "mode" "SI")])
>
>  (define_insn "*<code>qi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> -        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> -                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> +                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, QImode, operands)"
>    "@
>     <logic>{b}\t{%2, %0|%0, %2}
>     <logic>{b}\t{%2, %0|%0, %2}
> -   <logic>{l}\t{%k2, %k0|%k0, %k2}"
> -  [(set_attr "type" "alu")
> -   (set_attr "mode" "QI,QI,SI")
> +   <logic>{l}\t{%k2, %k0|%k0, %k2}
> +   #"
> +  [(set_attr "isa" "*,*,*,avx512f")
> +   (set_attr "type" "alu,alu,alu,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "2")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "3")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 2.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "2")
> @@ -10370,31 +10434,52 @@
>    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
>
>  (define_insn "*one_cmpl<mode>2_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
> -        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> +        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
>    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> -  "not{<imodesuffix>}\t%0"
> -  [(set_attr "type" "negnot")
> +  "@
> +   not{<imodesuffix>}\t%0
> +   #"
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "2")
> +                 (if_then_else (eq_attr "mode" "SI,DI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "negnot,msklog")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*one_cmplsi2_1_zext"
> -  [(set (match_operand:DI 0 "register_operand" "=r")
> +  [(set (match_operand:DI 0 "register_operand" "=r,k")
>          (zero_extend:DI
> -          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
> +          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
>    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> -  "not{l}\t%k0"
> -  [(set_attr "type" "negnot")
> -   (set_attr "mode" "SI")])
> +  "@
> +   not{l}\t%k0
> +   #"
> +  [(set_attr "isa" "x64,avx512bw")
> +   (set_attr "type" "negnot,msklog")
> +   (set_attr "mode" "SI,SI")])
>
>  (define_insn "*one_cmplqi2_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
> -        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> +        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
>    "ix86_unary_operator_ok (NOT, QImode, operands)"
>    "@
>     not{b}\t%0
> -   not{l}\t%k0"
> -  [(set_attr "type" "negnot")
> -   (set_attr "mode" "QI,SI")
> +   not{l}\t%k0
> +   #"
> +  [(set_attr "isa" "*,*,avx512f")
> +   (set_attr "type" "negnot,negnot,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "1")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "2")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 1.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "1")
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 07e69d555c0..dd1b31479f5 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -87,6 +87,11 @@
>    (and (match_code "reg")
>         (match_test "REGNO (op) == FLAGS_REG")))
>
> +;; True if the operand is a MASK register.
> +(define_predicate "mask_reg_operand"
> +  (and (match_code "reg")
> +       (match_test "MASK_REGNO_P (REGNO (op))")))
> +
>  ;; Match a DI, SI, HI or QImode nonimmediate_operand.
>  (define_special_predicate "int_nonimmediate_operand"
>    (and (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index b6348de67cb..4372a9fd785 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1452,6 +1452,18 @@
>    "TARGET_AVX512F
>     && !(MEM_P (operands[0]) && MEM_P (operands[1]))")
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (any_logic:SWI1248_AVX512BW
> +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "k<code><mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (any_logic:SWI1248_AVX512BW
> @@ -1474,6 +1486,21 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (and:SWI1248_AVX512BW
> +          (not:SWI1248_AVX512BW
> +            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
> +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (and:SWI1248_AVX512BW
> +             (not:SWI1248_AVX512BW (match_dup 1))
> +             (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "kandn<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (and:SWI1248_AVX512BW
> @@ -1520,6 +1547,16 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (not:SWI1248_AVX512BW
> +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (not:SWI1248_AVX512BW (match_dup 1)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "knot<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (not:SWI1248_AVX512BW
> @@ -1541,6 +1578,28 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:DI 0 "mask_reg_operand")
> +        (zero_extend:DI
> +          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
> +  "TARGET_AVX512BW && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (zero_extend:DI
> +             (not:SI (match_dup 1))))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
> +(define_insn "*knotsi_1_zext"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +        (zero_extend:DI
> +          (not:SI (match_operand:SI 1 "register_operand" "k"))))
> +   (unspec [(const_int 0)] UNSPEC_MASKOP)]
> +  "TARGET_AVX512BW"
> +  "knotd\t{%1, %0|%0, %1}";
> +  [(set_attr "type" "msklog")
> +   (set_attr "prefix" "vex")
> +   (set_attr "mode" "SI")])
> +
>  (define_insn "kadd<mode>"
>    [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
>          (plus:SWI1248_AVX512BWDQ2
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> index 94422f36010..46d9351f275 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bw -O2" } */
> -/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> index c68ad8cc1f7..fe13f4f33fc 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bw -O2" } */
> -/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> new file mode 100644
> index 00000000000..8e34bf45365
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> @@ -0,0 +1,23 @@
> +/* PR target/88465 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
> +/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
> +
> +void
> +foo (void)
> +{
> +  unsigned int k = 0;
> +  __asm volatile ("" : : "k" (k));
> +  k = -1;
> +  __asm volatile ("" : : "k" (k));
> +}
> +
> +void
> +bar (void)
> +{
> +  unsigned long long k = 0;
> +  __asm volatile ("" : : "k" (k));
> +  k = -1;
> +  __asm volatile ("" : : "k" (k));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> index 49817097e26..114e03ee93d 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512dq -O2" } */
> +/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
>  /* { dg-final { scan-assembler-times "kmovb\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> index 7bb34d34d8d..79d37394b36 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512f -O2" } */
> +/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
>  /* { dg-final { scan-assembler-times "kmovw\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> new file mode 100644
> index 00000000000..61f71ab8b23
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> @@ -0,0 +1,178 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
> +
> +#include <immintrin.h>
> +__m512i
> +foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
> +
> +__m512i
> +foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kord" "1" } }  */
> +
> +__m512i
> +foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
> +}
> +
> +__m512i
> +foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> +
> +__m512i
> +foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kxord" "1" } }  */
> +
> +__m512i
> +foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
> +}
> +
> +__m512i
> +foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> +
> +__m512i
> +foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  return _mm512_mask_add_epi8 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  return _mm512_mask_add_epi16 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  return _mm512_mask_add_epi32 (c, ~m1, a, d);
> +}
> +
> +__m512i
> +foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  return _mm512_mask_add_epi64 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotw" "4" } }  */
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> new file mode 100644
> index 00000000000..850f0b42652
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> @@ -0,0 +1,8 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> +/* { dg-final { scan-assembler-times "knotb" "2" } }  */
> +/* { dg-final { scan-assembler-times "korb" "1" } }  */
> +/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
> +#include "bitwise_mask_op-1.c"
> +
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> new file mode 100644
> index 00000000000..18bf4f0d768
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> @@ -0,0 +1,44 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> +
> +#include <immintrin.h>
> +volatile __mmask8 foo;
> +void
> +foo_orb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 | m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
> +
> +void
> +foo_xorb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 ^ m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
> +
> +void
> +foo_andb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 & m2;
> +}
> +
> +void
> +foo_andnb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 & ~m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
> +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
> --
> 2.18.1
>
>
> --
> BR,
> Hongtao
Hongtao Liu Aug. 20, 2020, 7:45 a.m. UTC | #7
On Thu, Aug 20, 2020 at 3:40 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Aug 20, 2020 at 9:31 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > > > enabled since there's no corresponding instruction for general
> > > > > > > registers.
> > > > > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movhi_internal): Ditto.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > > > --- a/gcc/config/i386/i386.md
> > > > > > +++ b/gcc/config/i386/i386.md
> > > > > > @@ -2294,7 +2294,7 @@
> > > > > >
> > > > > >  (define_insn "*movsi_internal"
> > > > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > > > >      (match_operand:SI 1 "general_operand"
> > > > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > > >
> > > > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > > > allocate a mask register by default, but it will be used to optimize
> > > > > > moves. With the above change, you are risking that during integer
> > > > > > register pressure, the register allocator will allocate zero to a mask
> > > > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > > > move.
> > > > > >
> > > > > > The current strategy is that only general registers get allocated for
> > > > > > integer modes. Let's keep it this way for now.
> > > > > >
> > > > >
> > > > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > > > move zero into mask register directly.
> > > >
> > > > Although it would be nice if the register allocator was smart enough,
> > > > the current strategy is to introduce peephole2 patterns to fix these
> > > > problems, similar to [1]. These peepholes can be introduced in a
> > > > follow-up patch.
> > > >
> > > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> > > >
> > >
> > > peephole2 added.
> > >
> > > > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > > > >
> > > > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > > > used to distinguish mask operations, generated from builtins from
> > > > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > > > is, that they are not combined with other operations, but at the end
> > > > > > of the day, this is what the programmer asked for by using builtins.
> > > > >
> > > > > Agree, I prefer to keep them.
> > > >
> > > > Thinking some more about the approach, it looks to me that the optimal
> > > > solution is a post-reload splitter that would convert "generic"
> > > > patterns to mask operations from sse.md. The mask operations don't set
> > > > flags, so we can substantially improve post reload scheduling of these
> > > > instructions by removing flags clobber.
> > > >
> > > > So, simply add "#" to relevant alternatives of logic patterns and add
> > > > something like:
> > > >
> > > > --cut here--
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index 41c6dbfa668..ad49bdc7583 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -1470,6 +1470,18 @@
> > > >            ]
> > > >            (const_string "<MODE>")))])
> > > >
> > > > +(define_split
> > > > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > > > +       (any_logic:SWI1248_AVX512BW
> > > > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > > > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > > > +   (clobber (reg:CC FLAGS_REG))]
> > > > +  "TARGET_AVX512F && reload_completed"
> > > > +  [(parallel
> > > > +     [(set (match_dup 0)
> > > > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > > > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > > > +
> > > >  (define_insn "kandn<mode>"
> > > >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> > > >         (and:SWI1248_AVX512BW
> > > > --cut here--
> > > >
> > > > and similar for kandn and knot in sse.md. You will have to add
> > > > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > > > for example.
> > > >
> > > > We don't lose anything, because all important transformations,
> > > > propagations and simplifications with these patterns happen before
> > > > reload.
> > >
> > > define_splits are added for those bitwise operations.
> > >
> > > >
> > > > Uros.
> > >
> > > Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
> > >
> > > -     must go into Q_REGS.  */
> > > +     must go into Q_REGS or ALL_MASK_REGS.  */
> > >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> > >      {
> > >        if (Q_CLASS_P (regclass))
> > >         return regclass;
> > >        else if (reg_class_subset_p (Q_REGS, regclass))
> > >         return Q_REGS;
> > > +      else if (MASK_CLASS_P (regclass))
> > > +       return regclass;
> > >        else
> > >         return NO_REGS;
> > >
> > >
> > > Update patch.
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
> >
> > networking is slow to send out mail with attachment, so i copy the
> > patch into mail.
> >
> > gcc/
> >         PR target/88808
> >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> >         QImode data go into mask registers.
> >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> >         for mask registers.
> >         (*movqi_internal): Ditto.
> >         (*anddi_1): Support mask register operations
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >         (define_peephole2): Move constant 0/-1 directly into mask
> >         registers.
> >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> >         * config/i386/sse.md (define_split): Add post-reload splitters
> >         that would convert "generic" patterns to mask patterns.
> >         (*knotsi_1_zext): New define_insn.
> >
> > gcc/testsuite/
> >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
>
> A little nit, please put new splitters after the instruction pattern.
>
> OK for the whole patch set with the above change,
>

Yes, thanks for the review.

> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.c                        |   4 +-
> >  gcc/config/i386/i386.md                       | 209 ++++++++++++------
> >  gcc/config/i386/predicates.md                 |   5 +
> >  gcc/config/i386/sse.md                        |  59 +++++
> >  .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
> >  .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
> >  .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
> >  .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
> >  .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
> >  .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
> >  .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
> >  .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
> >  12 files changed, 471 insertions(+), 67 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index d71d6d55be6..e8a2182ceb0 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
> > reg_class_t regclass)
> >      return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
> >
> >    /* QImode constants are easy to load, but non-constant QImode data
> > -     must go into Q_REGS.  */
> > +     must go into Q_REGS or ALL_MASK_REGS.  */
> >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> >      {
> >        if (Q_CLASS_P (regclass))
> >          return regclass;
> >        else if (reg_class_subset_p (Q_REGS, regclass))
> >          return Q_REGS;
> > +      else if (MASK_CLASS_P (regclass))
> > +        return regclass;
> >        else
> >          return NO_REGS;
> >      }
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 3a15941c3e8..676525fbc1f 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -2403,8 +2403,8 @@
> >             (symbol_ref "true")))])
> >
> >  (define_insn "*movhi_internal"
> > -  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
> > -        (match_operand:HI 1 "general_operand"      "r
> > ,rn,rm,rn,r,km,k,k,CBC"))]
> > +  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
> > ,*r,*m,*k")
> > +        (match_operand:HI 1 "general_operand"      "r
> > ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >  {
> >    switch (get_attr_type (insn))
> > @@ -2491,9 +2491,9 @@
> >
> >  (define_insn "*movqi_internal"
> >    [(set (match_operand:QI 0 "nonimmediate_operand"
> > -                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
> > +                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
> >          (match_operand:QI 1 "general_operand"
> > -                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
> > +                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >  {
> >    char buf[128];
> > @@ -2624,6 +2624,19 @@
> >             ]
> >             (const_string "QI")))])
> >
> > +/* Reload dislikes loading 0/-1 directly into mask registers.
> > +   Try to tidy things up here.  */
> > +(define_peephole2
> > +  [(set (match_operand:SWI 0 "general_reg_operand")
> > +        (match_operand:SWI 1 "immediate_operand"))
> > +   (set (match_operand:SWI 2 "mask_reg_operand")
> > +        (match_dup 0))]
> > +  "peep2_reg_dead_p (2, operands[0])
> > +   && (const0_operand (operands[1], <MODE>mode)
> > +       || (constm1_operand (operands[1], <MODE>mode)
> > +           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
> > +  [(set (match_dup 2) (match_dup 1))])
> > +
> >  ;; Stores and loads of ax to arbitrary constant address.
> >  ;; We fake an second form of instruction to force reload to load address
> >  ;; into register when rax is not available
> > @@ -9044,19 +9057,21 @@
> >  })
> >
> >  (define_insn "*anddi_1"
> > -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
> > +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
> >          (and:DI
> > -         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
> > -         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
> > +         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> > +         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
> >    "@
> >     and{l}\t{%k2, %k0|%k0, %k2}
> >     and{q}\t{%2, %0|%0, %2}
> >     and{q}\t{%2, %0|%0, %2}
> > +   #
> >     #"
> > -  [(set_attr "type" "alu,alu,alu,imovx")
> > -   (set_attr "length_immediate" "*,*,*,0")
> > +  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
> > +   (set_attr "type" "alu,alu,alu,imovx,msklog")
> > +   (set_attr "length_immediate" "*,*,*,0,*")
> >     (set (attr "prefix_rex")
> >       (if_then_else
> >         (and (eq_attr "type" "imovx")
> > @@ -9064,7 +9079,7 @@
> >                   (match_operand 1 "ext_QIreg_operand")))
> >         (const_string "1")
> >         (const_string "*")))
> > -   (set_attr "mode" "SI,DI,DI,SI")])
> > +   (set_attr "mode" "SI,DI,DI,SI,DI")])
> >
> >  (define_insn_and_split "*anddi_1_btr"
> >    [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
> > @@ -9130,17 +9145,25 @@
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*and<mode>_1"
> > -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
> > -        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
> > -                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
> > +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> > +        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
> > +                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
> >    "@
> >     and{<imodesuffix>}\t{%2, %0|%0, %2}
> >     and{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   #
> >     #"
> > -  [(set_attr "type" "alu,alu,imovx")
> > -   (set_attr "length_immediate" "*,*,0")
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "3")
> > +                 (if_then_else (eq_attr "mode" "SI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "alu,alu,imovx,msklog")
> > +   (set_attr "length_immediate" "*,*,0,*")
> >     (set (attr "prefix_rex")
> >       (if_then_else
> >         (and (eq_attr "type" "imovx")
> > @@ -9148,20 +9171,28 @@
> >                   (match_operand 1 "ext_QIreg_operand")))
> >         (const_string "1")
> >         (const_string "*")))
> > -   (set_attr "mode" "<MODE>,<MODE>,SI")])
> > +   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
> >
> >  (define_insn "*andqi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> > -        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> > -                (match_operand:QI 2 "general_operand" "qn,m,rn")))
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> > +                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (AND, QImode, operands)"
> >    "@
> >     and{b}\t{%2, %0|%0, %2}
> >     and{b}\t{%2, %0|%0, %2}
> > -   and{l}\t{%k2, %k0|%k0, %k2}"
> > -  [(set_attr "type" "alu")
> > -   (set_attr "mode" "QI,QI,SI")
> > +   and{l}\t{%k2, %k0|%k0, %k2}
> > +   #"
> > +  [(set_attr "type" "alu,alu,alu,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "3")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 2.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "2")
> > @@ -9539,28 +9570,42 @@
> >  })
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
> > +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
> >          (and:SWI48
> > -          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
> > -          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
> > +          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> > +          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "TARGET_BMI"
> > -  "andn\t{%2, %1, %0|%0, %1, %2}"
> > -  [(set_attr "type" "bitmanip")
> > -   (set_attr "btver2_decode" "direct, double")
> > +  "TARGET_BMI || TARGET_AVX512BW"
> > +  "@
> > +   andn\t{%2, %1, %0|%0, %1, %2}
> > +   andn\t{%2, %1, %0|%0, %1, %2}
> > +   #"
> > +  [(set_attr "isa" "bmi,bmi,avx512bw")
> > +   (set_attr "type" "bitmanip,bitmanip,msklog")
> > +   (set_attr "btver2_decode" "direct, double,*")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI12 0 "register_operand" "=r")
> > +  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
> >          (and:SWI12
> > -          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
> > -          (match_operand:SWI12 2 "register_operand" "r")))
> > +          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> > +          (match_operand:SWI12 2 "register_operand" "r,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "TARGET_BMI"
> > -  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
> > -  [(set_attr "type" "bitmanip")
> > -   (set_attr "btver2_decode" "direct")
> > -   (set_attr "mode" "SI")])
> > +  "TARGET_BMI || TARGET_AVX512BW"
> > +  "@
> > +   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
> > +   #"
> > +  [(set_attr "isa" "bmi,avx512f")
> > +   (set_attr "type" "bitmanip,msklog")
> > +   (set_attr "btver2_decode" "direct,*")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "0")
> > +                 (const_string "SI")
> > +               (and (eq_attr "alternative" "1")
> > +                    (match_test "!TARGET_AVX512DQ"))
> > +                  (const_string "HI")
> > +              ]
> > +              (const_string "<MODE>")))])
> >
> >  (define_insn "*andn_<mode>_ccno"
> >    [(set (reg FLAGS_REG)
> > @@ -9631,14 +9676,24 @@
> >  })
> >
> >  (define_insn "*<code><mode>_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
> >          (any_or:SWI248
> > -         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
> > -         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
> > +         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> > +         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> > -  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
> > -  [(set_attr "type" "alu")
> > +  "@
> > +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   #"
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (if_then_else (eq_attr "mode" "SI,DI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "alu, alu, msklog")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn_and_split "*iordi_1_bts"
> > @@ -9711,17 +9766,26 @@
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*<code>qi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> > -        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> > -                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> > +                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (<CODE>, QImode, operands)"
> >    "@
> >     <logic>{b}\t{%2, %0|%0, %2}
> >     <logic>{b}\t{%2, %0|%0, %2}
> > -   <logic>{l}\t{%k2, %k0|%k0, %k2}"
> > -  [(set_attr "type" "alu")
> > -   (set_attr "mode" "QI,QI,SI")
> > +   <logic>{l}\t{%k2, %k0|%k0, %k2}
> > +   #"
> > +  [(set_attr "isa" "*,*,*,avx512f")
> > +   (set_attr "type" "alu,alu,alu,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "3")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 2.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "2")
> > @@ -10370,31 +10434,52 @@
> >    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
> >
> >  (define_insn "*one_cmpl<mode>2_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
> > -        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> > +        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
> >    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> > -  "not{<imodesuffix>}\t%0"
> > -  [(set_attr "type" "negnot")
> > +  "@
> > +   not{<imodesuffix>}\t%0
> > +   #"
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (if_then_else (eq_attr "mode" "SI,DI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "negnot,msklog")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*one_cmplsi2_1_zext"
> > -  [(set (match_operand:DI 0 "register_operand" "=r")
> > +  [(set (match_operand:DI 0 "register_operand" "=r,k")
> >          (zero_extend:DI
> > -          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
> > +          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
> >    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> > -  "not{l}\t%k0"
> > -  [(set_attr "type" "negnot")
> > -   (set_attr "mode" "SI")])
> > +  "@
> > +   not{l}\t%k0
> > +   #"
> > +  [(set_attr "isa" "x64,avx512bw")
> > +   (set_attr "type" "negnot,msklog")
> > +   (set_attr "mode" "SI,SI")])
> >
> >  (define_insn "*one_cmplqi2_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
> > -        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> > +        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
> >    "ix86_unary_operator_ok (NOT, QImode, operands)"
> >    "@
> >     not{b}\t%0
> > -   not{l}\t%k0"
> > -  [(set_attr "type" "negnot")
> > -   (set_attr "mode" "QI,SI")
> > +   not{l}\t%k0
> > +   #"
> > +  [(set_attr "isa" "*,*,avx512f")
> > +   (set_attr "type" "negnot,negnot,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "1")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "2")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 1.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "1")
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index 07e69d555c0..dd1b31479f5 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -87,6 +87,11 @@
> >    (and (match_code "reg")
> >         (match_test "REGNO (op) == FLAGS_REG")))
> >
> > +;; True if the operand is a MASK register.
> > +(define_predicate "mask_reg_operand"
> > +  (and (match_code "reg")
> > +       (match_test "MASK_REGNO_P (REGNO (op))")))
> > +
> >  ;; Match a DI, SI, HI or QImode nonimmediate_operand.
> >  (define_special_predicate "int_nonimmediate_operand"
> >    (and (match_operand 0 "nonimmediate_operand")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index b6348de67cb..4372a9fd785 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1452,6 +1452,18 @@
> >    "TARGET_AVX512F
> >     && !(MEM_P (operands[0]) && MEM_P (operands[1]))")
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (any_logic:SWI1248_AVX512BW
> > +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "k<code><mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (any_logic:SWI1248_AVX512BW
> > @@ -1474,6 +1486,21 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (and:SWI1248_AVX512BW
> > +          (not:SWI1248_AVX512BW
> > +            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
> > +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (and:SWI1248_AVX512BW
> > +             (not:SWI1248_AVX512BW (match_dup 1))
> > +             (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "kandn<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (and:SWI1248_AVX512BW
> > @@ -1520,6 +1547,16 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (not:SWI1248_AVX512BW
> > +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (not:SWI1248_AVX512BW (match_dup 1)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "knot<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (not:SWI1248_AVX512BW
> > @@ -1541,6 +1578,28 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:DI 0 "mask_reg_operand")
> > +        (zero_extend:DI
> > +          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
> > +  "TARGET_AVX512BW && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (zero_extend:DI
> > +             (not:SI (match_dup 1))))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> > +(define_insn "*knotsi_1_zext"
> > +  [(set (match_operand:DI 0 "register_operand" "=k")
> > +        (zero_extend:DI
> > +          (not:SI (match_operand:SI 1 "register_operand" "k"))))
> > +   (unspec [(const_int 0)] UNSPEC_MASKOP)]
> > +  "TARGET_AVX512BW"
> > +  "knotd\t{%1, %0|%0, %1}";
> > +  [(set_attr "type" "msklog")
> > +   (set_attr "prefix" "vex")
> > +   (set_attr "mode" "SI")])
> > +
> >  (define_insn "kadd<mode>"
> >    [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
> >          (plus:SWI1248_AVX512BWDQ2
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > index 94422f36010..46d9351f275 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-mavx512bw -O2" } */
> > -/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > index c68ad8cc1f7..fe13f4f33fc 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-mavx512bw -O2" } */
> > -/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > new file mode 100644
> > index 00000000000..8e34bf45365
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > @@ -0,0 +1,23 @@
> > +/* PR target/88465 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx512bw" } */
> > +/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
> > +/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
> > +
> > +void
> > +foo (void)
> > +{
> > +  unsigned int k = 0;
> > +  __asm volatile ("" : : "k" (k));
> > +  k = -1;
> > +  __asm volatile ("" : : "k" (k));
> > +}
> > +
> > +void
> > +bar (void)
> > +{
> > +  unsigned long long k = 0;
> > +  __asm volatile ("" : : "k" (k));
> > +  k = -1;
> > +  __asm volatile ("" : : "k" (k));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > index 49817097e26..114e03ee93d 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512dq -O2" } */
> > +/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
> >  /* { dg-final { scan-assembler-times "kmovb\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > index 7bb34d34d8d..79d37394b36 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512f -O2" } */
> > +/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
> >  /* { dg-final { scan-assembler-times "kmovw\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > new file mode 100644
> > index 00000000000..61f71ab8b23
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > @@ -0,0 +1,178 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
> > +
> > +#include <immintrin.h>
> > +__m512i
> > +foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
> > +
> > +__m512i
> > +foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kord" "1" } }  */
> > +
> > +__m512i
> > +foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> > +
> > +__m512i
> > +foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxord" "1" } }  */
> > +
> > +__m512i
> > +foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> > +
> > +__m512i
> > +foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  return _mm512_mask_add_epi8 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  return _mm512_mask_add_epi16 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  return _mm512_mask_add_epi32 (c, ~m1, a, d);
> > +}
> > +
> > +__m512i
> > +foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  return _mm512_mask_add_epi64 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotw" "4" } }  */
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > new file mode 100644
> > index 00000000000..850f0b42652
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > @@ -0,0 +1,8 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> > +/* { dg-final { scan-assembler-times "knotb" "2" } }  */
> > +/* { dg-final { scan-assembler-times "korb" "1" } }  */
> > +/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
> > +#include "bitwise_mask_op-1.c"
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > new file mode 100644
> > index 00000000000..18bf4f0d768
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > @@ -0,0 +1,44 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> > +
> > +#include <immintrin.h>
> > +volatile __mmask8 foo;
> > +void
> > +foo_orb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 | m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
> > +
> > +void
> > +foo_xorb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 ^ m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
> > +
> > +void
> > +foo_andb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 & m2;
> > +}
> > +
> > +void
> > +foo_andnb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 & ~m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
> > +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
> > --
> > 2.18.1
> >
> >
> > --
> > BR,
> > Hongtao
Uros Bizjak Aug. 21, 2020, 1:15 p.m. UTC | #8
> > > gcc/
> > >         PR target/88808
> > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > >         QImode data go into mask registers.
> > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > >         for mask registers.
> > >         (*movqi_internal): Ditto.
> > >         (*anddi_1): Support mask register operations
> > >         (*and<mode>_1): Ditto.
> > >         (*andqi_1): Ditto.
> > >         (*andn<mode>_1): Ditto.
> > >         (*<code><mode>_1): Ditto.
> > >         (*<code>qi_1): Ditto.
> > >         (*one_cmpl<mode>2_1): Ditto.
> > >         (*one_cmplsi2_1_zext): Ditto.
> > >         (*one_cmplqi2_1): Ditto.
> > >         (define_peephole2): Move constant 0/-1 directly into mask
> > >         registers.
> > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > >         that would convert "generic" patterns to mask patterns.
> > >         (*knotsi_1_zext): New define_insn.
> > >
> > > gcc/testsuite/
> > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> >
> > A little nit, please put new splitters after the instruction pattern.
> >
> > OK for the whole patch set with the above change,
> >
>
> Yes, thanks for the review.

Please note that your patch introduces several testsuite fails with -m32:

gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c

Program received signal SIGILL, Illegal instruction.
0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
__ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
pointer>,
    __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);

   0x080490a3 <+51>:    cpuid
   0x080490a5 <+53>:    mov    $0x1,%eax
   0x080490aa <+58>:    mov    %ecx,%esi
=> 0x080490ac <+60>:    kmovd  %ebx,%k0
   0x080490b0 <+64>:    mov    %edi,%ecx
   0x080490b2 <+66>:    mov    %edi,%ebx

kmov insn is generated for __cpuid_count function, where the binary
determines, if the new instructions are supported. The binary will
crash in the detection code if the processor lacks AVX512
instructions.

Uros.
Hongtao Liu Aug. 21, 2020, 3:41 p.m. UTC | #9
On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > > > gcc/
> > > >         PR target/88808
> > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > >         QImode data go into mask registers.
> > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > >         for mask registers.
> > > >         (*movqi_internal): Ditto.
> > > >         (*anddi_1): Support mask register operations
> > > >         (*and<mode>_1): Ditto.
> > > >         (*andqi_1): Ditto.
> > > >         (*andn<mode>_1): Ditto.
> > > >         (*<code><mode>_1): Ditto.
> > > >         (*<code>qi_1): Ditto.
> > > >         (*one_cmpl<mode>2_1): Ditto.
> > > >         (*one_cmplsi2_1_zext): Ditto.
> > > >         (*one_cmplqi2_1): Ditto.
> > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > >         registers.
> > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > >         that would convert "generic" patterns to mask patterns.
> > > >         (*knotsi_1_zext): New define_insn.
> > > >
> > > > gcc/testsuite/
> > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > >
> > > A little nit, please put new splitters after the instruction pattern.
> > >
> > > OK for the whole patch set with the above change,
> > >
> >
> > Yes, thanks for the review.
>
> Please note that your patch introduces several testsuite fails with -m32:
>
> gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
>

I can't reproduce this failure.

> Program received signal SIGILL, Illegal instruction.
> 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> pointer>,
>     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
>
>    0x080490a3 <+51>:    cpuid
>    0x080490a5 <+53>:    mov    $0x1,%eax
>    0x080490aa <+58>:    mov    %ecx,%esi
> => 0x080490ac <+60>:    kmovd  %ebx,%k0
>    0x080490b0 <+64>:    mov    %edi,%ecx
>    0x080490b2 <+66>:    mov    %edi,%ebx
>
> kmov insn is generated for __cpuid_count function, where the binary
> determines, if the new instructions are supported. The binary will
> crash in the detection code if the processor lacks AVX512
> instructions.
>

IMHO, the testcase shouldn't be run on processors without AVX512BW.
Because in  avx512bitalg-vpopcntb-1.c, there's /* {
dg-require-effective-target avx512bw } */.

what's the version of your assembler?

> Uros.
H.J. Lu Aug. 21, 2020, 3:50 p.m. UTC | #10
On Fri, Aug 21, 2020 at 8:41 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > >         QImode data go into mask registers.
> > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > >         registers.
> > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > >         that would convert "generic" patterns to mask patterns.
> > > > >         (*knotsi_1_zext): New define_insn.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > A little nit, please put new splitters after the instruction pattern.
> > > >
> > > > OK for the whole patch set with the above change,
> > > >
> > >
> > > Yes, thanks for the review.
> >
> > Please note that your patch introduces several testsuite fails with -m32:
> >
> > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> >
>
> I can't reproduce this failure.
>
> > Program received signal SIGILL, Illegal instruction.
> > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > pointer>,
> >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> >
> >    0x080490a3 <+51>:    cpuid
> >    0x080490a5 <+53>:    mov    $0x1,%eax
> >    0x080490aa <+58>:    mov    %ecx,%esi
> > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> >    0x080490b0 <+64>:    mov    %edi,%ecx
> >    0x080490b2 <+66>:    mov    %edi,%ebx
> >
> > kmov insn is generated for __cpuid_count function, where the binary
> > determines, if the new instructions are supported. The binary will
> > crash in the detection code if the processor lacks AVX512
> > instructions.
> >
>
> IMHO, the testcase shouldn't be run on processors without AVX512BW.
> Because in  avx512bitalg-vpopcntb-1.c, there's /* {
> dg-require-effective-target avx512bw } */.
>

dg-require-effective-target avx512bw checks assembler support.
__cpuid_count can't use any mask instructions.   Please run this test
on CPU without AVX512.
Uros Bizjak Aug. 21, 2020, 3:50 p.m. UTC | #11
On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > >         QImode data go into mask registers.
> > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > >         registers.
> > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > >         that would convert "generic" patterns to mask patterns.
> > > > >         (*knotsi_1_zext): New define_insn.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > A little nit, please put new splitters after the instruction pattern.
> > > >
> > > > OK for the whole patch set with the above change,
> > > >
> > >
> > > Yes, thanks for the review.
> >
> > Please note that your patch introduces several testsuite fails with -m32:
> >
> > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> >
>
> I can't reproduce this failure.

Because you are running it on AVX512 enabled target.

> > Program received signal SIGILL, Illegal instruction.
> > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > pointer>,
> >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> >
> >    0x080490a3 <+51>:    cpuid
> >    0x080490a5 <+53>:    mov    $0x1,%eax
> >    0x080490aa <+58>:    mov    %ecx,%esi
> > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> >    0x080490b0 <+64>:    mov    %edi,%ecx
> >    0x080490b2 <+66>:    mov    %edi,%ebx
> >
> > kmov insn is generated for __cpuid_count function, where the binary
> > determines, if the new instructions are supported. The binary will
> > crash in the detection code if the processor lacks AVX512
> > instructions.
> >
>
> IMHO, the testcase shouldn't be run on processors without AVX512BW.

No, it could run, because it checks for AVX512BW at runtime.

> Because in  avx512bitalg-vpopcntb-1.c, there's /*
> dg-require-effective-target avx512bw } */.

This is to check the toolchain for support.

> what's the version of your assembler?

GNU assembler version 2.34-4.fc32

Please add something like
X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).

Handle this in inline_secondary_memory_needed to reject direct moves
for all other targets. This should disable direct moves for generic
targets.

Uros.
Hongtao Liu Aug. 21, 2020, 4:29 p.m. UTC | #12
On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > > > > gcc/
> > > > > >         PR target/88808
> > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > >         QImode data go into mask registers.
> > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > >         for mask registers.
> > > > > >         (*movqi_internal): Ditto.
> > > > > >         (*anddi_1): Support mask register operations
> > > > > >         (*and<mode>_1): Ditto.
> > > > > >         (*andqi_1): Ditto.
> > > > > >         (*andn<mode>_1): Ditto.
> > > > > >         (*<code><mode>_1): Ditto.
> > > > > >         (*<code>qi_1): Ditto.
> > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > >         registers.
> > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > >
> > > > > A little nit, please put new splitters after the instruction pattern.
> > > > >
> > > > > OK for the whole patch set with the above change,
> > > > >
> > > >
> > > > Yes, thanks for the review.
> > >
> > > Please note that your patch introduces several testsuite fails with -m32:
> > >
> > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > >
> >
> > I can't reproduce this failure.
>
> Because you are running it on AVX512 enabled target.
>
> > > Program received signal SIGILL, Illegal instruction.
> > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > pointer>,
> > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > >
> > >    0x080490a3 <+51>:    cpuid
> > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > >
> > > kmov insn is generated for __cpuid_count function, where the binary
> > > determines, if the new instructions are supported. The binary will
> > > crash in the detection code if the processor lacks AVX512
> > > instructions.
> > >
> >
> > IMHO, the testcase shouldn't be run on processors without AVX512BW.
>
> No, it could run, because it checks for AVX512BW at runtime.
>

Got it.

> > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > dg-require-effective-target avx512bw } */.
>
> This is to check the toolchain for support.
>
> > what's the version of your assembler?
>
> GNU assembler version 2.34-4.fc32
>

If assembler supports avx512bw, but processor not, the test would pass
condition `dg-require-effective-target avx512bw` and be runned.
then crashed for illegal instruction.

> Please add something like
> X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
>
> Handle this in inline_secondary_memory_needed to reject direct moves
> for all other targets. This should disable direct moves for generic
> targets.
>

Yes, I'll add it.

> Uros.
H.J. Lu Aug. 21, 2020, 4:35 p.m. UTC | #13
On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > >         QImode data go into mask registers.
> > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > >         registers.
> > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > >
> > > > > > OK for the whole patch set with the above change,
> > > > > >
> > > > >
> > > > > Yes, thanks for the review.
> > > >
> > > > Please note that your patch introduces several testsuite fails with -m32:
> > > >
> > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > >
> > >
> > > I can't reproduce this failure.
> >
> > Because you are running it on AVX512 enabled target.
> >
> > > > Program received signal SIGILL, Illegal instruction.
> > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > pointer>,
> > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > >
> > > >    0x080490a3 <+51>:    cpuid
> > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > >
> > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > determines, if the new instructions are supported. The binary will
> > > > crash in the detection code if the processor lacks AVX512
> > > > instructions.
> > > >
> > >
> > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> >
> > No, it could run, because it checks for AVX512BW at runtime.
> >
>
> Got it.
>
> > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > dg-require-effective-target avx512bw } */.
> >
> > This is to check the toolchain for support.
> >
> > > what's the version of your assembler?
> >
> > GNU assembler version 2.34-4.fc32
> >
>
> If assembler supports avx512bw, but processor not, the test would pass
> condition `dg-require-effective-target avx512bw` and be runned.
> then crashed for illegal instruction.
>
> > Please add something like
> > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> >
> > Handle this in inline_secondary_memory_needed to reject direct moves
> > for all other targets. This should disable direct moves for generic
> > targets.
> >
>
> Yes, I'll add it.
>


(define_insn "*movsi_internal"
  [(set (match_operand:SI 0 "nonimmediate_operand"
    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
        (match_operand:SI 1 "general_operand"
    "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
...
 [(set (attr "isa")
     (cond [(eq_attr "alternative" "12,13")
              (const_string "sse2")
           ]
           (const_string "*")))

is wrong.   mask register alternatives should be marked with avx512f.
Please fix it.   Other integer move patterns may have the same issue.
Once these are fixed,

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..576e9b390c6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,7 @@ do_test (void)
 }
 #endif

+__attribute__((target ("no-avx512f")))
 static int
 check_osxsave (void)
 {
@@ -34,6 +35,7 @@ check_osxsave (void)
   return (ecx & bit_OSXSAVE) != 0;
 }

+__attribute__((target ("no-avx512f")))
 int
 main ()
 {

should work.
H.J. Lu Aug. 21, 2020, 4:45 p.m. UTC | #14
On Fri, Aug 21, 2020 at 9:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > > > > gcc/
> > > > > > > >         PR target/88808
> > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > >         QImode data go into mask registers.
> > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > >         for mask registers.
> > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > >         (*andqi_1): Ditto.
> > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > >         registers.
> > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > >
> > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > >
> > > > > > > OK for the whole patch set with the above change,
> > > > > > >
> > > > > >
> > > > > > Yes, thanks for the review.
> > > > >
> > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > >
> > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > >
> > > >
> > > > I can't reproduce this failure.
> > >
> > > Because you are running it on AVX512 enabled target.
> > >
> > > > > Program received signal SIGILL, Illegal instruction.
> > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > pointer>,
> > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > >
> > > > >    0x080490a3 <+51>:    cpuid
> > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > >
> > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > determines, if the new instructions are supported. The binary will
> > > > > crash in the detection code if the processor lacks AVX512
> > > > > instructions.
> > > > >
> > > >
> > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > >
> > > No, it could run, because it checks for AVX512BW at runtime.
> > >
> >
> > Got it.
> >
> > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > dg-require-effective-target avx512bw } */.
> > >
> > > This is to check the toolchain for support.
> > >
> > > > what's the version of your assembler?
> > >
> > > GNU assembler version 2.34-4.fc32
> > >
> >
> > If assembler supports avx512bw, but processor not, the test would pass
> > condition `dg-require-effective-target avx512bw` and be runned.
> > then crashed for illegal instruction.
> >
> > > Please add something like
> > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > >
> > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > for all other targets. This should disable direct moves for generic
> > > targets.
> > >
> >
> > Yes, I'll add it.
> >
>
>
> (define_insn "*movsi_internal"
>   [(set (match_operand:SI 0 "nonimmediate_operand"
>     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
>         (match_operand:SI 1 "general_operand"
>     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> ...
>  [(set (attr "isa")
>      (cond [(eq_attr "alternative" "12,13")
>               (const_string "sse2")
>            ]
>            (const_string "*")))
>
> is wrong.   mask register alternatives should be marked with avx512f.
> Please fix it.   Other integer move patterns may have the same issue.
> Once these are fixed,
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> b/gcc/testsuite/gcc.target/i386/avx512-check.h
> index 0a377dba1d5..576e9b390c6 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> @@ -25,6 +25,7 @@ do_test (void)
>  }
>  #endif
>
> +__attribute__((target ("no-avx512f")))
>  static int
>  check_osxsave (void)
>  {
> @@ -34,6 +35,7 @@ check_osxsave (void)
>    return (ecx & bit_OSXSAVE) != 0;
>  }
>
> +__attribute__((target ("no-avx512f")))
>  int
>  main ()
>  {
>
> should work.
>

Like this.  You need to check all integer patterns with mskmov and msklog.
Hongtao Liu Aug. 21, 2020, 4:46 p.m. UTC | #15
On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > > > > gcc/
> > > > > > > >         PR target/88808
> > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > >         QImode data go into mask registers.
> > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > >         for mask registers.
> > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > >         (*andqi_1): Ditto.
> > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > >         registers.
> > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > >
> > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > >
> > > > > > > OK for the whole patch set with the above change,
> > > > > > >
> > > > > >
> > > > > > Yes, thanks for the review.
> > > > >
> > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > >
> > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > >
> > > >
> > > > I can't reproduce this failure.
> > >
> > > Because you are running it on AVX512 enabled target.
> > >
> > > > > Program received signal SIGILL, Illegal instruction.
> > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > pointer>,
> > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > >
> > > > >    0x080490a3 <+51>:    cpuid
> > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > >
> > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > determines, if the new instructions are supported. The binary will
> > > > > crash in the detection code if the processor lacks AVX512
> > > > > instructions.
> > > > >
> > > >
> > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > >
> > > No, it could run, because it checks for AVX512BW at runtime.
> > >
> >
> > Got it.
> >
> > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > dg-require-effective-target avx512bw } */.
> > >
> > > This is to check the toolchain for support.
> > >
> > > > what's the version of your assembler?
> > >
> > > GNU assembler version 2.34-4.fc32
> > >
> >
> > If assembler supports avx512bw, but processor not, the test would pass
> > condition `dg-require-effective-target avx512bw` and be runned.
> > then crashed for illegal instruction.
> >
> > > Please add something like
> > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > >
> > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > for all other targets. This should disable direct moves for generic
> > > targets.
> > >
> >
> > Yes, I'll add it.
> >
>
>
> (define_insn "*movsi_internal"
>   [(set (match_operand:SI 0 "nonimmediate_operand"
>     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
>         (match_operand:SI 1 "general_operand"
>     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> ...
>  [(set (attr "isa")
>      (cond [(eq_attr "alternative" "12,13")
>               (const_string "sse2")
>            ]
>            (const_string "*")))
>
> is wrong.   mask register alternatives should be marked with avx512f.
> Please fix it.   Other integer move patterns may have the same issue.
> Once these are fixed,
>

It is restricted by ix86_hard_regno_mode_ok
---
18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
18977              || (TARGET_AVX512BW
18978                  && VALID_MASK_AVX512BW_MODE (mode)));
18979    }
---

> diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> b/gcc/testsuite/gcc.target/i386/avx512-check.h
> index 0a377dba1d5..576e9b390c6 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> @@ -25,6 +25,7 @@ do_test (void)
>  }
>  #endif
>
> +__attribute__((target ("no-avx512f")))
>  static int
>  check_osxsave (void)
>  {
> @@ -34,6 +35,7 @@ check_osxsave (void)
>    return (ecx & bit_OSXSAVE) != 0;
>  }
>
> +__attribute__((target ("no-avx512f")))
>  int
>  main ()
>  {
>
> should work.
>

That's what i thought right now.
The real problem is we use -mavx512bw to build a binary to determine
whether the processor support AVX512BW

> --
> H.J.
H.J. Lu Aug. 21, 2020, 5:02 p.m. UTC | #16
On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > > > > gcc/
> > > > > > > > >         PR target/88808
> > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > >         QImode data go into mask registers.
> > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > >         for mask registers.
> > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > >         registers.
> > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > >
> > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > >
> > > > > > > > OK for the whole patch set with the above change,
> > > > > > > >
> > > > > > >
> > > > > > > Yes, thanks for the review.
> > > > > >
> > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > >
> > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > >
> > > > >
> > > > > I can't reproduce this failure.
> > > >
> > > > Because you are running it on AVX512 enabled target.
> > > >
> > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > pointer>,
> > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > >
> > > > > >    0x080490a3 <+51>:    cpuid
> > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > >
> > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > determines, if the new instructions are supported. The binary will
> > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > instructions.
> > > > > >
> > > > >
> > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > >
> > > > No, it could run, because it checks for AVX512BW at runtime.
> > > >
> > >
> > > Got it.
> > >
> > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > dg-require-effective-target avx512bw } */.
> > > >
> > > > This is to check the toolchain for support.
> > > >
> > > > > what's the version of your assembler?
> > > >
> > > > GNU assembler version 2.34-4.fc32
> > > >
> > >
> > > If assembler supports avx512bw, but processor not, the test would pass
> > > condition `dg-require-effective-target avx512bw` and be runned.
> > > then crashed for illegal instruction.
> > >
> > > > Please add something like
> > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > >
> > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > for all other targets. This should disable direct moves for generic
> > > > targets.
> > > >
> > >
> > > Yes, I'll add it.
> > >
> >
> >
> > (define_insn "*movsi_internal"
> >   [(set (match_operand:SI 0 "nonimmediate_operand"
> >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> >         (match_operand:SI 1 "general_operand"
> >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > ...
> >  [(set (attr "isa")
> >      (cond [(eq_attr "alternative" "12,13")
> >               (const_string "sse2")
> >            ]
> >            (const_string "*")))
> >
> > is wrong.   mask register alternatives should be marked with avx512f.
> > Please fix it.   Other integer move patterns may have the same issue.
> > Once these are fixed,
> >
>
> It is restricted by ix86_hard_regno_mode_ok
> ---
> 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> 18977              || (TARGET_AVX512BW
> 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> 18979    }
> ---

It may not be checked by RA.

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > index 0a377dba1d5..576e9b390c6 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > @@ -25,6 +25,7 @@ do_test (void)
> >  }
> >  #endif
> >
> > +__attribute__((target ("no-avx512f")))
> >  static int
> >  check_osxsave (void)
> >  {
> > @@ -34,6 +35,7 @@ check_osxsave (void)
> >    return (ecx & bit_OSXSAVE) != 0;
> >  }
> >
> > +__attribute__((target ("no-avx512f")))
> >  int
> >  main ()
> >  {
> >
> > should work.
> >
>
> That's what i thought right now.
> The real problem is we use -mavx512bw to build a binary to determine
> whether the processor support AVX512BW
>

This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
codes.   All CPUID codes should have

#pragma GCC push_options
#pragma GCC target("no-avx")  // no-sse?

Otherwise we will run into problems with integer register spill with
AVX instructions.
H.J. Lu Aug. 21, 2020, 5:07 p.m. UTC | #17
On Fri, Aug 21, 2020 at 10:02 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >         PR target/88808
> > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > >         for mask registers.
> > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > >         registers.
> > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > >
> > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > >
> > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > >
> > > > > > > >
> > > > > > > > Yes, thanks for the review.
> > > > > > >
> > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > >
> > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > >
> > > > > >
> > > > > > I can't reproduce this failure.
> > > > >
> > > > > Because you are running it on AVX512 enabled target.
> > > > >
> > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > pointer>,
> > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > >
> > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > >
> > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > instructions.
> > > > > > >
> > > > > >
> > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > >
> > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > >
> > > >
> > > > Got it.
> > > >
> > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > dg-require-effective-target avx512bw } */.
> > > > >
> > > > > This is to check the toolchain for support.
> > > > >
> > > > > > what's the version of your assembler?
> > > > >
> > > > > GNU assembler version 2.34-4.fc32
> > > > >
> > > >
> > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > then crashed for illegal instruction.
> > > >
> > > > > Please add something like
> > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > >
> > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > for all other targets. This should disable direct moves for generic
> > > > > targets.
> > > > >
> > > >
> > > > Yes, I'll add it.
> > > >
> > >
> > >
> > > (define_insn "*movsi_internal"
> > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > >         (match_operand:SI 1 "general_operand"
> > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > ...
> > >  [(set (attr "isa")
> > >      (cond [(eq_attr "alternative" "12,13")
> > >               (const_string "sse2")
> > >            ]
> > >            (const_string "*")))
> > >
> > > is wrong.   mask register alternatives should be marked with avx512f.
> > > Please fix it.   Other integer move patterns may have the same issue.
> > > Once these are fixed,
> > >
> >
> > It is restricted by ix86_hard_regno_mode_ok
> > ---
> > 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> > 18977              || (TARGET_AVX512BW
> > 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> > 18979    }
> > ---
>
> It may not be checked by RA.

See:

https://gcc.gnu.org/pipermail/gcc-patches/2020-April/543838.html

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > index 0a377dba1d5..576e9b390c6 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > @@ -25,6 +25,7 @@ do_test (void)
> > >  }
> > >  #endif
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  static int
> > >  check_osxsave (void)
> > >  {
> > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > >    return (ecx & bit_OSXSAVE) != 0;
> > >  }
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  int
> > >  main ()
> > >  {
> > >
> > > should work.
> > >
> >
> > That's what i thought right now.
> > The real problem is we use -mavx512bw to build a binary to determine
> > whether the processor support AVX512BW
> >
>
> This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
> codes.   All CPUID codes should have
>
> #pragma GCC push_options
> #pragma GCC target("no-avx")  // no-sse?
>
> Otherwise we will run into problems with integer register spill with
> AVX instructions.
>
>
> --
> H.J.
Hongtao Liu Aug. 21, 2020, 5:29 p.m. UTC | #18
On Sat, Aug 22, 2020 at 1:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 10:02 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > > >
> > > > > > > > > > > gcc/
> > > > > > > > > > >         PR target/88808
> > > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > > >         for mask registers.
> > > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > > >         registers.
> > > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > > >
> > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > > >
> > > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > > >
> > > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, thanks for the review.
> > > > > > > >
> > > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > > >
> > > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > > >
> > > > > > >
> > > > > > > I can't reproduce this failure.
> > > > > >
> > > > > > Because you are running it on AVX512 enabled target.
> > > > > >
> > > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > > pointer>,
> > > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > > >
> > > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > > >
> > > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > > instructions.
> > > > > > > >
> > > > > > >
> > > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > > >
> > > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > > >
> > > > >
> > > > > Got it.
> > > > >
> > > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > > dg-require-effective-target avx512bw } */.
> > > > > >
> > > > > > This is to check the toolchain for support.
> > > > > >
> > > > > > > what's the version of your assembler?
> > > > > >
> > > > > > GNU assembler version 2.34-4.fc32
> > > > > >
> > > > >
> > > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > > then crashed for illegal instruction.
> > > > >
> > > > > > Please add something like
> > > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > > >
> > > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > > for all other targets. This should disable direct moves for generic
> > > > > > targets.
> > > > > >
> > > > >
> > > > > Yes, I'll add it.
> > > > >
> > > >
> > > >
> > > > (define_insn "*movsi_internal"
> > > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > >         (match_operand:SI 1 "general_operand"
> > > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > ...
> > > >  [(set (attr "isa")
> > > >      (cond [(eq_attr "alternative" "12,13")
> > > >               (const_string "sse2")
> > > >            ]
> > > >            (const_string "*")))
> > > >
> > > > is wrong.   mask register alternatives should be marked with avx512f.
> > > > Please fix it.   Other integer move patterns may have the same issue.
> > > > Once these are fixed,
> > > >
> > >
> > > It is restricted by ix86_hard_regno_mode_ok
> > > ---
> > > 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> > > 18977              || (TARGET_AVX512BW
> > > 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> > > 18979    }
> > > ---
> >
> > It may not be checked by RA.
>

Also restricted by constraint.md
(define_register_constraint "k" "TARGET_AVX512F ? ALL_MASK_REGS : NO_REGS"
"@internal Any mask register.")

> See:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2020-April/543838.html
>

(define_register_constraint "v" "TARGET_SSE ? ALL_SSE_REGS : NO_REGS"
 "Any EVEX encodable SSE register (@code{%xmm0-%xmm31}).")
Need adjustment?

> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > index 0a377dba1d5..576e9b390c6 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > @@ -25,6 +25,7 @@ do_test (void)
> > > >  }
> > > >  #endif
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  static int
> > > >  check_osxsave (void)
> > > >  {
> > > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > > >    return (ecx & bit_OSXSAVE) != 0;
> > > >  }
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  int
> > > >  main ()
> > > >  {
> > > >
> > > > should work.
> > > >
> > >
> > > That's what i thought right now.
> > > The real problem is we use -mavx512bw to build a binary to determine
> > > whether the processor support AVX512BW
> > >
> >
> > This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
> > codes.   All CPUID codes should have
> >
> > #pragma GCC push_options
> > #pragma GCC target("no-avx")  // no-sse?
> >
> > Otherwise we will run into problems with integer register spill with
> > AVX instructions.
> >
> >
> > --
> > H.J.
>
>
>
> --
> H.J.
Uros Bizjak Aug. 21, 2020, 6:25 p.m. UTC | #19
On Fri, Aug 21, 2020 at 6:29 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > >         QImode data go into mask registers.
> > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > >         registers.
> > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > >
> > > > > > OK for the whole patch set with the above change,
> > > > > >
> > > > >
> > > > > Yes, thanks for the review.
> > > >
> > > > Please note that your patch introduces several testsuite fails with -m32:
> > > >
> > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > >
> > >
> > > I can't reproduce this failure.
> >
> > Because you are running it on AVX512 enabled target.
> >
> > > > Program received signal SIGILL, Illegal instruction.
> > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > pointer>,
> > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > >
> > > >    0x080490a3 <+51>:    cpuid
> > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > >
> > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > determines, if the new instructions are supported. The binary will
> > > > crash in the detection code if the processor lacks AVX512
> > > > instructions.
> > > >
> > >
> > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> >
> > No, it could run, because it checks for AVX512BW at runtime.
> >
>
> Got it.
>
> > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > dg-require-effective-target avx512bw } */.
> >
> > This is to check the toolchain for support.
> >
> > > what's the version of your assembler?
> >
> > GNU assembler version 2.34-4.fc32
> >
>
> If assembler supports avx512bw, but processor not, the test would pass
> condition `dg-require-effective-target avx512bw` and be runned.
> then crashed for illegal instruction.

Please look at avx512-check.h. This is where main function lives.

> > Please add something like
> > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> >
> > Handle this in inline_secondary_memory_needed to reject direct moves
> > for all other targets. This should disable direct moves for generic
> > targets.
> >
>
> Yes, I'll add it.

Thanks.

Uros.
diff mbox series

Patch

From df816952e6e76e3dccd53b6384075c41eed1a0f9 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 13 Aug 2020 14:20:43 +0800
Subject: [PATCH 4/4] Enable bitwise operation for type mask.

Enable operator or/xor/and/andn/not for mask register, kxnor is not
enabled since there's no corresponding instruction for general
registers.

gcc/
	PR target/88808
	* config/i386/i386.md: (*movsi_internal): Adjust constraints
	for mask registers.
	(*movhi_internal): Ditto.
	(*movqi_internal): Ditto.
	(*anddi_1): Support mask register operations
	(*and<mode>_1): Ditto.
	(*andqi_1): Ditto.
	(*andn<mode>_1): Ditto.
	(*<code><mode>_1): Ditto.
	(*<code>qi_1): Ditto.
	(*one_cmpl<mode>2_1): Ditto.
	(*one_cmplsi2_1_zext): Ditto.
	(*one_cmplqi2_1): Ditto.

gcc/testsuite/
	* gcc.target/i386/bitwise_mask_op-1.c: New test.
	* gcc.target/i386/bitwise_mask_op-2.c: New test.
	* gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
	* gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
	* gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
	* gcc.target/i386/avx512f-kmovw-5.c: Ditto.
---
 gcc/config/i386/i386.md                       | 262 +++++++++++++-----
 .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
 .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
 .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
 .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
 .../gcc.target/i386/bitwise_mask_op-1.c       | 177 ++++++++++++
 .../gcc.target/i386/bitwise_mask_op-2.c       |   7 +
 7 files changed, 377 insertions(+), 77 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 74d207c3711..e8ad79d1b0a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2294,7 +2294,7 @@ 
 
 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
 	(match_operand:SI 1 "general_operand"
     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
@@ -2403,8 +2403,8 @@ 
 	   (symbol_ref "true")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,k")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@ 
 
 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-			"=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+			"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,k,k")
 	(match_operand:QI 1 "general_operand"
-			"Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+			"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -9044,19 +9044,21 @@ 
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
 	(and:DI
-	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+   and{q}\t{%2, %0|%0, %2}
+   kandq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9066,7 @@ 
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])
 
 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9132,25 @@ 
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+   and{<imodesuffix>}\t{%2, %0|%0, %2}
+   kand<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "3")
+		 (if_then_else (eq_attr "mode" "SI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9158,39 @@ 
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		(match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
-  "@
-   and{b}\t{%2, %0|%0, %2}
-   and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+     case 0:
+     case 1:
+       return "and{b}\t{%2, %0|%0, %2}";
+     case 2:
+       return "and{l}\t{%k2, %k0|%k0, %k2}";
+     case 3:
+       if (TARGET_AVX512DQ)
+	 return "kandb\t{%2, %1, %0|%0, %1, %2}";
+       return "kandw\t{%2, %1, %0|%0, %1, %2}";
+     default:
+       gcc_unreachable ();
+     }
+}
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9568,53 @@ 
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
 	(and:SWI48
-	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+    andn\t{%2, %1, %0|%0, %1, %2}
+    andn\t{%2, %1, %0|%0, %1, %2}
+    kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
 	(and:SWI12
-	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-	  (match_operand:SWI12 2 "register_operand" "r")))
+	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}";
+    case 1:
+      if (TARGET_AVX512DQ)
+	return "kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}";
+      return "kandnw\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "0")
+		 (const_string "SI")
+	       (and (eq_attr "alternative" "1")
+		    (match_test "!TARGET_AVX512DQ"))
+		  (const_string "HI")
+	      ]
+	      (const_string "<MODE>")))])
 
 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9685,24 @@ 
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
 	(any_or:SWI248
-	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    k<logic><mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9775,37 @@ 
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
-  "@
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return "<logic>{b}\t{%2, %0|%0, %2}";
+    case 2:
+      return "<logic>{l}\t{%k2, %k0|%k0, %k2}";
+    case 3:
+      if (TARGET_AVX512DQ)
+	return "k<logic>b\t{%2, %1, %0|%0, %1, %2}";
+      return "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10454,63 @@ 
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+  not{<imodesuffix>}\t%0
+  knot<mskmodesuffix>\t{%1, %0|%0, %1}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
 	(zero_extend:DI
-	  (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+    not{l}\t%k0
+    knotd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
-  "@
-   not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "not{b}\t%0";
+    case 1:
+      return "not{l}\t%k0";
+    case 2:
+      if (TARGET_AVX512DQ)
+	return "knotb\t{%1, %0|%0, %1}";
+      return "knotw\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "2")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
index 94422f36010..46d9351f275 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
index c68ad8cc1f7..fe13f4f33fc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
index 49817097e26..114e03ee93d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512dq -O2" } */
+/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
index 7bb34d34d8d..79d37394b36 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -O2" } */
+/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovw\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
new file mode 100644
index 00000000000..2757bcaaf50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
@@ -0,0 +1,177 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
+
+#include <immintrin.h>
+__m512i
+foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
+
+__m512i
+foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kord" "1" } }  */
+
+__m512i
+foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
+}
+
+__m512i
+foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
+
+__m512i
+foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxord" "1" } }  */
+
+__m512i
+foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
+}
+
+__m512i
+foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  return _mm512_mask_add_epi8 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  return _mm512_mask_add_epi16 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  return _mm512_mask_add_epi32 (c, ~m1, a, d);
+}
+
+__m512i
+foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  return _mm512_mask_add_epi64 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotw" "4" } }  */
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
new file mode 100644
index 00000000000..277c5a98079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times "knotb" "2" } }  */
+/* { dg-final { scan-assembler-times "korb" "1" } }  */
+/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
+#include "bitwise_mask_op-1.c"
+
-- 
2.18.1