diff mbox series

Enable GCC support for AVX512_VP2INTERSECT.

Message ID CAMZc-by58aYYN1eEwb2ug=raF=A4TFEJCxJPiy7RW4+CZ3L4Ug@mail.gmail.com
State New
Headers show
Series Enable GCC support for AVX512_VP2INTERSECT. | expand

Commit Message

Hongtao Liu June 6, 2019, 5:56 a.m. UTC
Hi Uros and all:
  This patch is about to enable support for AVX512_VP2INTERSECT which will
be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf

  Bootstrap is ok, and no regressions for i386/x86-64 testsuite.

Changelog:

gcc/
+2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
+     H.J. Lu  <hongjiu.lu@intel.com>
+     Olga Makhotina  <olga.makhotina@intel.com>
+
+ * common/config/i386/i386-common.c
+ (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
+ OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
+ (OPTION_MASK_ISA2_AVX512F_UNSET): Add
+ OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
+ (ix86_handle_option): Handle -mavx512vp2intersect.
+ * config/i386/avx512vp2intersectintrin.h: New.
+ * config/i386/avx512vp2intersectvlintrin.h: New.
+ * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
+ * config/i386/driver-i386.c (host_detect_local_cpu): Detect
+ AVX512VP2INTERSECT.
+ * config/i386/i386-builtin-types.def: Add new types.
+ * config/i386/i386-builtin.def: Add new builtins.
+ * config/i386/i386-builtins.c: (enum processor_features): Add
+ F_AVX512VP2INTERSECT.
+ (static const _isa_names_table isa_names_table): Ditto.
+ * config/i386/i386-c.c (ix86_target_macros_internal): Define
+ __AVX512VP2INTERSECT__.
+ * config/i386/i386-expand.c (ix86_expand_builtin): Expand
+ IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
+ IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
+ IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
+ * config/i386/i386-modes.def (P2QI, P2HI): New modes.
+ * config/i386/i386-options.c (ix86_target_string): Add
+ -mavx512vp2intersect.
+ (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
+ * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
+ P2HImode and P2QImode.
+ (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
+ number for P2QImode and P2HImode.
+ * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
+ TARGET_AVX512VP2INTERSECT_P): New.
+ (PTA_AVX512VP2INTERSECT): Ditto.
+ * config/i386/i386.opt: Add -mavx512vp2intersect.
+ * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
+ avx512vp2intersectvlintrin.h.
+ * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
+ (define_mode_iterator VI48_AVX512VP2VL): New.
+ (avx512vp2intersect_2intersect<mode>,
+ avx512vp2intersect_2intersectv16si): New define_insn patterns.
+ (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
+ patterns.
+ * config.gcc: Add avx512vp2intersectvlintrin.h and
+ avx512vp2intersectintrin.h to extra_headers.
+ * doc/invoke.texi: Document -mavx512vp2intersect.
+

gcc/testsuite/
+2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
+     Olga Makhotina  <olga.makhotina@intel.com>
+
+ * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
+ * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
+ * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
+ * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
+ * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
+ * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
+ * gcc.target/i386/sse-13.c: Likewsie.
+ * gcc.target/i386/sse-14.c: Likewise.
+ * gcc.target/i386/sse-22.c: Likewise.
+ * gcc.target/i386/sse-23.c: Likewise.
+ * g++.dg/other/i386-2.C: Likewise.
+ * g++.dg/other/i386-3.C: Likewise.
+

Comments

Uros Bizjak June 6, 2019, 12:12 p.m. UTC | #1
On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Hi Uros and all:
>   This patch is about to enable support for AVX512_VP2INTERSECT which will
> be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
>
>   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
>
> Changelog:
>
> gcc/
> +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> +     H.J. Lu  <hongjiu.lu@intel.com>
> +     Olga Makhotina  <olga.makhotina@intel.com>
> +
> + * common/config/i386/i386-common.c
> + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> + (ix86_handle_option): Handle -mavx512vp2intersect.
> + * config/i386/avx512vp2intersectintrin.h: New.
> + * config/i386/avx512vp2intersectvlintrin.h: New.
> + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> + AVX512VP2INTERSECT.
> + * config/i386/i386-builtin-types.def: Add new types.
> + * config/i386/i386-builtin.def: Add new builtins.
> + * config/i386/i386-builtins.c: (enum processor_features): Add
> + F_AVX512VP2INTERSECT.
> + (static const _isa_names_table isa_names_table): Ditto.
> + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> + __AVX512VP2INTERSECT__.
> + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> + * config/i386/i386-options.c (ix86_target_string): Add
> + -mavx512vp2intersect.
> + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> + P2HImode and P2QImode.
> + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> + number for P2QImode and P2HImode.
> + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> + TARGET_AVX512VP2INTERSECT_P): New.
> + (PTA_AVX512VP2INTERSECT): Ditto.
> + * config/i386/i386.opt: Add -mavx512vp2intersect.
> + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> + avx512vp2intersectvlintrin.h.
> + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> + (define_mode_iterator VI48_AVX512VP2VL): New.
> + (avx512vp2intersect_2intersect<mode>,
> + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> + patterns.
> + * config.gcc: Add avx512vp2intersectvlintrin.h and
> + avx512vp2intersectintrin.h to extra_headers.
> + * doc/invoke.texi: Document -mavx512vp2intersect.
> +
>
> gcc/testsuite/
> +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> +     Olga Makhotina  <olga.makhotina@intel.com>
> +
> + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> + * gcc.target/i386/sse-13.c: Likewsie.
> + * gcc.target/i386/sse-14.c: Likewise.
> + * gcc.target/i386/sse-22.c: Likewise.
> + * gcc.target/i386/sse-23.c: Likewise.
> + * g++.dg/other/i386-2.C: Likewise.
> + * g++.dg/other/i386-3.C: Likewise.
> +

+    case OPT_mavx512vp2intersect:
+      if (value)
+        {
+          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
+          opts->x_ix86_isa_flags2_explicit |=
OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
+  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
+  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
+        }

some space/tab mixup here.

+(define_mode_iterator VI48_AVX512VP2VL
+  [V8DI
+  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
+  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])

also here (or maybe a vertical alignment issue).

+      op2 = copy_to_reg (op2);
+      op3 = copy_to_reg (op3);

The predicate says that this one can be memory operand as well. I
suggest you use

if (!insn_data[icode].operand[X].predicate (opX, modeX))
  opX = copy_to_mode_reg (modeX, opX);

This would also handle eventual VOIDmode vector 0 operand.

+
+      op4 = gen_reg_rtx (mode4);
+      emit_insn (GEN_FCN (icode) (op4, op2, op3));
+      mode0 = GET_MODE_INNER (GET_MODE (op4));
+      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
+      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
+      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
+      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
+      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
+      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
+

You should probably emit a subreg here (using simplify_gen_subreg) and
leave to the register allocator to emit correct hard register out of a
register pair. Using this approach, *vec_extractp2hi and
*vec_extractp2hi should not be necessary anymore; RA will reduce the
subreg RTX to a movqi/movhi by itself.

Uros.
Uros Bizjak June 6, 2019, 12:26 p.m. UTC | #2
On Thu, Jun 6, 2019 at 2:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Hi Uros and all:
> >   This patch is about to enable support for AVX512_VP2INTERSECT which will
> > be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> > VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> > https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> >
> >   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
> >
> > Changelog:
> >
> > gcc/
> > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > +     H.J. Lu  <hongjiu.lu@intel.com>
> > +     Olga Makhotina  <olga.makhotina@intel.com>
> > +
> > + * common/config/i386/i386-common.c
> > + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> > + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> > + (ix86_handle_option): Handle -mavx512vp2intersect.
> > + * config/i386/avx512vp2intersectintrin.h: New.
> > + * config/i386/avx512vp2intersectvlintrin.h: New.
> > + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> > + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> > + AVX512VP2INTERSECT.
> > + * config/i386/i386-builtin-types.def: Add new types.
> > + * config/i386/i386-builtin.def: Add new builtins.
> > + * config/i386/i386-builtins.c: (enum processor_features): Add
> > + F_AVX512VP2INTERSECT.
> > + (static const _isa_names_table isa_names_table): Ditto.
> > + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > + __AVX512VP2INTERSECT__.
> > + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> > + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> > + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> > + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> > + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> > + * config/i386/i386-options.c (ix86_target_string): Add
> > + -mavx512vp2intersect.
> > + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> > + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> > + P2HImode and P2QImode.
> > + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> > + number for P2QImode and P2HImode.
> > + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> > + TARGET_AVX512VP2INTERSECT_P): New.
> > + (PTA_AVX512VP2INTERSECT): Ditto.
> > + * config/i386/i386.opt: Add -mavx512vp2intersect.
> > + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> > + avx512vp2intersectvlintrin.h.
> > + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> > + (define_mode_iterator VI48_AVX512VP2VL): New.
> > + (avx512vp2intersect_2intersect<mode>,
> > + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> > + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> > + patterns.
> > + * config.gcc: Add avx512vp2intersectvlintrin.h and
> > + avx512vp2intersectintrin.h to extra_headers.
> > + * doc/invoke.texi: Document -mavx512vp2intersect.
> > +
> >
> > gcc/testsuite/
> > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > +     Olga Makhotina  <olga.makhotina@intel.com>
> > +
> > + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> > + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> > + * gcc.target/i386/sse-13.c: Likewsie.
> > + * gcc.target/i386/sse-14.c: Likewise.
> > + * gcc.target/i386/sse-22.c: Likewise.
> > + * gcc.target/i386/sse-23.c: Likewise.
> > + * g++.dg/other/i386-2.C: Likewise.
> > + * g++.dg/other/i386-3.C: Likewise.
> > +
>
> +    case OPT_mavx512vp2intersect:
> +      if (value)
> +        {
> +          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> +          opts->x_ix86_isa_flags2_explicit |=
> OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> +  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
> +  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
> +        }
>
> some space/tab mixup here.
>
> +(define_mode_iterator VI48_AVX512VP2VL
> +  [V8DI
> +  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
> +  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])
>
> also here (or maybe a vertical alignment issue).
>
> +      op2 = copy_to_reg (op2);
> +      op3 = copy_to_reg (op3);
>
> The predicate says that this one can be memory operand as well. I
> suggest you use
>
> if (!insn_data[icode].operand[X].predicate (opX, modeX))
>   opX = copy_to_mode_reg (modeX, opX);
>
> This would also handle eventual VOIDmode vector 0 operand.
>
> +
> +      op4 = gen_reg_rtx (mode4);
> +      emit_insn (GEN_FCN (icode) (op4, op2, op3));
> +      mode0 = GET_MODE_INNER (GET_MODE (op4));
> +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
> +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> +      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
> +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
> +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> +      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
> +
>
> You should probably emit a subreg here (using simplify_gen_subreg) and
> leave to the register allocator to emit correct hard register out of a
> register pair. Using this approach, *vec_extractp2hi and
> *vec_extractp2hi should not be necessary anymore; RA will reduce the
> subreg RTX to a movqi/movhi by itself.

+/* Register pair.  */
+VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
+VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */

I think

INT_MODE (P2QI, 16);
INT_MODE (P2HI, 32);

with the above subreg approach should work.

Uros.
H.J. Lu June 7, 2019, 3:04 p.m. UTC | #3
On Thu, Jun 6, 2019 at 5:26 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Jun 6, 2019 at 2:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > Hi Uros and all:
> > >   This patch is about to enable support for AVX512_VP2INTERSECT which will
> > > be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> > > VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> > > https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> > >
> > >   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
> > >
> > > Changelog:
> > >
> > > gcc/
> > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > +     H.J. Lu  <hongjiu.lu@intel.com>
> > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > +
> > > + * common/config/i386/i386-common.c
> > > + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> > > + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> > > + (ix86_handle_option): Handle -mavx512vp2intersect.
> > > + * config/i386/avx512vp2intersectintrin.h: New.
> > > + * config/i386/avx512vp2intersectvlintrin.h: New.
> > > + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> > > + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> > > + AVX512VP2INTERSECT.
> > > + * config/i386/i386-builtin-types.def: Add new types.
> > > + * config/i386/i386-builtin.def: Add new builtins.
> > > + * config/i386/i386-builtins.c: (enum processor_features): Add
> > > + F_AVX512VP2INTERSECT.
> > > + (static const _isa_names_table isa_names_table): Ditto.
> > > + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > > + __AVX512VP2INTERSECT__.
> > > + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> > > + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> > > + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> > > + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> > > + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> > > + * config/i386/i386-options.c (ix86_target_string): Add
> > > + -mavx512vp2intersect.
> > > + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> > > + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> > > + P2HImode and P2QImode.
> > > + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> > > + number for P2QImode and P2HImode.
> > > + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> > > + TARGET_AVX512VP2INTERSECT_P): New.
> > > + (PTA_AVX512VP2INTERSECT): Ditto.
> > > + * config/i386/i386.opt: Add -mavx512vp2intersect.
> > > + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> > > + avx512vp2intersectvlintrin.h.
> > > + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> > > + (define_mode_iterator VI48_AVX512VP2VL): New.
> > > + (avx512vp2intersect_2intersect<mode>,
> > > + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> > > + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> > > + patterns.
> > > + * config.gcc: Add avx512vp2intersectvlintrin.h and
> > > + avx512vp2intersectintrin.h to extra_headers.
> > > + * doc/invoke.texi: Document -mavx512vp2intersect.
> > > +
> > >
> > > gcc/testsuite/
> > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > +
> > > + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> > > + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> > > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> > > + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> > > + * gcc.target/i386/sse-13.c: Likewsie.
> > > + * gcc.target/i386/sse-14.c: Likewise.
> > > + * gcc.target/i386/sse-22.c: Likewise.
> > > + * gcc.target/i386/sse-23.c: Likewise.
> > > + * g++.dg/other/i386-2.C: Likewise.
> > > + * g++.dg/other/i386-3.C: Likewise.
> > > +
> >
> > +    case OPT_mavx512vp2intersect:
> > +      if (value)
> > +        {
> > +          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > +          opts->x_ix86_isa_flags2_explicit |=
> > OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > +  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
> > +  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
> > +        }
> >
> > some space/tab mixup here.
> >
> > +(define_mode_iterator VI48_AVX512VP2VL
> > +  [V8DI
> > +  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
> > +  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])
> >
> > also here (or maybe a vertical alignment issue).
> >
> > +      op2 = copy_to_reg (op2);
> > +      op3 = copy_to_reg (op3);
> >
> > The predicate says that this one can be memory operand as well. I
> > suggest you use
> >
> > if (!insn_data[icode].operand[X].predicate (opX, modeX))
> >   opX = copy_to_mode_reg (modeX, opX);
> >
> > This would also handle eventual VOIDmode vector 0 operand.
> >
> > +
> > +      op4 = gen_reg_rtx (mode4);
> > +      emit_insn (GEN_FCN (icode) (op4, op2, op3));
> > +      mode0 = GET_MODE_INNER (GET_MODE (op4));
> > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
> > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > +      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
> > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
> > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > +      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
> > +
> >
> > You should probably emit a subreg here (using simplify_gen_subreg) and
> > leave to the register allocator to emit correct hard register out of a
> > register pair. Using this approach, *vec_extractp2hi and
> > *vec_extractp2hi should not be necessary anymore; RA will reduce the
> > subreg RTX to a movqi/movhi by itself.
>
> +/* Register pair.  */
> +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
>
> I think
>
> INT_MODE (P2QI, 16);
> INT_MODE (P2HI, 32);
>
> with the above subreg approach should work.
>

I don't think subreg works on pseudo registers with non-zero
offset.  validate_subreg has

 if (maybe_lt (osize, regsize)
      && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P (omode))))
    {
      /* It is invalid for the target to pick a register size for a mode
         that isn't ordered wrt to the size of that mode.  */
      poly_uint64 block_size = ordered_min (isize, regsize);
      unsigned int start_reg;
      poly_uint64 offset_within_reg;
      if (!can_div_trunc_p (offset, block_size, &start_reg, &offset_within_reg)
          || (BYTES_BIG_ENDIAN
              ? maybe_ne (offset_within_reg, block_size - osize)
              : maybe_ne (offset_within_reg, 0U)))
        return false;
Uros Bizjak June 7, 2019, 3:50 p.m. UTC | #4
On Fri, Jun 7, 2019 at 5:05 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Jun 6, 2019 at 5:26 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Jun 6, 2019 at 2:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > Hi Uros and all:
> > > >   This patch is about to enable support for AVX512_VP2INTERSECT which will
> > > > be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> > > > VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> > > > https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> > > >
> > > >   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
> > > >
> > > > Changelog:
> > > >
> > > > gcc/
> > > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > > +     H.J. Lu  <hongjiu.lu@intel.com>
> > > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > > +
> > > > + * common/config/i386/i386-common.c
> > > > + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> > > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> > > > + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> > > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> > > > + (ix86_handle_option): Handle -mavx512vp2intersect.
> > > > + * config/i386/avx512vp2intersectintrin.h: New.
> > > > + * config/i386/avx512vp2intersectvlintrin.h: New.
> > > > + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> > > > + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> > > > + AVX512VP2INTERSECT.
> > > > + * config/i386/i386-builtin-types.def: Add new types.
> > > > + * config/i386/i386-builtin.def: Add new builtins.
> > > > + * config/i386/i386-builtins.c: (enum processor_features): Add
> > > > + F_AVX512VP2INTERSECT.
> > > > + (static const _isa_names_table isa_names_table): Ditto.
> > > > + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > > > + __AVX512VP2INTERSECT__.
> > > > + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> > > > + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> > > > + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> > > > + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> > > > + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> > > > + * config/i386/i386-options.c (ix86_target_string): Add
> > > > + -mavx512vp2intersect.
> > > > + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> > > > + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> > > > + P2HImode and P2QImode.
> > > > + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> > > > + number for P2QImode and P2HImode.
> > > > + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> > > > + TARGET_AVX512VP2INTERSECT_P): New.
> > > > + (PTA_AVX512VP2INTERSECT): Ditto.
> > > > + * config/i386/i386.opt: Add -mavx512vp2intersect.
> > > > + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> > > > + avx512vp2intersectvlintrin.h.
> > > > + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> > > > + (define_mode_iterator VI48_AVX512VP2VL): New.
> > > > + (avx512vp2intersect_2intersect<mode>,
> > > > + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> > > > + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> > > > + patterns.
> > > > + * config.gcc: Add avx512vp2intersectvlintrin.h and
> > > > + avx512vp2intersectintrin.h to extra_headers.
> > > > + * doc/invoke.texi: Document -mavx512vp2intersect.
> > > > +
> > > >
> > > > gcc/testsuite/
> > > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > > +
> > > > + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> > > > + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> > > > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> > > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> > > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> > > > + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> > > > + * gcc.target/i386/sse-13.c: Likewsie.
> > > > + * gcc.target/i386/sse-14.c: Likewise.
> > > > + * gcc.target/i386/sse-22.c: Likewise.
> > > > + * gcc.target/i386/sse-23.c: Likewise.
> > > > + * g++.dg/other/i386-2.C: Likewise.
> > > > + * g++.dg/other/i386-3.C: Likewise.
> > > > +
> > >
> > > +    case OPT_mavx512vp2intersect:
> > > +      if (value)
> > > +        {
> > > +          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > > +          opts->x_ix86_isa_flags2_explicit |=
> > > OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > > +  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
> > > +  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
> > > +        }
> > >
> > > some space/tab mixup here.
> > >
> > > +(define_mode_iterator VI48_AVX512VP2VL
> > > +  [V8DI
> > > +  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
> > > +  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])
> > >
> > > also here (or maybe a vertical alignment issue).
> > >
> > > +      op2 = copy_to_reg (op2);
> > > +      op3 = copy_to_reg (op3);
> > >
> > > The predicate says that this one can be memory operand as well. I
> > > suggest you use
> > >
> > > if (!insn_data[icode].operand[X].predicate (opX, modeX))
> > >   opX = copy_to_mode_reg (modeX, opX);
> > >
> > > This would also handle eventual VOIDmode vector 0 operand.
> > >
> > > +
> > > +      op4 = gen_reg_rtx (mode4);
> > > +      emit_insn (GEN_FCN (icode) (op4, op2, op3));
> > > +      mode0 = GET_MODE_INNER (GET_MODE (op4));
> > > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
> > > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > > +      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
> > > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
> > > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > > +      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
> > > +
> > >
> > > You should probably emit a subreg here (using simplify_gen_subreg) and
> > > leave to the register allocator to emit correct hard register out of a
> > > register pair. Using this approach, *vec_extractp2hi and
> > > *vec_extractp2hi should not be necessary anymore; RA will reduce the
> > > subreg RTX to a movqi/movhi by itself.
> >
> > +/* Register pair.  */
> > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> >
> > I think
> >
> > INT_MODE (P2QI, 16);
> > INT_MODE (P2HI, 32);
> >
> > with the above subreg approach should work.
> >
>
> I don't think subreg works on pseudo registers with non-zero
> offset.  validate_subreg has
>
>  if (maybe_lt (osize, regsize)
>       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P (omode))))
>     {
>       /* It is invalid for the target to pick a register size for a mode
>          that isn't ordered wrt to the size of that mode.  */
>       poly_uint64 block_size = ordered_min (isize, regsize);
>       unsigned int start_reg;
>       poly_uint64 offset_within_reg;
>       if (!can_div_trunc_p (offset, block_size, &start_reg, &offset_within_reg)
>           || (BYTES_BIG_ENDIAN
>               ? maybe_ne (offset_within_reg, block_size - osize)
>               : maybe_ne (offset_within_reg, 0U)))
>         return false;

It works with SImode subregs of DImode values on 32bit targets. Please
look for calls to gen_highpart, one concrete example is in
atomic_compare_and_swap<mode>.

Uros.
Uros Bizjak June 7, 2019, 3:58 p.m. UTC | #5
On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Hi Uros and all:
>   This patch is about to enable support for AVX512_VP2INTERSECT which will
> be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
>
>   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
>
> Changelog:
>
> gcc/
> +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> +     H.J. Lu  <hongjiu.lu@intel.com>
> +     Olga Makhotina  <olga.makhotina@intel.com>
> +
> + * common/config/i386/i386-common.c
> + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> + (ix86_handle_option): Handle -mavx512vp2intersect.
> + * config/i386/avx512vp2intersectintrin.h: New.
> + * config/i386/avx512vp2intersectvlintrin.h: New.
> + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> + AVX512VP2INTERSECT.
> + * config/i386/i386-builtin-types.def: Add new types.
> + * config/i386/i386-builtin.def: Add new builtins.
> + * config/i386/i386-builtins.c: (enum processor_features): Add
> + F_AVX512VP2INTERSECT.
> + (static const _isa_names_table isa_names_table): Ditto.
> + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> + __AVX512VP2INTERSECT__.
> + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> + * config/i386/i386-options.c (ix86_target_string): Add
> + -mavx512vp2intersect.
> + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> + P2HImode and P2QImode.
> + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> + number for P2QImode and P2HImode.
> + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> + TARGET_AVX512VP2INTERSECT_P): New.
> + (PTA_AVX512VP2INTERSECT): Ditto.
> + * config/i386/i386.opt: Add -mavx512vp2intersect.
> + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> + avx512vp2intersectvlintrin.h.
> + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> + (define_mode_iterator VI48_AVX512VP2VL): New.
> + (avx512vp2intersect_2intersect<mode>,
> + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> + patterns.
> + * config.gcc: Add avx512vp2intersectvlintrin.h and
> + avx512vp2intersectintrin.h to extra_headers.
> + * doc/invoke.texi: Document -mavx512vp2intersect.
> +
>
> gcc/testsuite/
> +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> +     Olga Makhotina  <olga.makhotina@intel.com>
> +
> + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> + * gcc.target/i386/sse-13.c: Likewsie.
> + * gcc.target/i386/sse-14.c: Likewise.
> + * gcc.target/i386/sse-22.c: Likewise.
> + * gcc.target/i386/sse-23.c: Likewise.
> + * g++.dg/other/i386-2.C: Likewise.
> + * g++.dg/other/i386-3.C: Likewise.

@@ -18702,9 +18705,16 @@
   if (STACK_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
   if (MASK_REGNO_P (regno))
-    return (VALID_MASK_REG_MODE (mode)
-    || (TARGET_AVX512BW
- && VALID_MASK_AVX512BW_MODE (mode)));
+    {
+      /* Register pair only starts at even register number.  */
+      if ((mode == P2QImode || mode == P2HImode))
+ return (regno & 1) == 0;
+
+      return (VALID_MASK_REG_MODE (mode)
+      || (TARGET_AVX512BW
+  && VALID_MASK_AVX512BW_MODE (mode)));
+    }
+
   if (SSE_REGNO_P (regno))

There is no guarantee that the first regno of the mask register set
will be odd number. Please rather spell out appropriate mask
registers, following the example of MOD4_SSE_REGNO_P.

Uros.
H.J. Lu June 7, 2019, 5:29 p.m. UTC | #6
On Fri, Jun 7, 2019 at 8:50 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Jun 7, 2019 at 5:05 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Jun 6, 2019 at 5:26 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Jun 6, 2019 at 2:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > Hi Uros and all:
> > > > >   This patch is about to enable support for AVX512_VP2INTERSECT which will
> > > > > be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> > > > > VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> > > > > https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> > > > >
> > > > >   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
> > > > >
> > > > > Changelog:
> > > > >
> > > > > gcc/
> > > > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > > > +     H.J. Lu  <hongjiu.lu@intel.com>
> > > > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > > > +
> > > > > + * common/config/i386/i386-common.c
> > > > > + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> > > > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> > > > > + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> > > > > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> > > > > + (ix86_handle_option): Handle -mavx512vp2intersect.
> > > > > + * config/i386/avx512vp2intersectintrin.h: New.
> > > > > + * config/i386/avx512vp2intersectvlintrin.h: New.
> > > > > + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> > > > > + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> > > > > + AVX512VP2INTERSECT.
> > > > > + * config/i386/i386-builtin-types.def: Add new types.
> > > > > + * config/i386/i386-builtin.def: Add new builtins.
> > > > > + * config/i386/i386-builtins.c: (enum processor_features): Add
> > > > > + F_AVX512VP2INTERSECT.
> > > > > + (static const _isa_names_table isa_names_table): Ditto.
> > > > > + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > > > > + __AVX512VP2INTERSECT__.
> > > > > + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> > > > > + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> > > > > + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> > > > > + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> > > > > + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> > > > > + * config/i386/i386-options.c (ix86_target_string): Add
> > > > > + -mavx512vp2intersect.
> > > > > + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> > > > > + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> > > > > + P2HImode and P2QImode.
> > > > > + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> > > > > + number for P2QImode and P2HImode.
> > > > > + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> > > > > + TARGET_AVX512VP2INTERSECT_P): New.
> > > > > + (PTA_AVX512VP2INTERSECT): Ditto.
> > > > > + * config/i386/i386.opt: Add -mavx512vp2intersect.
> > > > > + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> > > > > + avx512vp2intersectvlintrin.h.
> > > > > + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> > > > > + (define_mode_iterator VI48_AVX512VP2VL): New.
> > > > > + (avx512vp2intersect_2intersect<mode>,
> > > > > + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> > > > > + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> > > > > + patterns.
> > > > > + * config.gcc: Add avx512vp2intersectvlintrin.h and
> > > > > + avx512vp2intersectintrin.h to extra_headers.
> > > > > + * doc/invoke.texi: Document -mavx512vp2intersect.
> > > > > +
> > > > >
> > > > > gcc/testsuite/
> > > > > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > > > > +     Olga Makhotina  <olga.makhotina@intel.com>
> > > > > +
> > > > > + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> > > > > + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> > > > > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> > > > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> > > > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> > > > > + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> > > > > + * gcc.target/i386/sse-13.c: Likewsie.
> > > > > + * gcc.target/i386/sse-14.c: Likewise.
> > > > > + * gcc.target/i386/sse-22.c: Likewise.
> > > > > + * gcc.target/i386/sse-23.c: Likewise.
> > > > > + * g++.dg/other/i386-2.C: Likewise.
> > > > > + * g++.dg/other/i386-3.C: Likewise.
> > > > > +
> > > >
> > > > +    case OPT_mavx512vp2intersect:
> > > > +      if (value)
> > > > +        {
> > > > +          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > > > +          opts->x_ix86_isa_flags2_explicit |=
> > > > OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
> > > > +  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
> > > > +  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
> > > > +        }
> > > >
> > > > some space/tab mixup here.
> > > >
> > > > +(define_mode_iterator VI48_AVX512VP2VL
> > > > +  [V8DI
> > > > +  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
> > > > +  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])
> > > >
> > > > also here (or maybe a vertical alignment issue).
> > > >
> > > > +      op2 = copy_to_reg (op2);
> > > > +      op3 = copy_to_reg (op3);
> > > >
> > > > The predicate says that this one can be memory operand as well. I
> > > > suggest you use
> > > >
> > > > if (!insn_data[icode].operand[X].predicate (opX, modeX))
> > > >   opX = copy_to_mode_reg (modeX, opX);
> > > >
> > > > This would also handle eventual VOIDmode vector 0 operand.
> > > >
> > > > +
> > > > +      op4 = gen_reg_rtx (mode4);
> > > > +      emit_insn (GEN_FCN (icode) (op4, op2, op3));
> > > > +      mode0 = GET_MODE_INNER (GET_MODE (op4));
> > > > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
> > > > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > > > +      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
> > > > +      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
> > > > +      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
> > > > +      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
> > > > +
> > > >
> > > > You should probably emit a subreg here (using simplify_gen_subreg) and
> > > > leave to the register allocator to emit correct hard register out of a
> > > > register pair. Using this approach, *vec_extractp2hi and
> > > > *vec_extractp2hi should not be necessary anymore; RA will reduce the
> > > > subreg RTX to a movqi/movhi by itself.
> > >
> > > +/* Register pair.  */
> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > >
> > > I think
> > >
> > > INT_MODE (P2QI, 16);
> > > INT_MODE (P2HI, 32);
> > >
> > > with the above subreg approach should work.
> > >
> >
> > I don't think subreg works on pseudo registers with non-zero
> > offset.  validate_subreg has
> >
> >  if (maybe_lt (osize, regsize)
> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P (omode))))
> >     {
> >       /* It is invalid for the target to pick a register size for a mode
> >          that isn't ordered wrt to the size of that mode.  */
> >       poly_uint64 block_size = ordered_min (isize, regsize);
> >       unsigned int start_reg;
> >       poly_uint64 offset_within_reg;
> >       if (!can_div_trunc_p (offset, block_size, &start_reg, &offset_within_reg)
> >           || (BYTES_BIG_ENDIAN
> >               ? maybe_ne (offset_within_reg, block_size - osize)
> >               : maybe_ne (offset_within_reg, 0U)))
> >         return false;
>
> It works with SImode subregs of DImode values on 32bit targets. Please
> look for calls to gen_highpart, one concrete example is in
> atomic_compare_and_swap<mode>.
>

It works because of

#define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD

and only works for the high part of SImode of DImode.

P2QI and P2HI are 2 special modes of mask register pair for
2 instructions.   Do we want to make them more generic?
H.J. Lu June 7, 2019, 5:42 p.m. UTC | #7
On Fri, Jun 7, 2019 at 8:59 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Jun 6, 2019 at 7:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Hi Uros and all:
> >   This patch is about to enable support for AVX512_VP2INTERSECT which will
> > be in Willow Cove. There are two instructions for AVX512_VP2INTERSECT:
> > VP2INTERSECTD and VP2INTERSECTQ. More details please refer to
> > https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
> >
> >   Bootstrap is ok, and no regressions for i386/x86-64 testsuite.
> >
> > Changelog:
> >
> > gcc/
> > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > +     H.J. Lu  <hongjiu.lu@intel.com>
> > +     Olga Makhotina  <olga.makhotina@intel.com>
> > +
> > + * common/config/i386/i386-common.c
> > + (OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
> > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
> > + (OPTION_MASK_ISA2_AVX512F_UNSET): Add
> > + OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
> > + (ix86_handle_option): Handle -mavx512vp2intersect.
> > + * config/i386/avx512vp2intersectintrin.h: New.
> > + * config/i386/avx512vp2intersectvlintrin.h: New.
> > + * config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
> > + * config/i386/driver-i386.c (host_detect_local_cpu): Detect
> > + AVX512VP2INTERSECT.
> > + * config/i386/i386-builtin-types.def: Add new types.
> > + * config/i386/i386-builtin.def: Add new builtins.
> > + * config/i386/i386-builtins.c: (enum processor_features): Add
> > + F_AVX512VP2INTERSECT.
> > + (static const _isa_names_table isa_names_table): Ditto.
> > + * config/i386/i386-c.c (ix86_target_macros_internal): Define
> > + __AVX512VP2INTERSECT__.
> > + * config/i386/i386-expand.c (ix86_expand_builtin): Expand
> > + IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
> > + IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
> > + IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
> > + * config/i386/i386-modes.def (P2QI, P2HI): New modes.
> > + * config/i386/i386-options.c (ix86_target_string): Add
> > + -mavx512vp2intersect.
> > + (ix86_option_override_internal): Handle AVX512VP2INTERSECT.
> > + * config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
> > + P2HImode and P2QImode.
> > + (ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
> > + number for P2QImode and P2HImode.
> > + * config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
> > + TARGET_AVX512VP2INTERSECT_P): New.
> > + (PTA_AVX512VP2INTERSECT): Ditto.
> > + * config/i386/i386.opt: Add -mavx512vp2intersect.
> > + * config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
> > + avx512vp2intersectvlintrin.h.
> > + * config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
> > + (define_mode_iterator VI48_AVX512VP2VL): New.
> > + (avx512vp2intersect_2intersect<mode>,
> > + avx512vp2intersect_2intersectv16si): New define_insn patterns.
> > + (*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
> > + patterns.
> > + * config.gcc: Add avx512vp2intersectvlintrin.h and
> > + avx512vp2intersectintrin.h to extra_headers.
> > + * doc/invoke.texi: Document -mavx512vp2intersect.
> > +
> >
> > gcc/testsuite/
> > +2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
> > +     Olga Makhotina  <olga.makhotina@intel.com>
> > +
> > + * gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
> > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
> > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
> > + * gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
> > + * gcc.target/i386/sse-13.c: Likewsie.
> > + * gcc.target/i386/sse-14.c: Likewise.
> > + * gcc.target/i386/sse-22.c: Likewise.
> > + * gcc.target/i386/sse-23.c: Likewise.
> > + * g++.dg/other/i386-2.C: Likewise.
> > + * g++.dg/other/i386-3.C: Likewise.
>
> @@ -18702,9 +18705,16 @@
>    if (STACK_REGNO_P (regno))
>      return VALID_FP_MODE_P (mode);
>    if (MASK_REGNO_P (regno))
> -    return (VALID_MASK_REG_MODE (mode)
> -    || (TARGET_AVX512BW
> - && VALID_MASK_AVX512BW_MODE (mode)));
> +    {
> +      /* Register pair only starts at even register number.  */
> +      if ((mode == P2QImode || mode == P2HImode))
> + return (regno & 1) == 0;
> +
> +      return (VALID_MASK_REG_MODE (mode)
> +      || (TARGET_AVX512BW
> +  && VALID_MASK_AVX512BW_MODE (mode)));
> +    }
> +
>    if (SSE_REGNO_P (regno))
>
> There is no guarantee that the first regno of the mask register set
> will be odd number. Please rather spell out appropriate mask
> registers, following the example of MOD4_SSE_REGNO_P.
>

We can use

 #define MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
 #define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
#define MASK_PAIR_REGNO_P(N) ((((N) - FIRST_MASK_REG) & 1) == 0)

BTW,

      /* For AVX-5124FMAPS or AVX-5124VNNIW
         allow V64SF and V64SI modes for special regnos.  */
      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
          && (mode == V64SFmode || mode == V64SImode)
          && MOD4_SSE_REGNO_P (regno))
        return true;

can be optimized to

      /* For AVX-5124FMAPS or AVX-5124VNNIW
         allow V64SF and V64SI modes for special regnos.  */
      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
          && (mode == V64SFmode || mode == V64SImode))
         return MOD4_SSE_REGNO_P (regno);
Uros Bizjak June 7, 2019, 8:12 p.m. UTC | #8
On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:

>> > > +/* Register pair.  */
>> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
>> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
>> > >
>> > > I think
>> > >
>> > > INT_MODE (P2QI, 16);
>> > > INT_MODE (P2HI, 32);
>> > >
>> > > with the above subreg approach should work.
>> > >
>> >
>> > I don't think subreg works on pseudo registers with non-zero
>> > offset.  validate_subreg has
>> >
>> >  if (maybe_lt (osize, regsize)
>> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
>> > (omode))))
>> >     {
>> >       /* It is invalid for the target to pick a register size for a
>> > mode
>> >          that isn't ordered wrt to the size of that mode.  */
>> >       poly_uint64 block_size = ordered_min (isize, regsize);
>> >       unsigned int start_reg;
>> >       poly_uint64 offset_within_reg;
>> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
>> > &offset_within_reg)
>> >           || (BYTES_BIG_ENDIAN
>> >               ? maybe_ne (offset_within_reg, block_size - osize)
>> >               : maybe_ne (offset_within_reg, 0U)))
>> >         return false;
>>
>> It works with SImode subregs of DImode values on 32bit targets. Please
>> look for calls to gen_highpart, one concrete example is in
>> atomic_compare_and_swap<mode>.
>>
>
> It works because of
>
> #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
>
> and only works for the high part of SImode of DImode.
>
> P2QI and P2HI are 2 special modes of mask register pair for
> 2 instructions.   Do we want to make them more generic?

If enhancing the referred define means that we don't need two
artificial instructions and leave all heavy lifting to the existing
generic functionality, then this is the way to go.

Uros.
Uros Bizjak June 7, 2019, 8:16 p.m. UTC | #9
On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:

>> @@ -18702,9 +18705,16 @@
>>    if (STACK_REGNO_P (regno))
>>      return VALID_FP_MODE_P (mode);
>>    if (MASK_REGNO_P (regno))
>> -    return (VALID_MASK_REG_MODE (mode)
>> -    || (TARGET_AVX512BW
>> - && VALID_MASK_AVX512BW_MODE (mode)));
>> +    {
>> +      /* Register pair only starts at even register number.  */
>> +      if ((mode == P2QImode || mode == P2HImode))
>> + return (regno & 1) == 0;
>> +
>> +      return (VALID_MASK_REG_MODE (mode)
>> +      || (TARGET_AVX512BW
>> +  && VALID_MASK_AVX512BW_MODE (mode)));
>> +    }
>> +
>>    if (SSE_REGNO_P (regno))
>>
>> There is no guarantee that the first regno of the mask register set
>> will be odd number. Please rather spell out appropriate mask
>> registers, following the example of MOD4_SSE_REGNO_P.
>>
>
> We can use
>
>  #define MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
>  #define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
> #define MASK_PAIR_REGNO_P(N) ((((N) - FIRST_MASK_REG) & 1) == 0)

Yes this would work.

> BTW,
>
>       /* For AVX-5124FMAPS or AVX-5124VNNIW
>          allow V64SF and V64SI modes for special regnos.  */
>       if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
>           && (mode == V64SFmode || mode == V64SImode)
>           && MOD4_SSE_REGNO_P (regno))
>         return true;
>
> can be optimized to
>
>       /* For AVX-5124FMAPS or AVX-5124VNNIW
>          allow V64SF and V64SI modes for special regnos.  */
>       if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
>           && (mode == V64SFmode || mode == V64SImode))
>          return MOD4_SSE_REGNO_P (regno);

Sure, this would be an obvious patch.

Uros.
Hongtao Liu June 20, 2019, 5:38 a.m. UTC | #10
On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
>
> >> > > +/* Register pair.  */
> >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> >> > >
> >> > > I think
> >> > >
> >> > > INT_MODE (P2QI, 16);
> >> > > INT_MODE (P2HI, 32);
> >> > >
> >> > > with the above subreg approach should work.
> >> > >
> >> >
> >> > I don't think subreg works on pseudo registers with non-zero
> >> > offset.  validate_subreg has
> >> >
> >> >  if (maybe_lt (osize, regsize)
> >> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
> >> > (omode))))
> >> >     {
> >> >       /* It is invalid for the target to pick a register size for a
> >> > mode
> >> >          that isn't ordered wrt to the size of that mode.  */
> >> >       poly_uint64 block_size = ordered_min (isize, regsize);
> >> >       unsigned int start_reg;
> >> >       poly_uint64 offset_within_reg;
> >> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
> >> > &offset_within_reg)
> >> >           || (BYTES_BIG_ENDIAN
> >> >               ? maybe_ne (offset_within_reg, block_size - osize)
> >> >               : maybe_ne (offset_within_reg, 0U)))
> >> >         return false;
> >>
> >> It works with SImode subregs of DImode values on 32bit targets. Please
> >> look for calls to gen_highpart, one concrete example is in
> >> atomic_compare_and_swap<mode>.
> >>
> >
> > It works because of
> >
> > #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
> >
> > and only works for the high part of SImode of DImode.
> >
> > P2QI and P2HI are 2 special modes of mask register pair for
> > 2 instructions.   Do we want to make them more generic?
>
> If enhancing the referred define means that we don't need two
> artificial instructions and leave all heavy lifting to the existing
Do you mean that we take P2HI and P2QI as normal vector modes,
and reuse ix86_expand_vector_* things?
But still two artificial instructions can't be avoided.
> generic functionality, then this is the way to go.
>
> Uros.
Uros Bizjak June 20, 2019, 6:12 a.m. UTC | #11
On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > >> > > +/* Register pair.  */
> > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > >> > >
> > >> > > I think
> > >> > >
> > >> > > INT_MODE (P2QI, 16);
> > >> > > INT_MODE (P2HI, 32);
> > >> > >
> > >> > > with the above subreg approach should work.
> > >> > >
> > >> >
> > >> > I don't think subreg works on pseudo registers with non-zero
> > >> > offset.  validate_subreg has
> > >> >
> > >> >  if (maybe_lt (osize, regsize)
> > >> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
> > >> > (omode))))
> > >> >     {
> > >> >       /* It is invalid for the target to pick a register size for a
> > >> > mode
> > >> >          that isn't ordered wrt to the size of that mode.  */
> > >> >       poly_uint64 block_size = ordered_min (isize, regsize);
> > >> >       unsigned int start_reg;
> > >> >       poly_uint64 offset_within_reg;
> > >> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
> > >> > &offset_within_reg)
> > >> >           || (BYTES_BIG_ENDIAN
> > >> >               ? maybe_ne (offset_within_reg, block_size - osize)
> > >> >               : maybe_ne (offset_within_reg, 0U)))
> > >> >         return false;
> > >>
> > >> It works with SImode subregs of DImode values on 32bit targets. Please
> > >> look for calls to gen_highpart, one concrete example is in
> > >> atomic_compare_and_swap<mode>.
> > >>
> > >
> > > It works because of
> > >
> > > #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
> > >
> > > and only works for the high part of SImode of DImode.
> > >
> > > P2QI and P2HI are 2 special modes of mask register pair for
> > > 2 instructions.   Do we want to make them more generic?
> >
> > If enhancing the referred define means that we don't need two
> > artificial instructions and leave all heavy lifting to the existing
> Do you mean that we take P2HI and P2QI as normal vector modes,
> and reuse ix86_expand_vector_* things?
> But still two artificial instructions can't be avoided.
> > generic functionality, then this is the way to go.

No, declare them as integer modes and use subregs to access high and
low register. This should work in the same way as SImode hard
registers are accessed in DImode pair for 32bit targets.

Uros.
Hongtao Liu June 20, 2019, 10:55 a.m. UTC | #12
On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > >> > > +/* Register pair.  */
> > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > >> > >
> > > >> > > I think
> > > >> > >
> > > >> > > INT_MODE (P2QI, 16);
> > > >> > > INT_MODE (P2HI, 32);
> > > >> > >
> > > >> > > with the above subreg approach should work.
Yes, it works.

But i didn't figure out how did pass_reload correctly handle such subreg,
do you have suggestions such as "which function i can dig into first" or
"which piece of codes handle subreg"?
> > > >> > >
> > > >> >
> > > >> > I don't think subreg works on pseudo registers with non-zero
> > > >> > offset.  validate_subreg has
> > > >> >
> > > >> >  if (maybe_lt (osize, regsize)
> > > >> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
> > > >> > (omode))))
> > > >> >     {
> > > >> >       /* It is invalid for the target to pick a register size for a
> > > >> > mode
> > > >> >          that isn't ordered wrt to the size of that mode.  */
> > > >> >       poly_uint64 block_size = ordered_min (isize, regsize);
> > > >> >       unsigned int start_reg;
> > > >> >       poly_uint64 offset_within_reg;
> > > >> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
> > > >> > &offset_within_reg)
> > > >> >           || (BYTES_BIG_ENDIAN
> > > >> >               ? maybe_ne (offset_within_reg, block_size - osize)
> > > >> >               : maybe_ne (offset_within_reg, 0U)))
> > > >> >         return false;
> > > >>
> > > >> It works with SImode subregs of DImode values on 32bit targets. Please
> > > >> look for calls to gen_highpart, one concrete example is in
> > > >> atomic_compare_and_swap<mode>.
> > > >>
> > > >
> > > > It works because of
> > > >
> > > > #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
> > > >
> > > > and only works for the high part of SImode of DImode.
> > > >
> > > > P2QI and P2HI are 2 special modes of mask register pair for
> > > > 2 instructions.   Do we want to make them more generic?
> > >
> > > If enhancing the referred define means that we don't need two
> > > artificial instructions and leave all heavy lifting to the existing
> > Do you mean that we take P2HI and P2QI as normal vector modes,
> > and reuse ix86_expand_vector_* things?
> > But still two artificial instructions can't be avoided.
> > > generic functionality, then this is the way to go.
>
> No, declare them as integer modes and use subregs to access high and
> low register. This should work in the same way as SImode hard
> registers are accessed in DImode pair for 32bit targets.
>
> Uros.

Update patch.
Uros Bizjak June 20, 2019, 11:37 a.m. UTC | #13
On Thu, Jun 20, 2019 at 12:54 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > >> > > +/* Register pair.  */
> > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > >> > >
> > > > >> > > I think
> > > > >> > >
> > > > >> > > INT_MODE (P2QI, 16);
> > > > >> > > INT_MODE (P2HI, 32);
> > > > >> > >
> > > > >> > > with the above subreg approach should work.
> Yes, it works.
>
> But i didn't figure out how did pass_reload correctly handle such subreg,
> do you have suggestions such as "which function i can dig into first" or
> "which piece of codes handle subreg"?

I'm really not an expert in this part of the compiler, so I'll leave
the answer for someone else.

> > > > >> > >
> > > > >> >
> > > > >> > I don't think subreg works on pseudo registers with non-zero
> > > > >> > offset.  validate_subreg has
> > > > >> >
> > > > >> >  if (maybe_lt (osize, regsize)
> > > > >> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
> > > > >> > (omode))))
> > > > >> >     {
> > > > >> >       /* It is invalid for the target to pick a register size for a
> > > > >> > mode
> > > > >> >          that isn't ordered wrt to the size of that mode.  */
> > > > >> >       poly_uint64 block_size = ordered_min (isize, regsize);
> > > > >> >       unsigned int start_reg;
> > > > >> >       poly_uint64 offset_within_reg;
> > > > >> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
> > > > >> > &offset_within_reg)
> > > > >> >           || (BYTES_BIG_ENDIAN
> > > > >> >               ? maybe_ne (offset_within_reg, block_size - osize)
> > > > >> >               : maybe_ne (offset_within_reg, 0U)))
> > > > >> >         return false;
> > > > >>
> > > > >> It works with SImode subregs of DImode values on 32bit targets. Please
> > > > >> look for calls to gen_highpart, one concrete example is in
> > > > >> atomic_compare_and_swap<mode>.
> > > > >>
> > > > >
> > > > > It works because of
> > > > >
> > > > > #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
> > > > >
> > > > > and only works for the high part of SImode of DImode.
> > > > >
> > > > > P2QI and P2HI are 2 special modes of mask register pair for
> > > > > 2 instructions.   Do we want to make them more generic?
> > > >
> > > > If enhancing the referred define means that we don't need two
> > > > artificial instructions and leave all heavy lifting to the existing
> > > Do you mean that we take P2HI and P2QI as normal vector modes,
> > > and reuse ix86_expand_vector_* things?
> > > But still two artificial instructions can't be avoided.
> > > > generic functionality, then this is the way to go.
> >
> > No, declare them as integer modes and use subregs to access high and
> > low register. This should work in the same way as SImode hard
> > registers are accessed in DImode pair for 32bit targets.
> >
> > Uros.
>
> Update patch.

Does gen_lowpart/gen_higpart instead of simplify_gen_subreg work?
These two are just a handy wrapper for simplify_gen_subreg. Other than
that, patch LGTM.

Uros.
H.J. Lu June 20, 2019, 2:58 p.m. UTC | #14
On Thu, Jun 20, 2019 at 3:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > >> > > +/* Register pair.  */
> > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > >> > >
> > > > >> > > I think
> > > > >> > >
> > > > >> > > INT_MODE (P2QI, 16);
> > > > >> > > INT_MODE (P2HI, 32);
> > > > >> > >
> > > > >> > > with the above subreg approach should work.
> Yes, it works.
>
> But i didn't figure out how did pass_reload correctly handle such subreg,
> do you have suggestions such as "which function i can dig into first" or
> "which piece of codes handle subreg"?

You need to define REGMODE_NATURAL_SIZE.
Hongtao Liu June 21, 2019, 2:21 a.m. UTC | #15
On Thu, Jun 20, 2019 at 10:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 3:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > >> > > +/* Register pair.  */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > >> > >
> > > > > >> > > I think
> > > > > >> > >
> > > > > >> > > INT_MODE (P2QI, 16);
> > > > > >> > > INT_MODE (P2HI, 32);
Why P2QI need 16 bytes but not 2 bytes?
Same question with P2HI.
> > > > > >> > >
> > > > > >> > > with the above subreg approach should work.
> > Yes, it works.
> >
> > But i didn't figure out how did pass_reload correctly handle such subreg,
> > do you have suggestions such as "which function i can dig into first" or
> > "which piece of codes handle subreg"?
>
> You need to define REGMODE_NATURAL_SIZE.
>
> --
> H.J.
Hongtao Liu June 21, 2019, 5:55 a.m. UTC | #16
On Thu, Jun 20, 2019 at 7:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 12:54 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > >> > > +/* Register pair.  */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > >> > >
> > > > > >> > > I think
> > > > > >> > >
> > > > > >> > > INT_MODE (P2QI, 16);
> > > > > >> > > INT_MODE (P2HI, 32);
> > > > > >> > >
> > > > > >> > > with the above subreg approach should work.
> > Yes, it works.
> >
> > But i didn't figure out how did pass_reload correctly handle such subreg,
> > do you have suggestions such as "which function i can dig into first" or
> > "which piece of codes handle subreg"?
>
> I'm really not an expert in this part of the compiler, so I'll leave
> the answer for someone else.
>
> > > > > >> > >
> > > > > >> >
> > > > > >> > I don't think subreg works on pseudo registers with non-zero
> > > > > >> > offset.  validate_subreg has
> > > > > >> >
> > > > > >> >  if (maybe_lt (osize, regsize)
> > > > > >> >       && ! (lra_in_progress && (FLOAT_MODE_P (imode) || FLOAT_MODE_P
> > > > > >> > (omode))))
> > > > > >> >     {
> > > > > >> >       /* It is invalid for the target to pick a register size for a
> > > > > >> > mode
> > > > > >> >          that isn't ordered wrt to the size of that mode.  */
> > > > > >> >       poly_uint64 block_size = ordered_min (isize, regsize);
> > > > > >> >       unsigned int start_reg;
> > > > > >> >       poly_uint64 offset_within_reg;
> > > > > >> >       if (!can_div_trunc_p (offset, block_size, &start_reg,
> > > > > >> > &offset_within_reg)
> > > > > >> >           || (BYTES_BIG_ENDIAN
> > > > > >> >               ? maybe_ne (offset_within_reg, block_size - osize)
> > > > > >> >               : maybe_ne (offset_within_reg, 0U)))
> > > > > >> >         return false;
> > > > > >>
> > > > > >> It works with SImode subregs of DImode values on 32bit targets. Please
> > > > > >> look for calls to gen_highpart, one concrete example is in
> > > > > >> atomic_compare_and_swap<mode>.
> > > > > >>
> > > > > >
> > > > > > It works because of
> > > > > >
> > > > > > #define REGMODE_NATURAL_SIZE(MODE) UNITS_PER_WORD
> > > > > >
> > > > > > and only works for the high part of SImode of DImode.
> > > > > >
> > > > > > P2QI and P2HI are 2 special modes of mask register pair for
> > > > > > 2 instructions.   Do we want to make them more generic?
> > > > >
> > > > > If enhancing the referred define means that we don't need two
> > > > > artificial instructions and leave all heavy lifting to the existing
> > > > Do you mean that we take P2HI and P2QI as normal vector modes,
> > > > and reuse ix86_expand_vector_* things?
> > > > But still two artificial instructions can't be avoided.
> > > > > generic functionality, then this is the way to go.
> > >
> > > No, declare them as integer modes and use subregs to access high and
> > > low register. This should work in the same way as SImode hard
> > > registers are accessed in DImode pair for 32bit targets.
> > >
> > > Uros.
> >
> > Update patch.
>
> Does gen_lowpart/gen_higpart instead of simplify_gen_subreg work?
Nope.
gen_highpart(QImode, op) calls simpliy_gen_subreg (QImode, op, P2QImode, 15)
which failed to produce subreg operand.
> These two are just a handy wrapper for simplify_gen_subreg. Other than
> that, patch LGTM.
>
> Uros.
Uros Bizjak June 21, 2019, 5:56 a.m. UTC | #17
On Fri, Jun 21, 2019 at 4:21 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jun 20, 2019 at 10:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 3:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > >> > > +/* Register pair.  */
> > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > >> > >
> > > > > > >> > > I think
> > > > > > >> > >
> > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > >> > > INT_MODE (P2HI, 32);
> Why P2QI need 16 bytes but not 2 bytes?
> Same question with P2HI.

Because we made a mistake. It should be 2 and 4, since these arguments
are bytes, not bits.

This will also fix gen_highpart issue.

Uros.

> > > > > > >> > >
> > > > > > >> > > with the above subreg approach should work.
> > > Yes, it works.
> > >
> > > But i didn't figure out how did pass_reload correctly handle such subreg,
> > > do you have suggestions such as "which function i can dig into first" or
> > > "which piece of codes handle subreg"?
> >
> > You need to define REGMODE_NATURAL_SIZE.
> >
> > --
> > H.J.
>
>
>
> --
> BR,
> Hongtao
Hongtao Liu June 21, 2019, 6:53 a.m. UTC | #18
On Fri, Jun 21, 2019 at 1:56 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Jun 21, 2019 at 4:21 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 10:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 3:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > >
> > > > > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > >> > > +/* Register pair.  */
> > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > >> > >
> > > > > > > >> > > I think
> > > > > > > >> > >
> > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > >> > > INT_MODE (P2HI, 32);
> > Why P2QI need 16 bytes but not 2 bytes?
> > Same question with P2HI.
>
> Because we made a mistake. It should be 2 and 4, since these arguments
Then it will run into internal comiler error when building libgcc.
I'm still invertigating it.
> are bytes, not bits.
>
> This will also fix gen_highpart issue.
>
> Uros.
>
> > > > > > > >> > >
> > > > > > > >> > > with the above subreg approach should work.
> > > > Yes, it works.
> > > >
> > > > But i didn't figure out how did pass_reload correctly handle such subreg,
> > > > do you have suggestions such as "which function i can dig into first" or
> > > > "which piece of codes handle subreg"?
> > >
> > > You need to define REGMODE_NATURAL_SIZE.
> > >
> > > --
> > > H.J.
> >
> >
> >
> > --
> > BR,
> > Hongtao
Uros Bizjak June 21, 2019, 6:59 a.m. UTC | #19
On Thu, Jun 20, 2019 at 4:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > > >> > > +/* Register pair.  */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > >> > >
> > > > > >> > > I think
> > > > > >> > >
> > > > > >> > > INT_MODE (P2QI, 16);
> > > > > >> > > INT_MODE (P2HI, 32);
> > > > > >> > >
> > > > > >> > > with the above subreg approach should work.
> > Yes, it works.
> >
> > But i didn't figure out how did pass_reload correctly handle such subreg,
> > do you have suggestions such as "which function i can dig into first" or
> > "which piece of codes handle subreg"?
>
> You need to define REGMODE_NATURAL_SIZE.

It looks to me that this define needs to be updated for vector modes.
We can't access all parts of xmm regs in word_mode.

Uros.
H.J. Lu June 21, 2019, 6:38 p.m. UTC | #20
On Thu, Jun 20, 2019 at 11:51 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Jun 21, 2019 at 1:56 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Jun 21, 2019 at 4:21 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 10:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 3:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Thu, Jun 20, 2019 at 2:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Jun 20, 2019 at 7:36 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Sat, Jun 8, 2019 at 4:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On 6/7/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > >> > > +/* Register pair.  */
> > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > > >> > >
> > > > > > > > >> > > I think
> > > > > > > > >> > >
> > > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > > >> > > INT_MODE (P2HI, 32);
> > > Why P2QI need 16 bytes but not 2 bytes?
> > > Same question with P2HI.
> >
> > Because we made a mistake. It should be 2 and 4, since these arguments
> Then it will run into internal comiler error when building libgcc.
> I'm still invertigating it.
> > are bytes, not bits.

I don't think we can have 2 integer modes with the same number of bytes since
it breaks things like

scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();

We can get

(gdb) p mode
$2 = {m_mode = E_SImode}
(gdb) p wider_mode
$3 = {m_mode = E_P2HImode}
(gdb)

Neither middle-end nor backend support it.
Uros Bizjak June 22, 2019, 7:38 a.m. UTC | #21
On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > > > > > > >> > > +/* Register pair.  */
> > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > > > >> > >
> > > > > > > > > >> > > I think
> > > > > > > > > >> > >
> > > > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > > > >> > > INT_MODE (P2HI, 32);
> > > > Why P2QI need 16 bytes but not 2 bytes?
> > > > Same question with P2HI.
> > >
> > > Because we made a mistake. It should be 2 and 4, since these arguments
> > Then it will run into internal comiler error when building libgcc.
> > I'm still invertigating it.
> > > are bytes, not bits.
>
> I don't think we can have 2 integer modes with the same number of bytes since
> it breaks things like
>
> scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
>
> We can get
>
> (gdb) p mode
> $2 = {m_mode = E_SImode}
> (gdb) p wider_mode
> $3 = {m_mode = E_P2HImode}
> (gdb)
>
> Neither middle-end nor backend support it.

Ouch... It looks we hit the limitation of the middle end (which should
at least warn/error out if two modes of the same width are declared).

OTOH, we can't solve this problem by using two HI/QImode registers,
since a consecutive register pair has to be allocated It is also not
possible to overload existing SI/HImode mode with different
requirements w.r.t register pair allocation (e.g. sometimes the whole
register is allocated, and sometimes a register pair is allocated).

I think we have to invent something like SPECIAL_INT_MODE, which would
avoid mode promotion functionality (basically, it should not be listed
in mode_wider and similar arrays). This would prevent mode promotion
issues, while it would still allow to have mode, having the same width
as existing mode, but with special properties.

I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.

Uros.
Hongtao Liu June 25, 2019, 2:46 a.m. UTC | #22
On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> > > > > > > > > > >> > > +/* Register pair.  */
> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > > > > >> > >
> > > > > > > > > > >> > > I think
> > > > > > > > > > >> > >
> > > > > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > > > > >> > > INT_MODE (P2HI, 32);
> > > > > Why P2QI need 16 bytes but not 2 bytes?
> > > > > Same question with P2HI.
> > > >
> > > > Because we made a mistake. It should be 2 and 4, since these arguments
> > > Then it will run into internal comiler error when building libgcc.
> > > I'm still invertigating it.
> > > > are bytes, not bits.
> >
> > I don't think we can have 2 integer modes with the same number of bytes since
> > it breaks things like
> >
> > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
> >
> > We can get
> >
> > (gdb) p mode
> > $2 = {m_mode = E_SImode}
> > (gdb) p wider_mode
> > $3 = {m_mode = E_P2HImode}
> > (gdb)
> >
> > Neither middle-end nor backend support it.
>
> Ouch... It looks we hit the limitation of the middle end (which should
> at least warn/error out if two modes of the same width are declared).
>
> OTOH, we can't solve this problem by using two HI/QImode registers,
> since a consecutive register pair has to be allocated It is also not
> possible to overload existing SI/HImode mode with different
> requirements w.r.t register pair allocation (e.g. sometimes the whole
> register is allocated, and sometimes a register pair is allocated).
>
> I think we have to invent something like SPECIAL_INT_MODE, which would
> avoid mode promotion functionality (basically, it should not be listed
> in mode_wider and similar arrays). This would prevent mode promotion
> issues, while it would still allow to have mode, having the same width
> as existing mode, but with special properties.
>
> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
>
> Uros.

Patch from H.J using PARTIAL_INT_MODE fixed this issue.

+/* Register pair.  */
+PARTIAL_INT_MODE (HI, 16, P2QI);
+PARTIAL_INT_MODE (SI, 32, P2HI);
+

Here is updated patch.
Uros Bizjak June 25, 2019, 7:58 a.m. UTC | #23
On 6/25/19, Hongtao Liu <crazylht@gmail.com> wrote:
> On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>>
>> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> > > > > > > > > > >> > > +/* Register pair.  */
>> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI
>> > > > > > > > > > >> > > */
>> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI
>> > > > > > > > > > >> > > P4QI */
>> > > > > > > > > > >> > >
>> > > > > > > > > > >> > > I think
>> > > > > > > > > > >> > >
>> > > > > > > > > > >> > > INT_MODE (P2QI, 16);
>> > > > > > > > > > >> > > INT_MODE (P2HI, 32);
>> > > > > Why P2QI need 16 bytes but not 2 bytes?
>> > > > > Same question with P2HI.
>> > > >
>> > > > Because we made a mistake. It should be 2 and 4, since these
>> > > > arguments
>> > > Then it will run into internal comiler error when building libgcc.
>> > > I'm still invertigating it.
>> > > > are bytes, not bits.
>> >
>> > I don't think we can have 2 integer modes with the same number of bytes
>> > since
>> > it breaks things like
>> >
>> > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
>> >
>> > We can get
>> >
>> > (gdb) p mode
>> > $2 = {m_mode = E_SImode}
>> > (gdb) p wider_mode
>> > $3 = {m_mode = E_P2HImode}
>> > (gdb)
>> >
>> > Neither middle-end nor backend support it.
>>
>> Ouch... It looks we hit the limitation of the middle end (which should
>> at least warn/error out if two modes of the same width are declared).
>>
>> OTOH, we can't solve this problem by using two HI/QImode registers,
>> since a consecutive register pair has to be allocated It is also not
>> possible to overload existing SI/HImode mode with different
>> requirements w.r.t register pair allocation (e.g. sometimes the whole
>> register is allocated, and sometimes a register pair is allocated).
>>
>> I think we have to invent something like SPECIAL_INT_MODE, which would
>> avoid mode promotion functionality (basically, it should not be listed
>> in mode_wider and similar arrays). This would prevent mode promotion
>> issues, while it would still allow to have mode, having the same width
>> as existing mode, but with special properties.
>>
>> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
>>
>> Uros.
>
> Patch from H.J using PARTIAL_INT_MODE fixed this issue.
>
> +/* Register pair.  */
> +PARTIAL_INT_MODE (HI, 16, P2QI);
> +PARTIAL_INT_MODE (SI, 32, P2HI);
> +

I don't think this approach is correct (the mode is not partial), and
it could work by chance. The documentation is very brief with the
details of different mode types, so let's ask middle-end and RTL
experts.

Uros.
H.J. Lu June 25, 2019, 2:34 p.m. UTC | #24
On Tue, Jun 25, 2019 at 12:58 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On 6/25/19, Hongtao Liu <crazylht@gmail.com> wrote:
> > On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >>
> >> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> > > > > > > > > > >> > > +/* Register pair.  */
> >> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI
> >> > > > > > > > > > >> > > */
> >> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI
> >> > > > > > > > > > >> > > P4QI */
> >> > > > > > > > > > >> > >
> >> > > > > > > > > > >> > > I think
> >> > > > > > > > > > >> > >
> >> > > > > > > > > > >> > > INT_MODE (P2QI, 16);
> >> > > > > > > > > > >> > > INT_MODE (P2HI, 32);
> >> > > > > Why P2QI need 16 bytes but not 2 bytes?
> >> > > > > Same question with P2HI.
> >> > > >
> >> > > > Because we made a mistake. It should be 2 and 4, since these
> >> > > > arguments
> >> > > Then it will run into internal comiler error when building libgcc.
> >> > > I'm still invertigating it.
> >> > > > are bytes, not bits.
> >> >
> >> > I don't think we can have 2 integer modes with the same number of bytes
> >> > since
> >> > it breaks things like
> >> >
> >> > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
> >> >
> >> > We can get
> >> >
> >> > (gdb) p mode
> >> > $2 = {m_mode = E_SImode}
> >> > (gdb) p wider_mode
> >> > $3 = {m_mode = E_P2HImode}
> >> > (gdb)
> >> >
> >> > Neither middle-end nor backend support it.
> >>
> >> Ouch... It looks we hit the limitation of the middle end (which should
> >> at least warn/error out if two modes of the same width are declared).
> >>
> >> OTOH, we can't solve this problem by using two HI/QImode registers,
> >> since a consecutive register pair has to be allocated It is also not
> >> possible to overload existing SI/HImode mode with different
> >> requirements w.r.t register pair allocation (e.g. sometimes the whole
> >> register is allocated, and sometimes a register pair is allocated).
> >>
> >> I think we have to invent something like SPECIAL_INT_MODE, which would
> >> avoid mode promotion functionality (basically, it should not be listed
> >> in mode_wider and similar arrays). This would prevent mode promotion
> >> issues, while it would still allow to have mode, having the same width
> >> as existing mode, but with special properties.
> >>
> >> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
> >>
> >> Uros.
> >
> > Patch from H.J using PARTIAL_INT_MODE fixed this issue.
> >
> > +/* Register pair.  */
> > +PARTIAL_INT_MODE (HI, 16, P2QI);
> > +PARTIAL_INT_MODE (SI, 32, P2HI);
> > +
>
> I don't think this approach is correct (the mode is not partial), and
> it could work by chance. The documentation is very brief with the
> details of different mode types, so let's ask middle-end and RTL
> experts.
>

It is used by powerpc backend for similar purpose:

:/* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
   for quad memory atomic operations to force getting an even/odd register
   combination.  */
PARTIAL_INT_MODE (TI, 128, PTI);
Richard Sandiford June 25, 2019, 2:51 p.m. UTC | #25
"H.J. Lu" <hjl.tools@gmail.com> writes:
> On Tue, Jun 25, 2019 at 12:58 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>>
>> On 6/25/19, Hongtao Liu <crazylht@gmail.com> wrote:
>> > On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>> >>
>> >> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> > > > > > > > > > >> > > +/* Register pair.  */
>> >> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI
>> >> > > > > > > > > > >> > > */
>> >> > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI
>> >> > > > > > > > > > >> > > P4QI */
>> >> > > > > > > > > > >> > >
>> >> > > > > > > > > > >> > > I think
>> >> > > > > > > > > > >> > >
>> >> > > > > > > > > > >> > > INT_MODE (P2QI, 16);
>> >> > > > > > > > > > >> > > INT_MODE (P2HI, 32);
>> >> > > > > Why P2QI need 16 bytes but not 2 bytes?
>> >> > > > > Same question with P2HI.
>> >> > > >
>> >> > > > Because we made a mistake. It should be 2 and 4, since these
>> >> > > > arguments
>> >> > > Then it will run into internal comiler error when building libgcc.
>> >> > > I'm still invertigating it.
>> >> > > > are bytes, not bits.
>> >> >
>> >> > I don't think we can have 2 integer modes with the same number of bytes
>> >> > since
>> >> > it breaks things like
>> >> >
>> >> > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
>> >> >
>> >> > We can get
>> >> >
>> >> > (gdb) p mode
>> >> > $2 = {m_mode = E_SImode}
>> >> > (gdb) p wider_mode
>> >> > $3 = {m_mode = E_P2HImode}
>> >> > (gdb)
>> >> >
>> >> > Neither middle-end nor backend support it.
>> >>
>> >> Ouch... It looks we hit the limitation of the middle end (which should
>> >> at least warn/error out if two modes of the same width are declared).
>> >>
>> >> OTOH, we can't solve this problem by using two HI/QImode registers,
>> >> since a consecutive register pair has to be allocated It is also not
>> >> possible to overload existing SI/HImode mode with different
>> >> requirements w.r.t register pair allocation (e.g. sometimes the whole
>> >> register is allocated, and sometimes a register pair is allocated).
>> >>
>> >> I think we have to invent something like SPECIAL_INT_MODE, which would
>> >> avoid mode promotion functionality (basically, it should not be listed
>> >> in mode_wider and similar arrays). This would prevent mode promotion
>> >> issues, while it would still allow to have mode, having the same width
>> >> as existing mode, but with special properties.
>> >>
>> >> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
>> >>
>> >> Uros.
>> >
>> > Patch from H.J using PARTIAL_INT_MODE fixed this issue.
>> >
>> > +/* Register pair.  */
>> > +PARTIAL_INT_MODE (HI, 16, P2QI);
>> > +PARTIAL_INT_MODE (SI, 32, P2HI);
>> > +
>>
>> I don't think this approach is correct (the mode is not partial), and
>> it could work by chance. The documentation is very brief with the
>> details of different mode types, so let's ask middle-end and RTL
>> experts.

Agree your SPECIAL_INT_MODE sounds cleaner FWIW.  Having PARTIAL_INT_MODEs
that aren't actually partial seems pretty grim, but...

> It is used by powerpc backend for similar purpose:
>
> :/* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
>    for quad memory atomic operations to force getting an even/odd register
>    combination.  */
> PARTIAL_INT_MODE (TI, 128, PTI);

...I guess this means that it's correct through usage.

Richard
Jeff Law June 25, 2019, 2:55 p.m. UTC | #26
On 6/25/19 8:34 AM, H.J. Lu wrote:
> On Tue, Jun 25, 2019 at 12:58 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>>
>> On 6/25/19, Hongtao Liu <crazylht@gmail.com> wrote:
>>> On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>>>>
>>>> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>
>>>>>>>>>>>>>>>>> +/* Register pair.  */
>>>>>>>>>>>>>>>>> +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI
>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>> +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI
>>>>>>>>>>>>>>>>> P4QI */
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I think
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> INT_MODE (P2QI, 16);
>>>>>>>>>>>>>>>>> INT_MODE (P2HI, 32);
>>>>>>>> Why P2QI need 16 bytes but not 2 bytes?
>>>>>>>> Same question with P2HI.
>>>>>>>
>>>>>>> Because we made a mistake. It should be 2 and 4, since these
>>>>>>> arguments
>>>>>> Then it will run into internal comiler error when building libgcc.
>>>>>> I'm still invertigating it.
>>>>>>> are bytes, not bits.
>>>>>
>>>>> I don't think we can have 2 integer modes with the same number of bytes
>>>>> since
>>>>> it breaks things like
>>>>>
>>>>> scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
>>>>>
>>>>> We can get
>>>>>
>>>>> (gdb) p mode
>>>>> $2 = {m_mode = E_SImode}
>>>>> (gdb) p wider_mode
>>>>> $3 = {m_mode = E_P2HImode}
>>>>> (gdb)
>>>>>
>>>>> Neither middle-end nor backend support it.
>>>>
>>>> Ouch... It looks we hit the limitation of the middle end (which should
>>>> at least warn/error out if two modes of the same width are declared).
>>>>
>>>> OTOH, we can't solve this problem by using two HI/QImode registers,
>>>> since a consecutive register pair has to be allocated It is also not
>>>> possible to overload existing SI/HImode mode with different
>>>> requirements w.r.t register pair allocation (e.g. sometimes the whole
>>>> register is allocated, and sometimes a register pair is allocated).
>>>>
>>>> I think we have to invent something like SPECIAL_INT_MODE, which would
>>>> avoid mode promotion functionality (basically, it should not be listed
>>>> in mode_wider and similar arrays). This would prevent mode promotion
>>>> issues, while it would still allow to have mode, having the same width
>>>> as existing mode, but with special properties.
>>>>
>>>> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
>>>>
>>>> Uros.
>>>
>>> Patch from H.J using PARTIAL_INT_MODE fixed this issue.
>>>
>>> +/* Register pair.  */
>>> +PARTIAL_INT_MODE (HI, 16, P2QI);
>>> +PARTIAL_INT_MODE (SI, 32, P2HI);
>>> +
>>
>> I don't think this approach is correct (the mode is not partial), and
>> it could work by chance. The documentation is very brief with the
>> details of different mode types, so let's ask middle-end and RTL
>> experts.
>>
> 
> It is used by powerpc backend for similar purpose:
> 
> :/* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
>    for quad memory atomic operations to force getting an even/odd register
>    combination.  */
> PARTIAL_INT_MODE (TI, 128, PTI);
The partial modes were designed to handle things like targets with
register sizes that aren't 2**n bits in size.  A port can certainly
support something like SImode and PSImode side by side and they can have
the same underlying size.

Essentially the partial modes represent a mode where the compiler does
not necessarily know the exact size, but instead knows a maximum size of
the object.  You'll have to define suitable movXX patterns and any other
operations you might want to perform.  THe compiler will generally not
convert between the partial mode and any other modes without an explicit
conversion (again it can't because it doesn't know how big the partial
mode really is).

I don't see anything inherently wrong with using the partial modes, but
we need to be aware that they're not stressed all that hard and we could
well run into under-specified cases and missed optimizations.
Jeff
H.J. Lu June 25, 2019, 3:10 p.m. UTC | #27
On Tue, Jun 25, 2019 at 7:55 AM Jeff Law <law@redhat.com> wrote:
>
> On 6/25/19 8:34 AM, H.J. Lu wrote:
> > On Tue, Jun 25, 2019 at 12:58 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >>
> >> On 6/25/19, Hongtao Liu <crazylht@gmail.com> wrote:
> >>> On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >>>>
> >>>> On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>>>
> >>>>>>>>>>>>>>>>> +/* Register pair.  */
> >>>>>>>>>>>>>>>>> +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI
> >>>>>>>>>>>>>>>>> */
> >>>>>>>>>>>>>>>>> +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI
> >>>>>>>>>>>>>>>>> P4QI */
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> I think
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> INT_MODE (P2QI, 16);
> >>>>>>>>>>>>>>>>> INT_MODE (P2HI, 32);
> >>>>>>>> Why P2QI need 16 bytes but not 2 bytes?
> >>>>>>>> Same question with P2HI.
> >>>>>>>
> >>>>>>> Because we made a mistake. It should be 2 and 4, since these
> >>>>>>> arguments
> >>>>>> Then it will run into internal comiler error when building libgcc.
> >>>>>> I'm still invertigating it.
> >>>>>>> are bytes, not bits.
> >>>>>
> >>>>> I don't think we can have 2 integer modes with the same number of bytes
> >>>>> since
> >>>>> it breaks things like
> >>>>>
> >>>>> scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
> >>>>>
> >>>>> We can get
> >>>>>
> >>>>> (gdb) p mode
> >>>>> $2 = {m_mode = E_SImode}
> >>>>> (gdb) p wider_mode
> >>>>> $3 = {m_mode = E_P2HImode}
> >>>>> (gdb)
> >>>>>
> >>>>> Neither middle-end nor backend support it.
> >>>>
> >>>> Ouch... It looks we hit the limitation of the middle end (which should
> >>>> at least warn/error out if two modes of the same width are declared).
> >>>>
> >>>> OTOH, we can't solve this problem by using two HI/QImode registers,
> >>>> since a consecutive register pair has to be allocated It is also not
> >>>> possible to overload existing SI/HImode mode with different
> >>>> requirements w.r.t register pair allocation (e.g. sometimes the whole
> >>>> register is allocated, and sometimes a register pair is allocated).
> >>>>
> >>>> I think we have to invent something like SPECIAL_INT_MODE, which would
> >>>> avoid mode promotion functionality (basically, it should not be listed
> >>>> in mode_wider and similar arrays). This would prevent mode promotion
> >>>> issues, while it would still allow to have mode, having the same width
> >>>> as existing mode, but with special properties.
> >>>>
> >>>> I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
> >>>>
> >>>> Uros.
> >>>
> >>> Patch from H.J using PARTIAL_INT_MODE fixed this issue.
> >>>
> >>> +/* Register pair.  */
> >>> +PARTIAL_INT_MODE (HI, 16, P2QI);
> >>> +PARTIAL_INT_MODE (SI, 32, P2HI);
> >>> +
> >>
> >> I don't think this approach is correct (the mode is not partial), and
> >> it could work by chance. The documentation is very brief with the
> >> details of different mode types, so let's ask middle-end and RTL
> >> experts.
> >>
> >
> > It is used by powerpc backend for similar purpose:
> >
> > :/* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
> >    for quad memory atomic operations to force getting an even/odd register
> >    combination.  */
> > PARTIAL_INT_MODE (TI, 128, PTI);
> The partial modes were designed to handle things like targets with
> register sizes that aren't 2**n bits in size.  A port can certainly
> support something like SImode and PSImode side by side and they can have
> the same underlying size.
>
> Essentially the partial modes represent a mode where the compiler does
> not necessarily know the exact size, but instead knows a maximum size of
> the object.  You'll have to define suitable movXX patterns and any other
> operations you might want to perform.  THe compiler will generally not
> convert between the partial mode and any other modes without an explicit
> conversion (again it can't because it doesn't know how big the partial
> mode really is).

These are all what we need here.  We generate an instruction to set a
P2HI/P2QI register and immediately extract it to HI/QI registers.  No other
operations in P2HI/P2QI modes are generated nor needed.

[hjl@gnu-cfl-1 vp2intersect]$ cat 2.i
typedef int __v16si __attribute__ ((__vector_size__ (64)));

typedef unsigned char  __mmask8;
typedef unsigned short __mmask16;

__mmask16
foo (__v16si x, __v16si y, __mmask16 *b)
{
  __mmask16 a;
  __builtin_ia32_2intersectd512 (&a, b, x, y);
  return a;
}
[hjl@gnu-cfl-1 vp2intersect]$ make 2.s
/export/build/gnu/tools-build/gcc-intel/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-intel/build-x86_64-linux/gcc/
-mavx512vp2intersect -O2 -S 2.i
[hjl@gnu-cfl-1 vp2intersect]$ cat 2.s
.file "2.i"
.text
.p2align 4
.globl foo
.type foo, @function
foo:
.LFB0:
.cfi_startproc
vp2intersectd %zmm1, %zmm0, %k0
kmovw %k0, %eax
kmovw %k1, (%rdi)
ret
.cfi_endproc
.LFE0:
.size foo, .-foo
.ident "GCC: (GNU) 10.0.0 20190620 (experimental)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-cfl-1 vp2intersect]$


> I don't see anything inherently wrong with using the partial modes, but
> we need to be aware that they're not stressed all that hard and we could
> well run into under-specified cases and missed optimizations.
> Jeff
Uros Bizjak June 25, 2019, 5:13 p.m. UTC | #28
On Tue, Jun 25, 2019 at 4:44 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > > > > > > > > > > >> > > +/* Register pair.  */
> > > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > > > > > >> > >
> > > > > > > > > > > >> > > I think
> > > > > > > > > > > >> > >
> > > > > > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > > > > > >> > > INT_MODE (P2HI, 32);
> > > > > > Why P2QI need 16 bytes but not 2 bytes?
> > > > > > Same question with P2HI.
> > > > >
> > > > > Because we made a mistake. It should be 2 and 4, since these arguments
> > > > Then it will run into internal comiler error when building libgcc.
> > > > I'm still invertigating it.
> > > > > are bytes, not bits.
> > >
> > > I don't think we can have 2 integer modes with the same number of bytes since
> > > it breaks things like
> > >
> > > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
> > >
> > > We can get
> > >
> > > (gdb) p mode
> > > $2 = {m_mode = E_SImode}
> > > (gdb) p wider_mode
> > > $3 = {m_mode = E_P2HImode}
> > > (gdb)
> > >
> > > Neither middle-end nor backend support it.
> >
> > Ouch... It looks we hit the limitation of the middle end (which should
> > at least warn/error out if two modes of the same width are declared).
> >
> > OTOH, we can't solve this problem by using two HI/QImode registers,
> > since a consecutive register pair has to be allocated It is also not
> > possible to overload existing SI/HImode mode with different
> > requirements w.r.t register pair allocation (e.g. sometimes the whole
> > register is allocated, and sometimes a register pair is allocated).
> >
> > I think we have to invent something like SPECIAL_INT_MODE, which would
> > avoid mode promotion functionality (basically, it should not be listed
> > in mode_wider and similar arrays). This would prevent mode promotion
> > issues, while it would still allow to have mode, having the same width
> > as existing mode, but with special properties.
> >
> > I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
> >
> > Uros.
>
> Patch from H.J using PARTIAL_INT_MODE fixed this issue.
>
> +/* Register pair.  */
> +PARTIAL_INT_MODE (HI, 16, P2QI);
> +PARTIAL_INT_MODE (SI, 32, P2HI);
> +
>
> Here is updated patch.

OK for mainline, but please add the comment about the reason to use
PARTIAL_INT_MODE.

Thanks,
Uros.
Hongtao Liu June 26, 2019, 4:54 a.m. UTC | #29
On Wed, Jun 26, 2019 at 1:13 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jun 25, 2019 at 4:44 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sat, Jun 22, 2019 at 3:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Jun 21, 2019 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > > > > > > > > > > >> > > +/* Register pair.  */
> > > > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 2); /* P2QI */
> > > > > > > > > > > > >> > > +VECTOR_MODES_WITH_PREFIX (P, INT, 4); /* P2HI P4QI */
> > > > > > > > > > > > >> > >
> > > > > > > > > > > > >> > > I think
> > > > > > > > > > > > >> > >
> > > > > > > > > > > > >> > > INT_MODE (P2QI, 16);
> > > > > > > > > > > > >> > > INT_MODE (P2HI, 32);
> > > > > > > Why P2QI need 16 bytes but not 2 bytes?
> > > > > > > Same question with P2HI.
> > > > > >
> > > > > > Because we made a mistake. It should be 2 and 4, since these arguments
> > > > > Then it will run into internal comiler error when building libgcc.
> > > > > I'm still invertigating it.
> > > > > > are bytes, not bits.
> > > >
> > > > I don't think we can have 2 integer modes with the same number of bytes since
> > > > it breaks things like
> > > >
> > > > scalar_int_mode wider_mode = GET_MODE_WIDER_MODE (mode).require ();
> > > >
> > > > We can get
> > > >
> > > > (gdb) p mode
> > > > $2 = {m_mode = E_SImode}
> > > > (gdb) p wider_mode
> > > > $3 = {m_mode = E_P2HImode}
> > > > (gdb)
> > > >
> > > > Neither middle-end nor backend support it.
> > >
> > > Ouch... It looks we hit the limitation of the middle end (which should
> > > at least warn/error out if two modes of the same width are declared).
> > >
> > > OTOH, we can't solve this problem by using two HI/QImode registers,
> > > since a consecutive register pair has to be allocated It is also not
> > > possible to overload existing SI/HImode mode with different
> > > requirements w.r.t register pair allocation (e.g. sometimes the whole
> > > register is allocated, and sometimes a register pair is allocated).
> > >
> > > I think we have to invent something like SPECIAL_INT_MODE, which would
> > > avoid mode promotion functionality (basically, it should not be listed
> > > in mode_wider and similar arrays). This would prevent mode promotion
> > > issues, while it would still allow to have mode, having the same width
> > > as existing mode, but with special properties.
> > >
> > > I'm adding Jeff and Jakub to the discussion about SPECIAL_INT_MODE.
> > >
> > > Uros.
> >
> > Patch from H.J using PARTIAL_INT_MODE fixed this issue.
> >
> > +/* Register pair.  */
> > +PARTIAL_INT_MODE (HI, 16, P2QI);
> > +PARTIAL_INT_MODE (SI, 32, P2HI);
> > +
> >
> > Here is updated patch.
>
> OK for mainline, but please add the comment about the reason to use
> PARTIAL_INT_MODE.
>
Done.
> Thanks,
> Uros.

Commited in r272668.


--
BR,
Hongtao
Martin Liška June 26, 2019, 9:21 a.m. UTC | #30
Hi.

Started from r272668 I see:

/tmp/ccqxwVjt.s: Assembler messages:

/tmp/ccqxwVjt.s:22: Error: no such instruction: `vp2intersectq .LC1(%rip),%zmm0,%k0'

/tmp/ccqxwVjt.s:33: Error: no such instruction: `vp2intersectd .LC3(%rip),%zmm0,%k0'

compiler exited with status 1
FAIL: gcc.target/i386/avx512vp2intersect-2intersect-1b.c (test for excess errors)
Excess errors:
/tmp/ccqxwVjt.s:22: Error: no such instruction: `vp2intersectq .LC1(%rip),%zmm0,%k0'
/tmp/ccqxwVjt.s:33: Error: no such instruction: `vp2intersectd .LC3(%rip),%zmm0,%k0'

You'll need a dg-require detection I guess.

Thanks,
Martin
Hongtao Liu June 26, 2019, 10:10 a.m. UTC | #31
On Wed, Jun 26, 2019 at 5:21 PM Martin Liška <mliska@suse.cz> wrote:
>
> Hi.
>
> Started from r272668 I see:
>
> /tmp/ccqxwVjt.s: Assembler messages:
>
> /tmp/ccqxwVjt.s:22: Error: no such instruction: `vp2intersectq .LC1(%rip),%zmm0,%k0'
>
> /tmp/ccqxwVjt.s:33: Error: no such instruction: `vp2intersectd .LC3(%rip),%zmm0,%k0'
>
> compiler exited with status 1
> FAIL: gcc.target/i386/avx512vp2intersect-2intersect-1b.c (test for excess errors)
> Excess errors:
> /tmp/ccqxwVjt.s:22: Error: no such instruction: `vp2intersectq .LC1(%rip),%zmm0,%k0'
> /tmp/ccqxwVjt.s:33: Error: no such instruction: `vp2intersectd .LC3(%rip),%zmm0,%k0'
>
> You'll need a dg-require detection I guess.
Yes, thank you.

>
> Thanks,
> Martin

Patch:
Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
===================================================================
--- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
(revision 272668)
+++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c (working copy)
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vp2intersect" } */
+/* { dg-require-effective-target "avx512vp2intersect" } */

 #define AVX512F
 #include <x86intrin.h>
Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
===================================================================
--- testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
(revision 272668)
+++ testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
(working copy)
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vp2intersect -mavx512vl" } */
+/* { dg-require-effective-target "avx512vp2intersect" } */

 #define AVX512F
 #include <x86intrin.h>
Index: testsuite/lib/target-supports.exp
===================================================================
--- testsuite/lib/target-supports.exp (revision 272667)
+++ testsuite/lib/target-supports.exp (working copy)
@@ -7963,6 +7963,20 @@
     } "-mavx512bw" ]
 }

+# Return 1 if avx512vp2intersect instructions can be compiled.
+proc check_effective_target_avx512vp2intersect { } {
+    return [check_no_compiler_messages avx512vp2intersect object {
+ typedef int __v16si __attribute__ ((__vector_size__ (64)));
+ typedef short __mmask16;
+ void
+ _mm512_2intersect_epi32 (__v16si __A, __v16si __B, __mmask16 *__U,
+ __mmask16 *__M)
+ {
+     __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
+ }
+    } "-mavx512vp2intersect" ]
+}
+
 # Return 1 if avx512ifma instructions can be compiled.
 proc check_effective_target_avx512ifma { } {
     return [check_no_compiler_messages avx512ifma object {
Rainer Orth June 26, 2019, 10:14 a.m. UTC | #32
Hi Hongtao,

> Index: testsuite/lib/target-supports.exp
> ===================================================================
> --- testsuite/lib/target-supports.exp (revision 272667)
> +++ testsuite/lib/target-supports.exp (working copy)
> @@ -7963,6 +7963,20 @@
>      } "-mavx512bw" ]
>  }
>
> +# Return 1 if avx512vp2intersect instructions can be compiled.
> +proc check_effective_target_avx512vp2intersect { } {
> +    return [check_no_compiler_messages avx512vp2intersect object {
> + typedef int __v16si __attribute__ ((__vector_size__ (64)));
> + typedef short __mmask16;
> + void
> + _mm512_2intersect_epi32 (__v16si __A, __v16si __B, __mmask16 *__U,
> + __mmask16 *__M)
> + {
> +     __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
> + }
> +    } "-mavx512vp2intersect" ]
> +}
> +
>  # Return 1 if avx512ifma instructions can be compiled.
>  proc check_effective_target_avx512ifma { } {
>      return [check_no_compiler_messages avx512ifma object {

as usual, the new effective-target keyword needs documenting in
sourcebuild.texi.

	Rainer
Hongtao Liu June 27, 2019, 2:33 a.m. UTC | #33
On Wed, Jun 26, 2019 at 6:14 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>
> Hi Hongtao,
>
> > Index: testsuite/lib/target-supports.exp
> > ===================================================================
> > --- testsuite/lib/target-supports.exp (revision 272667)
> > +++ testsuite/lib/target-supports.exp (working copy)
> > @@ -7963,6 +7963,20 @@
> >      } "-mavx512bw" ]
> >  }
> >
> > +# Return 1 if avx512vp2intersect instructions can be compiled.
> > +proc check_effective_target_avx512vp2intersect { } {
> > +    return [check_no_compiler_messages avx512vp2intersect object {
> > + typedef int __v16si __attribute__ ((__vector_size__ (64)));
> > + typedef short __mmask16;
> > + void
> > + _mm512_2intersect_epi32 (__v16si __A, __v16si __B, __mmask16 *__U,
> > + __mmask16 *__M)
> > + {
> > +     __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
> > + }
> > +    } "-mavx512vp2intersect" ]
> > +}
> > +
> >  # Return 1 if avx512ifma instructions can be compiled.
> >  proc check_effective_target_avx512ifma { } {
> >      return [check_no_compiler_messages avx512ifma object {
>
> as usual, the new effective-target keyword needs documenting in
> sourcebuild.texi.
Like this?

Index: ChangeLog
===================================================================
--- ChangeLog (revision 272668)
+++ ChangeLog (working copy)
@@ -1,3 +1,8 @@
+2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
+
+ * doc/sourcebuild.texi: Document new effective target keyword
+ avx512vp2intersect.
+
 2019-06-25  Hongtao Liu  <hongtao.liu@intel.com>
      H.J. Lu  <hongjiu.lu@intel.com>
      Olga Makhotina  <olga.makhotina@intel.com>
Index: doc/sourcebuild.texi
===================================================================
--- doc/sourcebuild.texi (revision 272667)
+++ doc/sourcebuild.texi (working copy)
@@ -2046,6 +2046,9 @@
 @item avx512f_runtime
 Target supports the execution of @code{avx512f} instructions.

+@item avx512vp2intersect
+Target supports the execution of @code{avx512vp2intersect} instructions.
+
 @item cell_hw
 Test system can execute AltiVec and Cell PPU instructions.

Index: testsuite/ChangeLog
===================================================================
--- testsuite/ChangeLog (revision 272668)
+++ testsuite/ChangeLog (working copy)
@@ -1,3 +1,11 @@
+2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
+
+ * lib/target-supports.exp: Add
+ check_effective_target_avx512vp2intersect.
+ * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
+ dg-require-effective-target avx512vp2intersect.
+ * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
+
 2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
      Olga Makhotina  <olga.makhotina@intel.com>

Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
===================================================================
--- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
(revision 272668)
+++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c (working copy)
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vp2intersect" } */
+/* { dg-require-effective-target "avx512vp2intersect" } */

 #define AVX512F
 #include <x86intrin.h>
Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
===================================================================
--- testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
(revision 272668)
+++ testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
(working copy)
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vp2intersect -mavx512vl" } */
+/* { dg-require-effective-target "avx512vp2intersect" } */

 #define AVX512F
 #include <x86intrin.h>
Index: testsuite/lib/target-supports.exp
===================================================================
--- testsuite/lib/target-supports.exp (revision 272667)
+++ testsuite/lib/target-supports.exp (working copy)
@@ -7963,6 +7963,20 @@
     } "-mavx512bw" ]
 }

+# Return 1 if avx512vp2intersect instructions can be compiled.
+proc check_effective_target_avx512vp2intersect { } {
+    return [check_no_compiler_messages avx512vp2intersect object {
+ typedef int __v16si __attribute__ ((__vector_size__ (64)));
+ typedef short __mmask16;
+ void
+ _mm512_2intersect_epi32 (__v16si __A, __v16si __B, __mmask16 *__U,
+ __mmask16 *__M)
+ {
+     __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
+ }
+    } "-mavx512vp2intersect" ]
+}
+
 # Return 1 if avx512ifma instructions can be compiled.
 proc check_effective_target_avx512ifma { } {
     return [check_no_compiler_messages avx512ifma object {
>
>         Rainer
>
> --
> -----------------------------------------------------------------------------
> Rainer Orth, Center for Biotechnology, Bielefeld University
Rainer Orth June 27, 2019, 8:57 a.m. UTC | #34
Hi Hongtao,

>> as usual, the new effective-target keyword needs documenting in
>> sourcebuild.texi.
> Like this?

a couple of nits: first of all, your mailer seems to replace tabs by a
single space.  Please fix this or attach the patch instead.

> Index: ChangeLog
> ===================================================================
> --- ChangeLog (revision 272668)
> +++ ChangeLog (working copy)
> @@ -1,3 +1,8 @@
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * doc/sourcebuild.texi: Document new effective target keyword
> + avx512vp2intersect.

Please include the sections you're modifying, something like

	* doc/sourcebuild.texi (Effective-Target Keywords, Other
	hardware attributes): Document avx512vp2intersect.

And please don't include the ChangeLog in the patch, but include it in
the mail proper: it won't apply due to date and context changes anyway.
Best review https://gcc.gnu.org/contribute.html where this is documented
besides other points of patch submission.

Besides, it's most likely useful to also review the GNU Coding
Standards, too, not only for ChangeLog formatting.

> Index: testsuite/ChangeLog
> ===================================================================
> --- testsuite/ChangeLog (revision 272668)
> +++ testsuite/ChangeLog (working copy)
> @@ -1,3 +1,11 @@
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * lib/target-supports.exp: Add
> + check_effective_target_avx512vp2intersect.

Use

	* lib/target-supports.exp
	(check_effective_target_avx512vp2intersect): New proc.

> + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> + dg-require-effective-target avx512vp2intersect.

Better:

	* gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
	avx512vp2intersect.

but that's a matter of preference.

> Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> ===================================================================
> --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> (revision 272668)
> +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c (working copy)
> @@ -1,5 +1,6 @@
>  /* { dg-do run } */
>  /* { dg-options "-O2 -mavx512vp2intersect" } */
> +/* { dg-require-effective-target "avx512vp2intersect" } */

No need to quote avx512vp2intersect here and in the next test.

Ok with those nits fixed.

Thanks.
        Rainer
Hongtao Liu June 27, 2019, 9:20 a.m. UTC | #35
On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>
> Hi Hongtao,
>
> >> as usual, the new effective-target keyword needs documenting in
> >> sourcebuild.texi.
> > Like this?
>
> a couple of nits: first of all, your mailer seems to replace tabs by a
> single space.  Please fix this or attach the patch instead.
>
> > Index: ChangeLog
> > ===================================================================
> > --- ChangeLog (revision 272668)
> > +++ ChangeLog (working copy)
> > @@ -1,3 +1,8 @@
> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > +
> > + * doc/sourcebuild.texi: Document new effective target keyword
> > + avx512vp2intersect.
>
> Please include the sections you're modifying, something like
>
>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
>         hardware attributes): Document avx512vp2intersect.
>
> And please don't include the ChangeLog in the patch, but include it in
> the mail proper: it won't apply due to date and context changes anyway.
> Best review https://gcc.gnu.org/contribute.html where this is documented
> besides other points of patch submission.
>
> Besides, it's most likely useful to also review the GNU Coding
> Standards, too, not only for ChangeLog formatting.
>
> > Index: testsuite/ChangeLog
> > ===================================================================
> > --- testsuite/ChangeLog (revision 272668)
> > +++ testsuite/ChangeLog (working copy)
> > @@ -1,3 +1,11 @@
> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > +
> > + * lib/target-supports.exp: Add
> > + check_effective_target_avx512vp2intersect.
>
> Use
>
>         * lib/target-supports.exp
>         (check_effective_target_avx512vp2intersect): New proc.
>
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> > + dg-require-effective-target avx512vp2intersect.
>
> Better:
>
>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
>         avx512vp2intersect.
>
> but that's a matter of preference.
>
> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> > ===================================================================
> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> > (revision 272668)
> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c (working copy)
> > @@ -1,5 +1,6 @@
> >  /* { dg-do run } */
> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
> > +/* { dg-require-effective-target "avx512vp2intersect" } */
>
> No need to quote avx512vp2intersect here and in the next test.
>
> Ok with those nits fixed.
>
Yes, thanks a lot.
> Thanks.
>         Rainer
>
> --
> -----------------------------------------------------------------------------
> Rainer Orth, Center for Biotechnology, Bielefeld University

Ok for trunk?
Rainer Orth June 27, 2019, 9:38 a.m. UTC | #36
Hi Hongtao,

> On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>>
>> Hi Hongtao,
>>
>> >> as usual, the new effective-target keyword needs documenting in
>> >> sourcebuild.texi.
>> > Like this?
>>
>> a couple of nits: first of all, your mailer seems to replace tabs by a
>> single space.  Please fix this or attach the patch instead.
>>
>> > Index: ChangeLog
>> > ===================================================================
>> > --- ChangeLog (revision 272668)
>> > +++ ChangeLog (working copy)
>> > @@ -1,3 +1,8 @@
>> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
>> > +
>> > + * doc/sourcebuild.texi: Document new effective target keyword
>> > + avx512vp2intersect.
>>
>> Please include the sections you're modifying, something like
>>
>>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
>>         hardware attributes): Document avx512vp2intersect.
>>
>> And please don't include the ChangeLog in the patch, but include it in
>> the mail proper: it won't apply due to date and context changes anyway.
>> Best review https://gcc.gnu.org/contribute.html where this is documented
>> besides other points of patch submission.
>>
>> Besides, it's most likely useful to also review the GNU Coding
>> Standards, too, not only for ChangeLog formatting.
>>
>> > Index: testsuite/ChangeLog
>> > ===================================================================
>> > --- testsuite/ChangeLog (revision 272668)
>> > +++ testsuite/ChangeLog (working copy)
>> > @@ -1,3 +1,11 @@
>> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
>> > +
>> > + * lib/target-supports.exp: Add
>> > + check_effective_target_avx512vp2intersect.
>>
>> Use
>>
>>         * lib/target-supports.exp
>>         (check_effective_target_avx512vp2intersect): New proc.
>>
>> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
>> > + dg-require-effective-target avx512vp2intersect.
>>
>> Better:
>>
>>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
>>         avx512vp2intersect.
>>
>> but that's a matter of preference.
>>
>> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> > ===================================================================
>> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> > (revision 272668)
>> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> > (working copy)
>> > @@ -1,5 +1,6 @@
>> >  /* { dg-do run } */
>> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
>> > +/* { dg-require-effective-target "avx512vp2intersect" } */
>>
>> No need to quote avx512vp2intersect here and in the next test.
>>
>> Ok with those nits fixed.
>>
> Yes, thanks a lot.
>> Thanks.
>>         Rainer
>>
>> --
>> -----------------------------------------------------------------------------
>> Rainer Orth, Center for Biotechnology, Bielefeld University
>
> Ok for trunk?

ENOPATCH
Hongtao Liu June 28, 2019, 1:23 a.m. UTC | #37
On Thu, Jun 27, 2019 at 5:38 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>
> Hi Hongtao,
>
> > On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
> >>
> >> Hi Hongtao,
> >>
> >> >> as usual, the new effective-target keyword needs documenting in
> >> >> sourcebuild.texi.
> >> > Like this?
> >>
> >> a couple of nits: first of all, your mailer seems to replace tabs by a
> >> single space.  Please fix this or attach the patch instead.
> >>
> >> > Index: ChangeLog
> >> > ===================================================================
> >> > --- ChangeLog (revision 272668)
> >> > +++ ChangeLog (working copy)
> >> > @@ -1,3 +1,8 @@
> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> >> > +
> >> > + * doc/sourcebuild.texi: Document new effective target keyword
> >> > + avx512vp2intersect.
> >>
> >> Please include the sections you're modifying, something like
> >>
> >>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
> >>         hardware attributes): Document avx512vp2intersect.
> >>
> >> And please don't include the ChangeLog in the patch, but include it in
> >> the mail proper: it won't apply due to date and context changes anyway.
> >> Best review https://gcc.gnu.org/contribute.html where this is documented
> >> besides other points of patch submission.
> >>
> >> Besides, it's most likely useful to also review the GNU Coding
> >> Standards, too, not only for ChangeLog formatting.
> >>
> >> > Index: testsuite/ChangeLog
> >> > ===================================================================
> >> > --- testsuite/ChangeLog (revision 272668)
> >> > +++ testsuite/ChangeLog (working copy)
> >> > @@ -1,3 +1,11 @@
> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> >> > +
> >> > + * lib/target-supports.exp: Add
> >> > + check_effective_target_avx512vp2intersect.
> >>
> >> Use
> >>
> >>         * lib/target-supports.exp
> >>         (check_effective_target_avx512vp2intersect): New proc.
> >>
> >> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> >> > + dg-require-effective-target avx512vp2intersect.
> >>
> >> Better:
> >>
> >>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
> >>         avx512vp2intersect.
> >>
> >> but that's a matter of preference.
> >>
> >> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> > ===================================================================
> >> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> > (revision 272668)
> >> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> > (working copy)
> >> > @@ -1,5 +1,6 @@
> >> >  /* { dg-do run } */
> >> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
> >> > +/* { dg-require-effective-target "avx512vp2intersect" } */
> >>
> >> No need to quote avx512vp2intersect here and in the next test.
> >>
> >> Ok with those nits fixed.
> >>
> > Yes, thanks a lot.
> >> Thanks.
> >>         Rainer
> >>
> >> --
> >> -----------------------------------------------------------------------------
> >> Rainer Orth, Center for Biotechnology, Bielefeld University
> >
> > Ok for trunk?
>
> ENOPATCH
Sorry, Here is the patch.
>
> --
> -----------------------------------------------------------------------------
> Rainer Orth, Center for Biotechnology, Bielefeld University


Changelog

gcc/
+2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
+
+ * doc/sourcebuild.texi (Effective-Target Keywords, Other
+ hardware attributes): Document avx512vp2intersect.
+

gcc/testsuite/
+2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
+
+ * lib/target-supports.exp
+ (check_effective_target_avx512vp2intersect): New proc.
+ * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
+ dg-require-effective-target avx512vp2intersect.
+ * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
+
Rainer Orth June 28, 2019, 7:50 a.m. UTC | #38
Hi Hongtao,

>> > Ok for trunk?
>>
>> ENOPATCH
> Sorry, Here is the patch.

> Changelog
>
> gcc/
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * doc/sourcebuild.texi (Effective-Target Keywords, Other
> + hardware attributes): Document avx512vp2intersect.
> +
>
> gcc/testsuite/
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * lib/target-supports.exp
> + (check_effective_target_avx512vp2intersect): New proc.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> + dg-require-effective-target avx512vp2intersect.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
> +

Ok.

Thanks.
        Rainer
Rainer Orth Aug. 6, 2019, 11:16 a.m. UTC | #39
Hi Hongtao,

> On Thu, Jun 27, 2019 at 5:38 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>>
>> Hi Hongtao,
>>
>> > On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth
>> > <ro@cebitec.uni-bielefeld.de> wrote:
>> >>
>> >> Hi Hongtao,
>> >>
>> >> >> as usual, the new effective-target keyword needs documenting in
>> >> >> sourcebuild.texi.
>> >> > Like this?
>> >>
>> >> a couple of nits: first of all, your mailer seems to replace tabs by a
>> >> single space.  Please fix this or attach the patch instead.
>> >>
>> >> > Index: ChangeLog
>> >> > ===================================================================
>> >> > --- ChangeLog (revision 272668)
>> >> > +++ ChangeLog (working copy)
>> >> > @@ -1,3 +1,8 @@
>> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
>> >> > +
>> >> > + * doc/sourcebuild.texi: Document new effective target keyword
>> >> > + avx512vp2intersect.
>> >>
>> >> Please include the sections you're modifying, something like
>> >>
>> >>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
>> >>         hardware attributes): Document avx512vp2intersect.
>> >>
>> >> And please don't include the ChangeLog in the patch, but include it in
>> >> the mail proper: it won't apply due to date and context changes anyway.
>> >> Best review https://gcc.gnu.org/contribute.html where this is documented
>> >> besides other points of patch submission.
>> >>
>> >> Besides, it's most likely useful to also review the GNU Coding
>> >> Standards, too, not only for ChangeLog formatting.
>> >>
>> >> > Index: testsuite/ChangeLog
>> >> > ===================================================================
>> >> > --- testsuite/ChangeLog (revision 272668)
>> >> > +++ testsuite/ChangeLog (working copy)
>> >> > @@ -1,3 +1,11 @@
>> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
>> >> > +
>> >> > + * lib/target-supports.exp: Add
>> >> > + check_effective_target_avx512vp2intersect.
>> >>
>> >> Use
>> >>
>> >>         * lib/target-supports.exp
>> >>         (check_effective_target_avx512vp2intersect): New proc.
>> >>
>> >> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
>> >> > + dg-require-effective-target avx512vp2intersect.
>> >>
>> >> Better:
>> >>
>> >>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
>> >>         avx512vp2intersect.
>> >>
>> >> but that's a matter of preference.
>> >>
>> >> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> >> > ===================================================================
>> >> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> >> > (revision 272668)
>> >> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
>> >> > (working copy)
>> >> > @@ -1,5 +1,6 @@
>> >> >  /* { dg-do run } */
>> >> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
>> >> > +/* { dg-require-effective-target "avx512vp2intersect" } */
>> >>
>> >> No need to quote avx512vp2intersect here and in the next test.
>> >>
>> >> Ok with those nits fixed.
>> >>
>> > Yes, thanks a lot.
>> >> Thanks.
>> >>         Rainer
>> >>
>> >> --
>> >> -----------------------------------------------------------------------------
>> >> Rainer Orth, Center for Biotechnology, Bielefeld University
>> >
>> > Ok for trunk?
>>
>> ENOPATCH
> Sorry, Here is the patch.
>>
>> --
>> -----------------------------------------------------------------------------
>> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> Changelog
>
> gcc/
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * doc/sourcebuild.texi (Effective-Target Keywords, Other
> + hardware attributes): Document avx512vp2intersect.
> +
>
> gcc/testsuite/
> +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> +
> + * lib/target-supports.exp
> + (check_effective_target_avx512vp2intersect): New proc.
> + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> + dg-require-effective-target avx512vp2intersect.
> + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
> +

unfortunately, the testcases are still not right.  While with gas 2.32
they now come up as UNSUPPORTED, I've now tried a mainline bootstrap on
i386-pc-solaris2.11 with gas from binutils master.  Doing so, I get

+FAIL: gcc.target/i386/avx512vp2intersect-2intersect-1b.c execution test
+FAIL: gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c execution test

for both 32 and 64-bit, and there are similar results on
gcc-testresults for x86_64-pc-linux-gnu.

Running one of the testcases under gdb shows

Thread 2 received signal SIGILL, Illegal instruction.
[Switching to Thread 1 (LWP 1)]
0x08050d89 in do_test ()
1: x/i $pc
=> 0x8050d89 <do_test+25>:	(bad)  

or with objdump 2.32:

 8050d89:       62 f2 ff 48 68          (bad)  

Using objdump from binutils master shows

 8050d89:       62 f2 ff 48 68 05 80    vp2intersectq 0x8050a80,%zmm0,%k0

Currently, the testcases only check for AVX512F (which the machine in
question supports: Xeon Gold 6132), while they need to check for
AVX512VP2INTERSECT to avoid this.

The following patch does this; tested on i386-pc-solaris2.11 with gas
2.32.51 both 32 and 64-bit where the tests PASS.

Ok for mainline?

	Rainer
Uros Bizjak Aug. 6, 2019, 3:02 p.m. UTC | #40
On Tue, Aug 6, 2019 at 1:16 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
>
> Hi Hongtao,
>
> > On Thu, Jun 27, 2019 at 5:38 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
> >>
> >> Hi Hongtao,
> >>
> >> > On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth
> >> > <ro@cebitec.uni-bielefeld.de> wrote:
> >> >>
> >> >> Hi Hongtao,
> >> >>
> >> >> >> as usual, the new effective-target keyword needs documenting in
> >> >> >> sourcebuild.texi.
> >> >> > Like this?
> >> >>
> >> >> a couple of nits: first of all, your mailer seems to replace tabs by a
> >> >> single space.  Please fix this or attach the patch instead.
> >> >>
> >> >> > Index: ChangeLog
> >> >> > ===================================================================
> >> >> > --- ChangeLog (revision 272668)
> >> >> > +++ ChangeLog (working copy)
> >> >> > @@ -1,3 +1,8 @@
> >> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> >> >> > +
> >> >> > + * doc/sourcebuild.texi: Document new effective target keyword
> >> >> > + avx512vp2intersect.
> >> >>
> >> >> Please include the sections you're modifying, something like
> >> >>
> >> >>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
> >> >>         hardware attributes): Document avx512vp2intersect.
> >> >>
> >> >> And please don't include the ChangeLog in the patch, but include it in
> >> >> the mail proper: it won't apply due to date and context changes anyway.
> >> >> Best review https://gcc.gnu.org/contribute.html where this is documented
> >> >> besides other points of patch submission.
> >> >>
> >> >> Besides, it's most likely useful to also review the GNU Coding
> >> >> Standards, too, not only for ChangeLog formatting.
> >> >>
> >> >> > Index: testsuite/ChangeLog
> >> >> > ===================================================================
> >> >> > --- testsuite/ChangeLog (revision 272668)
> >> >> > +++ testsuite/ChangeLog (working copy)
> >> >> > @@ -1,3 +1,11 @@
> >> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> >> >> > +
> >> >> > + * lib/target-supports.exp: Add
> >> >> > + check_effective_target_avx512vp2intersect.
> >> >>
> >> >> Use
> >> >>
> >> >>         * lib/target-supports.exp
> >> >>         (check_effective_target_avx512vp2intersect): New proc.
> >> >>
> >> >> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> >> >> > + dg-require-effective-target avx512vp2intersect.
> >> >>
> >> >> Better:
> >> >>
> >> >>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
> >> >>         avx512vp2intersect.
> >> >>
> >> >> but that's a matter of preference.
> >> >>
> >> >> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> >> > ===================================================================
> >> >> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> >> > (revision 272668)
> >> >> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> >> >> > (working copy)
> >> >> > @@ -1,5 +1,6 @@
> >> >> >  /* { dg-do run } */
> >> >> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
> >> >> > +/* { dg-require-effective-target "avx512vp2intersect" } */
> >> >>
> >> >> No need to quote avx512vp2intersect here and in the next test.
> >> >>
> >> >> Ok with those nits fixed.
> >> >>
> >> > Yes, thanks a lot.
> >> >> Thanks.
> >> >>         Rainer
> >> >>
> >> >> --
> >> >> -----------------------------------------------------------------------------
> >> >> Rainer Orth, Center for Biotechnology, Bielefeld University
> >> >
> >> > Ok for trunk?
> >>
> >> ENOPATCH
> > Sorry, Here is the patch.
> >>
> >> --
> >> -----------------------------------------------------------------------------
> >> Rainer Orth, Center for Biotechnology, Bielefeld University
> >
> >
> > Changelog
> >
> > gcc/
> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > +
> > + * doc/sourcebuild.texi (Effective-Target Keywords, Other
> > + hardware attributes): Document avx512vp2intersect.
> > +
> >
> > gcc/testsuite/
> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > +
> > + * lib/target-supports.exp
> > + (check_effective_target_avx512vp2intersect): New proc.
> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> > + dg-require-effective-target avx512vp2intersect.
> > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
> > +
>
> unfortunately, the testcases are still not right.  While with gas 2.32
> they now come up as UNSUPPORTED, I've now tried a mainline bootstrap on
> i386-pc-solaris2.11 with gas from binutils master.  Doing so, I get
>
> +FAIL: gcc.target/i386/avx512vp2intersect-2intersect-1b.c execution test
> +FAIL: gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c execution test
>
> for both 32 and 64-bit, and there are similar results on
> gcc-testresults for x86_64-pc-linux-gnu.
>
> Running one of the testcases under gdb shows
>
> Thread 2 received signal SIGILL, Illegal instruction.
> [Switching to Thread 1 (LWP 1)]
> 0x08050d89 in do_test ()
> 1: x/i $pc
> => 0x8050d89 <do_test+25>:      (bad)
>
> or with objdump 2.32:
>
>  8050d89:       62 f2 ff 48 68          (bad)
>
> Using objdump from binutils master shows
>
>  8050d89:       62 f2 ff 48 68 05 80    vp2intersectq 0x8050a80,%zmm0,%k0
>
> Currently, the testcases only check for AVX512F (which the machine in
> question supports: Xeon Gold 6132), while they need to check for
> AVX512VP2INTERSECT to avoid this.
>
> The following patch does this; tested on i386-pc-solaris2.11 with gas
> 2.32.51 both 32 and 64-bit where the tests PASS.
>
> Ok for mainline?

OK.

Thanks,
Uros.

>
>         Rainer
>
> --
> -----------------------------------------------------------------------------
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2019-08-06  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
>
>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c (AVX512F):
>         Remove.
>         (AVX512VP2INTERSECT): Define.
>         * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c (AVX512F):
>         Remove.
>         (AVX512VP2INTERSECT): Define.
>
Hongtao Liu Aug. 7, 2019, 3:18 a.m. UTC | #41
On Tue, Aug 6, 2019 at 11:02 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Aug 6, 2019 at 1:16 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
> >
> > Hi Hongtao,
> >
> > > On Thu, Jun 27, 2019 at 5:38 PM Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote:
> > >>
> > >> Hi Hongtao,
> > >>
> > >> > On Thu, Jun 27, 2019 at 5:02 PM Rainer Orth
> > >> > <ro@cebitec.uni-bielefeld.de> wrote:
> > >> >>
> > >> >> Hi Hongtao,
> > >> >>
> > >> >> >> as usual, the new effective-target keyword needs documenting in
> > >> >> >> sourcebuild.texi.
> > >> >> > Like this?
> > >> >>
> > >> >> a couple of nits: first of all, your mailer seems to replace tabs by a
> > >> >> single space.  Please fix this or attach the patch instead.
> > >> >>
> > >> >> > Index: ChangeLog
> > >> >> > ===================================================================
> > >> >> > --- ChangeLog (revision 272668)
> > >> >> > +++ ChangeLog (working copy)
> > >> >> > @@ -1,3 +1,8 @@
> > >> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > >> >> > +
> > >> >> > + * doc/sourcebuild.texi: Document new effective target keyword
> > >> >> > + avx512vp2intersect.
> > >> >>
> > >> >> Please include the sections you're modifying, something like
> > >> >>
> > >> >>         * doc/sourcebuild.texi (Effective-Target Keywords, Other
> > >> >>         hardware attributes): Document avx512vp2intersect.
> > >> >>
> > >> >> And please don't include the ChangeLog in the patch, but include it in
> > >> >> the mail proper: it won't apply due to date and context changes anyway.
> > >> >> Best review https://gcc.gnu.org/contribute.html where this is documented
> > >> >> besides other points of patch submission.
> > >> >>
> > >> >> Besides, it's most likely useful to also review the GNU Coding
> > >> >> Standards, too, not only for ChangeLog formatting.
> > >> >>
> > >> >> > Index: testsuite/ChangeLog
> > >> >> > ===================================================================
> > >> >> > --- testsuite/ChangeLog (revision 272668)
> > >> >> > +++ testsuite/ChangeLog (working copy)
> > >> >> > @@ -1,3 +1,11 @@
> > >> >> > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > >> >> > +
> > >> >> > + * lib/target-supports.exp: Add
> > >> >> > + check_effective_target_avx512vp2intersect.
> > >> >>
> > >> >> Use
> > >> >>
> > >> >>         * lib/target-supports.exp
> > >> >>         (check_effective_target_avx512vp2intersect): New proc.
> > >> >>
> > >> >> > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> > >> >> > + dg-require-effective-target avx512vp2intersect.
> > >> >>
> > >> >> Better:
> > >> >>
> > >> >>         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Require
> > >> >>         avx512vp2intersect.
> > >> >>
> > >> >> but that's a matter of preference.
> > >> >>
> > >> >> > Index: testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> > >> >> > ===================================================================
> > >> >> > --- testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> > >> >> > (revision 272668)
> > >> >> > +++ testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
> > >> >> > (working copy)
> > >> >> > @@ -1,5 +1,6 @@
> > >> >> >  /* { dg-do run } */
> > >> >> >  /* { dg-options "-O2 -mavx512vp2intersect" } */
> > >> >> > +/* { dg-require-effective-target "avx512vp2intersect" } */
> > >> >>
> > >> >> No need to quote avx512vp2intersect here and in the next test.
> > >> >>
> > >> >> Ok with those nits fixed.
> > >> >>
> > >> > Yes, thanks a lot.
> > >> >> Thanks.
> > >> >>         Rainer
> > >> >>
> > >> >> --
> > >> >> -----------------------------------------------------------------------------
> > >> >> Rainer Orth, Center for Biotechnology, Bielefeld University
> > >> >
> > >> > Ok for trunk?
> > >>
> > >> ENOPATCH
> > > Sorry, Here is the patch.
> > >>
> > >> --
> > >> -----------------------------------------------------------------------------
> > >> Rainer Orth, Center for Biotechnology, Bielefeld University
> > >
> > >
> > > Changelog
> > >
> > > gcc/
> > > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > > +
> > > + * doc/sourcebuild.texi (Effective-Target Keywords, Other
> > > + hardware attributes): Document avx512vp2intersect.
> > > +
> > >
> > > gcc/testsuite/
> > > +2019-06-27  Hongtao Liu  <hongtao.liu@intel.com>
> > > +
> > > + * lib/target-supports.exp
> > > + (check_effective_target_avx512vp2intersect): New proc.
> > > + * gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Add
> > > + dg-require-effective-target avx512vp2intersect.
> > > + * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Ditto.
> > > +
> >
> > unfortunately, the testcases are still not right.  While with gas 2.32
> > they now come up as UNSUPPORTED, I've now tried a mainline bootstrap on
> > i386-pc-solaris2.11 with gas from binutils master.  Doing so, I get
> >
> > +FAIL: gcc.target/i386/avx512vp2intersect-2intersect-1b.c execution test
> > +FAIL: gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c execution test
> >
> > for both 32 and 64-bit, and there are similar results on
> > gcc-testresults for x86_64-pc-linux-gnu.
> >
> > Running one of the testcases under gdb shows
> >
> > Thread 2 received signal SIGILL, Illegal instruction.
> > [Switching to Thread 1 (LWP 1)]
> > 0x08050d89 in do_test ()
> > 1: x/i $pc
> > => 0x8050d89 <do_test+25>:      (bad)
> >
> > or with objdump 2.32:
> >
> >  8050d89:       62 f2 ff 48 68          (bad)
> >
> > Using objdump from binutils master shows
> >
> >  8050d89:       62 f2 ff 48 68 05 80    vp2intersectq 0x8050a80,%zmm0,%k0
> >
> > Currently, the testcases only check for AVX512F (which the machine in
> > question supports: Xeon Gold 6132), while they need to check for
> > AVX512VP2INTERSECT to avoid this.
> >
> > The following patch does this; tested on i386-pc-solaris2.11 with gas
> > 2.32.51 both 32 and 64-bit where the tests PASS.
> >
> > Ok for mainline?
>
> OK.
>
> Thanks,
> Uros.
>
> >
> >         Rainer
> >
> > --
> > -----------------------------------------------------------------------------
> > Rainer Orth, Center for Biotechnology, Bielefeld University
> >
> >
> > 2019-08-06  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
> >
> >         * gcc.target/i386/avx512vp2intersect-2intersect-1b.c (AVX512F):
> >         Remove.
> >         (AVX512VP2INTERSECT): Define.
> >         * gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c (AVX512F):
> >         Remove.
> >         (AVX512VP2INTERSECT): Define.
> >

Sorry for mistake and thanks.
diff mbox series

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 271984)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,53 @@ 
+2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
+	    H.J. Lu  <hongjiu.lu@intel.com>
+	    Olga Makhotina  <olga.makhotina@intel.com>
+
+	* common/config/i386/i386-common.c
+	(OPTION_MASK_ISA_AVX512VP2INTERSECT_SET,
+	OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET): New macros.
+	(OPTION_MASK_ISA2_AVX512F_UNSET): Add
+	OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET.
+	(ix86_handle_option): Handle -mavx512vp2intersect.
+	* config/i386/avx512vp2intersectintrin.h: New.
+	* config/i386/avx512vp2intersectvlintrin.h: New.
+	* config/i386/cpuid.h (bit_AVX512VP2INTERSECT): New.
+	* config/i386/driver-i386.c (host_detect_local_cpu): Detect
+	AVX512VP2INTERSECT.
+	* config/i386/i386-builtin-types.def: Add new types.
+	* config/i386/i386-builtin.def: Add new builtins.
+	* config/i386/i386-builtins.c: (enum processor_features): Add
+	F_AVX512VP2INTERSECT.
+	(static const _isa_names_table isa_names_table): Ditto.
+	* config/i386/i386-c.c (ix86_target_macros_internal): Define
+	__AVX512VP2INTERSECT__.
+	* config/i386/i386-expand.c (ix86_expand_builtin): Expand
+	IX86_BUILTIN_2INTERSECTD512, IX86_BUILTIN_2INTERSECTQ512,
+	IX86_BUILTIN_2INTERSECTD256, IX86_BUILTIN_2INTERSECTQ256,
+	IX86_BUILTIN_2INTERSECTD128, IX86_BUILTIN_2INTERSECTQ128.
+	* config/i386/i386-modes.def (P2QI, P2HI): New modes.
+	* config/i386/i386-options.c (ix86_target_string): Add
+	-mavx512vp2intersect.
+	(ix86_option_override_internal): Handle AVX512VP2INTERSECT.
+	* config/i386/i386.c (ix86_hard_regno_nregs): Allocate two regs for
+	P2HImode and P2QImode.
+	(ix86_hard_regno_mode_ok): Register pair only starts at even hardreg
+	number for P2QImode and P2HImode.
+	* config/i386/i386.h (TARGET_AVX512VP2INTERSECT,
+	TARGET_AVX512VP2INTERSECT_P): New.
+	(PTA_AVX512VP2INTERSECT): Ditto.
+	* config/i386/i386.opt: Add -mavx512vp2intersect.
+	* config/i386/immintrin.h: Include avx512vp2intersectintrin.h and
+	avx512vp2intersectvlintrin.h.
+	* config/i386/sse.md (define_c_enum "unspec"): Add UNSPEC_VP2INTERSECT.
+	(define_mode_iterator VI48_AVX512VP2VL): New.
+	(avx512vp2intersect_2intersect<mode>,
+	avx512vp2intersect_2intersectv16si): New define_insn patterns.
+	(*vec_extractp2hi, *vec_extractp2qi): New define_insn_and_split
+	patterns.
+	* config.gcc: Add avx512vp2intersectvlintrin.h and
+	avx512vp2intersectintrin.h to extra_headers.
+	* doc/invoke.texi: Document -mavx512vp2intersect.
+
 2019-06-05  Hongtao Liu  <hongtao.liu@intel.com>
 
 	* config/i386/sse.md (define_mode_suffix vecmemsuffix): New.
Index: gcc/common/config/i386/i386-common.c
===================================================================
--- gcc/common/config/i386/i386-common.c	(revision 271984)
+++ gcc/common/config/i386/i386-common.c	(working copy)
@@ -100,6 +100,7 @@ 
 #define OPTION_MASK_ISA_XSAVEC_SET \
   (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_XSAVE_SET)
 #define OPTION_MASK_ISA_CLWB_SET OPTION_MASK_ISA_CLWB
+#define OPTION_MASK_ISA_AVX512VP2INTERSECT_SET OPTION_MASK_ISA_AVX512VP2INTERSECT
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
    as -msse4.2.  */
@@ -240,6 +241,7 @@ 
 #define OPTION_MASK_ISA_WAITPKG_UNSET OPTION_MASK_ISA_WAITPKG
 #define OPTION_MASK_ISA_CLDEMOTE_UNSET OPTION_MASK_ISA_CLDEMOTE
 #define OPTION_MASK_ISA_ENQCMD_UNSET OPTION_MASK_ISA_ENQCMD
+#define OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET OPTION_MASK_ISA_AVX512VP2INTERSECT
 
 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
    as -mno-sse4.1. */
@@ -282,7 +284,8 @@ 
 #define OPTION_MASK_ISA2_AVX512F_UNSET \
   (OPTION_MASK_ISA_AVX512BF16_UNSET \
    | OPTION_MASK_ISA_AVX5124FMAPS_UNSET \
-   | OPTION_MASK_ISA_AVX5124VNNIW_UNSET)
+   | OPTION_MASK_ISA_AVX5124VNNIW_UNSET \
+   | OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET)
 #define OPTION_MASK_ISA2_GENERAL_REGS_ONLY_UNSET \
   (OPTION_MASK_ISA2_AVX512F_UNSET)
 
@@ -880,6 +883,21 @@ 
 	}
       return true;
 
+    case OPT_mavx512vp2intersect:
+      if (value)
+        {
+          opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
+          opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512VP2INTERSECT_SET;
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
+        }
+      else
+        {
+          opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET;
+          opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512VP2INTERSECT_UNSET;
+        }
+      return true;
+
     case OPT_mfma:
       if (value)
 	{
Index: gcc/config/i386/avx512vp2intersectintrin.h
===================================================================
--- gcc/config/i386/avx512vp2intersectintrin.h	(nonexistent)
+++ gcc/config/i386/avx512vp2intersectintrin.h	(working copy)
@@ -0,0 +1,35 @@ 
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect")
+#define __DISABLE_AVX512VP2INTERSECT__
+#endif /* __AVX512VP2INTERSECT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi32 (__m512i __A, __m512i __B, __mmask16 *__U,
+			 __mmask16 *__M)
+{
+  __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi64 (__m512i __A, __m512i __B, __mmask8 *__U,
+			 __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq512 (__U, __M, (__v8di) __A, (__v8di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECT__
+#undef __DISABLE_AVX512VP2INTERSECT__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECT__ */
+
+#endif /* _AVX512VP2INTERSECTINTRIN_H_INCLUDED */
Index: gcc/config/i386/avx512vp2intersectvlintrin.h
===================================================================
--- gcc/config/i386/avx512vp2intersectvlintrin.h	(nonexistent)
+++ gcc/config/i386/avx512vp2intersectvlintrin.h	(working copy)
@@ -0,0 +1,49 @@ 
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect,avx512vl")
+#define __DISABLE_AVX512VP2INTERSECTVL__
+#endif /* __AVX512VP2INTERSECTVL__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi32 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+  __builtin_ia32_2intersectd128 (__U, __M, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi32 (__m256i __A, __m256i __B, __mmask8 *__U,
+			 __mmask8 *__M)
+{
+  __builtin_ia32_2intersectd256 (__U, __M, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi64 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq128 (__U, __M, (__v2di) __A, (__v2di) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi64 (__m256i __A, __m256i __B, __mmask8 *__U,
+			 __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq256 (__U, __M, (__v4di) __A, (__v4di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECTVL__
+#undef __DISABLE_AVX512VP2INTERSECTVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECTVL__ */
+
+#endif /* _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED */
Index: gcc/config/i386/cpuid.h
===================================================================
--- gcc/config/i386/cpuid.h	(revision 271984)
+++ gcc/config/i386/cpuid.h	(working copy)
@@ -119,6 +119,7 @@ 
 /* %edx */
 #define bit_AVX5124VNNIW (1 << 2)
 #define bit_AVX5124FMAPS (1 << 3)
+#define bit_AVX512VP2INTERSECT	(1 << 8)
 #define bit_IBT	(1 << 20)
 #define bit_PCONFIG	(1 << 18)
 /* XFEATURE_ENABLED_MASK register bits (%eax == 13, %ecx == 0) */
Index: gcc/config/i386/driver-i386.c
===================================================================
--- gcc/config/i386/driver-i386.c	(revision 271984)
+++ gcc/config/i386/driver-i386.c	(working copy)
@@ -423,6 +423,7 @@ 
   unsigned int has_shstk = 0;
   unsigned int has_avx512vnni = 0, has_vaes = 0;
   unsigned int has_vpclmulqdq = 0;
+  unsigned int has_avx512vp2intersect = 0;
   unsigned int has_movdiri = 0, has_movdir64b = 0;
   unsigned int has_enqcmd = 0;
   unsigned int has_waitpkg = 0;
@@ -532,6 +533,7 @@ 
 
       has_avx5124vnniw = edx & bit_AVX5124VNNIW;
       has_avx5124fmaps = edx & bit_AVX5124FMAPS;
+      has_avx512vp2intersect = edx & bit_AVX512VP2INTERSECT;
 
       has_shstk = ecx & bit_SHSTK;
       has_pconfig = edx & bit_PCONFIG;
@@ -1143,6 +1145,7 @@ 
       const char *shstk = has_shstk ? " -mshstk" : " -mno-shstk";
       const char *vaes = has_vaes ? " -mvaes" : " -mno-vaes";
       const char *vpclmulqdq = has_vpclmulqdq ? " -mvpclmulqdq" : " -mno-vpclmulqdq";
+      const char *avx512vp2intersect = has_avx512vp2intersect ? " -mavx512vp2intersect" : " -mno-avx512vp2intersect";
       const char *avx512bitalg = has_avx512bitalg ? " -mavx512bitalg" : " -mno-avx512bitalg";
       const char *movdiri = has_movdiri ? " -mmovdiri" : " -mno-movdiri";
       const char *movdir64b = has_movdir64b ? " -mmovdir64b" : " -mno-movdir64b";
@@ -1165,7 +1168,7 @@ 
 			clwb, mwaitx, clzero, pku, rdpid, gfni, shstk,
 			avx512vbmi2, avx512vnni, vaes, vpclmulqdq,
 			avx512bitalg, movdiri, movdir64b, waitpkg, cldemote,
-			ptwrite, avx512bf16, enqcmd,
+			ptwrite, avx512bf16, enqcmd, avx512vp2intersect,
 			NULL);
     }
 
Index: gcc/config/i386/i386-builtin-types.def
===================================================================
--- gcc/config/i386/i386-builtin-types.def	(revision 271984)
+++ gcc/config/i386/i386-builtin-types.def	(working copy)
@@ -975,6 +975,13 @@ 
 DEF_FUNCTION_TYPE (QI, V4SF, INT, UQI)
 DEF_FUNCTION_TYPE (VOID, PV32QI, V32HI, USI)
 
+DEF_FUNCTION_TYPE (VOID, PUHI, PUHI, V16SI, V16SI)
+DEF_FUNCTION_TYPE (VOID, PUQI, PUQI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (VOID, PUQI, PUQI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (VOID, PUQI, PUQI, V8DI, V8DI)
+DEF_FUNCTION_TYPE (VOID, PUQI, PUQI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (VOID, PUQI, PUQI, V2DI, V2DI)
+
 DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, UINT, UINT)
 DEF_FUNCTION_TYPE (V4HI, HI, HI, HI, HI)
 
Index: gcc/config/i386/i386-builtin.def
===================================================================
--- gcc/config/i386/i386-builtin.def	(revision 271984)
+++ gcc/config/i386/i386-builtin.def	(working copy)
@@ -288,6 +288,14 @@ 
 BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev32hi_mask, "__builtin_ia32_storedquhi512_mask", IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, (int) VOID_FTYPE_PSHORT_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI)
 
+/* AVX512VP2INTERSECT */
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd256", IX86_BUILTIN_2INTERSECTD256, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8SI_V8SI)
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectq256", IX86_BUILTIN_2INTERSECTQ256, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V4DI_V4DI)
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd128", IX86_BUILTIN_2INTERSECTD128, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V4SI_V4SI)
+BDESC (0, OPTION_MASK_ISA_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectq128", IX86_BUILTIN_2INTERSECTQ128, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V2DI_V2DI)
+
 /* AVX512VL */
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_loadv16hi_mask, "__builtin_ia32_loaddquhi256_mask", IX86_BUILTIN_LOADDQUHI256_MASK, UNKNOWN, (int) V16HI_FTYPE_PCSHORT_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_loadv8hi_mask, "__builtin_ia32_loaddquhi128_mask", IX86_BUILTIN_LOADDQUHI128_MASK, UNKNOWN, (int) V8HI_FTYPE_PCSHORT_V8HI_UQI)
Index: gcc/config/i386/i386-builtins.c
===================================================================
--- gcc/config/i386/i386-builtins.c	(revision 271984)
+++ gcc/config/i386/i386-builtins.c	(working copy)
@@ -1924,6 +1924,7 @@ 
   F_VPCLMULQDQ,
   F_AVX512VNNI,
   F_AVX512BITALG,
+  F_AVX512VP2INTERSECT,
   F_AVX512BF16,
   F_MAX
 };
@@ -2070,6 +2071,7 @@ 
   {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
   {"avx512vnni", F_AVX512VNNI, P_ZERO},
   {"avx512bitalg", F_AVX512BITALG, P_ZERO},
+  {"avx512vp2intersect",F_AVX512VP2INTERSECT, P_ZERO},
   {"avx512bf16", F_AVX512BF16, P_ZERO}
 };
 
Index: gcc/config/i386/i386-c.c
===================================================================
--- gcc/config/i386/i386-c.c	(revision 271984)
+++ gcc/config/i386/i386-c.c	(working copy)
@@ -404,6 +404,8 @@ 
 
   if (isa_flag2 & OPTION_MASK_ISA_WBNOINVD)
     def_or_undef (parse_in, "__WBNOINVD__");
+  if (isa_flag2 & OPTION_MASK_ISA_AVX512VP2INTERSECT)
+    def_or_undef (parse_in, "__AVX512VP2INTERSECT__");
   if (isa_flag & OPTION_MASK_ISA_MMX)
     def_or_undef (parse_in, "__MMX__");
   if (isa_flag & OPTION_MASK_ISA_3DNOW)
Index: gcc/config/i386/i386-expand.c
===================================================================
--- gcc/config/i386/i386-expand.c	(revision 271984)
+++ gcc/config/i386/i386-expand.c	(working copy)
@@ -11357,6 +11357,76 @@ 
       emit_move_insn (target, op0);
       return target;
 
+    case IX86_BUILTIN_2INTERSECTD512:
+    case IX86_BUILTIN_2INTERSECTQ512:
+    case IX86_BUILTIN_2INTERSECTD256:
+    case IX86_BUILTIN_2INTERSECTQ256:
+    case IX86_BUILTIN_2INTERSECTD128:
+    case IX86_BUILTIN_2INTERSECTQ128:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
+	}
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
+	}
+      op2 = copy_to_reg (op2);
+      op3 = copy_to_reg (op3);
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_2INTERSECTD512:
+	  mode4 = P2HImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
+	  break;
+	case IX86_BUILTIN_2INTERSECTQ512:
+	  mode4 = P2QImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
+	  break;
+	case IX86_BUILTIN_2INTERSECTD256:
+	  mode4 = P2QImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
+	  break;
+	case IX86_BUILTIN_2INTERSECTQ256:
+	  mode4 = P2QImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
+	  break;
+	case IX86_BUILTIN_2INTERSECTD128:
+	  mode4 = P2QImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
+	  break;
+	case IX86_BUILTIN_2INTERSECTQ128:
+	  mode4 = P2QImode;
+	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      op4 = gen_reg_rtx (mode4);
+      emit_insn (GEN_FCN (icode) (op4, op2, op3));
+      mode0 = GET_MODE_INNER (GET_MODE (op4));
+      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (0)));
+      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
+      emit_move_insn (gen_rtx_MEM (mode0, op0), pat2);
+      pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1)));
+      pat2 = gen_rtx_VEC_SELECT (mode0, op4, pat);
+      emit_move_insn (gen_rtx_MEM (mode0, op1), pat2);
+
+      return 0;
+
     case IX86_BUILTIN_RDPMC:
     case IX86_BUILTIN_RDTSC:
     case IX86_BUILTIN_RDTSCP:
Index: gcc/config/i386/i386-modes.def
===================================================================
--- gcc/config/i386/i386-modes.def	(revision 271984)
+++ gcc/config/i386/i386-modes.def	(working copy)
@@ -101,6 +101,10 @@ 
 INT_MODE (OI, 32);
 INT_MODE (XI, 64);
 
+/* Register pair.  */
+VECTOR_MODES_WITH_PREFIX (P, INT, 2);	/*	P2QI */
+VECTOR_MODES_WITH_PREFIX (P, INT, 4);	/* P2HI P4QI */
+
 /* Keep the OI and XI modes from confusing the compiler into thinking
    that these modes could actually be used for computation.  They are
    only holders for vectors during data movement.  */
Index: gcc/config/i386/i386-options.c
===================================================================
--- gcc/config/i386/i386-options.c	(revision 271984)
+++ gcc/config/i386/i386-options.c	(working copy)
@@ -199,6 +199,7 @@ 
     { "-mrdpid",	OPTION_MASK_ISA_RDPID },
     { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
     { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
+    { "-mavx512vp2intersect", OPTION_MASK_ISA_AVX512VP2INTERSECT },
     { "-msgx",		OPTION_MASK_ISA_SGX },
     { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
     { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
@@ -852,6 +853,7 @@ 
     IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
     IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
     IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
+    IX86_ATTR_ISA ("avx512vp2intersect", OPT_mavx512vp2intersect),
 
     IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
     IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
@@ -2027,6 +2029,10 @@ 
 	    & OPTION_MASK_ISA_AVX512BITALG))
 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
 
+	if (((processor_alias_table[i].flags & PTA_AVX512VP2INTERSECT) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA_AVX512VP2INTERSECT))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VP2INTERSECT;
 	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
 	    && !(opts->x_ix86_isa_flags2_explicit
 		 & OPTION_MASK_ISA_AVX5124VNNIW))
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 271984)
+++ gcc/config/i386/i386.c	(working copy)
@@ -18682,6 +18682,9 @@ 
     }
   if (COMPLEX_MODE_P (mode))
     return 2;
+  /* Register pair for mask registers.  */
+  if (mode == P2QImode || mode == P2HImode)
+    return 2;
   if (mode == V64SFmode || mode == V64SImode)
     return 4;
   return 1;
@@ -18702,9 +18705,16 @@ 
   if (STACK_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
   if (MASK_REGNO_P (regno))
-    return (VALID_MASK_REG_MODE (mode)
-	    || (TARGET_AVX512BW
-		&& VALID_MASK_AVX512BW_MODE (mode)));
+    {
+      /* Register pair only starts at even register number.  */
+      if ((mode == P2QImode || mode == P2HImode))
+	return (regno & 1) == 0;
+
+      return (VALID_MASK_REG_MODE (mode)
+	      || (TARGET_AVX512BW
+		  && VALID_MASK_AVX512BW_MODE (mode)));
+    }
+
   if (SSE_REGNO_P (regno))
     {
       /* We implement the move patterns for all vector modes into and
Index: gcc/config/i386/i386.h
===================================================================
--- gcc/config/i386/i386.h	(revision 271984)
+++ gcc/config/i386/i386.h	(working copy)
@@ -93,6 +93,8 @@ 
 #define TARGET_AVX512VNNI_P(x) TARGET_ISA_AVX512VNNI_P(x)
 #define TARGET_AVX512BITALG	TARGET_ISA_AVX512BITALG
 #define TARGET_AVX512BITALG_P(x) TARGET_ISA_AVX512BITALG_P(x)
+#define TARGET_AVX512VP2INTERSECT	TARGET_ISA_AVX512VP2INTERSECT
+#define TARGET_AVX512VP2INTERSECT_P(x) TARGET_ISA_AVX512VP2INTERSECT_P(x)
 #define TARGET_FMA	TARGET_ISA_FMA
 #define TARGET_FMA_P(x)	TARGET_ISA_FMA_P(x)
 #define TARGET_SSE4A	TARGET_ISA_SSE4A
@@ -2363,6 +2365,7 @@ 
 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
 const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7);
 const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
+const wide_int_bitmask PTA_AVX512VP2INTERSECT (0, HOST_WIDE_INT_1U << 9);
 const wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 9);
 const wide_int_bitmask PTA_PTWRITE (0, HOST_WIDE_INT_1U << 10);
 const wide_int_bitmask PTA_AVX512BF16 (0, HOST_WIDE_INT_1U << 11);
Index: gcc/config/i386/i386.opt
===================================================================
--- gcc/config/i386/i386.opt	(revision 271984)
+++ gcc/config/i386/i386.opt	(working copy)
@@ -749,6 +749,10 @@ 
 Target Report Mask(ISA_AVX512BITALG) Var(ix86_isa_flags) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512BITALG built-in functions and code generation.
 
+mavx512vp2intersect
+Target Report Mask(ISA_AVX512VP2INTERSECT) Var(ix86_isa_flags2) Save
+Support AVX512VP2INTERSECT built-in functions and code generation.
+
 mfma
 Target Report Mask(ISA_FMA) Var(ix86_isa_flags) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation.
Index: gcc/config/i386/immintrin.h
===================================================================
--- gcc/config/i386/immintrin.h	(revision 271984)
+++ gcc/config/i386/immintrin.h	(working copy)
@@ -96,6 +96,10 @@ 
 
 #include <avx512bitalgintrin.h>
 
+#include <avx512vp2intersectintrin.h>
+
+#include <avx512vp2intersectvlintrin.h>
+
 #include <shaintrin.h>
 
 #include <lzcntintrin.h>
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md	(revision 271984)
+++ gcc/config/i386/sse.md	(working copy)
@@ -188,6 +188,9 @@ 
   ;; For AVX512BITALG support
   UNSPEC_VPSHUFBIT
 
+  ;; For VP2INTERSECT support
+  UNSPEC_VP2INTERSECT
+
   ;; For AVX512BF16 support
   UNSPEC_VCVTNE2PS2BF16
   UNSPEC_VCVTNEPS2BF16
@@ -22405,6 +22408,67 @@ 
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_mode_iterator VI48_AVX512VP2VL
+  [V8DI
+  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")
+  (V8SI "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")])
+
+(define_insn "avx512vp2intersect_2intersect<mode>"
+  [(set (match_operand:P2QI 0 "register_operand" "=k")
+	(unspec:P2QI
+	  [(match_operand:VI48_AVX512VP2VL 1 "register_operand" "v")
+	   (match_operand:VI48_AVX512VP2VL 2 "vector_operand" "vm")]
+	  UNSPEC_VP2INTERSECT))]
+  "TARGET_AVX512VP2INTERSECT"
+  "vp2intersect<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr ("prefix") ("evex"))])
+
+(define_insn "avx512vp2intersect_2intersectv16si"
+  [(set (match_operand:P2HI 0 "register_operand" "=k")
+	(unspec:P2HI [(match_operand:V16SI 1 "register_operand" "v")
+		      (match_operand:V16SI 2 "vector_operand" "vm")]
+		UNSPEC_VP2INTERSECT))]
+  "TARGET_AVX512VP2INTERSECT"
+  "vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr ("prefix") ("evex"))])
+
+;; Used for register pair
+;; i.e vp2intersectd k+1, xmm2, xmm3/m128/m32bcst
+;; Store, in an even/odd pair of mask registers,
+;; the indicators of the locations of value
+;; matches between dwords in xmm3/m128/m32bcst and xmm2.
+
+(define_insn_and_split "*vec_extractp2hi"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm")
+        (vec_select:HI
+           (match_operand:P2HI 1 "register_operand" "k")
+           (parallel
+             [(match_operand:SI 2 "const_0_to_1_operand")])))]
+  "TARGET_AVX512F"
+  "#"
+  "reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  unsigned int regno = REGNO (operands[1]) + INTVAL (operands[2]);
+  operands[1] = gen_rtx_REG (HImode, regno);
+})
+
+(define_insn_and_split "*vec_extractp2qi"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=rm,r")
+        (vec_select:QI
+           (match_operand:P2QI 1 "register_operand" "k,k")
+           (parallel
+             [(match_operand:SI 2 "const_0_to_1_operand")])))]
+  "TARGET_AVX512F"
+  "#"
+  "reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  unsigned int regno = REGNO (operands[1]) + INTVAL (operands[2]);
+  operands[1] = gen_rtx_REG (QImode, regno);
+}
+  [(set_attr "isa" "avx512dq,avx512f")])
+
 (define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
 ;; Converting from BF to SF
 (define_mode_attr bf16_cvt_2sf
Index: gcc/config.gcc
===================================================================
--- gcc/config.gcc	(revision 271984)
+++ gcc/config.gcc	(working copy)
@@ -408,7 +408,7 @@ 
 		       avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
 		       pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
 		       waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h
-		       enqcmdintrin.h"
+		       enqcmdintrin.h avx512vp2intersectintrin.h avx512vp2intersectvlintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -441,7 +441,7 @@ 
 		       avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
 		       pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
 		       waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h
-		       enqcmdintrin.h"
+		       enqcmdintrin.h avx512vp2intersectintrin.h avx512vp2intersectvlintrin.h"
 	;;
 ia64-*-*)
 	extra_headers=ia64intrin.h
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 271984)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1278,7 +1278,7 @@ 
 -mshstk -mmanual-endbr -mforce-indirect-call  -mavx512vbmi2 -mavx512bf16 -menqcmd @gol
 -mvpclmulqdq  -mavx512bitalg  -mmovdiri  -mmovdir64b  -mavx512vpopcntdq @gol
 -mavx5124fmaps  -mavx512vnni  -mavx5124vnniw  -mprfchw  -mrdpid @gol
--mrdseed  -msgx @gol
+-mrdseed  -msgx -mavx512vp2intersect@gol
 -mcldemote  -mms-bitfields  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically  -mstringop-strategy=@var{alg} @gol
 -mmemcpy-strategy=@var{strategy}  -mmemset-strategy=@var{strategy} @gol
@@ -28096,6 +28096,9 @@ 
 @itemx -mavx512vpopcntdq
 @opindex mavx512vpopcntdq
 @need 200
+@itemx -mavx512vp2intersect
+@opindex mavx512vp2intersect
+@need 200
 @itemx -mavx5124fmaps
 @opindex mavx5124fmaps
 @need 200
Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 271984)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,19 @@ 
+2019-06-06  Hongtao Liu  <hongtao.liu@intel.com>
+	    Olga Makhotina  <olga.makhotina@intel.com>
+
+	* gcc.target/i386/avx512-check.h: Handle bit_AVX512VP2INTERSECT.
+	* gcc.target/i386/avx512vp2intersect-2intersect-1a.c: New test.
+	* gcc.target/i386/avx512vp2intersect-2intersect-1b.c: Likewise.
+	* gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c: Likewise.
+	* gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c: Likewise.
+	* gcc.target/i386/sse-12.c: Add -mavx512vp2intersect.
+	* gcc.target/i386/sse-13.c: Likewsie.
+	* gcc.target/i386/sse-14.c: Likewise.
+	* gcc.target/i386/sse-22.c: Likewise.
+	* gcc.target/i386/sse-23.c: Likewise.
+	* g++.dg/other/i386-2.C: Likewise.
+	* g++.dg/other/i386-3.C: Likewise.
+
 2019-06-05  Hongtao Liu  <hongtao.liu@intel.com>
 
 	* gcc.target/i386/avx512dq-vfpclasspd-1.c: Adjust scan assember
Index: gcc/testsuite/g++.dg/other/i386-2.C
===================================================================
--- gcc/testsuite/g++.dg/other/i386-2.C	(revision 271984)
+++ gcc/testsuite/g++.dg/other/i386-2.C	(working copy)
@@ -1,12 +1,13 @@ 
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt  -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt  -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
    popcntintrin.h, fmaintrin.h, pkuintrin.h, avx5124fmapsintrin.h,
    avx5124vnniwintrin.h, avx512vpopcntdqintrin.h gfniintrin.h
-   avx512bitalgintrin.h and mm_malloc.h.h are usable with -O
-   -pedantic-errors.  */
+   avx512bitalgintrin.h, avx512vp2intersectintrin.h,
+   avx512vp2intersectvlintrin.h and mm_malloc.h.h are usable
+   with -O -pedantic-errors.  */
 
 #include <x86intrin.h>
 
Index: gcc/testsuite/g++.dg/other/i386-3.C
===================================================================
--- gcc/testsuite/g++.dg/other/i386-3.C	(revision 271984)
+++ gcc/testsuite/g++.dg/other/i386-3.C	(working copy)
@@ -1,11 +1,12 @@ 
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
    popcntintrin.h, fmaintrin.h, pkuintrin.h, avx5124fmapsintrin.h,
    avx5124vnniwintrin.h, avx512vpopcntdqintrin.h gfniintrin.h
-   avx512bitalgintrin.h and mm_malloc.h are usable with -O
-   -fkeep-inline-functions.  */
+   avx512bitalgintrin.h, avx512vp2intersectintrin.h,
+   avx512vp2intersectvlintrin.h and mm_malloc.h are usable
+   with -O -fkeep-inline-functions.  */
 
 #include <x86intrin.h>
Index: gcc/testsuite/gcc.target/i386/avx512-check.h
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512-check.h	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/avx512-check.h	(working copy)
@@ -93,6 +93,9 @@ 
 #ifdef VPCLMULQDQ
       && (ecx & bit_VPCLMULQDQ)
 #endif
+#ifdef AVX512VP2INTERSECT
+      && (edx & bit_AVX512VP2INTERSECT)
+#endif
       && avx512f_os_support ())
     {
       DO_TEST ();
Index: gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1a.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1a.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1a.c	(working copy)
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vp2intersect" } */
+/* { dg-final { scan-assembler "vp2intersectq\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%k\[0-7\]"} } */
+/* { dg-final { scan-assembler "vp2intersectd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%k\[0-7\]"} } */
+
+#include <x86intrin.h>
+
+__m512i a1, b1;
+__m512i a2, b2;
+__mmask8 m8, u8;
+__mmask16 m16, u16;
+
+int foo ()
+{
+  _mm512_2intersect_epi64 (a1, b1, &u8, &m8);
+  _mm512_2intersect_epi32 (a2, b2, &u16, &m16);
+}
+
Index: gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersect-1b.c	(working copy)
@@ -0,0 +1,28 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vp2intersect" } */
+
+#define AVX512F
+#include <x86intrin.h>
+#include "avx512f-helper.h"
+
+void
+TEST (void)
+{
+  __m512i a1 = _mm512_set_epi64 (10, 43, 253, 3566, 25, -253, -243, 3456);
+  __m512i b1 = _mm512_set_epi64 (43, 100, 3566, 2353, -253, -25, 3456, 243);
+  __m512i a2 = _mm512_set_epi32 (21, 22, 23, 24, 25, 26, 27, 28,
+			       11, 12, 13, 14, 15, 16, 17, 18);
+  __m512i b2 = _mm512_set_epi32 (22, 211, 24, 213, 26, 215, 28, 217,
+				 12, 111, 14, 113, 16, 115, 18, 117);
+  __mmask8 u8 = 0, m8 = 0;
+  __mmask16 u16 = 0, m16 = 0;
+
+  _mm512_2intersect_epi64 (a1, b1, &u8, &m8);
+  /* u8 = 01010101, m8 = 10101010.  */
+  if (u8 != 0x55 || m8 != 0xaa)
+    abort();
+  _mm512_2intersect_epi32 (a2, b2, &u16, &m16);
+  /* u8 = 0101010101010101, m8 = 1010101010101010.  */
+  if (u16 != 0x5555 || m16 != 0xaaaa)
+    abort();
+}
Index: gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1a.c	(working copy)
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vp2intersect -mavx512vl" } */
+/* { dg-final { scan-assembler "vp2intersectd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%k\[0-7\]"  } } */
+/* { dg-final { scan-assembler "vp2intersectd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%k\[0-7\]"  } } */
+/* { dg-final { scan-assembler "vp2intersectq\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%k\[0-7\]"  } } */
+/* { dg-final { scan-assembler "vp2intersectq\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%k\[0-7\]"  } } */
+
+#include <x86intrin.h>
+
+__m256i a2, b2;
+__m128i a3, b3;
+__mmask8 m0, m1, m2, m3, m4, m5, m6, m7;
+
+int foo ()
+{
+  _mm_2intersect_epi64 (a3, b3, &m0, &m1);
+  _mm_2intersect_epi32 (a3, b3, &m2, &m3);
+
+  _mm256_2intersect_epi64 (a2, b2, &m4, &m5);
+  _mm256_2intersect_epi32 (a2, b2, &m6, &m7);
+}
Index: gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512vp2intersect-2intersectvl-1b.c	(working copy)
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vp2intersect -mavx512vl" } */
+
+#define AVX512F
+#include <x86intrin.h>
+#include "avx512f-helper.h"
+
+ void
+ TEST (void)
+{
+  __m256i a1 = _mm256_set_epi64x (1, 2, 3, 4);
+  __m256i b1 = _mm256_set_epi64x (2, 11, 4, 33);
+  __m256i a2 = _mm256_set_epi32 (1, 2, 3, 4, 5, 6, 7, 8);
+  __m256i b2 = _mm256_set_epi32 (2, 11, 4, 33, 6, 55, 8, 77);
+  __m128i a3 = _mm_set_epi64x (13, 22);
+  __m128i b3 = _mm_set_epi64x (22, 1434);
+  __m128i a4 = _mm_set_epi32 (1, 2, 3, 4);
+  __m128i b4 = _mm_set_epi32 (2, 11, 4, 33);
+  __mmask8 m0, m1, m2, m3, m4, m5, m6, m7;
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
+
+  _mm_2intersect_epi64 (a3, b3, &m0, &m1);
+  /* m0 = ******01, m1 = ******10.  */
+  if (m0 != 0x1 || m1 != 0x2)
+    abort();
+
+  _mm_2intersect_epi32 (a4, b4, &m2, &m3);
+  /* m2 = ****0101, m3 = ****1010.  */
+  if (m2  != 0x5 || m3 != 0xa)
+    abort();
+
+  _mm256_2intersect_epi64 (a1, b1, &m4, &m5);
+  /* m4 = ****0101, m5 = ****1010.  */
+  if (m4 != 0x5 || m5 != 0xa)
+    abort();
+
+  _mm256_2intersect_epi32 (a2, b2, &m6, &m7);
+  /* m0 = 01010101, m1 = 10101010.  */
+  if (m6 != 0x55 || m7 != 0xaa)
+    abort();
+}
Index: gcc/testsuite/gcc.target/i386/sse-12.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-12.c	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/sse-12.c	(working copy)
@@ -3,7 +3,7 @@ 
    popcntintrin.h gfniintrin.h and mm_malloc.h are usable
    with -O -std=c89 -pedantic-errors.  */
 /* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect" } */
 
 #include <x86intrin.h>
 
Index: gcc/testsuite/gcc.target/i386/sse-13.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-13.c	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/sse-13.c	(working copy)
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/sse-14.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-14.c	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/sse-14.c	(working copy)
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/sse-22.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-22.c	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/sse-22.c	(working copy)
@@ -10,8 +10,9 @@ 
    mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h,
    tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h,
    avx5124fmapsintrin.h, avx5124vnniwintrin.h, avx512vpopcntdqintrin.h,
-   avx512bitalgintrin.h and mm_malloc.h that reference the proper builtin
-   functions.
+   avx512bitalgintrin.h, avx512vp2intersectintrin.h,
+   avx512vp2intersectvlintrin.h and mm_malloc.h that reference the proper
+   builtin functions.
    Defining away "extern" and "__inline" results in all of them being
    compiled as proper functions.  */
 
@@ -101,7 +102,7 @@ 
 
 
 #ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect")
 #endif
 
 /* Following intrinsics require immediate arguments.  They
@@ -218,7 +219,7 @@ 
 
 /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
 #ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect")
 #endif
 #include <immintrin.h>
 test_1 (_cvtss_sh, unsigned short, float, 1)
Index: gcc/testsuite/gcc.target/i386/sse-23.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-23.c	(revision 271984)
+++ gcc/testsuite/gcc.target/i386/sse-23.c	(working copy)
@@ -9,8 +9,9 @@ 
    mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h,
    tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h,
    avx5124fmapsintrin.h, avx5124vnniwintrin.h, avx512vpopcntdqintrin.h,
-   avx512bitalgintrin.h and mm_malloc.h that reference the proper builtin
-   functions.
+   avx512bitalgintrin.h, avx512vp2intersectintrin.h,
+   avx512vp2intersectvlintrin.h and mm_malloc.h that reference the proper
+   builtin functions.
    Defining away "extern" and "__inline" results in all of them being
    compiled as proper functions.  */
 
@@ -696,6 +697,6 @@ 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v8di(A, B, C)  __builtin_ia32_vpclmulqdq_v8di(A, B, 1) 
 
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect")
 
 #include <x86intrin.h>