diff mbox series

[3/3] vect: support direct conversion under x86-64-v3.

Message ID 20240523063742.2333446-4-lin1.hu@intel.com
State New
Headers show
Series Optimize __builtin_convertvector for x86-64-v4 and | expand

Commit Message

Hu, Lin1 May 23, 2024, 6:37 a.m. UTC
gcc/ChangeLog:

	PR 107432
	* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
	New function for generate a series of suitable insn.
	* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
	Define new function.
	* config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.

gcc/testsuite/ChangeLog:

	PR 107432
	* gcc.target/i386/pr107432-8.c: New test.
	* gcc.target/i386/pr107432-9.c: Ditto.
	* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc             |  47 +++++++-
 gcc/config/i386/i386-protos.h              |   3 +
 gcc/config/i386/sse.md                     |  87 +++++++++++----
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 6 files changed, 304 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

Comments

Hongtao Liu May 23, 2024, 6:42 a.m. UTC | #1
On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> gcc/ChangeLog:
>
>         PR 107432
>         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
>         New function for generate a series of suitable insn.
>         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
>         Define new function.
>         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
I have some concern for this patch since
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to
this patch.
> gcc/testsuite/ChangeLog:
>
>         PR 107432
>         * gcc.target/i386/pr107432-8.c: New test.
>         * gcc.target/i386/pr107432-9.c: Ditto.
>         * gcc.target/i386/pr92645-4.c: Modify test.
> ---
>  gcc/config/i386/i386-expand.cc             |  47 +++++++-
>  gcc/config/i386/i386-protos.h              |   3 +
>  gcc/config/i386/sse.md                     |  87 +++++++++++----
>  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
>  6 files changed, 304 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..bca8b85c9d1 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
>    emit_insn (gen_xorv4si3 (value, value, large));
>  }
>
> -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> -                                                machine_mode mode, rtx target,
> -                                                rtx var, int one_var);
> -
>  /* Convert an unsigned DImode value into a DFmode, using only SSE.
>     Expects the 64-bit DImode to be supplied in a pair of integral
>     registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
> @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
>     whose ONE_VAR element is VAR, and other elements are zero.  Return true
>     if successful.  */
>
> -static bool
> +bool
>  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
>                                      rtx target, rtx var, int one_var)
>  {
> @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
>    return ret;
>  }
>
> +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> +
> +bool
> +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
> +{
> +  machine_mode out_mode = GET_MODE (output);
> +  machine_mode in_mode = GET_MODE (input);
> +  int len = GET_MODE_SIZE (in_mode);
> +  gcc_assert (len == 16 || len == 32);
> +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> +
> +  struct expand_vec_perm_d d;
> +  d.target = gen_reg_rtx (cvt_mode);
> +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
> +  d.op1 = d.op0;
> +  d.vmode = cvt_mode;
> +  d.nelt = len;
> +  d.testing_p = false;
> +  d.one_operand_p = true;
> +
> +  /* Init perm. Put the needed bits of input in order and
> +     fill the rest of bits by default.  */
> +  int tot = 0;
> +  for (int i = 0; i < len; ++i)
> +    {
> +      d.perm[i] = i;
> +      if ((i % in_innersize) < out_innersize)
> +       d.perm[tot++] = i;
> +    }
> +
> +  if (ix86_expand_vec_perm_const_1(&d))
> +    {
> +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> +      return true;
> +    }
> +
> +  return false;
> +}
> +
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..ac29fb34028 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
>  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
>                                       bool, rtx_code_label *);
>  extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
>  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
>  extern bool ix86_memtag_can_tag_addresses (void);
>
> @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
>  extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
>                                                rtx);
> +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
> +                                                rtx, int);
>  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
>
>  /* In i386-c.cc  */
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f57f36ae380..0b14b3dc1ac 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
>
>  (define_mode_iterator PMOV_DST_MODE_2
>    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> +  [V4SI V8HI V16QI])
>  (define_mode_attr pmov_suff_2
>    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
>
>  (define_expand "trunc<ssedoublemodelower><mode>2"
> -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> -       (truncate:PMOV_DST_MODE_2
> +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
> +       (truncate:PMOV_DST_MODE_2_AVX2
>           (match_operand:<ssedoublemode> 1 "register_operand")))]
> -  "TARGET_AVX512VL")
> +  "TARGET_AVX2"
> +{
> +  if (!TARGET_AVX512VL
> +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +      DONE;
> +    }
> +})
>
>  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
>    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
> @@ -14460,6 +14471,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
>    "TARGET_AVX512VL")
>
>  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
>  (define_mode_attr pmov_dst_3_lower
>    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
>  (define_mode_attr pmov_dst_3
> @@ -14472,16 +14484,26 @@ (define_mode_attr pmov_suff_3
>  (define_expand "trunc<mode><pmov_dst_3_lower>2"
>    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
>         (truncate:<pmov_dst_3>
> -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V16QImode);
> +  if (TARGET_AVX512VL
> +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> +    {
> +       rtx op0 = gen_reg_rtx (V16QImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> +                (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> +
> +       emit_move_insn (operands[0],
> +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> +    }
> +  else
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +    }
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
>    DONE;
>  })
>
> @@ -14853,15 +14875,24 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2"
>    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
>         (truncate:<pmov_dst_4>
>           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V8HImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V8HImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> +               (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      DONE;
> +    }
> +  else
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +    }
>    DONE;
>  })
>
> @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
>    [(set (match_operand:V2SI 0 "register_operand")
>         (truncate:V2SI
>           (match_operand:V2DI 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V4SImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V4SImode);
>
> -  emit_insn (gen_avx512vl_truncatev2div2si2
> -            (op0, operands[1], CONST0_RTX (V2SImode)));
> +      emit_insn (gen_avx512vl_truncatev2div2si2
> +               (op0, operands[1], CONST0_RTX (V2SImode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (V2SImode, op0, V4SImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
> +  else
> +    {
> +      rtx tmp = lowpart_subreg (V4SImode,
> +                               force_reg (V2DImode, operands[1]), V2DImode);
> +      rtx op0 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> +                                     GEN_INT (6), GEN_INT (7)));
> +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> new file mode 100644
> index 00000000000..f0d1ab028f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -O2" } */
> +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> +{
> +  return __builtin_convertvector((__v16hi)a, __v16qi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> new file mode 100644
> index 00000000000..650d352b945
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> @@ -0,0 +1,121 @@
> +/* { dg-do run } */
> +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> +#include <x86intrin.h>
> +
> +#include "avx-check.h"
> +
> +#ifndef TEST
> +#define TEST avx_test
> +#endif
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef union
> +{
> +  __v2si x;
> +  int a[2];
> +} union64i_d;
> +
> +typedef union
> +{
> +  __v2hi x;
> +  short a[2];
> +} union32i_w;
> +
> +typedef union
> +{
> +  __v4hi x;
> +  short a[4];
> +} union64i_w;
> +
> +typedef union
> +{
> +  __v2qi x;
> +  char a[2];
> +} union16i_b;
> +
> +typedef union
> +{
> +  __v4qi x;
> +  char a[4];
> +} union32i_b;
> +
> +typedef union
> +{
> +  __v8qi x;
> +  char a[8];
> +} union64i_b;
> +
> +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> +static int                                               \
> +__attribute__((noinline, unused))                        \
> +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> +{                                                        \
> +  int i;                                                 \
> +  int err = 0;                                           \
> +                                                         \
> +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> +    if (u.a[i] != v[i])                                          \
> +      {                                                          \
> +       err++;                                            \
> +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> +               i, v[i], u.a[i]);                         \
> +      }                                                          \
> +  return err;                                            \
> +}
> +
> +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> +
> +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
> +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> +{                                                        \
> +  INPUT_TYPE s;                                                  \
> +  OUTPUT_TYPE r, ref;                                    \
> +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> +    {                                                    \
> +      s.a[i] = (i + 23415) * (i + 341);                          \
> +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> +    }                                                    \
> +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> +                                                         \
> +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> +    abort ();                                            \
> +  return;                                                \
> +}
> +
> +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> +
> +void TEST (void)
> +{
> +  do_test__v2di__v2si ();
> +  do_test__v2di__v2hi ();
> +  do_test__v2di__v2qi ();
> +  do_test__v4di__v4si ();
> +  do_test__v4di__v4hi ();
> +  do_test__v4di__v4qi ();
> +  do_test__v4si__v4hi ();
> +  do_test__v4si__v4qi ();
> +  do_test__v8si__v8hi ();
> +  do_test__v8si__v8qi ();
> +  do_test__v8hi__v8qi ();
> +  do_test__v16hi__v16qi ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> index 28a3f9a3527..3aa49a3b654 100644
> --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
>     a uniform CTOR with a vector promotion to a CTOR on a promoted
>     element.  */
>  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
> -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
> --
> 2.31.1
>
Hu, Lin1 May 23, 2024, 7:17 a.m. UTC | #2
> -----Original Message-----
> From: Hongtao Liu <crazylht@gmail.com>
> Sent: Thursday, May 23, 2024 2:42 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com; rguenther@suse.de
> Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> 
> On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
> >
> > gcc/ChangeLog:
> >
> >         PR 107432
> >         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> >         New function for generate a series of suitable insn.
> >         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> >         Define new function.
> >         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
> I have some concern for this patch since
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> patch.

OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq.

BRs,
Lin
 
> > gcc/testsuite/ChangeLog:
> >
> >         PR 107432
> >         * gcc.target/i386/pr107432-8.c: New test.
> >         * gcc.target/i386/pr107432-9.c: Ditto.
> >         * gcc.target/i386/pr92645-4.c: Modify test.
> > ---
> >  gcc/config/i386/i386-expand.cc             |  47 +++++++-
> >  gcc/config/i386/i386-protos.h              |   3 +
> >  gcc/config/i386/sse.md                     |  87 +++++++++++----
> >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> >    emit_insn (gen_xorv4si3 (value, value, large));  }
> >
> > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > -                                                machine_mode mode, rtx target,
> > -                                                rtx var, int one_var);
> > -
> >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> >     Expects the 64-bit DImode to be supplied in a pair of integral
> >     registers.  Requires SSE2; will use SSE3 if available.  For
> > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> >     whose ONE_VAR element is VAR, and other elements are zero.  Return true
> >     if successful.  */
> >
> > -static bool
> > +bool
> >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> >                                      rtx target, rtx var, int one_var)
> > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> >    return ret;
> >  }
> >
> > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > +
> > +bool
> > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > +  machine_mode out_mode = GET_MODE (output);
> > +  machine_mode in_mode = GET_MODE (input);
> > +  int len = GET_MODE_SIZE (in_mode);
> > +  gcc_assert (len == 16 || len == 32);
> > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > +
> > +  struct expand_vec_perm_d d;
> > +  d.target = gen_reg_rtx (cvt_mode);
> > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > + in_mode);
> > +  d.op1 = d.op0;
> > +  d.vmode = cvt_mode;
> > +  d.nelt = len;
> > +  d.testing_p = false;
> > +  d.one_operand_p = true;
> > +
> > +  /* Init perm. Put the needed bits of input in order and
> > +     fill the rest of bits by default.  */  int tot = 0;  for (int i
> > + = 0; i < len; ++i)
> > +    {
> > +      d.perm[i] = i;
> > +      if ((i % in_innersize) < out_innersize)
> > +       d.perm[tot++] = i;
> > +    }
> > +
> > +  if (ix86_expand_vec_perm_const_1(&d))
> > +    {
> > +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> > +      return true;
> > +    }
> > +
> > +  return false;
> > +}
> > +
> >  #include "gt-i386-expand.h"
> > diff --git a/gcc/config/i386/i386-protos.h
> > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx,
> > rtx, rtx, enum rtx_code,  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx,
> rtx, rtx, rtx,
> >                                       bool, rtx_code_label *);  extern
> > rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
> >  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);  extern bool
> > ix86_memtag_can_tag_addresses (void);
> >
> > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx,
> > rtx);  extern void ix86_expand_sse2_abs (rtx, rtx);  extern bool
> > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
> >                                                rtx);
> > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
> rtx,
> > +                                                rtx, int);
> >  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
> >
> >  /* In i386-c.cc  */
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> > f57f36ae380..0b14b3dc1ac 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -14373,14 +14373,25 @@ (define_expand
> "avx512bw_<code>v32hiv32qi2_mask_store"
> >
> >  (define_mode_iterator PMOV_DST_MODE_2
> >    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> > +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> > +  [V4SI V8HI V16QI])
> >  (define_mode_attr pmov_suff_2
> >    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
> >
> >  (define_expand "trunc<ssedoublemodelower><mode>2"
> > -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> > -       (truncate:PMOV_DST_MODE_2
> > +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0
> "nonimmediate_operand")
> > +       (truncate:PMOV_DST_MODE_2_AVX2
> >           (match_operand:<ssedoublemode> 1 "register_operand")))]
> > -  "TARGET_AVX512VL")
> > +  "TARGET_AVX2"
> > +{
> > +  if (!TARGET_AVX512VL
> > +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +      DONE;
> > +    }
> > +})
> >
> >  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
> >    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand"
> > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand
> "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
> >    "TARGET_AVX512VL")
> >
> >  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI
> > "TARGET_AVX512BW")])
> > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI
> > +V8HI])
> >  (define_mode_attr pmov_dst_3_lower
> >    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI
> > "v8qi")])  (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@
> > (define_mode_attr pmov_suff_3  (define_expand
> > "trunc<mode><pmov_dst_3_lower>2"
> >    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
> >         (truncate:<pmov_dst_3>
> > -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> > + "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V16QImode);
> > +  if (TARGET_AVX512VL
> > +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> > +    {
> > +       rtx op0 = gen_reg_rtx (V16QImode);
> >
> > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> > +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > +                (op0, operands[1], CONST0_RTX
> > + (<pmov_dst_zeroed_3>mode)));
> > +
> > +       emit_move_insn (operands[0],
> > +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > +    }
> > +  else
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +    }
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> >    DONE;
> >  })
> >
> > @@ -14853,15 +14875,24 @@ (define_expand
> "trunc<mode><pmov_dst_4_lower>2"
> >    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
> >         (truncate:<pmov_dst_4>
> >           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +  "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V8HImode);
> > +  if (TARGET_AVX512VL)
> > +    {
> > +      rtx op0 = gen_reg_rtx (V8HImode);
> >
> > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> > +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > +               (op0, operands[1], CONST0_RTX
> > + (<pmov_dst_zeroed_4>mode)));
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > +      emit_move_insn (operands[0],
> > +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > +      DONE;
> > +    }
> > +  else
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +    }
> >    DONE;
> >  })
> >
> > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
> >    [(set (match_operand:V2SI 0 "register_operand")
> >         (truncate:V2SI
> >           (match_operand:V2DI 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +  "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V4SImode);
> > +  if (TARGET_AVX512VL)
> > +    {
> > +      rtx op0 = gen_reg_rtx (V4SImode);
> >
> > -  emit_insn (gen_avx512vl_truncatev2div2si2
> > -            (op0, operands[1], CONST0_RTX (V2SImode)));
> > +      emit_insn (gen_avx512vl_truncatev2div2si2
> > +               (op0, operands[1], CONST0_RTX (V2SImode)));
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (V2SImode, op0, V4SImode));
> > +      emit_move_insn (operands[0],
> > +                     lowpart_subreg (V2SImode, op0, V4SImode));
> > +    }
> > +  else
> > +    {
> > +      rtx tmp = lowpart_subreg (V4SImode,
> > +                               force_reg (V2DImode, operands[1]), V2DImode);
> > +      rtx op0 = gen_reg_rtx (V4SImode);
> > +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> > +                                     GEN_INT (6), GEN_INT (7)));
> > +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0,
> V4SImode));
> > +    }
> >    DONE;
> >  })
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > new file mode 100644
> > index 00000000000..f0d1ab028f7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > @@ -0,0 +1,73 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v3 -O2" } */
> > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) {
> > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > +
> > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) {
> > +  return __builtin_convertvector((__v4di)a, __v4si); }
> > +
> > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > +
> > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > +
> > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > +
> > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) {
> > +  return __builtin_convertvector((__v8si)a, __v8hi); }
> > +
> > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > +
> > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > +
> > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > +
> > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > +
> > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > +
> > +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> > +{
> > +  return __builtin_convertvector((__v16hi)a, __v16qi); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > new file mode 100644
> > index 00000000000..650d352b945
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > @@ -0,0 +1,121 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> > +#include <x86intrin.h>
> > +
> > +#include "avx-check.h"
> > +
> > +#ifndef TEST
> > +#define TEST avx_test
> > +#endif
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +typedef union
> > +{
> > +  __v2si x;
> > +  int a[2];
> > +} union64i_d;
> > +
> > +typedef union
> > +{
> > +  __v2hi x;
> > +  short a[2];
> > +} union32i_w;
> > +
> > +typedef union
> > +{
> > +  __v4hi x;
> > +  short a[4];
> > +} union64i_w;
> > +
> > +typedef union
> > +{
> > +  __v2qi x;
> > +  char a[2];
> > +} union16i_b;
> > +
> > +typedef union
> > +{
> > +  __v4qi x;
> > +  char a[4];
> > +} union32i_b;
> > +
> > +typedef union
> > +{
> > +  __v8qi x;
> > +  char a[8];
> > +} union64i_b;
> > +
> > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> > +static int                                               \
> > +__attribute__((noinline, unused))                        \
> > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> > +{                                                        \
> > +  int i;                                                 \
> > +  int err = 0;                                           \
> > +                                                         \
> > +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> > +    if (u.a[i] != v[i])                                          \
> > +      {                                                          \
> > +       err++;                                            \
> > +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> > +               i, v[i], u.a[i]);                         \
> > +      }                                                          \
> > +  return err;                                            \
> > +}
> > +
> > +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> > +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> > +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> > +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> > +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> > +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> > +
> > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE,
> CVT_TYPE) \
> > +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> > +{                                                        \
> > +  INPUT_TYPE s;                                                  \
> > +  OUTPUT_TYPE r, ref;                                    \
> > +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> > +    {                                                    \
> > +      s.a[i] = (i + 23415) * (i + 341);                          \
> > +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> > +    }                                                    \
> > +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> > +                                                         \
> > +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> > +    abort ();                                            \
> > +  return;                                                \
> > +}
> > +
> > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> > +
> > +void TEST (void)
> > +{
> > +  do_test__v2di__v2si ();
> > +  do_test__v2di__v2hi ();
> > +  do_test__v2di__v2qi ();
> > +  do_test__v4di__v4si ();
> > +  do_test__v4di__v4hi ();
> > +  do_test__v4di__v4qi ();
> > +  do_test__v4si__v4hi ();
> > +  do_test__v4si__v4qi ();
> > +  do_test__v8si__v8hi ();
> > +  do_test__v8si__v8qi ();
> > +  do_test__v8hi__v8qi ();
> > +  do_test__v16hi__v16qi ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > index 28a3f9a3527..3aa49a3b654 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
> >     a uniform CTOR with a vector promotion to a CTOR on a promoted
> >     element.  */
> >  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short
> > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" }
> > } */
> > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" }
> > } */
> > --
> > 2.31.1
> >
> 
> 
> --
> BR,
> Hongtao
Hongtao Liu May 23, 2024, 8:05 a.m. UTC | #3
On Thu, May 23, 2024 at 3:17 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> > -----Original Message-----
> > From: Hongtao Liu <crazylht@gmail.com>
> > Sent: Thursday, May 23, 2024 2:42 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com; rguenther@suse.de
> > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> >
> > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR 107432
> > >         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> > >         New function for generate a series of suitable insn.
> > >         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> > >         Define new function.
> > >         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
> > I have some concern for this patch since
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> > patch.
>
> OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
> Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq.
After a second thought, we may go ahead with the patch, for PR115069,
there's an alternative to avoid cross-lane truncation.
But for this one, there's no alternative. Although cross-lane
permutation is not very efficient, it should still be better than
original code.
>
> BRs,
> Lin
>
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR 107432
> > >         * gcc.target/i386/pr107432-8.c: New test.
> > >         * gcc.target/i386/pr107432-9.c: Ditto.
> > >         * gcc.target/i386/pr92645-4.c: Modify test.
> > > ---
> > >  gcc/config/i386/i386-expand.cc             |  47 +++++++-
> > >  gcc/config/i386/i386-protos.h              |   3 +
> > >  gcc/config/i386/sse.md                     |  87 +++++++++++----
> > >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> > >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc
> > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> > >    emit_insn (gen_xorv4si3 (value, value, large));  }
> > >
> > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > > -                                                machine_mode mode, rtx target,
> > > -                                                rtx var, int one_var);
> > > -
> > >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > >     Expects the 64-bit DImode to be supplied in a pair of integral
> > >     registers.  Requires SSE2; will use SSE3 if available.  For
> > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> > mmx_ok, machine_mode mode,
> > >     whose ONE_VAR element is VAR, and other elements are zero.  Return true
> > >     if successful.  */
> > >
> > > -static bool
> > > +bool
> > >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> > >                                      rtx target, rtx var, int one_var)
> > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> > >    return ret;
> > >  }
> > >
> > > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > > +
> > > +bool
> > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > > +  machine_mode out_mode = GET_MODE (output);
> > > +  machine_mode in_mode = GET_MODE (input);
> > > +  int len = GET_MODE_SIZE (in_mode);
> > > +  gcc_assert (len == 16 || len == 32);
> > > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > > +
> > > +  struct expand_vec_perm_d d;
> > > +  d.target = gen_reg_rtx (cvt_mode);
> > > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > > + in_mode);
> > > +  d.op1 = d.op0;
> > > +  d.vmode = cvt_mode;
> > > +  d.nelt = len;
> > > +  d.testing_p = false;
> > > +  d.one_operand_p = true;
> > > +
> > > +  /* Init perm. Put the needed bits of input in order and
> > > +     fill the rest of bits by default.  */  int tot = 0;  for (int i
> > > + = 0; i < len; ++i)
> > > +    {
> > > +      d.perm[i] = i;
> > > +      if ((i % in_innersize) < out_innersize)
> > > +       d.perm[tot++] = i;
> > > +    }
> > > +
> > > +  if (ix86_expand_vec_perm_const_1(&d))
> > > +    {
> > > +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> > > +      return true;
> > > +    }
> > > +
> > > +  return false;
> > > +}
> > > +
> > >  #include "gt-i386-expand.h"
> > > diff --git a/gcc/config/i386/i386-protos.h
> > > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx,
> > > rtx, rtx, enum rtx_code,  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx,
> > rtx, rtx, rtx,
> > >                                       bool, rtx_code_label *);  extern
> > > rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> > > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
> > >  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);  extern bool
> > > ix86_memtag_can_tag_addresses (void);
> > >
> > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx,
> > > rtx);  extern void ix86_expand_sse2_abs (rtx, rtx);  extern bool
> > > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
> > >                                                rtx);
> > > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
> > rtx,
> > > +                                                rtx, int);
> > >  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
> > >
> > >  /* In i386-c.cc  */
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> > > f57f36ae380..0b14b3dc1ac 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -14373,14 +14373,25 @@ (define_expand
> > "avx512bw_<code>v32hiv32qi2_mask_store"
> > >
> > >  (define_mode_iterator PMOV_DST_MODE_2
> > >    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> > > +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> > > +  [V4SI V8HI V16QI])
> > >  (define_mode_attr pmov_suff_2
> > >    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
> > >
> > >  (define_expand "trunc<ssedoublemodelower><mode>2"
> > > -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> > > -       (truncate:PMOV_DST_MODE_2
> > > +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0
> > "nonimmediate_operand")
> > > +       (truncate:PMOV_DST_MODE_2_AVX2
> > >           (match_operand:<ssedoublemode> 1 "register_operand")))]
> > > -  "TARGET_AVX512VL")
> > > +  "TARGET_AVX2"
> > > +{
> > > +  if (!TARGET_AVX512VL
> > > +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +      DONE;
> > > +    }
> > > +})
> > >
> > >  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
> > >    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand"
> > > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand
> > "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
> > >    "TARGET_AVX512VL")
> > >
> > >  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI
> > > "TARGET_AVX512BW")])
> > > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI
> > > +V8HI])
> > >  (define_mode_attr pmov_dst_3_lower
> > >    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI
> > > "v8qi")])  (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@
> > > (define_mode_attr pmov_suff_3  (define_expand
> > > "trunc<mode><pmov_dst_3_lower>2"
> > >    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
> > >         (truncate:<pmov_dst_3>
> > > -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> > > + "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V16QImode);
> > > +  if (TARGET_AVX512VL
> > > +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> > > +    {
> > > +       rtx op0 = gen_reg_rtx (V16QImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> > > +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > > +                (op0, operands[1], CONST0_RTX
> > > + (<pmov_dst_zeroed_3>mode)));
> > > +
> > > +       emit_move_insn (operands[0],
> > > +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > > +    }
> > > +  else
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +    }
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > >    DONE;
> > >  })
> > >
> > > @@ -14853,15 +14875,24 @@ (define_expand
> > "trunc<mode><pmov_dst_4_lower>2"
> > >    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
> > >         (truncate:<pmov_dst_4>
> > >           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +  "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V8HImode);
> > > +  if (TARGET_AVX512VL)
> > > +    {
> > > +      rtx op0 = gen_reg_rtx (V8HImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> > > +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > > +               (op0, operands[1], CONST0_RTX
> > > + (<pmov_dst_zeroed_4>mode)));
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > > +      emit_move_insn (operands[0],
> > > +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > > +      DONE;
> > > +    }
> > > +  else
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +    }
> > >    DONE;
> > >  })
> > >
> > > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
> > >    [(set (match_operand:V2SI 0 "register_operand")
> > >         (truncate:V2SI
> > >           (match_operand:V2DI 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +  "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V4SImode);
> > > +  if (TARGET_AVX512VL)
> > > +    {
> > > +      rtx op0 = gen_reg_rtx (V4SImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncatev2div2si2
> > > -            (op0, operands[1], CONST0_RTX (V2SImode)));
> > > +      emit_insn (gen_avx512vl_truncatev2div2si2
> > > +               (op0, operands[1], CONST0_RTX (V2SImode)));
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (V2SImode, op0, V4SImode));
> > > +      emit_move_insn (operands[0],
> > > +                     lowpart_subreg (V2SImode, op0, V4SImode));
> > > +    }
> > > +  else
> > > +    {
> > > +      rtx tmp = lowpart_subreg (V4SImode,
> > > +                               force_reg (V2DImode, operands[1]), V2DImode);
> > > +      rtx op0 = gen_reg_rtx (V4SImode);
> > > +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> > > +                                     GEN_INT (6), GEN_INT (7)));
> > > +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0,
> > V4SImode));
> > > +    }
> > >    DONE;
> > >  })
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > new file mode 100644
> > > index 00000000000..f0d1ab028f7
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > @@ -0,0 +1,73 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-march=x86-64-v3 -O2" } */
> > > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> > > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> > > +
> > > +#include <x86intrin.h>
> > > +
> > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > > +__attribute__ ((__vector_size__ (8)));
> > > +
> > > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > > +
> > > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4si); }
> > > +
> > > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > > +
> > > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > > +
> > > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > > +
> > > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) {
> > > +  return __builtin_convertvector((__v8si)a, __v8hi); }
> > > +
> > > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > > +
> > > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > > +
> > > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > > +
> > > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > > +
> > > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > > +
> > > +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> > > +{
> > > +  return __builtin_convertvector((__v16hi)a, __v16qi); }
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > new file mode 100644
> > > index 00000000000..650d352b945
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > @@ -0,0 +1,121 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> > > +#include <x86intrin.h>
> > > +
> > > +#include "avx-check.h"
> > > +
> > > +#ifndef TEST
> > > +#define TEST avx_test
> > > +#endif
> > > +
> > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > > +__attribute__ ((__vector_size__ (8)));
> > > +
> > > +typedef union
> > > +{
> > > +  __v2si x;
> > > +  int a[2];
> > > +} union64i_d;
> > > +
> > > +typedef union
> > > +{
> > > +  __v2hi x;
> > > +  short a[2];
> > > +} union32i_w;
> > > +
> > > +typedef union
> > > +{
> > > +  __v4hi x;
> > > +  short a[4];
> > > +} union64i_w;
> > > +
> > > +typedef union
> > > +{
> > > +  __v2qi x;
> > > +  char a[2];
> > > +} union16i_b;
> > > +
> > > +typedef union
> > > +{
> > > +  __v4qi x;
> > > +  char a[4];
> > > +} union32i_b;
> > > +
> > > +typedef union
> > > +{
> > > +  __v8qi x;
> > > +  char a[8];
> > > +} union64i_b;
> > > +
> > > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> > > +static int                                               \
> > > +__attribute__((noinline, unused))                        \
> > > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> > > +{                                                        \
> > > +  int i;                                                 \
> > > +  int err = 0;                                           \
> > > +                                                         \
> > > +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> > > +    if (u.a[i] != v[i])                                          \
> > > +      {                                                          \
> > > +       err++;                                            \
> > > +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> > > +               i, v[i], u.a[i]);                         \
> > > +      }                                                          \
> > > +  return err;                                            \
> > > +}
> > > +
> > > +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> > > +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> > > +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> > > +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> > > +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> > > +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> > > +
> > > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE,
> > CVT_TYPE) \
> > > +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> > > +{                                                        \
> > > +  INPUT_TYPE s;                                                  \
> > > +  OUTPUT_TYPE r, ref;                                    \
> > > +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> > > +    {                                                    \
> > > +      s.a[i] = (i + 23415) * (i + 341);                          \
> > > +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> > > +    }                                                    \
> > > +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> > > +                                                         \
> > > +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> > > +    abort ();                                            \
> > > +  return;                                                \
> > > +}
> > > +
> > > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> > > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> > > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> > > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> > > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> > > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> > > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> > > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> > > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> > > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> > > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> > > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> > > +
> > > +void TEST (void)
> > > +{
> > > +  do_test__v2di__v2si ();
> > > +  do_test__v2di__v2hi ();
> > > +  do_test__v2di__v2qi ();
> > > +  do_test__v4di__v4si ();
> > > +  do_test__v4di__v4hi ();
> > > +  do_test__v4di__v4qi ();
> > > +  do_test__v4si__v4hi ();
> > > +  do_test__v4si__v4qi ();
> > > +  do_test__v8si__v8hi ();
> > > +  do_test__v8si__v8qi ();
> > > +  do_test__v8hi__v8qi ();
> > > +  do_test__v16hi__v16qi ();
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > index 28a3f9a3527..3aa49a3b654 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
> > >     a uniform CTOR with a vector promotion to a CTOR on a promoted
> > >     element.  */
> > >  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short
> > > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" }
> > > } */
> > > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" }
> > > } */
> > > --
> > > 2.31.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..bca8b85c9d1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@  ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-						 machine_mode mode, rtx target,
-						 rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
    Expects the 64-bit DImode to be supplied in a pair of integral
    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
    whose ONE_VAR element is VAR, and other elements are zero.  Return true
    if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 				     rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,45 @@  ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+bool
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == 16 || len == 32);
+  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = len;
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+     fill the rest of bits by default.  */
+  int tot = 0;
+  for (int i = 0; i < len; ++i)
+    {
+      d.perm[i] = i;
+      if ((i % in_innersize) < out_innersize)
+	d.perm[tot++] = i;
+    }
+
+  if (ix86_expand_vec_perm_const_1(&d))
+    {
+      emit_move_insn (output, gen_lowpart (out_mode, d.target));
+      return true;
+    }
+
+  return false;
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..ac29fb34028 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@  extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
 				      bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@  extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
 					       rtx);
+extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
+						 rtx, int);
 extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.cc  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..0b14b3dc1ac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14373,14 +14373,25 @@  (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
 
 (define_mode_iterator PMOV_DST_MODE_2
   [V4SI V8HI (V16QI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_DST_MODE_2_AVX2
+  [V4SI V8HI V16QI])
 (define_mode_attr pmov_suff_2
   [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
 
 (define_expand "trunc<ssedoublemodelower><mode>2"
-  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
-	(truncate:PMOV_DST_MODE_2
+  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
+	(truncate:PMOV_DST_MODE_2_AVX2
 	  (match_operand:<ssedoublemode> 1 "register_operand")))]
-  "TARGET_AVX512VL")
+  "TARGET_AVX2"
+{
+  if (!TARGET_AVX512VL
+      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+      DONE;
+    }
+})
 
 (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
   [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
@@ -14460,6 +14471,7 @@  (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
   "TARGET_AVX512VL")
 
 (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
 (define_mode_attr pmov_dst_3_lower
   [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
 (define_mode_attr pmov_dst_3
@@ -14472,16 +14484,26 @@  (define_mode_attr pmov_suff_3
 (define_expand "trunc<mode><pmov_dst_3_lower>2"
   [(set (match_operand:<pmov_dst_3> 0 "register_operand")
 	(truncate:<pmov_dst_3>
-	  (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
-  "TARGET_AVX512VL"
+	  (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V16QImode);
+  if (TARGET_AVX512VL
+      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
+    {
+       rtx op0 = gen_reg_rtx (V16QImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
+       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
+	         (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
+
+       emit_move_insn (operands[0],
+		       lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
+    }
+  else
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+    }
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
   DONE;
 })
 
@@ -14853,15 +14875,24 @@  (define_expand "trunc<mode><pmov_dst_4_lower>2"
   [(set (match_operand:<pmov_dst_4> 0 "register_operand")
 	(truncate:<pmov_dst_4>
 	  (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V8HImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V8HImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
+      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
+		(op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      DONE;
+    }
+  else
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+    }
   DONE;
 })
 
@@ -15102,15 +15133,27 @@  (define_expand "truncv2div2si2"
   [(set (match_operand:V2SI 0 "register_operand")
 	(truncate:V2SI
 	  (match_operand:V2DI 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V4SImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_avx512vl_truncatev2div2si2
-	     (op0, operands[1], CONST0_RTX (V2SImode)));
+      emit_insn (gen_avx512vl_truncatev2div2si2
+		(op0, operands[1], CONST0_RTX (V2SImode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (V2SImode, op0, V4SImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (V2SImode, op0, V4SImode));
+    }
+  else
+    {
+      rtx tmp = lowpart_subreg (V4SImode,
+				force_reg (V2DImode, operands[1]), V2DImode);
+      rtx op0 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
+				      GEN_INT (6), GEN_INT (7)));
+      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+    }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
new file mode 100644
index 00000000000..f0d1ab028f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
@@ -0,0 +1,73 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vshufps" 1 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 5 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__v4si	mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
+{
+  return __builtin_convertvector((__v4di)a, __v4si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__v8hi	mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
+{
+  return __builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__v16qi	mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
+{
+  return __builtin_convertvector((__v16hi)a, __v16qi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
new file mode 100644
index 00000000000..650d352b945
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
@@ -0,0 +1,121 @@ 
+/* { dg-do run } */
+/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
+#include <x86intrin.h>
+
+#include "avx-check.h"
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef union
+{
+  __v2si x;
+  int a[2];
+} union64i_d;
+
+typedef union
+{
+  __v2hi x;
+  short a[2];
+} union32i_w;
+
+typedef union
+{
+  __v4hi x;
+  short a[4];
+} union64i_w;
+
+typedef union
+{
+  __v2qi x;
+  char a[2];
+} union16i_b;
+
+typedef union
+{
+  __v4qi x;
+  char a[4];
+} union32i_b;
+
+typedef union
+{
+  __v8qi x;
+  char a[8];
+} union64i_b;
+
+#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)	  \
+static int						  \
+__attribute__((noinline, unused))			  \
+check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)	  \
+{							  \
+  int i;						  \
+  int err = 0;						  \
+							  \
+  for (i = 0; i < ARRAY_SIZE (u.a); i++)		  \
+    if (u.a[i] != v[i])					  \
+      {							  \
+	err++;						  \
+	PRINTF ("%i: " FMT " != " FMT "\n",		  \
+		i, v[i], u.a[i]);			  \
+      }							  \
+  return err;						  \
+}
+
+CHECK_EXP_LESS128 (union64i_d, int, "%d");
+CHECK_EXP_LESS128 (union32i_w, short, "%d");
+CHECK_EXP_LESS128 (union64i_w, short, "%d");
+CHECK_EXP_LESS128 (union16i_b, char, "%d");
+CHECK_EXP_LESS128 (union32i_b, char, "%d");
+CHECK_EXP_LESS128 (union64i_b, char, "%d");
+
+#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
+void do_test##INIT_TYPE##CVT_TYPE ()			  \
+{							  \
+  INPUT_TYPE s;						  \
+  OUTPUT_TYPE r, ref;					  \
+  for (int i = 0; i < ARRAY_SIZE (s.a); i++)		  \
+    {							  \
+      s.a[i] = (i + 23415) * (i + 341);			  \
+      ref.a[i] = (OUTPUT_INNER) s.a[i];			  \
+    }							  \
+  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
+							  \
+  if (check_##OUTPUT_TYPE (r, ref.a))			  \
+    abort ();						  \
+  return;						  \
+}
+
+SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
+SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
+SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
+SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
+SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
+SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
+SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
+SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
+SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
+SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
+SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
+SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
+
+void TEST (void)
+{
+  do_test__v2di__v2si ();
+  do_test__v2di__v2hi ();
+  do_test__v2di__v2qi ();
+  do_test__v4di__v4si ();
+  do_test__v4di__v4hi ();
+  do_test__v4di__v4qi ();
+  do_test__v4si__v4hi ();
+  do_test__v4si__v4qi ();
+  do_test__v8si__v8hi ();
+  do_test__v8si__v8qi ();
+  do_test__v8hi__v8qi ();
+  do_test__v16hi__v16qi ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
index 28a3f9a3527..3aa49a3b654 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
@@ -52,5 +52,3 @@  void f(char *dst, char *src, unsigned long n, unsigned c)
    a uniform CTOR with a vector promotion to a CTOR on a promoted
    element.  */
 /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */