Message ID | 20240523063742.2333446-4-lin1.hu@intel.com |
---|---|
State | New |
Headers | show |
Series | Optimize __builtin_convertvector for x86-64-v4 and | expand |
On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote: > > gcc/ChangeLog: > > PR 107432 > * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): > New function for generate a series of suitable insn. > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > Define new function. > * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3. I have some concern for this patch since https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this patch. > gcc/testsuite/ChangeLog: > > PR 107432 > * gcc.target/i386/pr107432-8.c: New test. > * gcc.target/i386/pr107432-9.c: Ditto. > * gcc.target/i386/pr92645-4.c: Modify test. > --- > gcc/config/i386/i386-expand.cc | 47 +++++++- > gcc/config/i386/i386-protos.h | 3 + > gcc/config/i386/sse.md | 87 +++++++++++---- > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > 6 files changed, 304 insertions(+), 29 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index 2f27bfb484c..bca8b85c9d1 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > emit_insn (gen_xorv4si3 (value, value, large)); > } > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > - machine_mode mode, rtx target, > - rtx var, int one_var); > - > /* Convert an unsigned DImode value into a DFmode, using only SSE. > Expects the 64-bit DImode to be supplied in a pair of integral > registers. Requires SSE2; will use SSE3 if available. For x86_32, > @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, > whose ONE_VAR element is VAR, and other elements are zero. Return true > if successful. */ > > -static bool > +bool > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > rtx target, rtx var, int one_var) > { > @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > return ret; > } > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > + > +bool > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) > +{ > + machine_mode out_mode = GET_MODE (output); > + machine_mode in_mode = GET_MODE (input); > + int len = GET_MODE_SIZE (in_mode); > + gcc_assert (len == 16 || len == 32); > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > + > + struct expand_vec_perm_d d; > + d.target = gen_reg_rtx (cvt_mode); > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode); > + d.op1 = d.op0; > + d.vmode = cvt_mode; > + d.nelt = len; > + d.testing_p = false; > + d.one_operand_p = true; > + > + /* Init perm. Put the needed bits of input in order and > + fill the rest of bits by default. */ > + int tot = 0; > + for (int i = 0; i < len; ++i) > + { > + d.perm[i] = i; > + if ((i % in_innersize) < out_innersize) > + d.perm[tot++] = i; > + } > + > + if (ix86_expand_vec_perm_const_1(&d)) > + { > + emit_move_insn (output, gen_lowpart (out_mode, d.target)); > + return true; > + } > + > + return false; > +} > + > #include "gt-i386-expand.h" > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index dbc861fb1ea..ac29fb34028 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, > extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, > bool, rtx_code_label *); > extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); > extern rtx ix86_memtag_untagged_pointer (rtx, rtx); > extern bool ix86_memtag_can_tag_addresses (void); > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); > extern void ix86_expand_sse2_abs (rtx, rtx); > extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, > rtx); > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx, > + rtx, int); > extern bool ix86_extract_perm_from_pool_constant (int*, rtx); > > /* In i386-c.cc */ > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index f57f36ae380..0b14b3dc1ac 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store" > > (define_mode_iterator PMOV_DST_MODE_2 > [V4SI V8HI (V16QI "TARGET_AVX512BW")]) > +(define_mode_iterator PMOV_DST_MODE_2_AVX2 > + [V4SI V8HI V16QI]) > (define_mode_attr pmov_suff_2 > [(V16QI "wb") (V8HI "dw") (V4SI "qd")]) > > (define_expand "trunc<ssedoublemodelower><mode>2" > - [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand") > - (truncate:PMOV_DST_MODE_2 > + [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand") > + (truncate:PMOV_DST_MODE_2_AVX2 > (match_operand:<ssedoublemode> 1 "register_operand")))] > - "TARGET_AVX512VL") > + "TARGET_AVX2" > +{ > + if (!TARGET_AVX512VL > + || (<MODE>mode == V16QImode && !TARGET_AVX512BW)) > + { > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); > + gcc_assert (ok); > + DONE; > + } > +}) > > (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2" > [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m") > @@ -14460,6 +14471,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store" > "TARGET_AVX512VL") > > (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")]) > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI]) > (define_mode_attr pmov_dst_3_lower > [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")]) > (define_mode_attr pmov_dst_3 > @@ -14472,16 +14484,26 @@ (define_mode_attr pmov_suff_3 > (define_expand "trunc<mode><pmov_dst_3_lower>2" > [(set (match_operand:<pmov_dst_3> 0 "register_operand") > (truncate:<pmov_dst_3> > - (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))] > - "TARGET_AVX512VL" > + (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))] > + "TARGET_AVX2" > { > - rtx op0 = gen_reg_rtx (V16QImode); > + if (TARGET_AVX512VL > + && (<MODE>mode != V8HImode || TARGET_AVX512BW)) > + { > + rtx op0 = gen_reg_rtx (V16QImode); > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > + (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); > + > + emit_move_insn (operands[0], > + lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > + } > + else > + { > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); > + gcc_assert (ok); > + } > > - emit_move_insn (operands[0], > - lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > DONE; > }) > > @@ -14853,15 +14875,24 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2" > [(set (match_operand:<pmov_dst_4> 0 "register_operand") > (truncate:<pmov_dst_4> > (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))] > - "TARGET_AVX512VL" > + "TARGET_AVX2" > { > - rtx op0 = gen_reg_rtx (V8HImode); > + if (TARGET_AVX512VL) > + { > + rtx op0 = gen_reg_rtx (V8HImode); > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > + (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); > > - emit_move_insn (operands[0], > - lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > + emit_move_insn (operands[0], > + lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > + DONE; > + } > + else > + { > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); > + gcc_assert (ok); > + } > DONE; > }) > > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2" > [(set (match_operand:V2SI 0 "register_operand") > (truncate:V2SI > (match_operand:V2DI 1 "register_operand")))] > - "TARGET_AVX512VL" > + "TARGET_AVX2" > { > - rtx op0 = gen_reg_rtx (V4SImode); > + if (TARGET_AVX512VL) > + { > + rtx op0 = gen_reg_rtx (V4SImode); > > - emit_insn (gen_avx512vl_truncatev2div2si2 > - (op0, operands[1], CONST0_RTX (V2SImode))); > + emit_insn (gen_avx512vl_truncatev2div2si2 > + (op0, operands[1], CONST0_RTX (V2SImode))); > > - emit_move_insn (operands[0], > - lowpart_subreg (V2SImode, op0, V4SImode)); > + emit_move_insn (operands[0], > + lowpart_subreg (V2SImode, op0, V4SImode)); > + } > + else > + { > + rtx tmp = lowpart_subreg (V4SImode, > + force_reg (V2DImode, operands[1]), V2DImode); > + rtx op0 = gen_reg_rtx (V4SImode); > + emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2), > + GEN_INT (6), GEN_INT (7))); > + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); > + } > DONE; > }) > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c > new file mode 100644 > index 00000000000..f0d1ab028f7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c > @@ -0,0 +1,73 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -O2" } */ > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */ > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */ > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */ > + > +#include <x86intrin.h> > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) > +{ > + return __builtin_convertvector((__v2di)a, __v2si); > +} > + > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) > +{ > + return __builtin_convertvector((__v4di)a, __v4si); > +} > + > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2hi); > +} > + > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4hi); > +} > + > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4hi); > +} > + > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) > +{ > + return __builtin_convertvector((__v8si)a, __v8hi); > +} > + > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2qi); > +} > + > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4qi); > +} > + > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4qi); > +} > + > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v8si)a, __v8qi); > +} > + > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v8hi)a, __v8qi); > +} > + > +__v16qi mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a) > +{ > + return __builtin_convertvector((__v16hi)a, __v16qi); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c > new file mode 100644 > index 00000000000..650d352b945 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c > @@ -0,0 +1,121 @@ > +/* { dg-do run } */ > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */ > +#include <x86intrin.h> > + > +#include "avx-check.h" > + > +#ifndef TEST > +#define TEST avx_test > +#endif > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +typedef union > +{ > + __v2si x; > + int a[2]; > +} union64i_d; > + > +typedef union > +{ > + __v2hi x; > + short a[2]; > +} union32i_w; > + > +typedef union > +{ > + __v4hi x; > + short a[4]; > +} union64i_w; > + > +typedef union > +{ > + __v2qi x; > + char a[2]; > +} union16i_b; > + > +typedef union > +{ > + __v4qi x; > + char a[4]; > +} union32i_b; > + > +typedef union > +{ > + __v8qi x; > + char a[8]; > +} union64i_b; > + > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT) \ > +static int \ > +__attribute__((noinline, unused)) \ > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v) \ > +{ \ > + int i; \ > + int err = 0; \ > + \ > + for (i = 0; i < ARRAY_SIZE (u.a); i++) \ > + if (u.a[i] != v[i]) \ > + { \ > + err++; \ > + PRINTF ("%i: " FMT " != " FMT "\n", \ > + i, v[i], u.a[i]); \ > + } \ > + return err; \ > +} > + > +CHECK_EXP_LESS128 (union64i_d, int, "%d"); > +CHECK_EXP_LESS128 (union32i_w, short, "%d"); > +CHECK_EXP_LESS128 (union64i_w, short, "%d"); > +CHECK_EXP_LESS128 (union16i_b, char, "%d"); > +CHECK_EXP_LESS128 (union32i_b, char, "%d"); > +CHECK_EXP_LESS128 (union64i_b, char, "%d"); > + > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \ > +void do_test##INIT_TYPE##CVT_TYPE () \ > +{ \ > + INPUT_TYPE s; \ > + OUTPUT_TYPE r, ref; \ > + for (int i = 0; i < ARRAY_SIZE (s.a); i++) \ > + { \ > + s.a[i] = (i + 23415) * (i + 341); \ > + ref.a[i] = (OUTPUT_INNER) s.a[i]; \ > + } \ > + r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \ > + \ > + if (check_##OUTPUT_TYPE (r, ref.a)) \ > + abort (); \ > + return; \ > +} > + > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si); > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si); > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi); > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi); > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi); > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi); > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi); > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi); > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi); > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi); > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi); > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi); > + > +void TEST (void) > +{ > + do_test__v2di__v2si (); > + do_test__v2di__v2hi (); > + do_test__v2di__v2qi (); > + do_test__v4di__v4si (); > + do_test__v4di__v4hi (); > + do_test__v4di__v4qi (); > + do_test__v4si__v4hi (); > + do_test__v4si__v4qi (); > + do_test__v8si__v8hi (); > + do_test__v8si__v8qi (); > + do_test__v8hi__v8qi (); > + do_test__v16hi__v16qi (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c > index 28a3f9a3527..3aa49a3b654 100644 > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c) > a uniform CTOR with a vector promotion to a CTOR on a promoted > element. */ > /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */ > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */ > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */ > -- > 2.31.1 >
> -----Original Message----- > From: Hongtao Liu <crazylht@gmail.com> > Sent: Thursday, May 23, 2024 2:42 PM > To: Hu, Lin1 <lin1.hu@intel.com> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > ubizjak@gmail.com; rguenther@suse.de > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3. > > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote: > > > > gcc/ChangeLog: > > > > PR 107432 > > * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): > > New function for generate a series of suitable insn. > > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > > Define new function. > > * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3. > I have some concern for this patch since > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this > patch. OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5). Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq. BRs, Lin > > gcc/testsuite/ChangeLog: > > > > PR 107432 > > * gcc.target/i386/pr107432-8.c: New test. > > * gcc.target/i386/pr107432-9.c: Ditto. > > * gcc.target/i386/pr92645-4.c: Modify test. > > --- > > gcc/config/i386/i386-expand.cc | 47 +++++++- > > gcc/config/i386/i386-protos.h | 3 + > > gcc/config/i386/sse.md | 87 +++++++++++---- > > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > > 6 files changed, 304 insertions(+), 29 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > > > diff --git a/gcc/config/i386/i386-expand.cc > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > > emit_insn (gen_xorv4si3 (value, value, large)); } > > > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > > - machine_mode mode, rtx target, > > - rtx var, int one_var); > > - > > /* Convert an unsigned DImode value into a DFmode, using only SSE. > > Expects the 64-bit DImode to be supplied in a pair of integral > > registers. Requires SSE2; will use SSE3 if available. For > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool > mmx_ok, machine_mode mode, > > whose ONE_VAR element is VAR, and other elements are zero. Return true > > if successful. */ > > > > -static bool > > +bool > > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > > rtx target, rtx var, int one_var) > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > > return ret; > > } > > > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > + > > +bool > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) { > > + machine_mode out_mode = GET_MODE (output); > > + machine_mode in_mode = GET_MODE (input); > > + int len = GET_MODE_SIZE (in_mode); > > + gcc_assert (len == 16 || len == 32); > > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > > + > > + struct expand_vec_perm_d d; > > + d.target = gen_reg_rtx (cvt_mode); > > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), > > + in_mode); > > + d.op1 = d.op0; > > + d.vmode = cvt_mode; > > + d.nelt = len; > > + d.testing_p = false; > > + d.one_operand_p = true; > > + > > + /* Init perm. Put the needed bits of input in order and > > + fill the rest of bits by default. */ int tot = 0; for (int i > > + = 0; i < len; ++i) > > + { > > + d.perm[i] = i; > > + if ((i % in_innersize) < out_innersize) > > + d.perm[tot++] = i; > > + } > > + > > + if (ix86_expand_vec_perm_const_1(&d)) > > + { > > + emit_move_insn (output, gen_lowpart (out_mode, d.target)); > > + return true; > > + } > > + > > + return false; > > +} > > + > > #include "gt-i386-expand.h" > > diff --git a/gcc/config/i386/i386-protos.h > > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, > > rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, > rtx, rtx, rtx, > > bool, rtx_code_label *); extern > > rtx ix86_expand_fast_convert_bf_to_sf (rtx); > > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); > > extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool > > ix86_memtag_can_tag_addresses (void); > > > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, > > rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool > > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, > > rtx); > > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, > rtx, > > + rtx, int); > > extern bool ix86_extract_perm_from_pool_constant (int*, rtx); > > > > /* In i386-c.cc */ > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > f57f36ae380..0b14b3dc1ac 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -14373,14 +14373,25 @@ (define_expand > "avx512bw_<code>v32hiv32qi2_mask_store" > > > > (define_mode_iterator PMOV_DST_MODE_2 > > [V4SI V8HI (V16QI "TARGET_AVX512BW")]) > > +(define_mode_iterator PMOV_DST_MODE_2_AVX2 > > + [V4SI V8HI V16QI]) > > (define_mode_attr pmov_suff_2 > > [(V16QI "wb") (V8HI "dw") (V4SI "qd")]) > > > > (define_expand "trunc<ssedoublemodelower><mode>2" > > - [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand") > > - (truncate:PMOV_DST_MODE_2 > > + [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 > "nonimmediate_operand") > > + (truncate:PMOV_DST_MODE_2_AVX2 > > (match_operand:<ssedoublemode> 1 "register_operand")))] > > - "TARGET_AVX512VL") > > + "TARGET_AVX2" > > +{ > > + if (!TARGET_AVX512VL > > + || (<MODE>mode == V16QImode && !TARGET_AVX512BW)) > > + { > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > operands[1]); > > + gcc_assert (ok); > > + DONE; > > + } > > +}) > > > > (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2" > > [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" > > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand > "<avx512>_<code><ssedoublemodelower><mode>2_mask_store" > > "TARGET_AVX512VL") > > > > (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI > > "TARGET_AVX512BW")]) > > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI > > +V8HI]) > > (define_mode_attr pmov_dst_3_lower > > [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI > > "v8qi")]) (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@ > > (define_mode_attr pmov_suff_3 (define_expand > > "trunc<mode><pmov_dst_3_lower>2" > > [(set (match_operand:<pmov_dst_3> 0 "register_operand") > > (truncate:<pmov_dst_3> > > - (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))] > > - "TARGET_AVX512VL" > > + (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))] > > + "TARGET_AVX2" > > { > > - rtx op0 = gen_reg_rtx (V16QImode); > > + if (TARGET_AVX512VL > > + && (<MODE>mode != V8HImode || TARGET_AVX512BW)) > > + { > > + rtx op0 = gen_reg_rtx (V16QImode); > > > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); > > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > > + (op0, operands[1], CONST0_RTX > > + (<pmov_dst_zeroed_3>mode))); > > + > > + emit_move_insn (operands[0], > > + lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > > + } > > + else > > + { > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > operands[1]); > > + gcc_assert (ok); > > + } > > > > - emit_move_insn (operands[0], > > - lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > > DONE; > > }) > > > > @@ -14853,15 +14875,24 @@ (define_expand > "trunc<mode><pmov_dst_4_lower>2" > > [(set (match_operand:<pmov_dst_4> 0 "register_operand") > > (truncate:<pmov_dst_4> > > (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))] > > - "TARGET_AVX512VL" > > + "TARGET_AVX2" > > { > > - rtx op0 = gen_reg_rtx (V8HImode); > > + if (TARGET_AVX512VL) > > + { > > + rtx op0 = gen_reg_rtx (V8HImode); > > > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); > > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > > + (op0, operands[1], CONST0_RTX > > + (<pmov_dst_zeroed_4>mode))); > > > > - emit_move_insn (operands[0], > > - lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > > + emit_move_insn (operands[0], > > + lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > > + DONE; > > + } > > + else > > + { > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > operands[1]); > > + gcc_assert (ok); > > + } > > DONE; > > }) > > > > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2" > > [(set (match_operand:V2SI 0 "register_operand") > > (truncate:V2SI > > (match_operand:V2DI 1 "register_operand")))] > > - "TARGET_AVX512VL" > > + "TARGET_AVX2" > > { > > - rtx op0 = gen_reg_rtx (V4SImode); > > + if (TARGET_AVX512VL) > > + { > > + rtx op0 = gen_reg_rtx (V4SImode); > > > > - emit_insn (gen_avx512vl_truncatev2div2si2 > > - (op0, operands[1], CONST0_RTX (V2SImode))); > > + emit_insn (gen_avx512vl_truncatev2div2si2 > > + (op0, operands[1], CONST0_RTX (V2SImode))); > > > > - emit_move_insn (operands[0], > > - lowpart_subreg (V2SImode, op0, V4SImode)); > > + emit_move_insn (operands[0], > > + lowpart_subreg (V2SImode, op0, V4SImode)); > > + } > > + else > > + { > > + rtx tmp = lowpart_subreg (V4SImode, > > + force_reg (V2DImode, operands[1]), V2DImode); > > + rtx op0 = gen_reg_rtx (V4SImode); > > + emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2), > > + GEN_INT (6), GEN_INT (7))); > > + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, > V4SImode)); > > + } > > DONE; > > }) > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c > > b/gcc/testsuite/gcc.target/i386/pr107432-8.c > > new file mode 100644 > > index 00000000000..f0d1ab028f7 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c > > @@ -0,0 +1,73 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v3 -O2" } */ > > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */ > > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */ > > + > > +#include <x86intrin.h> > > + > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi > > +__attribute__ ((__vector_size__ (8))); > > + > > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) { > > + return __builtin_convertvector((__v2di)a, __v2si); } > > + > > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) { > > + return __builtin_convertvector((__v4di)a, __v4si); } > > + > > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) { > > + return __builtin_convertvector((__v2di)a, __v2hi); } > > + > > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) { > > + return __builtin_convertvector((__v4di)a, __v4hi); } > > + > > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) { > > + return __builtin_convertvector((__v4si)a, __v4hi); } > > + > > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) { > > + return __builtin_convertvector((__v8si)a, __v8hi); } > > + > > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) { > > + return __builtin_convertvector((__v2di)a, __v2qi); } > > + > > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) { > > + return __builtin_convertvector((__v4di)a, __v4qi); } > > + > > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) { > > + return __builtin_convertvector((__v4si)a, __v4qi); } > > + > > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) { > > + return __builtin_convertvector((__v8si)a, __v8qi); } > > + > > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) { > > + return __builtin_convertvector((__v8hi)a, __v8qi); } > > + > > +__v16qi mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a) > > +{ > > + return __builtin_convertvector((__v16hi)a, __v16qi); } > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c > > b/gcc/testsuite/gcc.target/i386/pr107432-9.c > > new file mode 100644 > > index 00000000000..650d352b945 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c > > @@ -0,0 +1,121 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */ > > +#include <x86intrin.h> > > + > > +#include "avx-check.h" > > + > > +#ifndef TEST > > +#define TEST avx_test > > +#endif > > + > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi > > +__attribute__ ((__vector_size__ (8))); > > + > > +typedef union > > +{ > > + __v2si x; > > + int a[2]; > > +} union64i_d; > > + > > +typedef union > > +{ > > + __v2hi x; > > + short a[2]; > > +} union32i_w; > > + > > +typedef union > > +{ > > + __v4hi x; > > + short a[4]; > > +} union64i_w; > > + > > +typedef union > > +{ > > + __v2qi x; > > + char a[2]; > > +} union16i_b; > > + > > +typedef union > > +{ > > + __v4qi x; > > + char a[4]; > > +} union32i_b; > > + > > +typedef union > > +{ > > + __v8qi x; > > + char a[8]; > > +} union64i_b; > > + > > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT) \ > > +static int \ > > +__attribute__((noinline, unused)) \ > > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v) \ > > +{ \ > > + int i; \ > > + int err = 0; \ > > + \ > > + for (i = 0; i < ARRAY_SIZE (u.a); i++) \ > > + if (u.a[i] != v[i]) \ > > + { \ > > + err++; \ > > + PRINTF ("%i: " FMT " != " FMT "\n", \ > > + i, v[i], u.a[i]); \ > > + } \ > > + return err; \ > > +} > > + > > +CHECK_EXP_LESS128 (union64i_d, int, "%d"); > > +CHECK_EXP_LESS128 (union32i_w, short, "%d"); > > +CHECK_EXP_LESS128 (union64i_w, short, "%d"); > > +CHECK_EXP_LESS128 (union16i_b, char, "%d"); > > +CHECK_EXP_LESS128 (union32i_b, char, "%d"); > > +CHECK_EXP_LESS128 (union64i_b, char, "%d"); > > + > > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, > CVT_TYPE) \ > > +void do_test##INIT_TYPE##CVT_TYPE () \ > > +{ \ > > + INPUT_TYPE s; \ > > + OUTPUT_TYPE r, ref; \ > > + for (int i = 0; i < ARRAY_SIZE (s.a); i++) \ > > + { \ > > + s.a[i] = (i + 23415) * (i + 341); \ > > + ref.a[i] = (OUTPUT_INNER) s.a[i]; \ > > + } \ > > + r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \ > > + \ > > + if (check_##OUTPUT_TYPE (r, ref.a)) \ > > + abort (); \ > > + return; \ > > +} > > + > > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si); > > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si); > > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi); > > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi); > > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi); > > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi); > > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi); > > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi); > > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi); > > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi); > > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi); > > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi); > > + > > +void TEST (void) > > +{ > > + do_test__v2di__v2si (); > > + do_test__v2di__v2hi (); > > + do_test__v2di__v2qi (); > > + do_test__v4di__v4si (); > > + do_test__v4di__v4hi (); > > + do_test__v4di__v4qi (); > > + do_test__v4si__v4hi (); > > + do_test__v4si__v4qi (); > > + do_test__v8si__v8hi (); > > + do_test__v8si__v8qi (); > > + do_test__v8hi__v8qi (); > > + do_test__v16hi__v16qi (); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c > > b/gcc/testsuite/gcc.target/i386/pr92645-4.c > > index 28a3f9a3527..3aa49a3b654 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c > > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c) > > a uniform CTOR with a vector promotion to a CTOR on a promoted > > element. */ > > /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short > > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */ > > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } > > } */ > > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } > > } */ > > -- > > 2.31.1 > > > > > -- > BR, > Hongtao
On Thu, May 23, 2024 at 3:17 PM Hu, Lin1 <lin1.hu@intel.com> wrote: > > > -----Original Message----- > > From: Hongtao Liu <crazylht@gmail.com> > > Sent: Thursday, May 23, 2024 2:42 PM > > To: Hu, Lin1 <lin1.hu@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>; > > ubizjak@gmail.com; rguenther@suse.de > > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3. > > > > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote: > > > > > > gcc/ChangeLog: > > > > > > PR 107432 > > > * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): > > > New function for generate a series of suitable insn. > > > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > > > Define new function. > > > * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3. > > I have some concern for this patch since > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this > > patch. > > OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5). > Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq. After a second thought, we may go ahead with the patch, for PR115069, there's an alternative to avoid cross-lane truncation. But for this one, there's no alternative. Although cross-lane permutation is not very efficient, it should still be better than original code. > > BRs, > Lin > > > > gcc/testsuite/ChangeLog: > > > > > > PR 107432 > > > * gcc.target/i386/pr107432-8.c: New test. > > > * gcc.target/i386/pr107432-9.c: Ditto. > > > * gcc.target/i386/pr92645-4.c: Modify test. > > > --- > > > gcc/config/i386/i386-expand.cc | 47 +++++++- > > > gcc/config/i386/i386-protos.h | 3 + > > > gcc/config/i386/sse.md | 87 +++++++++++---- > > > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 +++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > > > 6 files changed, 304 insertions(+), 29 deletions(-) create mode > > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > > > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > > > emit_insn (gen_xorv4si3 (value, value, large)); } > > > > > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > > > - machine_mode mode, rtx target, > > > - rtx var, int one_var); > > > - > > > /* Convert an unsigned DImode value into a DFmode, using only SSE. > > > Expects the 64-bit DImode to be supplied in a pair of integral > > > registers. Requires SSE2; will use SSE3 if available. For > > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool > > mmx_ok, machine_mode mode, > > > whose ONE_VAR element is VAR, and other elements are zero. Return true > > > if successful. */ > > > > > > -static bool > > > +bool > > > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > > > rtx target, rtx var, int one_var) > > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > > > return ret; > > > } > > > > > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > > + > > > +bool > > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) { > > > + machine_mode out_mode = GET_MODE (output); > > > + machine_mode in_mode = GET_MODE (input); > > > + int len = GET_MODE_SIZE (in_mode); > > > + gcc_assert (len == 16 || len == 32); > > > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > > > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > > > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > > > + > > > + struct expand_vec_perm_d d; > > > + d.target = gen_reg_rtx (cvt_mode); > > > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), > > > + in_mode); > > > + d.op1 = d.op0; > > > + d.vmode = cvt_mode; > > > + d.nelt = len; > > > + d.testing_p = false; > > > + d.one_operand_p = true; > > > + > > > + /* Init perm. Put the needed bits of input in order and > > > + fill the rest of bits by default. */ int tot = 0; for (int i > > > + = 0; i < len; ++i) > > > + { > > > + d.perm[i] = i; > > > + if ((i % in_innersize) < out_innersize) > > > + d.perm[tot++] = i; > > > + } > > > + > > > + if (ix86_expand_vec_perm_const_1(&d)) > > > + { > > > + emit_move_insn (output, gen_lowpart (out_mode, d.target)); > > > + return true; > > > + } > > > + > > > + return false; > > > +} > > > + > > > #include "gt-i386-expand.h" > > > diff --git a/gcc/config/i386/i386-protos.h > > > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644 > > > --- a/gcc/config/i386/i386-protos.h > > > +++ b/gcc/config/i386/i386-protos.h > > > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, > > > rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, > > rtx, rtx, rtx, > > > bool, rtx_code_label *); extern > > > rtx ix86_expand_fast_convert_bf_to_sf (rtx); > > > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); > > > extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool > > > ix86_memtag_can_tag_addresses (void); > > > > > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, > > > rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool > > > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, > > > rtx); > > > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, > > rtx, > > > + rtx, int); > > > extern bool ix86_extract_perm_from_pool_constant (int*, rtx); > > > > > > /* In i386-c.cc */ > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > > f57f36ae380..0b14b3dc1ac 100644 > > > --- a/gcc/config/i386/sse.md > > > +++ b/gcc/config/i386/sse.md > > > @@ -14373,14 +14373,25 @@ (define_expand > > "avx512bw_<code>v32hiv32qi2_mask_store" > > > > > > (define_mode_iterator PMOV_DST_MODE_2 > > > [V4SI V8HI (V16QI "TARGET_AVX512BW")]) > > > +(define_mode_iterator PMOV_DST_MODE_2_AVX2 > > > + [V4SI V8HI V16QI]) > > > (define_mode_attr pmov_suff_2 > > > [(V16QI "wb") (V8HI "dw") (V4SI "qd")]) > > > > > > (define_expand "trunc<ssedoublemodelower><mode>2" > > > - [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand") > > > - (truncate:PMOV_DST_MODE_2 > > > + [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 > > "nonimmediate_operand") > > > + (truncate:PMOV_DST_MODE_2_AVX2 > > > (match_operand:<ssedoublemode> 1 "register_operand")))] > > > - "TARGET_AVX512VL") > > > + "TARGET_AVX2" > > > +{ > > > + if (!TARGET_AVX512VL > > > + || (<MODE>mode == V16QImode && !TARGET_AVX512BW)) > > > + { > > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > > operands[1]); > > > + gcc_assert (ok); > > > + DONE; > > > + } > > > +}) > > > > > > (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2" > > > [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" > > > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand > > "<avx512>_<code><ssedoublemodelower><mode>2_mask_store" > > > "TARGET_AVX512VL") > > > > > > (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI > > > "TARGET_AVX512BW")]) > > > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI > > > +V8HI]) > > > (define_mode_attr pmov_dst_3_lower > > > [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI > > > "v8qi")]) (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@ > > > (define_mode_attr pmov_suff_3 (define_expand > > > "trunc<mode><pmov_dst_3_lower>2" > > > [(set (match_operand:<pmov_dst_3> 0 "register_operand") > > > (truncate:<pmov_dst_3> > > > - (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))] > > > - "TARGET_AVX512VL" > > > + (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))] > > > + "TARGET_AVX2" > > > { > > > - rtx op0 = gen_reg_rtx (V16QImode); > > > + if (TARGET_AVX512VL > > > + && (<MODE>mode != V8HImode || TARGET_AVX512BW)) > > > + { > > > + rtx op0 = gen_reg_rtx (V16QImode); > > > > > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > > > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); > > > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 > > > + (op0, operands[1], CONST0_RTX > > > + (<pmov_dst_zeroed_3>mode))); > > > + > > > + emit_move_insn (operands[0], > > > + lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > > > + } > > > + else > > > + { > > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > > operands[1]); > > > + gcc_assert (ok); > > > + } > > > > > > - emit_move_insn (operands[0], > > > - lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); > > > DONE; > > > }) > > > > > > @@ -14853,15 +14875,24 @@ (define_expand > > "trunc<mode><pmov_dst_4_lower>2" > > > [(set (match_operand:<pmov_dst_4> 0 "register_operand") > > > (truncate:<pmov_dst_4> > > > (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))] > > > - "TARGET_AVX512VL" > > > + "TARGET_AVX2" > > > { > > > - rtx op0 = gen_reg_rtx (V8HImode); > > > + if (TARGET_AVX512VL) > > > + { > > > + rtx op0 = gen_reg_rtx (V8HImode); > > > > > > - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > > > - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); > > > + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 > > > + (op0, operands[1], CONST0_RTX > > > + (<pmov_dst_zeroed_4>mode))); > > > > > > - emit_move_insn (operands[0], > > > - lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > > > + emit_move_insn (operands[0], > > > + lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); > > > + DONE; > > > + } > > > + else > > > + { > > > + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], > > operands[1]); > > > + gcc_assert (ok); > > > + } > > > DONE; > > > }) > > > > > > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2" > > > [(set (match_operand:V2SI 0 "register_operand") > > > (truncate:V2SI > > > (match_operand:V2DI 1 "register_operand")))] > > > - "TARGET_AVX512VL" > > > + "TARGET_AVX2" > > > { > > > - rtx op0 = gen_reg_rtx (V4SImode); > > > + if (TARGET_AVX512VL) > > > + { > > > + rtx op0 = gen_reg_rtx (V4SImode); > > > > > > - emit_insn (gen_avx512vl_truncatev2div2si2 > > > - (op0, operands[1], CONST0_RTX (V2SImode))); > > > + emit_insn (gen_avx512vl_truncatev2div2si2 > > > + (op0, operands[1], CONST0_RTX (V2SImode))); > > > > > > - emit_move_insn (operands[0], > > > - lowpart_subreg (V2SImode, op0, V4SImode)); > > > + emit_move_insn (operands[0], > > > + lowpart_subreg (V2SImode, op0, V4SImode)); > > > + } > > > + else > > > + { > > > + rtx tmp = lowpart_subreg (V4SImode, > > > + force_reg (V2DImode, operands[1]), V2DImode); > > > + rtx op0 = gen_reg_rtx (V4SImode); > > > + emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2), > > > + GEN_INT (6), GEN_INT (7))); > > > + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, > > V4SImode)); > > > + } > > > DONE; > > > }) > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c > > > b/gcc/testsuite/gcc.target/i386/pr107432-8.c > > > new file mode 100644 > > > index 00000000000..f0d1ab028f7 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c > > > @@ -0,0 +1,73 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-march=x86-64-v3 -O2" } */ > > > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */ > > > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */ > > > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ > > > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */ > > > + > > > +#include <x86intrin.h> > > > + > > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef > > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char > > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi > > > +__attribute__ ((__vector_size__ (8))); > > > + > > > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) { > > > + return __builtin_convertvector((__v2di)a, __v2si); } > > > + > > > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) { > > > + return __builtin_convertvector((__v4di)a, __v4si); } > > > + > > > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) { > > > + return __builtin_convertvector((__v2di)a, __v2hi); } > > > + > > > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) { > > > + return __builtin_convertvector((__v4di)a, __v4hi); } > > > + > > > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) { > > > + return __builtin_convertvector((__v4si)a, __v4hi); } > > > + > > > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) { > > > + return __builtin_convertvector((__v8si)a, __v8hi); } > > > + > > > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) { > > > + return __builtin_convertvector((__v2di)a, __v2qi); } > > > + > > > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) { > > > + return __builtin_convertvector((__v4di)a, __v4qi); } > > > + > > > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) { > > > + return __builtin_convertvector((__v4si)a, __v4qi); } > > > + > > > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) { > > > + return __builtin_convertvector((__v8si)a, __v8qi); } > > > + > > > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) { > > > + return __builtin_convertvector((__v8hi)a, __v8qi); } > > > + > > > +__v16qi mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a) > > > +{ > > > + return __builtin_convertvector((__v16hi)a, __v16qi); } > > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c > > > b/gcc/testsuite/gcc.target/i386/pr107432-9.c > > > new file mode 100644 > > > index 00000000000..650d352b945 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c > > > @@ -0,0 +1,121 @@ > > > +/* { dg-do run } */ > > > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */ > > > +#include <x86intrin.h> > > > + > > > +#include "avx-check.h" > > > + > > > +#ifndef TEST > > > +#define TEST avx_test > > > +#endif > > > + > > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef > > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char > > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi > > > +__attribute__ ((__vector_size__ (8))); > > > + > > > +typedef union > > > +{ > > > + __v2si x; > > > + int a[2]; > > > +} union64i_d; > > > + > > > +typedef union > > > +{ > > > + __v2hi x; > > > + short a[2]; > > > +} union32i_w; > > > + > > > +typedef union > > > +{ > > > + __v4hi x; > > > + short a[4]; > > > +} union64i_w; > > > + > > > +typedef union > > > +{ > > > + __v2qi x; > > > + char a[2]; > > > +} union16i_b; > > > + > > > +typedef union > > > +{ > > > + __v4qi x; > > > + char a[4]; > > > +} union32i_b; > > > + > > > +typedef union > > > +{ > > > + __v8qi x; > > > + char a[8]; > > > +} union64i_b; > > > + > > > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT) \ > > > +static int \ > > > +__attribute__((noinline, unused)) \ > > > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v) \ > > > +{ \ > > > + int i; \ > > > + int err = 0; \ > > > + \ > > > + for (i = 0; i < ARRAY_SIZE (u.a); i++) \ > > > + if (u.a[i] != v[i]) \ > > > + { \ > > > + err++; \ > > > + PRINTF ("%i: " FMT " != " FMT "\n", \ > > > + i, v[i], u.a[i]); \ > > > + } \ > > > + return err; \ > > > +} > > > + > > > +CHECK_EXP_LESS128 (union64i_d, int, "%d"); > > > +CHECK_EXP_LESS128 (union32i_w, short, "%d"); > > > +CHECK_EXP_LESS128 (union64i_w, short, "%d"); > > > +CHECK_EXP_LESS128 (union16i_b, char, "%d"); > > > +CHECK_EXP_LESS128 (union32i_b, char, "%d"); > > > +CHECK_EXP_LESS128 (union64i_b, char, "%d"); > > > + > > > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, > > CVT_TYPE) \ > > > +void do_test##INIT_TYPE##CVT_TYPE () \ > > > +{ \ > > > + INPUT_TYPE s; \ > > > + OUTPUT_TYPE r, ref; \ > > > + for (int i = 0; i < ARRAY_SIZE (s.a); i++) \ > > > + { \ > > > + s.a[i] = (i + 23415) * (i + 341); \ > > > + ref.a[i] = (OUTPUT_INNER) s.a[i]; \ > > > + } \ > > > + r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \ > > > + \ > > > + if (check_##OUTPUT_TYPE (r, ref.a)) \ > > > + abort (); \ > > > + return; \ > > > +} > > > + > > > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si); > > > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si); > > > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi); > > > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi); > > > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi); > > > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi); > > > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi); > > > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi); > > > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi); > > > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi); > > > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi); > > > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi); > > > + > > > +void TEST (void) > > > +{ > > > + do_test__v2di__v2si (); > > > + do_test__v2di__v2hi (); > > > + do_test__v2di__v2qi (); > > > + do_test__v4di__v4si (); > > > + do_test__v4di__v4hi (); > > > + do_test__v4di__v4qi (); > > > + do_test__v4si__v4hi (); > > > + do_test__v4si__v4qi (); > > > + do_test__v8si__v8hi (); > > > + do_test__v8si__v8qi (); > > > + do_test__v8hi__v8qi (); > > > + do_test__v16hi__v16qi (); > > > +} > > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c > > > b/gcc/testsuite/gcc.target/i386/pr92645-4.c > > > index 28a3f9a3527..3aa49a3b654 100644 > > > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c > > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c > > > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c) > > > a uniform CTOR with a vector promotion to a CTOR on a promoted > > > element. */ > > > /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short > > > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */ > > > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } > > > } */ > > > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } > > > } */ > > > -- > > > 2.31.1 > > > > > > > > > -- > > BR, > > Hongtao
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) emit_insn (gen_xorv4si3 (value, value, large)); } -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, - machine_mode mode, rtx target, - rtx var, int one_var); - /* Convert an unsigned DImode value into a DFmode, using only SSE. Expects the 64-bit DImode to be supplied in a pair of integral registers. Requires SSE2; will use SSE3 if available. For x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, whose ONE_VAR element is VAR, and other elements are zero. Return true if successful. */ -static bool +bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, rtx target, rtx var, int one_var) { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) return ret; } +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ + +bool +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) +{ + machine_mode out_mode = GET_MODE (output); + machine_mode in_mode = GET_MODE (input); + int len = GET_MODE_SIZE (in_mode); + gcc_assert (len == 16 || len == 32); + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); + + struct expand_vec_perm_d d; + d.target = gen_reg_rtx (cvt_mode); + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode); + d.op1 = d.op0; + d.vmode = cvt_mode; + d.nelt = len; + d.testing_p = false; + d.one_operand_p = true; + + /* Init perm. Put the needed bits of input in order and + fill the rest of bits by default. */ + int tot = 0; + for (int i = 0; i < len; ++i) + { + d.perm[i] = i; + if ((i % in_innersize) < out_innersize) + d.perm[tot++] = i; + } + + if (ix86_expand_vec_perm_const_1(&d)) + { + emit_move_insn (output, gen_lowpart (out_mode, d.target)); + return true; + } + + return false; +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, bool, rtx_code_label *); extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool ix86_memtag_can_tag_addresses (void); @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, rtx); +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx, + rtx, int); extern bool ix86_extract_perm_from_pool_constant (int*, rtx); /* In i386-c.cc */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f57f36ae380..0b14b3dc1ac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store" (define_mode_iterator PMOV_DST_MODE_2 [V4SI V8HI (V16QI "TARGET_AVX512BW")]) +(define_mode_iterator PMOV_DST_MODE_2_AVX2 + [V4SI V8HI V16QI]) (define_mode_attr pmov_suff_2 [(V16QI "wb") (V8HI "dw") (V4SI "qd")]) (define_expand "trunc<ssedoublemodelower><mode>2" - [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand") - (truncate:PMOV_DST_MODE_2 + [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand") + (truncate:PMOV_DST_MODE_2_AVX2 (match_operand:<ssedoublemode> 1 "register_operand")))] - "TARGET_AVX512VL") + "TARGET_AVX2" +{ + if (!TARGET_AVX512VL + || (<MODE>mode == V16QImode && !TARGET_AVX512BW)) + { + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); + gcc_assert (ok); + DONE; + } +}) (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2" [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m") @@ -14460,6 +14471,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store" "TARGET_AVX512VL") (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")]) +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI]) (define_mode_attr pmov_dst_3_lower [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")]) (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@ (define_mode_attr pmov_suff_3 (define_expand "trunc<mode><pmov_dst_3_lower>2" [(set (match_operand:<pmov_dst_3> 0 "register_operand") (truncate:<pmov_dst_3> - (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))] - "TARGET_AVX512VL" + (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))] + "TARGET_AVX2" { - rtx op0 = gen_reg_rtx (V16QImode); + if (TARGET_AVX512VL + && (<MODE>mode != V8HImode || TARGET_AVX512BW)) + { + rtx op0 = gen_reg_rtx (V16QImode); - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2 + (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode))); + + emit_move_insn (operands[0], + lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); + } + else + { + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); + gcc_assert (ok); + } - emit_move_insn (operands[0], - lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode)); DONE; }) @@ -14853,15 +14875,24 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2" [(set (match_operand:<pmov_dst_4> 0 "register_operand") (truncate:<pmov_dst_4> (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))] - "TARGET_AVX512VL" + "TARGET_AVX2" { - rtx op0 = gen_reg_rtx (V8HImode); + if (TARGET_AVX512VL) + { + rtx op0 = gen_reg_rtx (V8HImode); - emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 - (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); + emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2 + (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode))); - emit_move_insn (operands[0], - lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); + emit_move_insn (operands[0], + lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode)); + DONE; + } + else + { + bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]); + gcc_assert (ok); + } DONE; }) @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2" [(set (match_operand:V2SI 0 "register_operand") (truncate:V2SI (match_operand:V2DI 1 "register_operand")))] - "TARGET_AVX512VL" + "TARGET_AVX2" { - rtx op0 = gen_reg_rtx (V4SImode); + if (TARGET_AVX512VL) + { + rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_avx512vl_truncatev2div2si2 - (op0, operands[1], CONST0_RTX (V2SImode))); + emit_insn (gen_avx512vl_truncatev2div2si2 + (op0, operands[1], CONST0_RTX (V2SImode))); - emit_move_insn (operands[0], - lowpart_subreg (V2SImode, op0, V4SImode)); + emit_move_insn (operands[0], + lowpart_subreg (V2SImode, op0, V4SImode)); + } + else + { + rtx tmp = lowpart_subreg (V4SImode, + force_reg (V2DImode, operands[1]), V2DImode); + rtx op0 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2), + GEN_INT (6), GEN_INT (7))); + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); + } DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c new file mode 100644 index 00000000000..f0d1ab028f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O2" } */ +/* { dg-final { scan-assembler-times "vshufps" 1 } } */ +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */ +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ +/* { dg-final { scan-assembler-times "vpermq" 5 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) +{ + return __builtin_convertvector((__v4di)a, __v4si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) +{ + return __builtin_convertvector((__v8si)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4qi); +} + +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8si)a, __v8qi); +} + +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hi)a, __v8qi); +} + +__v16qi mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a) +{ + return __builtin_convertvector((__v16hi)a, __v16qi); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c new file mode 100644 index 00000000000..650d352b945 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c @@ -0,0 +1,121 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */ +#include <x86intrin.h> + +#include "avx-check.h" + +#ifndef TEST +#define TEST avx_test +#endif + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef union +{ + __v2si x; + int a[2]; +} union64i_d; + +typedef union +{ + __v2hi x; + short a[2]; +} union32i_w; + +typedef union +{ + __v4hi x; + short a[4]; +} union64i_w; + +typedef union +{ + __v2qi x; + char a[2]; +} union16i_b; + +typedef union +{ + __v4qi x; + char a[4]; +} union32i_b; + +typedef union +{ + __v8qi x; + char a[8]; +} union64i_b; + +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT) \ +static int \ +__attribute__((noinline, unused)) \ +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v) \ +{ \ + int i; \ + int err = 0; \ + \ + for (i = 0; i < ARRAY_SIZE (u.a); i++) \ + if (u.a[i] != v[i]) \ + { \ + err++; \ + PRINTF ("%i: " FMT " != " FMT "\n", \ + i, v[i], u.a[i]); \ + } \ + return err; \ +} + +CHECK_EXP_LESS128 (union64i_d, int, "%d"); +CHECK_EXP_LESS128 (union32i_w, short, "%d"); +CHECK_EXP_LESS128 (union64i_w, short, "%d"); +CHECK_EXP_LESS128 (union16i_b, char, "%d"); +CHECK_EXP_LESS128 (union32i_b, char, "%d"); +CHECK_EXP_LESS128 (union64i_b, char, "%d"); + +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \ +void do_test##INIT_TYPE##CVT_TYPE () \ +{ \ + INPUT_TYPE s; \ + OUTPUT_TYPE r, ref; \ + for (int i = 0; i < ARRAY_SIZE (s.a); i++) \ + { \ + s.a[i] = (i + 23415) * (i + 341); \ + ref.a[i] = (OUTPUT_INNER) s.a[i]; \ + } \ + r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \ + \ + if (check_##OUTPUT_TYPE (r, ref.a)) \ + abort (); \ + return; \ +} + +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si); +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si); +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi); +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi); +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi); +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi); +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi); +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi); +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi); +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi); +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi); +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi); + +void TEST (void) +{ + do_test__v2di__v2si (); + do_test__v2di__v2hi (); + do_test__v2di__v2qi (); + do_test__v4di__v4si (); + do_test__v4di__v4hi (); + do_test__v4di__v4qi (); + do_test__v4si__v4hi (); + do_test__v4si__v4qi (); + do_test__v8si__v8hi (); + do_test__v8si__v8qi (); + do_test__v8hi__v8qi (); + do_test__v16hi__v16qi (); +} diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c index 28a3f9a3527..3aa49a3b654 100644 --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c) a uniform CTOR with a vector promotion to a CTOR on a promoted element. */ /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */ -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */