Message ID | 20211129074616.78603-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Optimize _Float16 usage for non AVX512FP16. | expand |
On Mon, Nov 29, 2021 at 8:46 AM liuhongt <hongtao.liu@intel.com> wrote: > > As discussed in PR, this patch do optimizations: > 1. No memory is needed to move HI/HFmode between GPR and SSE registers > under TARGET_SSE2 and above, pinsrw/pextrw are used for them w/o > AVX512FP16. > 2. Use gen_sse2_pinsrph/gen_vec_setv4sf_0 to replace > ix86_expand_vector_set in extendhfsf2/truncsfhf2 so that redundant > initialization cound be eliminated. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and > x86_64-pc-linux-gnu{-m32\ -march=cadcadelake,\ -march=cascadelake} > Ok for trunk? > > gcc/ChangeLog: > > PR target/102811 > * config/i386/i386.c (inline_secondary_memory_needed): HImode > move between GPR and SSE registers is supported under > TARGET_SSE2 and above. > * config/i386/i386.md (extendhfsf2): Optimize expander. > (truncsfhf2): Ditto. > * config/i386/sse.md (sse2p4_1): Adjust attr for V8HFmode to > align with V8HImode. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr102811-2.c: New test. > * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: Add new > scan-assembler-times. > --- > gcc/config/i386/i386.c | 5 +++-- > gcc/config/i386/i386.md | 18 +++++++++++---- > gcc/config/i386/sse.md | 2 +- > .../i386/avx512vl-vcvtps2ph-pr102811.c | 2 +- > gcc/testsuite/gcc.target/i386/pr102811-2.c | 22 +++++++++++++++++++ > 5 files changed, 41 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr102811-2.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 7cf599f57f7..2657e7817ae 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -19437,8 +19437,9 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, > if (msize > UNITS_PER_WORD) > return true; > > - /* In addition to SImode moves, AVX512FP16 also enables HImode moves. */ > - int minsize = GET_MODE_SIZE (TARGET_AVX512FP16 ? HImode : SImode); > + /* In addition to SImode moves, HImode moves are supported for SSE2 and above, > + Use vmovw with AVX512FP16, or pinsrw/pextrw without AVX512FP16. */ > + int minsize = GET_MODE_SIZE (TARGET_SSE2 ? HImode : SImode); > > if (msize < minsize) > return true; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 2cb3e727588..070758edb66 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -4617,9 +4617,18 @@ (define_expand "extendhfsf2" > if (!TARGET_AVX512FP16) > { > rtx res = gen_reg_rtx (V4SFmode); > - rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode)); > + rtx tmp = gen_reg_rtx (V8HFmode); > + rtx zero = force_reg (V8HFmode, CONST0_RTX (V8HFmode)); > > - ix86_expand_vector_set (false, tmp, operands[1], 0); > + if (TARGET_AVX2) > + { > + rtx dup = gen_reg_rtx (V8HFmode); > + emit_move_insn (dup, gen_rtx_VEC_DUPLICATE (V8HFmode, operands[1])); > + emit_move_insn (tmp, gen_rtx_VEC_MERGE (V8HFmode, dup, > + zero, const1_rtx)); > + } > + else > + emit_insn (gen_sse2_pinsrph (tmp, zero, operands[1], const1_rtx)); > emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp))); > emit_move_insn (operands[0], gen_lowpart (SFmode, res)); > DONE; > @@ -4833,9 +4842,10 @@ (define_expand "truncsfhf2" > if (!TARGET_AVX512FP16) > { > rtx res = gen_reg_rtx (V8HFmode); > - rtx tmp = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); > + rtx tmp = gen_reg_rtx (V4SFmode); > + rtx zero = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); > > - ix86_expand_vector_set (false, tmp, operands[1], 0); > + emit_insn (gen_vec_setv4sf_0 (tmp, zero, operands[1])); > emit_insn (gen_vcvtps2ph (gen_lowpart (V8HImode, res), tmp, GEN_INT (4))); > emit_move_insn (operands[0], gen_lowpart (HFmode, res)); > DONE; > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 5229b23af98..b371b140eb1 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -17272,7 +17272,7 @@ (define_mode_iterator PINSR_MODE > (V2DI "TARGET_SSE4_1 && TARGET_64BIT")]) > > (define_mode_attr sse2p4_1 > - [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1") > + [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse2") > (V4SI "sse4_1") (V2DI "sse4_1")]) > > (define_mode_attr pinsr_evex_isa > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c > index dfbfb167953..9a6c432c866 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-O2 -mf16c -mno-avx512fp16" } */ > -/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 2 } } */ > +/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 1 } } */ > /* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */ > /* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */ > /* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr102811-2.c b/gcc/testsuite/gcc.target/i386/pr102811-2.c > new file mode 100644 > index 00000000000..e511c665ae8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr102811-2.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */ > +/* { dg-final { scan-assembler-times "pextrw" 1 } } */ > +/* { dg-final { scan-assembler-times "pinsrw" 1 } } */ > +/* { dg-final { scan-assembler-not "\\\(%rsp\\\)"} } */ The above scan-assembler-not is maybe too broad, but I have no better solution to detect spills. OK. Thanks, Uros. > +short test (_Float16 a) > +{ > + union{ > + short b; > + _Float16 a;}u; > + u.a = a; > + return u.b; > +} > + > +_Float16 test1 (short a) > +{ > + union{ > + _Float16 b; > + short a;}u; > + u.a = a; > + return u.b; > +} > -- > 2.18.1 >
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7cf599f57f7..2657e7817ae 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19437,8 +19437,9 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (msize > UNITS_PER_WORD) return true; - /* In addition to SImode moves, AVX512FP16 also enables HImode moves. */ - int minsize = GET_MODE_SIZE (TARGET_AVX512FP16 ? HImode : SImode); + /* In addition to SImode moves, HImode moves are supported for SSE2 and above, + Use vmovw with AVX512FP16, or pinsrw/pextrw without AVX512FP16. */ + int minsize = GET_MODE_SIZE (TARGET_SSE2 ? HImode : SImode); if (msize < minsize) return true; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2cb3e727588..070758edb66 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4617,9 +4617,18 @@ (define_expand "extendhfsf2" if (!TARGET_AVX512FP16) { rtx res = gen_reg_rtx (V4SFmode); - rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode)); + rtx tmp = gen_reg_rtx (V8HFmode); + rtx zero = force_reg (V8HFmode, CONST0_RTX (V8HFmode)); - ix86_expand_vector_set (false, tmp, operands[1], 0); + if (TARGET_AVX2) + { + rtx dup = gen_reg_rtx (V8HFmode); + emit_move_insn (dup, gen_rtx_VEC_DUPLICATE (V8HFmode, operands[1])); + emit_move_insn (tmp, gen_rtx_VEC_MERGE (V8HFmode, dup, + zero, const1_rtx)); + } + else + emit_insn (gen_sse2_pinsrph (tmp, zero, operands[1], const1_rtx)); emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp))); emit_move_insn (operands[0], gen_lowpart (SFmode, res)); DONE; @@ -4833,9 +4842,10 @@ (define_expand "truncsfhf2" if (!TARGET_AVX512FP16) { rtx res = gen_reg_rtx (V8HFmode); - rtx tmp = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); + rtx tmp = gen_reg_rtx (V4SFmode); + rtx zero = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); - ix86_expand_vector_set (false, tmp, operands[1], 0); + emit_insn (gen_vec_setv4sf_0 (tmp, zero, operands[1])); emit_insn (gen_vcvtps2ph (gen_lowpart (V8HImode, res), tmp, GEN_INT (4))); emit_move_insn (operands[0], gen_lowpart (HFmode, res)); DONE; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5229b23af98..b371b140eb1 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17272,7 +17272,7 @@ (define_mode_iterator PINSR_MODE (V2DI "TARGET_SSE4_1 && TARGET_64BIT")]) (define_mode_attr sse2p4_1 - [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1") + [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse2") (V4SI "sse4_1") (V2DI "sse4_1")]) (define_mode_attr pinsr_evex_isa diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c index dfbfb167953..9a6c432c866 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mf16c -mno-avx512fp16" } */ -/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 2 } } */ +/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 1 } } */ /* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */ /* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */ /* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */ diff --git a/gcc/testsuite/gcc.target/i386/pr102811-2.c b/gcc/testsuite/gcc.target/i386/pr102811-2.c new file mode 100644 index 00000000000..e511c665ae8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102811-2.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */ +/* { dg-final { scan-assembler-times "pextrw" 1 } } */ +/* { dg-final { scan-assembler-times "pinsrw" 1 } } */ +/* { dg-final { scan-assembler-not "\\\(%rsp\\\)"} } */ +short test (_Float16 a) +{ + union{ + short b; + _Float16 a;}u; + u.a = a; + return u.b; +} + +_Float16 test1 (short a) +{ + union{ + _Float16 b; + short a;}u; + u.a = a; + return u.b; +}