diff mbox series

Optimize _Float16 usage for non AVX512FP16.

Message ID 20211129074616.78603-1-hongtao.liu@intel.com
State New
Headers show
Series Optimize _Float16 usage for non AVX512FP16. | expand

Commit Message

Liu, Hongtao Nov. 29, 2021, 7:46 a.m. UTC
As discussed in PR, this patch do optimizations:
1. No memory is needed to move HI/HFmode between GPR and SSE registers
under TARGET_SSE2 and above, pinsrw/pextrw are used for them w/o
AVX512FP16.
2. Use gen_sse2_pinsrph/gen_vec_setv4sf_0 to replace
ix86_expand_vector_set in extendhfsf2/truncsfhf2 so that redundant
initialization cound be eliminated.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and
x86_64-pc-linux-gnu{-m32\ -march=cadcadelake,\ -march=cascadelake}
Ok for trunk?

gcc/ChangeLog:

	PR target/102811
	* config/i386/i386.c (inline_secondary_memory_needed): HImode
	move between GPR and SSE registers is supported under
	TARGET_SSE2 and above.
	* config/i386/i386.md (extendhfsf2): Optimize expander.
	(truncsfhf2): Ditto.
	* config/i386/sse.md (sse2p4_1): Adjust attr for V8HFmode to
	align with V8HImode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102811-2.c: New test.
	* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: Add new
	scan-assembler-times.
---
 gcc/config/i386/i386.c                        |  5 +++--
 gcc/config/i386/i386.md                       | 18 +++++++++++----
 gcc/config/i386/sse.md                        |  2 +-
 .../i386/avx512vl-vcvtps2ph-pr102811.c        |  2 +-
 gcc/testsuite/gcc.target/i386/pr102811-2.c    | 22 +++++++++++++++++++
 5 files changed, 41 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102811-2.c

Comments

Uros Bizjak Nov. 29, 2021, 7:57 a.m. UTC | #1
On Mon, Nov 29, 2021 at 8:46 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> As discussed in PR, this patch do optimizations:
> 1. No memory is needed to move HI/HFmode between GPR and SSE registers
> under TARGET_SSE2 and above, pinsrw/pextrw are used for them w/o
> AVX512FP16.
> 2. Use gen_sse2_pinsrph/gen_vec_setv4sf_0 to replace
> ix86_expand_vector_set in extendhfsf2/truncsfhf2 so that redundant
> initialization cound be eliminated.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and
> x86_64-pc-linux-gnu{-m32\ -march=cadcadelake,\ -march=cascadelake}
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/102811
>         * config/i386/i386.c (inline_secondary_memory_needed): HImode
>         move between GPR and SSE registers is supported under
>         TARGET_SSE2 and above.
>         * config/i386/i386.md (extendhfsf2): Optimize expander.
>         (truncsfhf2): Ditto.
>         * config/i386/sse.md (sse2p4_1): Adjust attr for V8HFmode to
>         align with V8HImode.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102811-2.c: New test.
>         * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: Add new
>         scan-assembler-times.
> ---
>  gcc/config/i386/i386.c                        |  5 +++--
>  gcc/config/i386/i386.md                       | 18 +++++++++++----
>  gcc/config/i386/sse.md                        |  2 +-
>  .../i386/avx512vl-vcvtps2ph-pr102811.c        |  2 +-
>  gcc/testsuite/gcc.target/i386/pr102811-2.c    | 22 +++++++++++++++++++
>  5 files changed, 41 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102811-2.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 7cf599f57f7..2657e7817ae 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19437,8 +19437,9 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
>        if (msize > UNITS_PER_WORD)
>         return true;
>
> -      /* In addition to SImode moves, AVX512FP16 also enables HImode moves.  */
> -      int minsize = GET_MODE_SIZE (TARGET_AVX512FP16 ? HImode : SImode);
> +      /* In addition to SImode moves, HImode moves are supported for SSE2 and above,
> +        Use vmovw with AVX512FP16, or pinsrw/pextrw without AVX512FP16.  */
> +      int minsize = GET_MODE_SIZE (TARGET_SSE2 ? HImode : SImode);
>
>        if (msize < minsize)
>         return true;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 2cb3e727588..070758edb66 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -4617,9 +4617,18 @@ (define_expand "extendhfsf2"
>    if (!TARGET_AVX512FP16)
>      {
>        rtx res = gen_reg_rtx (V4SFmode);
> -      rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
> +      rtx tmp = gen_reg_rtx (V8HFmode);
> +      rtx zero = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
>
> -      ix86_expand_vector_set (false, tmp, operands[1], 0);
> +      if (TARGET_AVX2)
> +       {
> +         rtx dup = gen_reg_rtx (V8HFmode);
> +         emit_move_insn (dup, gen_rtx_VEC_DUPLICATE (V8HFmode, operands[1]));
> +         emit_move_insn (tmp, gen_rtx_VEC_MERGE (V8HFmode, dup,
> +                                                 zero, const1_rtx));
> +       }
> +      else
> +       emit_insn (gen_sse2_pinsrph (tmp, zero, operands[1], const1_rtx));
>        emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
>        emit_move_insn (operands[0], gen_lowpart (SFmode, res));
>        DONE;
> @@ -4833,9 +4842,10 @@ (define_expand "truncsfhf2"
>      if (!TARGET_AVX512FP16)
>      {
>        rtx res = gen_reg_rtx (V8HFmode);
> -      rtx tmp = force_reg (V4SFmode, CONST0_RTX (V4SFmode));
> +      rtx tmp = gen_reg_rtx (V4SFmode);
> +      rtx zero = force_reg (V4SFmode, CONST0_RTX (V4SFmode));
>
> -      ix86_expand_vector_set (false, tmp, operands[1], 0);
> +      emit_insn (gen_vec_setv4sf_0 (tmp, zero, operands[1]));
>        emit_insn (gen_vcvtps2ph (gen_lowpart (V8HImode, res), tmp, GEN_INT (4)));
>        emit_move_insn (operands[0], gen_lowpart (HFmode, res));
>        DONE;
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 5229b23af98..b371b140eb1 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17272,7 +17272,7 @@ (define_mode_iterator PINSR_MODE
>     (V2DI "TARGET_SSE4_1 && TARGET_64BIT")])
>
>  (define_mode_attr sse2p4_1
> -  [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1")
> +  [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse2")
>     (V4SI "sse4_1") (V2DI "sse4_1")])
>
>  (define_mode_attr pinsr_evex_isa
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
> index dfbfb167953..9a6c432c866 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
> -/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 2 } } */
> +/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */
>  /* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */
>  /* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102811-2.c b/gcc/testsuite/gcc.target/i386/pr102811-2.c
> new file mode 100644
> index 00000000000..e511c665ae8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102811-2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
> +/* { dg-final { scan-assembler-times "pextrw" 1 } } */
> +/* { dg-final { scan-assembler-times "pinsrw" 1 } } */
> +/* { dg-final { scan-assembler-not "\\\(%rsp\\\)"} } */

The above scan-assembler-not is maybe too broad, but I have no better
solution to detect spills.

OK.

Thanks,
Uros.

> +short test (_Float16 a)
> +{
> +  union{
> +    short b;
> +    _Float16 a;}u;
> +  u.a = a;
> +  return u.b;
> +}
> +
> +_Float16 test1 (short a)
> +{
> +  union{
> +    _Float16 b;
> +    short a;}u;
> +  u.a = a;
> +  return u.b;
> +}
> --
> 2.18.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7cf599f57f7..2657e7817ae 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19437,8 +19437,9 @@  inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
       if (msize > UNITS_PER_WORD)
 	return true;
 
-      /* In addition to SImode moves, AVX512FP16 also enables HImode moves.  */
-      int minsize = GET_MODE_SIZE (TARGET_AVX512FP16 ? HImode : SImode);
+      /* In addition to SImode moves, HImode moves are supported for SSE2 and above,
+	 Use vmovw with AVX512FP16, or pinsrw/pextrw without AVX512FP16.  */
+      int minsize = GET_MODE_SIZE (TARGET_SSE2 ? HImode : SImode);
 
       if (msize < minsize)
 	return true;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 2cb3e727588..070758edb66 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4617,9 +4617,18 @@  (define_expand "extendhfsf2"
   if (!TARGET_AVX512FP16)
     {
       rtx res = gen_reg_rtx (V4SFmode);
-      rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
+      rtx tmp = gen_reg_rtx (V8HFmode);
+      rtx zero = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
 
-      ix86_expand_vector_set (false, tmp, operands[1], 0);
+      if (TARGET_AVX2)
+	{
+	  rtx dup = gen_reg_rtx (V8HFmode);
+	  emit_move_insn (dup, gen_rtx_VEC_DUPLICATE (V8HFmode, operands[1]));
+	  emit_move_insn (tmp, gen_rtx_VEC_MERGE (V8HFmode, dup,
+						  zero, const1_rtx));
+	}
+      else
+	emit_insn (gen_sse2_pinsrph (tmp, zero, operands[1], const1_rtx));
       emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
       emit_move_insn (operands[0], gen_lowpart (SFmode, res));
       DONE;
@@ -4833,9 +4842,10 @@  (define_expand "truncsfhf2"
     if (!TARGET_AVX512FP16)
     {
       rtx res = gen_reg_rtx (V8HFmode);
-      rtx tmp = force_reg (V4SFmode, CONST0_RTX (V4SFmode));
+      rtx tmp = gen_reg_rtx (V4SFmode);
+      rtx zero = force_reg (V4SFmode, CONST0_RTX (V4SFmode));
 
-      ix86_expand_vector_set (false, tmp, operands[1], 0);
+      emit_insn (gen_vec_setv4sf_0 (tmp, zero, operands[1]));
       emit_insn (gen_vcvtps2ph (gen_lowpart (V8HImode, res), tmp, GEN_INT (4)));
       emit_move_insn (operands[0], gen_lowpart (HFmode, res));
       DONE;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5229b23af98..b371b140eb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17272,7 +17272,7 @@  (define_mode_iterator PINSR_MODE
    (V2DI "TARGET_SSE4_1 && TARGET_64BIT")])
 
 (define_mode_attr sse2p4_1
-  [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1")
+  [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse2")
    (V4SI "sse4_1") (V2DI "sse4_1")])
 
 (define_mode_attr pinsr_evex_isa
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
index dfbfb167953..9a6c432c866 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
-/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 2 } } */
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */
 /* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */
 /* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102811-2.c b/gcc/testsuite/gcc.target/i386/pr102811-2.c
new file mode 100644
index 00000000000..e511c665ae8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102811-2.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
+/* { dg-final { scan-assembler-times "pextrw" 1 } } */
+/* { dg-final { scan-assembler-times "pinsrw" 1 } } */
+/* { dg-final { scan-assembler-not "\\\(%rsp\\\)"} } */
+short test (_Float16 a)
+{
+  union{
+    short b;
+    _Float16 a;}u;
+  u.a = a;
+  return u.b;
+}
+
+_Float16 test1 (short a)
+{
+  union{
+    _Float16 b;
+    short a;}u;
+  u.a = a;
+  return u.b;
+}