diff mbox series

[v2] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

Message ID 20220321115221.3438-1-hongyu.wang@intel.com
State New
Headers show
Series [v2] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978] | expand

Commit Message

Hongyu Wang March 21, 2022, 11:52 a.m. UTC
Hi,

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.
Use masked vmovss to perform same operation which omits higher bits
of mask.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

	PR target/104978
	* config/i386/sse.md
	(avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
	Use avx512f_movsf_mask instead of vmovaps or vblend.
	(avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.

gcc/testsuite/ChangeLog:

	PR target/104978
	* gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust scan.
	* gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
	* gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed.
	* gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
	* gcc.target/i386/pr104978.c: New test.
---
 gcc/config/i386/sse.md                        | 48 ++++---------------
 .../i386/avx512fp16-vfcmaddcsh-1a.c           |  4 +-
 .../i386/avx512fp16-vfcmaddcsh-1c.c           | 13 -----
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |  4 +-
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 -----
 gcc/testsuite/gcc.target/i386/pr104978.c      | 18 +++++++
 6 files changed, 30 insertions(+), 70 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
 delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c

Comments

Hongtao Liu March 21, 2022, 12:52 p.m. UTC | #1
On Mon, Mar 21, 2022 at 7:52 PM Hongyu Wang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> mask should be and by 1 to ensure the mask is bind to lowest byte.
> Use masked vmovss to perform same operation which omits higher bits
> of mask.
>
> Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> Ok for master?
>
> gcc/ChangeLog:
>
>         PR target/104978
>         * config/i386/sse.md
>         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
>         Use avx512f_movsf_mask instead of vmovaps or vblend.
>         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/104978
>         * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust scan.
>         * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
>         * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed.
>         * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
>         * gcc.target/i386/pr104978.c: New test.
> ---
>  gcc/config/i386/sse.md                        | 48 ++++---------------
>  .../i386/avx512fp16-vfcmaddcsh-1a.c           |  4 +-
>  .../i386/avx512fp16-vfcmaddcsh-1c.c           | 13 -----
>  .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |  4 +-
>  .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 -----
>  gcc/testsuite/gcc.target/i386/pr104978.c      | 18 +++++++
>  6 files changed, 30 insertions(+), 70 deletions(-)
>  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
>  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 21bf3c55c95..1087a37812f 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -6586,26 +6586,10 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
>      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
>        operands[1], operands[2], operands[3], operands[4]));
>
> -  if (TARGET_AVX512VL)
> -  {
> -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> -  }
> -  else
> -  {
> -    rtx mask, tmp, vec_mask;
> -    mask = lowpart_subreg (SImode, operands[4], QImode),
> -    tmp = gen_reg_rtx (SImode);
> -    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> -    vec_mask = gen_reg_rtx (V4SImode);
> -    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
> -    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
> -    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
> -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
> -  }
> +  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> +  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> +  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
> +  emit_move_insn (op0, op1);
Considering ICE in PR104976, it's better to force_reg before lowpart_subreg.
i.e.
op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode);
if (!MEM_P (operands[1]))
  operands[1] = force_reg (V8HFmode, operands[1]);
op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
rtx dest = gen_reg_rtx (V4SFmode);
emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode);

>    DONE;
>  })
>
> @@ -6641,26 +6625,10 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
>      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
>        operands[1], operands[2], operands[3], operands[4]));
>
> -  if (TARGET_AVX512VL)
> -  {
> -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> -  }
> -  else
> -  {
> -    rtx mask, tmp, vec_mask;
> -    mask = lowpart_subreg (SImode, operands[4], QImode),
> -    tmp = gen_reg_rtx (SImode);
> -    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> -    vec_mask = gen_reg_rtx (V4SImode);
> -    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
> -    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
> -    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
> -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
> -  }
> +  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> +  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> +  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
> +  emit_move_insn (op0, op1);
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> index eb96588df39..0f87861f09b 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> @@ -1,13 +1,13 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
> +/* { dg-options "-mavx512fp16 -O2" } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> deleted file mode 100644
> index 79a295f722c..00000000000
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> +++ /dev/null
> @@ -1,13 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> -
> -#include "avx512fp16-vfcmaddcsh-1a.c"
> -
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> index 288d1c12a10..6b07957a8bb 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> @@ -1,13 +1,13 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
> +/* { dg-options "-mavx512fp16 -O2" } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
>  /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> deleted file mode 100644
> index 7863f8f9af9..00000000000
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> +++ /dev/null
> @@ -1,13 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> -
> -#include "avx512fp16-vfmaddcsh-1a.c"
> -
> diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> new file mode 100644
> index 00000000000..54788171aff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> @@ -0,0 +1,18 @@
> +/* PR target/104978 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16" } */
> +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
> +
> +#include<immintrin.h>
> +
> +__m128h
> +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> +}
> +
> +__m128h
> +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> +}
> --
> 2.18.1
>
Hongyu Wang March 21, 2022, 2:14 p.m. UTC | #2
> Considering ICE in PR104976, it's better to force_reg before lowpart_subreg.
> i.e.
> op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode);
> if (!MEM_P (operands[1]))
>   operands[1] = force_reg (V8HFmode, operands[1]);
> op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> rtx dest = gen_reg_rtx (V4SFmode);
> emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
> emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode);

I think this is different from PR104976, since operands[0] and
operands[1] here are strictly V8HF operands from builtin input.
I suppose there should be no chance to input a different size subreg
for the expander, otherwise (__v8hf) convert in builtin would fail
first.

Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 20:53写道:

>
> On Mon, Mar 21, 2022 at 7:52 PM Hongyu Wang via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi,
> >
> > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > mask should be and by 1 to ensure the mask is bind to lowest byte.
> > Use masked vmovss to perform same operation which omits higher bits
> > of mask.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> >         PR target/104978
> >         * config/i386/sse.md
> >         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
> >         Use avx512f_movsf_mask instead of vmovaps or vblend.
> >         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/104978
> >         * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust scan.
> >         * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
> >         * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed.
> >         * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
> >         * gcc.target/i386/pr104978.c: New test.
> > ---
> >  gcc/config/i386/sse.md                        | 48 ++++---------------
> >  .../i386/avx512fp16-vfcmaddcsh-1a.c           |  4 +-
> >  .../i386/avx512fp16-vfcmaddcsh-1c.c           | 13 -----
> >  .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |  4 +-
> >  .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 -----
> >  gcc/testsuite/gcc.target/i386/pr104978.c      | 18 +++++++
> >  6 files changed, 30 insertions(+), 70 deletions(-)
> >  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> >  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 21bf3c55c95..1087a37812f 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -6586,26 +6586,10 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> >      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
> >        operands[1], operands[2], operands[3], operands[4]));
> >
> > -  if (TARGET_AVX512VL)
> > -  {
> > -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > -  }
> > -  else
> > -  {
> > -    rtx mask, tmp, vec_mask;
> > -    mask = lowpart_subreg (SImode, operands[4], QImode),
> > -    tmp = gen_reg_rtx (SImode);
> > -    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > -    vec_mask = gen_reg_rtx (V4SImode);
> > -    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
> > -    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
> > -    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
> > -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
> > -  }
> > +  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > +  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > +  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
> > +  emit_move_insn (op0, op1);
> Considering ICE in PR104976, it's better to force_reg before lowpart_subreg.
> i.e.
> op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode);
> if (!MEM_P (operands[1]))
>   operands[1] = force_reg (V8HFmode, operands[1]);
> op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> rtx dest = gen_reg_rtx (V4SFmode);
> emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
> emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode);
>
> >    DONE;
> >  })
> >
> > @@ -6641,26 +6625,10 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> >      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
> >        operands[1], operands[2], operands[3], operands[4]));
> >
> > -  if (TARGET_AVX512VL)
> > -  {
> > -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > -  }
> > -  else
> > -  {
> > -    rtx mask, tmp, vec_mask;
> > -    mask = lowpart_subreg (SImode, operands[4], QImode),
> > -    tmp = gen_reg_rtx (SImode);
> > -    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > -    vec_mask = gen_reg_rtx (V4SImode);
> > -    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
> > -    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
> > -    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
> > -    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
> > -  }
> > +  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > +  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > +  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
> > +  emit_move_insn (op0, op1);
> >    DONE;
> >  })
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> > index eb96588df39..0f87861f09b 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
> > @@ -1,13 +1,13 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
> > +/* { dg-options "-mavx512fp16 -O2" } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> > deleted file mode 100644
> > index 79a295f722c..00000000000
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> > +++ /dev/null
> > @@ -1,13 +0,0 @@
> > -/* { dg-do compile } */
> > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> > -
> > -#include "avx512fp16-vfcmaddcsh-1a.c"
> > -
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> > index 288d1c12a10..6b07957a8bb 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
> > @@ -1,13 +1,13 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
> > +/* { dg-options "-mavx512fp16 -O2" } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> >  /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> > deleted file mode 100644
> > index 7863f8f9af9..00000000000
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> > +++ /dev/null
> > @@ -1,13 +0,0 @@
> > -/* { dg-do compile } */
> > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
> > -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> > -
> > -#include "avx512fp16-vfmaddcsh-1a.c"
> > -
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> > new file mode 100644
> > index 00000000000..54788171aff
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> > @@ -0,0 +1,18 @@
> > +/* PR target/104978 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512fp16" } */
> > +/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m128h
> > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> > +}
> > +
> > +__m128h
> > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 21bf3c55c95..1087a37812f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6586,26 +6586,10 @@  (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
+  emit_move_insn (op0, op1);
   DONE;
 })
 
@@ -6641,26 +6625,10 @@  (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
+  emit_move_insn (op0, op1);
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
index eb96588df39..0f87861f09b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
@@ -1,13 +1,13 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
deleted file mode 100644
index 79a295f722c..00000000000
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
+++ /dev/null
@@ -1,13 +0,0 @@ 
-/* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
-
-#include "avx512fp16-vfcmaddcsh-1a.c"
-
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
index 288d1c12a10..6b07957a8bb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
@@ -1,13 +1,13 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
deleted file mode 100644
index 7863f8f9af9..00000000000
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
+++ /dev/null
@@ -1,13 +0,0 @@ 
-/* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
-
-#include "avx512fp16-vfmaddcsh-1a.c"
-
diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
new file mode 100644
index 00000000000..54788171aff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104978.c
@@ -0,0 +1,18 @@ 
+/* PR target/104978 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */
+
+#include<immintrin.h>
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
+}
+
+__m128h
+foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
+}