diff mbox series

AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

Message ID 20220319000857.75054-1-hongyu.wang@intel.com
State New
Headers show
Series AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978] | expand

Commit Message

Hongyu Wang March 19, 2022, 12:08 a.m. UTC
Hi,

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

	PR target/104978
	* config/i386/sse.md
	(avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
	Generate mask & 1 before move to dest under TARGET_AVX512VL.
	(avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.

gcc/testsuite/ChangeLog:

	PR target/104978
	* gcc.target/i386/pr104978.c: New test.
---
 gcc/config/i386/sse.md                   | 16 ++++++++++------
 gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c

Comments

Hongtao Liu March 21, 2022, 1:07 a.m. UTC | #1
On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> mask should be and by 1 to ensure the mask is bind to lowest byte.
>
> Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> Ok for master?
>
> gcc/ChangeLog:
>
>         PR target/104978
>         * config/i386/sse.md
>         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
>         Generate mask & 1 before move to dest under TARGET_AVX512VL.
>         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/104978
>         * gcc.target/i386/pr104978.c: New test.
> ---
>  gcc/config/i386/sse.md                   | 16 ++++++++++------
>  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
>  2 files changed, 28 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index ed98120be59..cc4c5542ee6 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -6576,7 +6576,7 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
>     (match_operand:QI 4 "register_operand")]
>    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
>  {
> -  rtx op0, op1;
> +  rtx op0, op1, mask;
>
>    if (<round_embedded_complex>)
>      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
> @@ -6590,11 +6590,13 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
>    {
>      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
>      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> +    mask = gen_reg_rtx (QImode);
> +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
>    }
>    else
>    {
> -    rtx mask, tmp, vec_mask;
> +    rtx tmp, vec_mask;
>      mask = lowpart_subreg (SImode, operands[4], QImode),
>      tmp = gen_reg_rtx (SImode);
>      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> @@ -6631,7 +6633,7 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
>     (match_operand:QI 4 "register_operand")]
>    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
>  {
> -  rtx op0, op1;
> +  rtx op0, op1, mask;
>
>    if (<round_embedded_complex>)
>      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
> @@ -6645,11 +6647,13 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
>    {
>      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
>      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> +    mask = gen_reg_rtx (QImode);
> +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
Would it be better to use vmovss under avx512vl without & 1 for mask.
>    }
>    else
>    {
> -    rtx mask, tmp, vec_mask;
> +    rtx tmp, vec_mask;
>      mask = lowpart_subreg (SImode, operands[4], QImode),
>      tmp = gen_reg_rtx (SImode);
>      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> new file mode 100644
> index 00000000000..fd22a6c3f43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> @@ -0,0 +1,18 @@
> +/* PR target/104978 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> +
> +#include<immintrin.h>
> +
> +__m128h
> +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> +}
> +
> +__m128h
> +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> +}
> --
> 2.18.1
>
Hongyu Wang March 21, 2022, 1:18 a.m. UTC | #2
> Would it be better to use vmovss under avx512vl without & 1 for mask.

vmovss clears the upper bits, but the intrinsic requires src1. We
still need either a mask move or blend for the high part.

LLVM generates mask & 1 for these intrinsics.

Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 09:08写道:
>
> On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi,
> >
> > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > mask should be and by 1 to ensure the mask is bind to lowest byte.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> >         PR target/104978
> >         * config/i386/sse.md
> >         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
> >         Generate mask & 1 before move to dest under TARGET_AVX512VL.
> >         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/104978
> >         * gcc.target/i386/pr104978.c: New test.
> > ---
> >  gcc/config/i386/sse.md                   | 16 ++++++++++------
> >  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
> >  2 files changed, 28 insertions(+), 6 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index ed98120be59..cc4c5542ee6 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -6576,7 +6576,7 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> >     (match_operand:QI 4 "register_operand")]
> >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> >  {
> > -  rtx op0, op1;
> > +  rtx op0, op1, mask;
> >
> >    if (<round_embedded_complex>)
> >      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
> > @@ -6590,11 +6590,13 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> >    {
> >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > +    mask = gen_reg_rtx (QImode);
> > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> >    }
> >    else
> >    {
> > -    rtx mask, tmp, vec_mask;
> > +    rtx tmp, vec_mask;
> >      mask = lowpart_subreg (SImode, operands[4], QImode),
> >      tmp = gen_reg_rtx (SImode);
> >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > @@ -6631,7 +6633,7 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> >     (match_operand:QI 4 "register_operand")]
> >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> >  {
> > -  rtx op0, op1;
> > +  rtx op0, op1, mask;
> >
> >    if (<round_embedded_complex>)
> >      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
> > @@ -6645,11 +6647,13 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> >    {
> >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > +    mask = gen_reg_rtx (QImode);
> > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> Would it be better to use vmovss under avx512vl without & 1 for mask.
> >    }
> >    else
> >    {
> > -    rtx mask, tmp, vec_mask;
> > +    rtx tmp, vec_mask;
> >      mask = lowpart_subreg (SImode, operands[4], QImode),
> >      tmp = gen_reg_rtx (SImode);
> >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> > new file mode 100644
> > index 00000000000..fd22a6c3f43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> > @@ -0,0 +1,18 @@
> > +/* PR target/104978 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m128h
> > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> > +}
> > +
> > +__m128h
> > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
Hongtao Liu March 21, 2022, 1:48 a.m. UTC | #3
On Mon, Mar 21, 2022 at 9:22 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > Would it be better to use vmovss under avx512vl without & 1 for mask.
>
> vmovss clears the upper bits, but the intrinsic requires src1. We
> still need either a mask move or blend for the high part.
not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838
>
> LLVM generates mask & 1 for these intrinsics.
>
> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 09:08写道:
> >
> > On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Hi,
> > >
> > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > > mask should be and by 1 to ensure the mask is bind to lowest byte.
> > >
> > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > >
> > > Ok for master?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/104978
> > >         * config/i386/sse.md
> > >         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
> > >         Generate mask & 1 before move to dest under TARGET_AVX512VL.
> > >         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/104978
> > >         * gcc.target/i386/pr104978.c: New test.
> > > ---
> > >  gcc/config/i386/sse.md                   | 16 ++++++++++------
> > >  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
> > >  2 files changed, 28 insertions(+), 6 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index ed98120be59..cc4c5542ee6 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -6576,7 +6576,7 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> > >     (match_operand:QI 4 "register_operand")]
> > >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > >  {
> > > -  rtx op0, op1;
> > > +  rtx op0, op1, mask;
> > >
> > >    if (<round_embedded_complex>)
> > >      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
> > > @@ -6590,11 +6590,13 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> > >    {
> > >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > > +    mask = gen_reg_rtx (QImode);
> > > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> > >    }
> > >    else
> > >    {
> > > -    rtx mask, tmp, vec_mask;
> > > +    rtx tmp, vec_mask;
> > >      mask = lowpart_subreg (SImode, operands[4], QImode),
> > >      tmp = gen_reg_rtx (SImode);
> > >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > > @@ -6631,7 +6633,7 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> > >     (match_operand:QI 4 "register_operand")]
> > >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > >  {
> > > -  rtx op0, op1;
> > > +  rtx op0, op1, mask;
> > >
> > >    if (<round_embedded_complex>)
> > >      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
> > > @@ -6645,11 +6647,13 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> > >    {
> > >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > > +    mask = gen_reg_rtx (QImode);
> > > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> > Would it be better to use vmovss under avx512vl without & 1 for mask.
> > >    }
> > >    else
> > >    {
> > > -    rtx mask, tmp, vec_mask;
> > > +    rtx tmp, vec_mask;
> > >      mask = lowpart_subreg (SImode, operands[4], QImode),
> > >      tmp = gen_reg_rtx (SImode);
> > >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> > > new file mode 100644
> > > index 00000000000..fd22a6c3f43
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> > > @@ -0,0 +1,18 @@
> > > +/* PR target/104978 */
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> > > +
> > > +#include<immintrin.h>
> > > +
> > > +__m128h
> > > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > > +{
> > > +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> > > +}
> > > +
> > > +__m128h
> > > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > > +{
> > > +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> > > +}
> > > --
> > > 2.18.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
Hongyu Wang March 21, 2022, 1:59 a.m. UTC | #4
> > > Would it be better to use vmovss under avx512vl without & 1 for mask.
> >
> > vmovss clears the upper bits, but the intrinsic requires src1. We
> > still need either a mask move or blend for the high part.
> not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
> https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838

Oh, if this works, the non-avx512vl part could also be adjusted. Will
try this, thanks.

Hongtao Liu <crazylht@gmail.com> 于2022年3月21日周一 09:48写道:
>
> On Mon, Mar 21, 2022 at 9:22 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > > Would it be better to use vmovss under avx512vl without & 1 for mask.
> >
> > vmovss clears the upper bits, but the intrinsic requires src1. We
> > still need either a mask move or blend for the high part.
> not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
> https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838
> >
> > LLVM generates mask & 1 for these intrinsics.
> >
> > Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 09:08写道:
> > >
> > > On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > Hi,
> > > >
> > > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > > > mask should be and by 1 to ensure the mask is bind to lowest byte.
> > > >
> > > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > > >
> > > > Ok for master?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR target/104978
> > > >         * config/i386/sse.md
> > > >         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
> > > >         Generate mask & 1 before move to dest under TARGET_AVX512VL.
> > > >         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         PR target/104978
> > > >         * gcc.target/i386/pr104978.c: New test.
> > > > ---
> > > >  gcc/config/i386/sse.md                   | 16 ++++++++++------
> > > >  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
> > > >  2 files changed, 28 insertions(+), 6 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> > > >
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index ed98120be59..cc4c5542ee6 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -6576,7 +6576,7 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> > > >     (match_operand:QI 4 "register_operand")]
> > > >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > > >  {
> > > > -  rtx op0, op1;
> > > > +  rtx op0, op1, mask;
> > > >
> > > >    if (<round_embedded_complex>)
> > > >      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
> > > > @@ -6590,11 +6590,13 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
> > > >    {
> > > >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > > >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > > > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > > > +    mask = gen_reg_rtx (QImode);
> > > > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > > > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> > > >    }
> > > >    else
> > > >    {
> > > > -    rtx mask, tmp, vec_mask;
> > > > +    rtx tmp, vec_mask;
> > > >      mask = lowpart_subreg (SImode, operands[4], QImode),
> > > >      tmp = gen_reg_rtx (SImode);
> > > >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > > > @@ -6631,7 +6633,7 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> > > >     (match_operand:QI 4 "register_operand")]
> > > >    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
> > > >  {
> > > > -  rtx op0, op1;
> > > > +  rtx op0, op1, mask;
> > > >
> > > >    if (<round_embedded_complex>)
> > > >      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
> > > > @@ -6645,11 +6647,13 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
> > > >    {
> > > >      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > > >      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > > > -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > > > +    mask = gen_reg_rtx (QImode);
> > > > +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > > > +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> > > Would it be better to use vmovss under avx512vl without & 1 for mask.
> > > >    }
> > > >    else
> > > >    {
> > > > -    rtx mask, tmp, vec_mask;
> > > > +    rtx tmp, vec_mask;
> > > >      mask = lowpart_subreg (SImode, operands[4], QImode),
> > > >      tmp = gen_reg_rtx (SImode);
> > > >      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
> > > > new file mode 100644
> > > > index 00000000000..fd22a6c3f43
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> > > > @@ -0,0 +1,18 @@
> > > > +/* PR target/104978 */
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > > +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> > > > +
> > > > +#include<immintrin.h>
> > > > +
> > > > +__m128h
> > > > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > > > +{
> > > > +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> > > > +}
> > > > +
> > > > +__m128h
> > > > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > > > +{
> > > > +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> > > > +}
> > > > --
> > > > 2.18.1
> > > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
>
>
>
> --
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ed98120be59..cc4c5542ee6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6576,7 +6576,7 @@  (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
@@ -6590,11 +6590,13 @@  (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
   {
     op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
     op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+    mask = gen_reg_rtx (QImode);
+    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-    rtx mask, tmp, vec_mask;
+    rtx tmp, vec_mask;
     mask = lowpart_subreg (SImode, operands[4], QImode),
     tmp = gen_reg_rtx (SImode);
     emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
@@ -6631,7 +6633,7 @@  (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
@@ -6645,11 +6647,13 @@  (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
   {
     op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
     op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+    mask = gen_reg_rtx (QImode);
+    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-    rtx mask, tmp, vec_mask;
+    rtx tmp, vec_mask;
     mask = lowpart_subreg (SImode, operands[4], QImode),
     tmp = gen_reg_rtx (SImode);
     emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c
new file mode 100644
index 00000000000..fd22a6c3f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104978.c
@@ -0,0 +1,18 @@ 
+/* PR target/104978 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
+
+#include<immintrin.h>
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
+}
+
+__m128h
+foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
+}