diff mbox series

x86: Broadcast from integer to a pseudo vector register

Message ID 20210824010054.336214-1-hjl.tools@gmail.com
State New
Headers show
Series x86: Broadcast from integer to a pseudo vector register | expand

Commit Message

H.J. Lu Aug. 24, 2021, 1 a.m. UTC
Broadcast from integer to a pseudo vector register instead of a hard
vector register to allow LRA to remove redundant move instruction after
broadcast.

gcc/

	PR target/102021
	* config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast
	from integer to a pseudo vector register.

gcc/testsuite/

	PR target/102021
	* gcc.target/i386/pr100865-10b.c: Expect vzeroupper.
	* gcc.target/i386/pr100865-4b.c: Likewise.
	* gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper.
	* gcc.target/i386/pr100865-7b.c: Likewise.
	* gcc.target/i386/pr102021.c: New test.
---
 gcc/config/i386/i386-expand.c                |  8 +-------
 gcc/testsuite/gcc.target/i386/pr100865-10b.c |  1 -
 gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  3 +--
 gcc/testsuite/gcc.target/i386/pr100865-6b.c  |  6 ++----
 gcc/testsuite/gcc.target/i386/pr100865-7b.c  |  6 ++----
 gcc/testsuite/gcc.target/i386/pr102021.c     | 15 +++++++++++++++
 6 files changed, 21 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c

Comments

Hongtao Liu Aug. 24, 2021, 1:23 a.m. UTC | #1
On Tue, Aug 24, 2021 at 9:01 AM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Broadcast from integer to a pseudo vector register instead of a hard
> vector register to allow LRA to remove redundant move instruction after
> broadcast.
>
> gcc/
>
>         PR target/102021
>         * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast
>         from integer to a pseudo vector register.
>
> gcc/testsuite/
>
>         PR target/102021
>         * gcc.target/i386/pr100865-10b.c: Expect vzeroupper.
>         * gcc.target/i386/pr100865-4b.c: Likewise.
>         * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper.
>         * gcc.target/i386/pr100865-7b.c: Likewise.
>         * gcc.target/i386/pr102021.c: New test.
> ---
>  gcc/config/i386/i386-expand.c                |  8 +-------
>  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  1 -
>  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  3 +--
>  gcc/testsuite/gcc.target/i386/pr100865-6b.c  |  6 ++----
>  gcc/testsuite/gcc.target/i386/pr100865-7b.c  |  6 ++----
>  gcc/testsuite/gcc.target/i386/pr102021.c     | 15 +++++++++++++++
>  6 files changed, 21 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c
>
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index 9bf13dbfa92..12db742e5f4 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -579,13 +579,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
>         {
>           /* Broadcast to XMM/YMM/ZMM register from an integer
>              constant or scalar mem.  */
> -         /* Hard registers are used for 2 purposes:
> -            1. Prevent stack realignment when the original code
> -            doesn't use vector registers, which is the same for
> -            memcpy and memset.
> -            2. Prevent combine to convert constant broadcast to
> -            load from constant pool.  */
> -         op1 = ix86_gen_scratch_sse_rtx (mode);
> +         op1 = gen_reg_rtx (mode);
>           if (FLOAT_MODE_P (mode)
>               || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
>             {
    if (FLOAT_MODE_P (mode)
        || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
-     {
-       first = force_const_mem (GET_MODE_INNER (mode), first);
-       op1 = gen_reg_rtx (mode);
-     }
+     first = force_const_mem (GET_MODE_INNER (mode), first);
Could you also apply the upper, others LGTM.
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> index 77ace86ffe8..e5616d8d258 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> @@ -5,4 +5,3 @@
>
>  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> -/* { dg-final { scan-assembler-not "vzeroupper" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> index 80e9fdb12ea..6d9cb91b8e9 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> @@ -5,7 +5,6 @@
>
>  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
>  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> index 35f2e961d25..9588249cb02 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> @@ -4,9 +4,7 @@
>  #include "pr100865-6a.c"
>
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
>  /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> index ad267c43891..3b20c680521 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> @@ -5,8 +5,6 @@
>
>  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
> new file mode 100644
> index 00000000000..6db3f57dc76
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102021.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=skylake-avx512" } */
> +
> +#include<immintrin.h>
> +
> +__m256i
> +foo ()
> +{
> +  return _mm256_set1_epi16 (12);
> +}
> +
> +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> --
> 2.31.1
>
H.J. Lu Aug. 24, 2021, 1:42 a.m. UTC | #2
On Mon, Aug 23, 2021 at 6:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Aug 24, 2021 at 9:01 AM H.J. Lu via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Broadcast from integer to a pseudo vector register instead of a hard
> > vector register to allow LRA to remove redundant move instruction after
> > broadcast.
> >
> > gcc/
> >
> >         PR target/102021
> >         * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast
> >         from integer to a pseudo vector register.
> >
> > gcc/testsuite/
> >
> >         PR target/102021
> >         * gcc.target/i386/pr100865-10b.c: Expect vzeroupper.
> >         * gcc.target/i386/pr100865-4b.c: Likewise.
> >         * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper.
> >         * gcc.target/i386/pr100865-7b.c: Likewise.
> >         * gcc.target/i386/pr102021.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.c                |  8 +-------
> >  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  1 -
> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  3 +--
> >  gcc/testsuite/gcc.target/i386/pr100865-6b.c  |  6 ++----
> >  gcc/testsuite/gcc.target/i386/pr100865-7b.c  |  6 ++----
> >  gcc/testsuite/gcc.target/i386/pr102021.c     | 15 +++++++++++++++
> >  6 files changed, 21 insertions(+), 18 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c
> >
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index 9bf13dbfa92..12db742e5f4 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -579,13 +579,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
> >         {
> >           /* Broadcast to XMM/YMM/ZMM register from an integer
> >              constant or scalar mem.  */
> > -         /* Hard registers are used for 2 purposes:
> > -            1. Prevent stack realignment when the original code
> > -            doesn't use vector registers, which is the same for
> > -            memcpy and memset.
> > -            2. Prevent combine to convert constant broadcast to
> > -            load from constant pool.  */
> > -         op1 = ix86_gen_scratch_sse_rtx (mode);
> > +         op1 = gen_reg_rtx (mode);
> >           if (FLOAT_MODE_P (mode)
> >               || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
> >             {
>     if (FLOAT_MODE_P (mode)
>         || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
> -     {
> -       first = force_const_mem (GET_MODE_INNER (mode), first);
> -       op1 = gen_reg_rtx (mode);
> -     }
> +     first = force_const_mem (GET_MODE_INNER (mode), first);
> Could you also apply the upper, others LGTM.

Like this?

Thanks.

> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > index 77ace86ffe8..e5616d8d258 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > @@ -5,4 +5,3 @@
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > -/* { dg-final { scan-assembler-not "vzeroupper" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > index 80e9fdb12ea..6d9cb91b8e9 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > @@ -5,7 +5,6 @@
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > index 35f2e961d25..9588249cb02 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > @@ -4,9 +4,7 @@
> >  #include "pr100865-6a.c"
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
> > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> >  /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > index ad267c43891..3b20c680521 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > @@ -5,8 +5,6 @@
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
> >  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
> > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
> > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
> > new file mode 100644
> > index 00000000000..6db3f57dc76
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr102021.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=skylake-avx512" } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m256i
> > +foo ()
> > +{
> > +  return _mm256_set1_epi16 (12);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> > --
> > 2.31.1
> >
>
>
> --
> BR,
> Hongtao
Hongtao Liu Aug. 24, 2021, 1:59 a.m. UTC | #3
On Tue, Aug 24, 2021 at 9:43 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 23, 2021 at 6:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Tue, Aug 24, 2021 at 9:01 AM H.J. Lu via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Broadcast from integer to a pseudo vector register instead of a hard
> > > vector register to allow LRA to remove redundant move instruction after
> > > broadcast.
> > >
> > > gcc/
> > >
> > >         PR target/102021
> > >         * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast
> > >         from integer to a pseudo vector register.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/102021
> > >         * gcc.target/i386/pr100865-10b.c: Expect vzeroupper.
> > >         * gcc.target/i386/pr100865-4b.c: Likewise.
> > >         * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper.
> > >         * gcc.target/i386/pr100865-7b.c: Likewise.
> > >         * gcc.target/i386/pr102021.c: New test.
> > > ---
> > >  gcc/config/i386/i386-expand.c                |  8 +-------
> > >  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  1 -
> > >  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  3 +--
> > >  gcc/testsuite/gcc.target/i386/pr100865-6b.c  |  6 ++----
> > >  gcc/testsuite/gcc.target/i386/pr100865-7b.c  |  6 ++----
> > >  gcc/testsuite/gcc.target/i386/pr102021.c     | 15 +++++++++++++++
> > >  6 files changed, 21 insertions(+), 18 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > > index 9bf13dbfa92..12db742e5f4 100644
> > > --- a/gcc/config/i386/i386-expand.c
> > > +++ b/gcc/config/i386/i386-expand.c
> > > @@ -579,13 +579,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
> > >         {
> > >           /* Broadcast to XMM/YMM/ZMM register from an integer
> > >              constant or scalar mem.  */
> > > -         /* Hard registers are used for 2 purposes:
> > > -            1. Prevent stack realignment when the original code
> > > -            doesn't use vector registers, which is the same for
> > > -            memcpy and memset.
> > > -            2. Prevent combine to convert constant broadcast to
> > > -            load from constant pool.  */
> > > -         op1 = ix86_gen_scratch_sse_rtx (mode);
> > > +         op1 = gen_reg_rtx (mode);
> > >           if (FLOAT_MODE_P (mode)
> > >               || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
> > >             {
> >     if (FLOAT_MODE_P (mode)
> >         || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
> > -     {
> > -       first = force_const_mem (GET_MODE_INNER (mode), first);
> > -       op1 = gen_reg_rtx (mode);
> > -     }
> > +     first = force_const_mem (GET_MODE_INNER (mode), first);
> > Could you also apply the upper, others LGTM.
>
> Like this?
Yes.
>
> Thanks.
>
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > index 77ace86ffe8..e5616d8d258 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > @@ -5,4 +5,3 @@
> > >
> > >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > > -/* { dg-final { scan-assembler-not "vzeroupper" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > index 80e9fdb12ea..6d9cb91b8e9 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > @@ -5,7 +5,6 @@
> > >
> > >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> > >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> > >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > > index 35f2e961d25..9588249cb02 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
> > > @@ -4,9 +4,7 @@
> > >  #include "pr100865-6a.c"
> > >
> > >  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
> > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> > >  /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> > >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > > index ad267c43891..3b20c680521 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
> > > @@ -5,8 +5,6 @@
> > >
> > >  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
> > >  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
> > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
> > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> > >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
> > > new file mode 100644
> > > index 00000000000..6db3f57dc76
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr102021.c
> > > @@ -0,0 +1,15 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -march=skylake-avx512" } */
> > > +
> > > +#include<immintrin.h>
> > > +
> > > +__m256i
> > > +foo ()
> > > +{
> > > +  return _mm256_set1_epi16 (12);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> > > --
> > > 2.31.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> H.J.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9bf13dbfa92..12db742e5f4 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -579,13 +579,7 @@  ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	{
 	  /* Broadcast to XMM/YMM/ZMM register from an integer
 	     constant or scalar mem.  */
-	  /* Hard registers are used for 2 purposes:
-	     1. Prevent stack realignment when the original code
-	     doesn't use vector registers, which is the same for
-	     memcpy and memset.
-	     2. Prevent combine to convert constant broadcast to
-	     load from constant pool.  */
-	  op1 = ix86_gen_scratch_sse_rtx (mode);
+	  op1 = gen_reg_rtx (mode);
 	  if (FLOAT_MODE_P (mode)
 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
 	    {
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index 77ace86ffe8..e5616d8d258 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -5,4 +5,3 @@ 
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
-/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 80e9fdb12ea..6d9cb91b8e9 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -5,7 +5,6 @@ 
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
index 35f2e961d25..9588249cb02 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -4,9 +4,7 @@ 
 #include "pr100865-6a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
index ad267c43891..3b20c680521 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -5,8 +5,6 @@ 
 
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
new file mode 100644
index 00000000000..6db3f57dc76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102021.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include<immintrin.h>
+
+__m256i
+foo ()
+{
+  return _mm256_set1_epi16 (12);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vzeroupper" } } */