diff mbox series

i386: Disable TARGET_SSE_TYPELESS_STORES for TARGET_AVX

Message ID 20200127182324.3856-1-hjl.tools@gmail.com
State New
Headers show
Series i386: Disable TARGET_SSE_TYPELESS_STORES for TARGET_AVX | expand

Commit Message

H.J. Lu Jan. 27, 2020, 6:23 p.m. UTC
movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
for TARGET_AVX.

gcc/

	PR target/91461
	* config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
	TARGET_AVX.
	* config/i386/i386.md (*movoi_internal_avx): Remove
	TARGET_SSE_TYPELESS_STORES check.

gcc/testsuite/

	PR target/91461
	* gcc.target/i386/pr91461-1.c: New test.
	* gcc.target/i386/pr91461-2.c: Likewise.
	* gcc.target/i386/pr91461-3.c: Likewise.
	* gcc.target/i386/pr91461-4.c: Likewise.
	* gcc.target/i386/pr91461-5.c: Likewise.
---
 gcc/config/i386/i386.h                    |  4 +-
 gcc/config/i386/i386.md                   |  4 +-
 gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
 gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
 gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
 7 files changed, 203 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c

Comments

Uros Bizjak Jan. 27, 2020, 8:26 p.m. UTC | #1
On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> for TARGET_AVX.
>
> gcc/
>
>         PR target/91461
>         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
>         TARGET_AVX.
>         * config/i386/i386.md (*movoi_internal_avx): Remove
>         TARGET_SSE_TYPELESS_STORES check.
>
> gcc/testsuite/
>
>         PR target/91461
>         * gcc.target/i386/pr91461-1.c: New test.
>         * gcc.target/i386/pr91461-2.c: Likewise.
>         * gcc.target/i386/pr91461-3.c: Likewise.
>         * gcc.target/i386/pr91461-4.c: Likewise.
>         * gcc.target/i386/pr91461-5.c: Likewise.
> ---
>  gcc/config/i386/i386.h                    |  4 +-
>  gcc/config/i386/i386.md                   |  4 +-
>  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
>  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
>  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
>  7 files changed, 203 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 943e9a5c783..c134b04c5c4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
>         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
>  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> +   isn't the case for AVX nor AVX512.  */
>  #define TARGET_SSE_TYPELESS_STORES \
> -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])

This is wrong place to disable the feature.

Uros.

>  #define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR]
>  #define TARGET_MEMORY_MISMATCH_STALL \
>         ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL]
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 6e9c9bd2fb6..bb096133880 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1980,9 +1980,7 @@
>                (and (eq_attr "alternative" "1")
>                     (match_test "TARGET_AVX512VL"))
>                  (const_string "XI")
> -              (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
> -                   (and (eq_attr "alternative" "3")
> -                        (match_test "TARGET_SSE_TYPELESS_STORES")))
> +              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
>                  (const_string "V8SF")
>               ]
>               (const_string "OI")))])
> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-1.c b/gcc/testsuite/gcc.target/i386/pr91461-1.c
> new file mode 100644
> index 00000000000..0c94b8e2b76
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91461-1.c
> @@ -0,0 +1,66 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovdqa\t" } } */
> +/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
> +/* { dg-final { scan-assembler "\tvmovapd\t" } } */
> +/* { dg-final { scan-assembler "\tvmovupd\t" } } */
> +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
> +/* { dg-final { scan-assembler-not "\tvmovups\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo1 (__m128i *p, __m128i x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo2 (__m128d *p, __m128d x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo3 (__float128 *p, __float128 x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo4 (__m128i_u *p, __m128i x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo5 (__m128d_u *p, __m128d x)
> +{
> +  *p = x;
> +}
> +
> +typedef __float128 __float128_u __attribute__ ((__aligned__ (1)));
> +
> +void
> +foo6 (__float128_u *p, __float128 x)
> +{
> +  *p = x;
> +}
> +
> +#ifdef __x86_64__
> +typedef __int128 __int128_u __attribute__ ((__aligned__ (1)));
> +
> +extern __int128 int128;
> +
> +void
> +foo7 (__int128 *p)
> +{
> +  *p = int128;
> +}
> +
> +void
> +foo8 (__int128_u *p)
> +{
> +  *p = int128;
> +}
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-2.c b/gcc/testsuite/gcc.target/i386/pr91461-2.c
> new file mode 100644
> index 00000000000..921cfaf9780
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91461-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovdqa\t" } } */
> +/* { dg-final { scan-assembler "\tvmovapd\t" } } */
> +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo1 (__m256i *p, __m256i x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo2 (__m256d *p, __m256d x)
> +{
> +  *p = x;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-3.c b/gcc/testsuite/gcc.target/i386/pr91461-3.c
> new file mode 100644
> index 00000000000..c67a48063bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91461-3.c
> @@ -0,0 +1,76 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512f -mavx512vl" } */
> +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
> +/* { dg-final { scan-assembler-not "\tvmovups\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo1 (__m128i *p, __m128i a)
> +{
> +  register __m128i x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo2 (__m128d *p, __m128d a)
> +{
> +  register __m128d x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo3 (__float128 *p, __float128 a)
> +{
> +  register __float128 x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo4 (__m128i_u *p, __m128i a)
> +{
> +  register __m128i x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo5 (__m128d_u *p, __m128d a)
> +{
> +  register __m128d x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +typedef __float128 __float128_u __attribute__ ((__aligned__ (1)));
> +
> +void
> +foo6 (__float128_u *p, __float128 a)
> +{
> +  register __float128 x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +typedef __int128 __int128_u __attribute__ ((__aligned__ (1)));
> +
> +extern __int128 int128;
> +
> +void
> +foo7 (__int128 *p)
> +{
> +  register __int128 x __asm ("xmm16") = int128;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo8 (__int128_u *p)
> +{
> +  register __int128 x __asm ("xmm16") = int128;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-4.c b/gcc/testsuite/gcc.target/i386/pr91461-4.c
> new file mode 100644
> index 00000000000..69df590de3a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91461-4.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512f -mavx512vl" } */
> +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo1 (__m256i *p, __m256i a)
> +{
> +  register __m256i x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> +
> +void
> +foo2 (__m256d *p, __m256d a)
> +{
> +  register __m256d x __asm ("xmm16") = a;
> +  asm volatile ("" : "+v" (x));
> +  *p = x;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-5.c b/gcc/testsuite/gcc.target/i386/pr91461-5.c
> new file mode 100644
> index 00000000000..974263042f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91461-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo1 (__m512i *p, __m512i x)
> +{
> +  *p = x;
> +}
> +
> +void
> +foo2 (__m512d *p, __m512d x)
> +{
> +  *p = x;
> +}
> --
> 2.24.1
>
H.J. Lu Jan. 27, 2020, 10:17 p.m. UTC | #2
On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > for TARGET_AVX.
> >
> > gcc/
> >
> >         PR target/91461
> >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> >         TARGET_AVX.
> >         * config/i386/i386.md (*movoi_internal_avx): Remove
> >         TARGET_SSE_TYPELESS_STORES check.
> >
> > gcc/testsuite/
> >
> >         PR target/91461
> >         * gcc.target/i386/pr91461-1.c: New test.
> >         * gcc.target/i386/pr91461-2.c: Likewise.
> >         * gcc.target/i386/pr91461-3.c: Likewise.
> >         * gcc.target/i386/pr91461-4.c: Likewise.
> >         * gcc.target/i386/pr91461-5.c: Likewise.
> > ---
> >  gcc/config/i386/i386.h                    |  4 +-
> >  gcc/config/i386/i386.md                   |  4 +-
> >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> >  7 files changed, 203 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> >
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 943e9a5c783..c134b04c5c4 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > +   isn't the case for AVX nor AVX512.  */
> >  #define TARGET_SSE_TYPELESS_STORES \
> > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
>
> This is wrong place to disable the feature.

Like this?

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index 2acc9fb0cfe..639969d736d 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -1597,6 +1597,11 @@ set_ix86_tune_features (enum processor_type
ix86_tune, bool dump)
     = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
     }

+  /* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
+     isn't the case for AVX nor AVX512.  */
+  if (TARGET_AVX)
+    ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] = 0;
+
   if (dump)
     {
       fprintf (stderr, "List of x86 specific tuning parameter names:\n");
H.J. Lu Jan. 28, 2020, 12:53 a.m. UTC | #3
On Mon, Jan 27, 2020 at 2:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > > for TARGET_AVX.
> > >
> > > gcc/
> > >
> > >         PR target/91461
> > >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> > >         TARGET_AVX.
> > >         * config/i386/i386.md (*movoi_internal_avx): Remove
> > >         TARGET_SSE_TYPELESS_STORES check.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/91461
> > >         * gcc.target/i386/pr91461-1.c: New test.
> > >         * gcc.target/i386/pr91461-2.c: Likewise.
> > >         * gcc.target/i386/pr91461-3.c: Likewise.
> > >         * gcc.target/i386/pr91461-4.c: Likewise.
> > >         * gcc.target/i386/pr91461-5.c: Likewise.
> > > ---
> > >  gcc/config/i386/i386.h                    |  4 +-
> > >  gcc/config/i386/i386.md                   |  4 +-
> > >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> > >  7 files changed, 203 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> > >
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index 943e9a5c783..c134b04c5c4 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> > >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> > >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > > +   isn't the case for AVX nor AVX512.  */
> > >  #define TARGET_SSE_TYPELESS_STORES \
> > > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
> >
> > This is wrong place to disable the feature.
>

Here is the updated patch on top of

https://gcc.gnu.org/ml/gcc-patches/2020-01/msg01742.html

so that set_ix86_tune_features can access per function setting.

OK for master branch?

Thanks.
Uros Bizjak Jan. 28, 2020, 7:04 a.m. UTC | #4
On Mon, Jan 27, 2020 at 11:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > > for TARGET_AVX.
> > >
> > > gcc/
> > >
> > >         PR target/91461
> > >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> > >         TARGET_AVX.
> > >         * config/i386/i386.md (*movoi_internal_avx): Remove
> > >         TARGET_SSE_TYPELESS_STORES check.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/91461
> > >         * gcc.target/i386/pr91461-1.c: New test.
> > >         * gcc.target/i386/pr91461-2.c: Likewise.
> > >         * gcc.target/i386/pr91461-3.c: Likewise.
> > >         * gcc.target/i386/pr91461-4.c: Likewise.
> > >         * gcc.target/i386/pr91461-5.c: Likewise.
> > > ---
> > >  gcc/config/i386/i386.h                    |  4 +-
> > >  gcc/config/i386/i386.md                   |  4 +-
> > >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> > >  7 files changed, 203 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> > >
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index 943e9a5c783..c134b04c5c4 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> > >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> > >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > > +   isn't the case for AVX nor AVX512.  */
> > >  #define TARGET_SSE_TYPELESS_STORES \
> > > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
> >
> > This is wrong place to disable the feature.
>
> Like this?

No.

There is a mode attribute in i386.md/sse.md for relevant patterns.
Please adapt calculation of mode attributes instead.

Uros.

> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index 2acc9fb0cfe..639969d736d 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -1597,6 +1597,11 @@ set_ix86_tune_features (enum processor_type
> ix86_tune, bool dump)
>      = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
>      }
>
> +  /* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> +     isn't the case for AVX nor AVX512.  */
> +  if (TARGET_AVX)
> +    ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] = 0;
> +
>    if (dump)
>      {
>        fprintf (stderr, "List of x86 specific tuning parameter names:\n");
>
>
> --
> H.J.
H.J. Lu Jan. 28, 2020, 2:32 p.m. UTC | #5
On Mon, Jan 27, 2020 at 11:04 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Jan 27, 2020 at 11:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > > > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > > > for TARGET_AVX.
> > > >
> > > > gcc/
> > > >
> > > >         PR target/91461
> > > >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> > > >         TARGET_AVX.
> > > >         * config/i386/i386.md (*movoi_internal_avx): Remove
> > > >         TARGET_SSE_TYPELESS_STORES check.
> > > >
> > > > gcc/testsuite/
> > > >
> > > >         PR target/91461
> > > >         * gcc.target/i386/pr91461-1.c: New test.
> > > >         * gcc.target/i386/pr91461-2.c: Likewise.
> > > >         * gcc.target/i386/pr91461-3.c: Likewise.
> > > >         * gcc.target/i386/pr91461-4.c: Likewise.
> > > >         * gcc.target/i386/pr91461-5.c: Likewise.
> > > > ---
> > > >  gcc/config/i386/i386.h                    |  4 +-
> > > >  gcc/config/i386/i386.md                   |  4 +-
> > > >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> > > >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> > > >  7 files changed, 203 insertions(+), 4 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > index 943e9a5c783..c134b04c5c4 100644
> > > > --- a/gcc/config/i386/i386.h
> > > > +++ b/gcc/config/i386/i386.h
> > > > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > > >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> > > >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> > > >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > > > +   isn't the case for AVX nor AVX512.  */
> > > >  #define TARGET_SSE_TYPELESS_STORES \
> > > > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > > > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
> > >
> > > This is wrong place to disable the feature.
> >
> > Like this?
>
> No.
>
> There is a mode attribute in i386.md/sse.md for relevant patterns.
> Please adapt calculation of mode attributes instead.
>

Like this?
Uros Bizjak Jan. 28, 2020, 2:45 p.m. UTC | #6
On Tue, Jan 28, 2020 at 3:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jan 27, 2020 at 11:04 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 11:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > > > > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > > > > for TARGET_AVX.
> > > > >
> > > > > gcc/
> > > > >
> > > > >         PR target/91461
> > > > >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> > > > >         TARGET_AVX.
> > > > >         * config/i386/i386.md (*movoi_internal_avx): Remove
> > > > >         TARGET_SSE_TYPELESS_STORES check.
> > > > >
> > > > > gcc/testsuite/
> > > > >
> > > > >         PR target/91461
> > > > >         * gcc.target/i386/pr91461-1.c: New test.
> > > > >         * gcc.target/i386/pr91461-2.c: Likewise.
> > > > >         * gcc.target/i386/pr91461-3.c: Likewise.
> > > > >         * gcc.target/i386/pr91461-4.c: Likewise.
> > > > >         * gcc.target/i386/pr91461-5.c: Likewise.
> > > > > ---
> > > > >  gcc/config/i386/i386.h                    |  4 +-
> > > > >  gcc/config/i386/i386.md                   |  4 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> > > > >  7 files changed, 203 insertions(+), 4 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > > index 943e9a5c783..c134b04c5c4 100644
> > > > > --- a/gcc/config/i386/i386.h
> > > > > +++ b/gcc/config/i386/i386.h
> > > > > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > > > >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> > > > >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> > > > >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > > > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > > > > +   isn't the case for AVX nor AVX512.  */
> > > > >  #define TARGET_SSE_TYPELESS_STORES \
> > > > > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > > > > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
> > > >
> > > > This is wrong place to disable the feature.
> > >
> > > Like this?
> >
> > No.
> >
> > There is a mode attribute in i386.md/sse.md for relevant patterns.
> > Please adapt calculation of mode attributes instead.
> >
>
> Like this?

Still no.

You could move

(match_test "TARGET_AVX")
  (const_string "TI")

up to bypass the cases below.

Uros.


Uros.

>
> --
> H.J.
H.J. Lu Jan. 28, 2020, 3:33 p.m. UTC | #7
On Tue, Jan 28, 2020 at 6:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jan 28, 2020 at 3:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 11:04 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Mon, Jan 27, 2020 at 11:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > > > > > case for AVX nor AVX512.  We should disable TARGET_SSE_TYPELESS_STORES
> > > > > > for TARGET_AVX.
> > > > > >
> > > > > > gcc/
> > > > > >
> > > > > >         PR target/91461
> > > > > >         * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for
> > > > > >         TARGET_AVX.
> > > > > >         * config/i386/i386.md (*movoi_internal_avx): Remove
> > > > > >         TARGET_SSE_TYPELESS_STORES check.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >
> > > > > >         PR target/91461
> > > > > >         * gcc.target/i386/pr91461-1.c: New test.
> > > > > >         * gcc.target/i386/pr91461-2.c: Likewise.
> > > > > >         * gcc.target/i386/pr91461-3.c: Likewise.
> > > > > >         * gcc.target/i386/pr91461-4.c: Likewise.
> > > > > >         * gcc.target/i386/pr91461-5.c: Likewise.
> > > > > > ---
> > > > > >  gcc/config/i386/i386.h                    |  4 +-
> > > > > >  gcc/config/i386/i386.md                   |  4 +-
> > > > > >  gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++
> > > > > >  7 files changed, 203 insertions(+), 4 deletions(-)
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c
> > > > > >
> > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > > > index 943e9a5c783..c134b04c5c4 100644
> > > > > > --- a/gcc/config/i386/i386.h
> > > > > > +++ b/gcc/config/i386/i386.h
> > > > > > @@ -516,8 +516,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> > > > > >  #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
> > > > > >         ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
> > > > > >  #define TARGET_SSE_SPLIT_REGS  ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
> > > > > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
> > > > > > +   isn't the case for AVX nor AVX512.  */
> > > > > >  #define TARGET_SSE_TYPELESS_STORES \
> > > > > > -       ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
> > > > > > +       (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
> > > > >
> > > > > This is wrong place to disable the feature.
> > > >
> > > > Like this?
> > >
> > > No.
> > >
> > > There is a mode attribute in i386.md/sse.md for relevant patterns.
> > > Please adapt calculation of mode attributes instead.
> > >
> >
> > Like this?
>
> Still no.
>
> You could move
>
> (match_test "TARGET_AVX")
>   (const_string "TI")
>
> up to bypass the cases below.
>

I don't think we can do that.   There are 2 cases where we prefer movaps/movups:

/* Use packed single precision instructions where posisble.  I.e.
movups instead   of movupd.  */
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
"sse_packed_single_insn_optimal",
          m_BDVER | m_ZNVER)

/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
          m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)

We should always use movaps/movups for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
It is wrong to bypass TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with TARGET_AVX
as m_BDVER | m_ZNVER support AVX.
Uros Bizjak Jan. 28, 2020, 5:12 p.m. UTC | #8
On Tue, Jan 28, 2020 at 4:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > You could move
> >
> > (match_test "TARGET_AVX")
> >   (const_string "TI")
> >
> > up to bypass the cases below.
> >
>
> I don't think we can do that.   There are 2 cases where we prefer movaps/movups:
>
> /* Use packed single precision instructions where posisble.  I.e.
> movups instead   of movupd.  */
> DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
> "sse_packed_single_insn_optimal",
>           m_BDVER | m_ZNVER)
>
> /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
> DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
>           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
>
> We should always use movaps/movups for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
> It is wrong to bypass TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with TARGET_AVX
> as m_BDVER | m_ZNVER support AVX.

The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
only insn size, as advised in e.g. Software Optimization Guide for the
AMD Family 15h Processors [1], section 7.1.2, where it is said:

--quote--
7.1.2 Reduce Instruction SizeOptimization

Reduce the size of instructions when possible.

Rationale

Using smaller instruction sizes improves instruction fetch throughput.
Specific examples include the following:

*In SIMD code, use the single-precision (PS) form of instructions
instead of the double-precision (PD) form. For example, for register
to register moves, MOVAPS achieves the same result as MOVAPD, but uses
one less byte to encode the instruction and has no prefix byte. Other
examples in which single-precision forms can be substituted for
double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
and SHUFPS.
...
--/quote--

Please note that this optimization applies only to non-AVX forms, as
demonstrated by:

   0:   0f 28 c8                movaps %xmm0,%xmm1
   3:   66 0f 28 c8             movapd %xmm0,%xmm1
   7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
   b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2

Also note that MOVDQA is missing in the above optimization. It is
harmful to substitute MOVDQA with MOVAPS, as it can (and does)
introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
(VALU) FP clusters.

Following the above optimization, it is obvious that
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling was cargo-culted from
one pattern to another. Its use should be reviewed and fixed where not
appropriate.

[1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf

Uros.
H.J. Lu Jan. 28, 2020, 5:51 p.m. UTC | #9
On Tue, Jan 28, 2020 at 9:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jan 28, 2020 at 4:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> > > You could move
> > >
> > > (match_test "TARGET_AVX")
> > >   (const_string "TI")
> > >
> > > up to bypass the cases below.
> > >
> >
> > I don't think we can do that.   There are 2 cases where we prefer movaps/movups:
> >
> > /* Use packed single precision instructions where posisble.  I.e.
> > movups instead   of movupd.  */
> > DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
> > "sse_packed_single_insn_optimal",
> >           m_BDVER | m_ZNVER)
> >
> > /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
> > DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
> >           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
> >
> > We should always use movaps/movups for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
> > It is wrong to bypass TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with TARGET_AVX
> > as m_BDVER | m_ZNVER support AVX.
>
> The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
> only insn size, as advised in e.g. Software Optimization Guide for the
> AMD Family 15h Processors [1], section 7.1.2, where it is said:
>
> --quote--
> 7.1.2 Reduce Instruction SizeOptimization
>
> Reduce the size of instructions when possible.
>
> Rationale
>
> Using smaller instruction sizes improves instruction fetch throughput.
> Specific examples include the following:
>
> *In SIMD code, use the single-precision (PS) form of instructions
> instead of the double-precision (PD) form. For example, for register
> to register moves, MOVAPS achieves the same result as MOVAPD, but uses
> one less byte to encode the instruction and has no prefix byte. Other
> examples in which single-precision forms can be substituted for
> double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
> and SHUFPS.
> ...
> --/quote--
>
> Please note that this optimization applies only to non-AVX forms, as
> demonstrated by:
>
>    0:   0f 28 c8                movaps %xmm0,%xmm1
>    3:   66 0f 28 c8             movapd %xmm0,%xmm1
>    7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
>    b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2
>
> Also note that MOVDQA is missing in the above optimization. It is
> harmful to substitute MOVDQA with MOVAPS, as it can (and does)
> introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
> (VALU) FP clusters.
>
> Following the above optimization, it is obvious that
> TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling was cargo-culted from
> one pattern to another. Its use should be reviewed and fixed where not
> appropriate.
>
> [1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
>
> Uros.

Here is the updated patch which moves TARGET_AVX before
TARGET_SSE_TYPELESS_STORES.   OK for master if there is
no regression?

Thanks.
Uros Bizjak Jan. 28, 2020, 6:04 p.m. UTC | #10
On Tue, Jan 28, 2020 at 6:51 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jan 28, 2020 at 9:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Tue, Jan 28, 2020 at 4:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > > > You could move
> > > >
> > > > (match_test "TARGET_AVX")
> > > >   (const_string "TI")
> > > >
> > > > up to bypass the cases below.
> > > >
> > >
> > > I don't think we can do that.   There are 2 cases where we prefer movaps/movups:
> > >
> > > /* Use packed single precision instructions where posisble.  I.e.
> > > movups instead   of movupd.  */
> > > DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
> > > "sse_packed_single_insn_optimal",
> > >           m_BDVER | m_ZNVER)
> > >
> > > /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
> > > DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
> > >           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
> > >
> > > We should always use movaps/movups for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
> > > It is wrong to bypass TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with TARGET_AVX
> > > as m_BDVER | m_ZNVER support AVX.
> >
> > The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
> > only insn size, as advised in e.g. Software Optimization Guide for the
> > AMD Family 15h Processors [1], section 7.1.2, where it is said:
> >
> > --quote--
> > 7.1.2 Reduce Instruction SizeOptimization
> >
> > Reduce the size of instructions when possible.
> >
> > Rationale
> >
> > Using smaller instruction sizes improves instruction fetch throughput.
> > Specific examples include the following:
> >
> > *In SIMD code, use the single-precision (PS) form of instructions
> > instead of the double-precision (PD) form. For example, for register
> > to register moves, MOVAPS achieves the same result as MOVAPD, but uses
> > one less byte to encode the instruction and has no prefix byte. Other
> > examples in which single-precision forms can be substituted for
> > double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
> > and SHUFPS.
> > ...
> > --/quote--
> >
> > Please note that this optimization applies only to non-AVX forms, as
> > demonstrated by:
> >
> >    0:   0f 28 c8                movaps %xmm0,%xmm1
> >    3:   66 0f 28 c8             movapd %xmm0,%xmm1
> >    7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
> >    b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2
> >
> > Also note that MOVDQA is missing in the above optimization. It is
> > harmful to substitute MOVDQA with MOVAPS, as it can (and does)
> > introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
> > (VALU) FP clusters.
> >
> > Following the above optimization, it is obvious that
> > TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling was cargo-culted from
> > one pattern to another. Its use should be reviewed and fixed where not
> > appropriate.
> >
> > [1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
> >
> > Uros.
>
> Here is the updated patch which moves TARGET_AVX before
> TARGET_SSE_TYPELESS_STORES.   OK for master if there is
> no regression?
>
> Thanks.


+       (match_test "TARGET_AVX")
+ (const_string "<sseinsnmode>")
        (and (match_test "<MODE_SIZE> == 16")

Only MODE_SIZE == 16 cases will be left here, since TARGET_AVX is
necessary for MODE_SIZE > 16. This test can be removed.

OK with the above change.

Thanks,
Uros.

> --
> H.J.
H.J. Lu Jan. 28, 2020, 6:20 p.m. UTC | #11
On Tue, Jan 28, 2020 at 10:04 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jan 28, 2020 at 6:51 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jan 28, 2020 at 9:12 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Tue, Jan 28, 2020 at 4:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > > > You could move
> > > > >
> > > > > (match_test "TARGET_AVX")
> > > > >   (const_string "TI")
> > > > >
> > > > > up to bypass the cases below.
> > > > >
> > > >
> > > > I don't think we can do that.   There are 2 cases where we prefer movaps/movups:
> > > >
> > > > /* Use packed single precision instructions where posisble.  I.e.
> > > > movups instead   of movupd.  */
> > > > DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
> > > > "sse_packed_single_insn_optimal",
> > > >           m_BDVER | m_ZNVER)
> > > >
> > > > /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
> > > > DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
> > > >           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
> > > >
> > > > We should always use movaps/movups for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
> > > > It is wrong to bypass TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with TARGET_AVX
> > > > as m_BDVER | m_ZNVER support AVX.
> > >
> > > The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
> > > only insn size, as advised in e.g. Software Optimization Guide for the
> > > AMD Family 15h Processors [1], section 7.1.2, where it is said:
> > >
> > > --quote--
> > > 7.1.2 Reduce Instruction SizeOptimization
> > >
> > > Reduce the size of instructions when possible.
> > >
> > > Rationale
> > >
> > > Using smaller instruction sizes improves instruction fetch throughput.
> > > Specific examples include the following:
> > >
> > > *In SIMD code, use the single-precision (PS) form of instructions
> > > instead of the double-precision (PD) form. For example, for register
> > > to register moves, MOVAPS achieves the same result as MOVAPD, but uses
> > > one less byte to encode the instruction and has no prefix byte. Other
> > > examples in which single-precision forms can be substituted for
> > > double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
> > > and SHUFPS.
> > > ...
> > > --/quote--
> > >
> > > Please note that this optimization applies only to non-AVX forms, as
> > > demonstrated by:
> > >
> > >    0:   0f 28 c8                movaps %xmm0,%xmm1
> > >    3:   66 0f 28 c8             movapd %xmm0,%xmm1
> > >    7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
> > >    b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2
> > >
> > > Also note that MOVDQA is missing in the above optimization. It is
> > > harmful to substitute MOVDQA with MOVAPS, as it can (and does)
> > > introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
> > > (VALU) FP clusters.
> > >
> > > Following the above optimization, it is obvious that
> > > TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling was cargo-culted from
> > > one pattern to another. Its use should be reviewed and fixed where not
> > > appropriate.
> > >
> > > [1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
> > >
> > > Uros.
> >
> > Here is the updated patch which moves TARGET_AVX before
> > TARGET_SSE_TYPELESS_STORES.   OK for master if there is
> > no regression?
> >
> > Thanks.
>
>
> +       (match_test "TARGET_AVX")
> + (const_string "<sseinsnmode>")
>         (and (match_test "<MODE_SIZE> == 16")
>
> Only MODE_SIZE == 16 cases will be left here, since TARGET_AVX is
> necessary for MODE_SIZE > 16. This test can be removed.
>
> OK with the above change.
>

This is the patch I am going to check in.

Thanks.
Jakub Jelinek Jan. 28, 2020, 6:57 p.m. UTC | #12
On Tue, Jan 28, 2020 at 10:20:36AM -0800, H.J. Lu wrote:
> From 66c534dedc7a9a632aa38c32e3f7c251b8f2c778 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Mon, 27 Jan 2020 09:35:11 -0800
> Subject: [PATCH] i386: Prefer TARGET_AVX over TARGET_SSE_TYPELESS_STORES
> 
> movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> case for AVX nor AVX512.  This patch prefers TARGET_AVX over
> TARGET_SSE_TYPELESS_STORES and adjust vmovups checks in assembly ouputs.

If you haven't committed yet, please fix the movdaq typo in the description
(to movdqa).

	Jakub
H.J. Lu Jan. 28, 2020, 7:06 p.m. UTC | #13
On Tue, Jan 28, 2020 at 10:58 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Tue, Jan 28, 2020 at 10:20:36AM -0800, H.J. Lu wrote:
> > From 66c534dedc7a9a632aa38c32e3f7c251b8f2c778 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Mon, 27 Jan 2020 09:35:11 -0800
> > Subject: [PATCH] i386: Prefer TARGET_AVX over TARGET_SSE_TYPELESS_STORES
> >
> > movaps/movups is one byte shorter than movdaq/movdqu.  But it isn't the
> > case for AVX nor AVX512.  This patch prefers TARGET_AVX over
> > TARGET_SSE_TYPELESS_STORES and adjust vmovups checks in assembly ouputs.
>
> If you haven't committed yet, please fix the movdaq typo in the description
> (to movdqa).
>

Will do.

Thanks.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 943e9a5c783..c134b04c5c4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -516,8 +516,10 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
 	ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
 #define TARGET_SSE_SPLIT_REGS	ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
+/* NB: movaps/movups is one byte shorter than movdaq/movdqu.  But it
+   isn't the case for AVX nor AVX512.  */
 #define TARGET_SSE_TYPELESS_STORES \
-	ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
+	(!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES])
 #define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR]
 #define TARGET_MEMORY_MISMATCH_STALL \
 	ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL]
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6e9c9bd2fb6..bb096133880 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1980,9 +1980,7 @@ 
 	       (and (eq_attr "alternative" "1")
 		    (match_test "TARGET_AVX512VL"))
 		 (const_string "XI")
-	       (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-		    (and (eq_attr "alternative" "3")
-			 (match_test "TARGET_SSE_TYPELESS_STORES")))
+	       (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 		 (const_string "V8SF")
 	      ]
 	      (const_string "OI")))])
diff --git a/gcc/testsuite/gcc.target/i386/pr91461-1.c b/gcc/testsuite/gcc.target/i386/pr91461-1.c
new file mode 100644
index 00000000000..0c94b8e2b76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91461-1.c
@@ -0,0 +1,66 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovdqa\t" } } */
+/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
+/* { dg-final { scan-assembler "\tvmovapd\t" } } */
+/* { dg-final { scan-assembler "\tvmovupd\t" } } */
+/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
+/* { dg-final { scan-assembler-not "\tvmovups\t" } } */
+
+#include <immintrin.h>
+
+void
+foo1 (__m128i *p, __m128i x)
+{
+  *p = x;
+}
+
+void
+foo2 (__m128d *p, __m128d x)
+{
+  *p = x;
+}
+
+void
+foo3 (__float128 *p, __float128 x)
+{
+  *p = x;
+}
+
+void
+foo4 (__m128i_u *p, __m128i x)
+{
+  *p = x;
+}
+
+void
+foo5 (__m128d_u *p, __m128d x)
+{
+  *p = x;
+}
+
+typedef __float128 __float128_u __attribute__ ((__aligned__ (1)));
+
+void
+foo6 (__float128_u *p, __float128 x)
+{
+  *p = x;
+}
+
+#ifdef __x86_64__
+typedef __int128 __int128_u __attribute__ ((__aligned__ (1)));
+
+extern __int128 int128;
+
+void
+foo7 (__int128 *p)
+{
+  *p = int128;
+}
+
+void
+foo8 (__int128_u *p)
+{
+  *p = int128;
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr91461-2.c b/gcc/testsuite/gcc.target/i386/pr91461-2.c
new file mode 100644
index 00000000000..921cfaf9780
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91461-2.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovdqa\t" } } */
+/* { dg-final { scan-assembler "\tvmovapd\t" } } */
+/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
+
+#include <immintrin.h>
+
+void
+foo1 (__m256i *p, __m256i x)
+{
+  *p = x;
+}
+
+void
+foo2 (__m256d *p, __m256d x)
+{
+  *p = x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr91461-3.c b/gcc/testsuite/gcc.target/i386/pr91461-3.c
new file mode 100644
index 00000000000..c67a48063bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91461-3.c
@@ -0,0 +1,76 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
+/* { dg-final { scan-assembler-not "\tvmovups\t" } } */
+
+#include <immintrin.h>
+
+void
+foo1 (__m128i *p, __m128i a)
+{
+  register __m128i x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo2 (__m128d *p, __m128d a)
+{
+  register __m128d x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo3 (__float128 *p, __float128 a)
+{
+  register __float128 x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo4 (__m128i_u *p, __m128i a)
+{
+  register __m128i x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo5 (__m128d_u *p, __m128d a)
+{
+  register __m128d x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+typedef __float128 __float128_u __attribute__ ((__aligned__ (1)));
+
+void
+foo6 (__float128_u *p, __float128 a)
+{
+  register __float128 x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+typedef __int128 __int128_u __attribute__ ((__aligned__ (1)));
+
+extern __int128 int128;
+
+void
+foo7 (__int128 *p)
+{
+  register __int128 x __asm ("xmm16") = int128;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo8 (__int128_u *p)
+{
+  register __int128 x __asm ("xmm16") = int128;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr91461-4.c b/gcc/testsuite/gcc.target/i386/pr91461-4.c
new file mode 100644
index 00000000000..69df590de3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91461-4.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
+
+#include <immintrin.h>
+
+void
+foo1 (__m256i *p, __m256i a)
+{
+  register __m256i x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
+
+void
+foo2 (__m256d *p, __m256d a)
+{
+  register __m256d x __asm ("xmm16") = a;
+  asm volatile ("" : "+v" (x));
+  *p = x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr91461-5.c b/gcc/testsuite/gcc.target/i386/pr91461-5.c
new file mode 100644
index 00000000000..974263042f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91461-5.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */
+
+#include <immintrin.h>
+
+void
+foo1 (__m512i *p, __m512i x)
+{
+  *p = x;
+}
+
+void
+foo2 (__m512d *p, __m512d x)
+{
+  *p = x;
+}