diff mbox series

[v7,03/10] x86: Update piecewise move and store

Message ID CAMe9rOrOnCWMBrdLVnQzr=s0Mi8kXDpEaDmFyueULr-jvseWjA@mail.gmail.com
State New
Headers show
Series None | expand

Commit Message

H.J. Lu Aug. 2, 2021, 2:56 p.m. UTC
On Mon, Aug 2, 2021 at 4:20 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Jul 30, 2021 at 11:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > We can use TImode/OImode/XImode integers for piecewise move and store.
> >
> > 1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of
> > bytes that a single instruction can move quickly between memory and
> > registers or between two memory locations.
> > 2. Define MOVE_MAX to MOVE_MAX_PIECES, which is the maximum number of
> > bytes we can move from memory to memory in one reasonably fast instruction.
> > The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX
> > must be a constant, independent of compiler options, since it is used in
> > reload.h to define struct target_reload and MOVE_MAX can vary, depending
> > on compiler options.
> > 3. When vector register is used for piecewise move and store, we don't
> > increase stack_alignment_needed since vector register spill isn't
> > required for piecewise move and store.  Since stack_realign_needed is
> > set to true by checking stack_alignment_estimated set by pseudo vector
> > register usage, we also need to check stack_realign_needed to eliminate
> > frame pointer.
> >
> > gcc/
> >
> >         * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
> >         check stack_realign_needed for stack realignment.
> >         (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
> >         than the largest integer supported by vector register.
> >         * config/i386/i386.h (MAX_MOVE_MAX): New.  Set to 64.
> >         (MOVE_MAX_PIECES): Set to bytes of the largest integer supported
> >         by vector register.
> >         (MOVE_MAX): Defined to MOVE_MAX_PIECES.
> >         (STORE_MAX_PIECES): New.
> >
> > gcc/testsuite/
> >
> >         * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
> >         * gcc.target/i386/pr90773-4.c: Also run for 32-bit.
> >         * gcc.target/i386/pr90773-15.c: Likewise.
> >         * gcc.target/i386/pr90773-16.c: Likewise.
> >         * gcc.target/i386/pr90773-17.c: Likewise.
> >         * gcc.target/i386/pr90773-24.c: Likewise.
> >         * gcc.target/i386/pr90773-25.c: Likewise.
> >         * gcc.target/i386/pr100865-1.c: Likewise.
> >         * gcc.target/i386/pr100865-2.c: Likewise.
> >         * gcc.target/i386/pr100865-3.c: Likewise.
> >         * gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect
> >         XMM movd to store 4 bytes.
> >         * gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect
> >         YMM registers.
> >         * gcc.target/i386/pr100865-4b.c: Likewise.
> >         * gcc.target/i386/pr100865-10a.c: Expect YMM registers.
> >         * gcc.target/i386/pr100865-10b.c: Likewise.
> > ---
> >  gcc/config/i386/i386.c                       | 21 ++++++++--
> >  gcc/config/i386/i386.h                       | 40 ++++++++++++++++----
> >  gcc/testsuite/gcc.target/i386/pr100865-1.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-10a.c |  4 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  4 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-2.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-3.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-4a.c  |  6 +--
> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  8 ++--
> >  gcc/testsuite/gcc.target/i386/pr90773-1.c    | 10 ++---
> >  gcc/testsuite/gcc.target/i386/pr90773-14.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr90773-15.c   |  6 +--
> >  gcc/testsuite/gcc.target/i386/pr90773-16.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr90773-17.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr90773-24.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr90773-25.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr90773-4.c    |  2 +-
> >  17 files changed, 76 insertions(+), 41 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 5d20ca2067f..842eb0e6786 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void)
> >       assumed stack realignment might be needed or -fno-omit-frame-pointer
> >       is used, but in the end nothing that needed the stack alignment had
> >       been spilled nor stack access, clear frame_pointer_needed and say we
> > -     don't need stack realignment.  */
> > -  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
> > +     don't need stack realignment.
> > +
> > +     When vector register is used for piecewise move and store, we don't
> > +     increase stack_alignment_needed as there is no register spill for
> > +     piecewise move and store.  Since stack_realign_needed is set to true
> > +     by checking stack_alignment_estimated which is updated by pseudo
> > +     vector register usage, we also need to check stack_realign_needed to
> > +     eliminate frame pointer.  */
> > +  if ((stack_realign
> > +       || (!flag_omit_frame_pointer && optimize)
> > +       || crtl->stack_realign_needed)
> >        && frame_pointer_needed
> >        && crtl->is_leaf
> >        && crtl->sp_is_unchanging
> > @@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
> >           /* FALLTHRU */
> >         case E_OImode:
> >         case E_XImode:
> > -         if (!standard_sse_constant_p (x, mode))
> > +         if (!standard_sse_constant_p (x, mode)
> > +             && GET_MODE_SIZE (TARGET_AVX512F
> > +                               ? XImode
> > +                               : (TARGET_AVX
> > +                                  ? OImode
> > +                                  : (TARGET_SSE2
> > +                                     ? TImode : DImode))) < GET_MODE_SIZE (mode))
> >             return false;
> >         default:
> >           break;
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index d1e1c225990..50418a0cc9b 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -1757,9 +1757,10 @@ typedef struct ix86_args {
> >  /* Define this as 1 if `char' should by default be signed; else as 0.  */
> >  #define DEFAULT_SIGNED_CHAR 1
> >
> > -/* Max number of bytes we can move from memory to memory
> > -   in one reasonably fast instruction.  */
> > -#define MOVE_MAX 16
> > +/* The constant maximum number of bytes that a single instruction can
> > +   move quickly between memory and registers or between two memory
> > +   locations.  */
> > +#define MAX_MOVE_MAX 64
> >
> >  /* MOVE_MAX_PIECES is the number of bytes at a time which we can
> >     move efficiently, as opposed to  MOVE_MAX which is the maximum
>
> The comment here is now totally wrong.

Fixed.

> > @@ -1770,11 +1771,34 @@ typedef struct ix86_args {
> >     widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
> >     64-bit mode.  */
> >  #define MOVE_MAX_PIECES \
> > -  ((TARGET_64BIT \
> > -    && TARGET_SSE2 \
> > -    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> > -    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > -   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
> > +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > +   ? 64 \
> > +   : ((TARGET_AVX \
> > +       && !TARGET_PREFER_AVX128 \
> > +       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
> > +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > +      ? 32 \
> > +      : ((TARGET_SSE2 \
> > +         && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> > +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > +        ? 16 : UNITS_PER_WORD)))
> > +
> > +/* Max number of bytes we can move from memory to memory in one
> > +   reasonably fast instruction.  */
> > +#define MOVE_MAX MOVE_MAX_PIECES
>
> Isn't this a bit backward now? Instead of the above define, we should
> define MOVE_MAX instead of MOVE_MAX_PIECES, defaults.h has:

Here is the v7 patch which is changed to

/* Max number of bytes we can move from memory to memory in one
   reasonably fast instruction, as opposed to MOVE_MAX_PIECES which
   is the number of bytes at a time which we can move efficiently.
   MOVE_MAX_PIECES defaults to MOVE_MAX.  */

#define MOVE_MAX \
  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
   ? 64 \
   : ((TARGET_AVX \
       && !TARGET_PREFER_AVX128 \
       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
      ? 32 \
      : ((TARGET_SSE2 \
          && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
          && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
         ? 16 : UNITS_PER_WORD)))

OK for master?  Thanks.

> defaults.h:#ifndef MOVE_MAX_PIECES
> defaults.h:#define MOVE_MAX_PIECES   MOVE_MAX
>
> Uros.
>
> > +
> > +/* STORE_MAX_PIECES is the number of bytes at a time that we can
> > +   store efficiently.  */
> > +#define STORE_MAX_PIECES \
> > +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > +   ? 64 \
> > +   : ((TARGET_AVX \
> > +       && !TARGET_PREFER_AVX128 \
> > +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > +      ? 32 \
> > +      : ((TARGET_SSE2 \
> > +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > +        ? 16 : UNITS_PER_WORD)))
> >
> >  /* If a memory-to-memory move would take MOVE_RATIO or more simple
> >     move-instruction pairs, we will do a cpymem or libcall instead.
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > index 6c3097fb2a6..949dd5c337a 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=x86-64" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > index 7ffc19e56a8..98b6dfb16f3 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > @@ -29,5 +29,5 @@ foo (void)
> >      array[i] = MK_CONST128_BROADCAST (0x1f);
> >  }
> >
> > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > index edf52765c60..e5616d8d258 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > @@ -3,5 +3,5 @@
> >
> >  #include "pr100865-10a.c"
> >
> > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > index 17efe2d72a3..f3ea7753abe 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > index 007e79f91b0..714c43e12c9 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake-avx512" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > index f55883598f9..365487337ae 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake" } */
> >
> >  extern char array[64];
> > @@ -11,6 +11,6 @@ foo (void)
> >      array[i] = -45;
> >  }
> >
> > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > index 1e50dc842bc..8e8a7eaaaff 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > @@ -1,9 +1,9 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake-avx512" } */
> >
> >  #include "pr100865-4a.c"
> >
> > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > -/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> > +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > index 1d9f282dc0d..4fd5a40d99d 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -mtune=generic" } */
> > +/* { dg-options "-O2 -msse2 -mtune=generic" } */
> >
> >  extern char *dst, *src;
> >
> > @@ -9,9 +9,5 @@ foo (void)
> >    __builtin_memcpy (dst, src, 15);
> >  }
> >
> > -/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> > -/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> > -/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > -/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > +/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
> > +/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > index e5c19f49cf5..96ee5cb08c1 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > index 185ea60e1d2..403cdb248a2 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake-avx512" } */
> >
> >  extern char *dst;
> > @@ -9,6 +9,6 @@ foo (int c)
> >    __builtin_memset (dst, c, 17);
> >  }
> >
> > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
> >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> > -/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
> > +/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > index d820cc318c3..bb0aadbc77e 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake-avx512" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > index f6f179e9b5b..73d5d5abaee 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=skylake-avx512" } */
> >
> >  extern char *dst;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > index 7b2ea66dcfc..71f1fd8c4df 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=x86-64" } */
> >
> >  struct S
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > index 57642ea8d2d..ad19a88c883 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -march=x86-64" } */
> >
> >  struct S
> > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > index ec0bc0100ae..ee4c04678d1 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > @@ -1,4 +1,4 @@
> > -/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-do compile } */
> >  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> >
> >  extern char *dst;
> > --
> > 2.31.1
> >

Comments

Uros Bizjak Aug. 2, 2021, 3:53 p.m. UTC | #1
On Mon, Aug 2, 2021 at 4:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 2, 2021 at 4:20 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Jul 30, 2021 at 11:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > We can use TImode/OImode/XImode integers for piecewise move and store.
> > >
> > > 1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of
> > > bytes that a single instruction can move quickly between memory and
> > > registers or between two memory locations.
> > > 2. Define MOVE_MAX to MOVE_MAX_PIECES, which is the maximum number of
> > > bytes we can move from memory to memory in one reasonably fast instruction.
> > > The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX
> > > must be a constant, independent of compiler options, since it is used in
> > > reload.h to define struct target_reload and MOVE_MAX can vary, depending
> > > on compiler options.
> > > 3. When vector register is used for piecewise move and store, we don't
> > > increase stack_alignment_needed since vector register spill isn't
> > > required for piecewise move and store.  Since stack_realign_needed is
> > > set to true by checking stack_alignment_estimated set by pseudo vector
> > > register usage, we also need to check stack_realign_needed to eliminate
> > > frame pointer.
> > >
> > > gcc/
> > >
> > >         * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
> > >         check stack_realign_needed for stack realignment.
> > >         (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
> > >         than the largest integer supported by vector register.
> > >         * config/i386/i386.h (MAX_MOVE_MAX): New.  Set to 64.
> > >         (MOVE_MAX_PIECES): Set to bytes of the largest integer supported
> > >         by vector register.
> > >         (MOVE_MAX): Defined to MOVE_MAX_PIECES.
> > >         (STORE_MAX_PIECES): New.
> > >
> > > gcc/testsuite/
> > >
> > >         * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
> > >         * gcc.target/i386/pr90773-4.c: Also run for 32-bit.
> > >         * gcc.target/i386/pr90773-15.c: Likewise.
> > >         * gcc.target/i386/pr90773-16.c: Likewise.
> > >         * gcc.target/i386/pr90773-17.c: Likewise.
> > >         * gcc.target/i386/pr90773-24.c: Likewise.
> > >         * gcc.target/i386/pr90773-25.c: Likewise.
> > >         * gcc.target/i386/pr100865-1.c: Likewise.
> > >         * gcc.target/i386/pr100865-2.c: Likewise.
> > >         * gcc.target/i386/pr100865-3.c: Likewise.
> > >         * gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect
> > >         XMM movd to store 4 bytes.
> > >         * gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect
> > >         YMM registers.
> > >         * gcc.target/i386/pr100865-4b.c: Likewise.
> > >         * gcc.target/i386/pr100865-10a.c: Expect YMM registers.
> > >         * gcc.target/i386/pr100865-10b.c: Likewise.
> > > ---
> > >  gcc/config/i386/i386.c                       | 21 ++++++++--
> > >  gcc/config/i386/i386.h                       | 40 ++++++++++++++++----
> > >  gcc/testsuite/gcc.target/i386/pr100865-1.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-10a.c |  4 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  4 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-2.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-3.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-4a.c  |  6 +--
> > >  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  8 ++--
> > >  gcc/testsuite/gcc.target/i386/pr90773-1.c    | 10 ++---
> > >  gcc/testsuite/gcc.target/i386/pr90773-14.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr90773-15.c   |  6 +--
> > >  gcc/testsuite/gcc.target/i386/pr90773-16.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr90773-17.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr90773-24.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr90773-25.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr90773-4.c    |  2 +-
> > >  17 files changed, 76 insertions(+), 41 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index 5d20ca2067f..842eb0e6786 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void)
> > >       assumed stack realignment might be needed or -fno-omit-frame-pointer
> > >       is used, but in the end nothing that needed the stack alignment had
> > >       been spilled nor stack access, clear frame_pointer_needed and say we
> > > -     don't need stack realignment.  */
> > > -  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
> > > +     don't need stack realignment.
> > > +
> > > +     When vector register is used for piecewise move and store, we don't
> > > +     increase stack_alignment_needed as there is no register spill for
> > > +     piecewise move and store.  Since stack_realign_needed is set to true
> > > +     by checking stack_alignment_estimated which is updated by pseudo
> > > +     vector register usage, we also need to check stack_realign_needed to
> > > +     eliminate frame pointer.  */
> > > +  if ((stack_realign
> > > +       || (!flag_omit_frame_pointer && optimize)
> > > +       || crtl->stack_realign_needed)
> > >        && frame_pointer_needed
> > >        && crtl->is_leaf
> > >        && crtl->sp_is_unchanging
> > > @@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
> > >           /* FALLTHRU */
> > >         case E_OImode:
> > >         case E_XImode:
> > > -         if (!standard_sse_constant_p (x, mode))
> > > +         if (!standard_sse_constant_p (x, mode)
> > > +             && GET_MODE_SIZE (TARGET_AVX512F
> > > +                               ? XImode
> > > +                               : (TARGET_AVX
> > > +                                  ? OImode
> > > +                                  : (TARGET_SSE2
> > > +                                     ? TImode : DImode))) < GET_MODE_SIZE (mode))
> > >             return false;
> > >         default:
> > >           break;
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index d1e1c225990..50418a0cc9b 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -1757,9 +1757,10 @@ typedef struct ix86_args {
> > >  /* Define this as 1 if `char' should by default be signed; else as 0.  */
> > >  #define DEFAULT_SIGNED_CHAR 1
> > >
> > > -/* Max number of bytes we can move from memory to memory
> > > -   in one reasonably fast instruction.  */
> > > -#define MOVE_MAX 16
> > > +/* The constant maximum number of bytes that a single instruction can
> > > +   move quickly between memory and registers or between two memory
> > > +   locations.  */
> > > +#define MAX_MOVE_MAX 64
> > >
> > >  /* MOVE_MAX_PIECES is the number of bytes at a time which we can
> > >     move efficiently, as opposed to  MOVE_MAX which is the maximum
> >
> > The comment here is now totally wrong.
>
> Fixed.
>
> > > @@ -1770,11 +1771,34 @@ typedef struct ix86_args {
> > >     widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
> > >     64-bit mode.  */
> > >  #define MOVE_MAX_PIECES \
> > > -  ((TARGET_64BIT \
> > > -    && TARGET_SSE2 \
> > > -    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> > > -    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > -   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
> > > +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > +   ? 64 \
> > > +   : ((TARGET_AVX \
> > > +       && !TARGET_PREFER_AVX128 \
> > > +       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
> > > +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > +      ? 32 \
> > > +      : ((TARGET_SSE2 \
> > > +         && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> > > +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > +        ? 16 : UNITS_PER_WORD)))
> > > +
> > > +/* Max number of bytes we can move from memory to memory in one
> > > +   reasonably fast instruction.  */
> > > +#define MOVE_MAX MOVE_MAX_PIECES
> >
> > Isn't this a bit backward now? Instead of the above define, we should
> > define MOVE_MAX instead of MOVE_MAX_PIECES, defaults.h has:
>
> Here is the v7 patch which is changed to
>
> /* Max number of bytes we can move from memory to memory in one
>    reasonably fast instruction, as opposed to MOVE_MAX_PIECES which
>    is the number of bytes at a time which we can move efficiently.
>    MOVE_MAX_PIECES defaults to MOVE_MAX.  */
>
> #define MOVE_MAX \
>   ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
>    ? 64 \
>    : ((TARGET_AVX \
>        && !TARGET_PREFER_AVX128 \
>        && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
>        && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
>       ? 32 \
>       : ((TARGET_SSE2 \
>           && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
>           && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
>          ? 16 : UNITS_PER_WORD)))
>
> OK for master?  Thanks.

OK.

Thanks,
Uros.

>
> > defaults.h:#ifndef MOVE_MAX_PIECES
> > defaults.h:#define MOVE_MAX_PIECES   MOVE_MAX
> >
> > Uros.
> >
> > > +
> > > +/* STORE_MAX_PIECES is the number of bytes at a time that we can
> > > +   store efficiently.  */
> > > +#define STORE_MAX_PIECES \
> > > +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > +   ? 64 \
> > > +   : ((TARGET_AVX \
> > > +       && !TARGET_PREFER_AVX128 \
> > > +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > +      ? 32 \
> > > +      : ((TARGET_SSE2 \
> > > +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > +        ? 16 : UNITS_PER_WORD)))
> > >
> > >  /* If a memory-to-memory move would take MOVE_RATIO or more simple
> > >     move-instruction pairs, we will do a cpymem or libcall instead.
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > > index 6c3097fb2a6..949dd5c337a 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=x86-64" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > > index 7ffc19e56a8..98b6dfb16f3 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> > > @@ -29,5 +29,5 @@ foo (void)
> > >      array[i] = MK_CONST128_BROADCAST (0x1f);
> > >  }
> > >
> > > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > index edf52765c60..e5616d8d258 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > > @@ -3,5 +3,5 @@
> > >
> > >  #include "pr100865-10a.c"
> > >
> > > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > > index 17efe2d72a3..f3ea7753abe 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > > index 007e79f91b0..714c43e12c9 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake-avx512" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > > index f55883598f9..365487337ae 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake" } */
> > >
> > >  extern char array[64];
> > > @@ -11,6 +11,6 @@ foo (void)
> > >      array[i] = -45;
> > >  }
> > >
> > > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */
> > >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > index 1e50dc842bc..8e8a7eaaaff 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > > @@ -1,9 +1,9 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake-avx512" } */
> > >
> > >  #include "pr100865-4a.c"
> > >
> > > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > > -/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> > > +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
> > >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > > index 1d9f282dc0d..4fd5a40d99d 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> > > @@ -1,5 +1,5 @@
> > >  /* { dg-do compile } */
> > > -/* { dg-options "-O2 -mtune=generic" } */
> > > +/* { dg-options "-O2 -msse2 -mtune=generic" } */
> > >
> > >  extern char *dst, *src;
> > >
> > > @@ -9,9 +9,5 @@ foo (void)
> > >    __builtin_memcpy (dst, src, 15);
> > >  }
> > >
> > > -/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> > > -/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> > > -/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > > -/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> > > +/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
> > > +/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > > index e5c19f49cf5..96ee5cb08c1 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > > index 185ea60e1d2..403cdb248a2 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake-avx512" } */
> > >
> > >  extern char *dst;
> > > @@ -9,6 +9,6 @@ foo (int c)
> > >    __builtin_memset (dst, c, 17);
> > >  }
> > >
> > > -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > > index d820cc318c3..bb0aadbc77e 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake-avx512" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > > index f6f179e9b5b..73d5d5abaee 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=skylake-avx512" } */
> > >
> > >  extern char *dst;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > > index 7b2ea66dcfc..71f1fd8c4df 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=x86-64" } */
> > >
> > >  struct S
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > > index 57642ea8d2d..ad19a88c883 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -march=x86-64" } */
> > >
> > >  struct S
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > > index ec0bc0100ae..ee4c04678d1 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> > > @@ -1,4 +1,4 @@
> > > -/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-do compile } */
> > >  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> > >
> > >  extern char *dst;
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> H.J.
diff mbox series

Patch

From ea40e16bfc6c2eca5f861d802360e9d015f3630c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 5 Mar 2016 07:17:09 -0800
Subject: [PATCH v7 03/10] x86: Update piecewise move and store

We can use TImode/OImode/XImode integers for piecewise move and store.

1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of
bytes that a single instruction can move quickly between memory and
registers or between two memory locations.
2. Define MOVE_MAX to the maximum number of bytes we can move from memory
to memory in one reasonably fast instruction.  The difference between
MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX must be a constant,
independent of compiler options, since it is used in reload.h to define
struct target_reload and MOVE_MAX can vary, depending on compiler options.
3. When vector register is used for piecewise move and store, we don't
increase stack_alignment_needed since vector register spill isn't
required for piecewise move and store.  Since stack_realign_needed is
set to true by checking stack_alignment_estimated set by pseudo vector
register usage, we also need to check stack_realign_needed to eliminate
frame pointer.

gcc/

	* config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
	check stack_realign_needed for stack realignment.
	(ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
	than the largest integer supported by vector register.
	* config/i386/i386.h (MAX_MOVE_MAX): New.  Set to 64.
	(MOVE_MAX): Set to bytes of the largest integer supported by
	vector register.
	(STORE_MAX_PIECES): New.

gcc/testsuite/

	* gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
	* gcc.target/i386/pr90773-4.c: Also run for 32-bit.
	* gcc.target/i386/pr90773-15.c: Likewise.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
	* gcc.target/i386/pr90773-24.c: Likewise.
	* gcc.target/i386/pr90773-25.c: Likewise.
	* gcc.target/i386/pr100865-1.c: Likewise.
	* gcc.target/i386/pr100865-2.c: Likewise.
	* gcc.target/i386/pr100865-3.c: Likewise.
	* gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect
	XMM movd to store 4 bytes.
	* gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect
	YMM registers.
	* gcc.target/i386/pr100865-4b.c: Likewise.
	* gcc.target/i386/pr100865-10a.c: Expect YMM registers.
	* gcc.target/i386/pr100865-10b.c: Likewise.

Fix x86: Update piecewise move and store

MOVE_MAX_PIECES -> MOVE_MAX.
---
 gcc/config/i386/i386.c                       | 21 ++++++--
 gcc/config/i386/i386.h                       | 53 +++++++++++++-------
 gcc/testsuite/gcc.target/i386/pr100865-1.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-10a.c |  4 +-
 gcc/testsuite/gcc.target/i386/pr100865-10b.c |  4 +-
 gcc/testsuite/gcc.target/i386/pr100865-2.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-3.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-4a.c  |  6 +--
 gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  8 +--
 gcc/testsuite/gcc.target/i386/pr90773-1.c    | 10 ++--
 gcc/testsuite/gcc.target/i386/pr90773-14.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c   |  6 +--
 gcc/testsuite/gcc.target/i386/pr90773-16.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-17.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-24.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-25.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-4.c    |  2 +-
 17 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5d20ca2067f..842eb0e6786 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -7953,8 +7953,17 @@  ix86_finalize_stack_frame_flags (void)
      assumed stack realignment might be needed or -fno-omit-frame-pointer
      is used, but in the end nothing that needed the stack alignment had
      been spilled nor stack access, clear frame_pointer_needed and say we
-     don't need stack realignment.  */
-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+     don't need stack realignment.
+
+     When vector register is used for piecewise move and store, we don't
+     increase stack_alignment_needed as there is no register spill for
+     piecewise move and store.  Since stack_realign_needed is set to true
+     by checking stack_alignment_estimated which is updated by pseudo
+     vector register usage, we also need to check stack_realign_needed to
+     eliminate frame pointer.  */
+  if ((stack_realign
+       || (!flag_omit_frame_pointer && optimize)
+       || crtl->stack_realign_needed)
       && frame_pointer_needed
       && crtl->is_leaf
       && crtl->sp_is_unchanging
@@ -10418,7 +10427,13 @@  ix86_legitimate_constant_p (machine_mode mode, rtx x)
 	  /* FALLTHRU */
 	case E_OImode:
 	case E_XImode:
-	  if (!standard_sse_constant_p (x, mode))
+	  if (!standard_sse_constant_p (x, mode)
+	      && GET_MODE_SIZE (TARGET_AVX512F
+				? XImode
+				: (TARGET_AVX
+				   ? OImode
+				   : (TARGET_SSE2
+				      ? TImode : DImode))) < GET_MODE_SIZE (mode))
 	    return false;
 	default:
 	  break;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d1e1c225990..bed9cd9da18 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1757,24 +1757,41 @@  typedef struct ix86_args {
 /* Define this as 1 if `char' should by default be signed; else as 0.  */
 #define DEFAULT_SIGNED_CHAR 1
 
-/* Max number of bytes we can move from memory to memory
-   in one reasonably fast instruction.  */
-#define MOVE_MAX 16
-
-/* MOVE_MAX_PIECES is the number of bytes at a time which we can
-   move efficiently, as opposed to  MOVE_MAX which is the maximum
-   number of bytes we can move with a single instruction.
-
-   ??? We should use TImode in 32-bit mode and use OImode or XImode
-   if they are available.  But since by_pieces_ninsns determines the
-   widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
-   64-bit mode.  */
-#define MOVE_MAX_PIECES \
-  ((TARGET_64BIT \
-    && TARGET_SSE2 \
-    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
-    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+/* The constant maximum number of bytes that a single instruction can
+   move quickly between memory and registers or between two memory
+   locations.  */
+#define MAX_MOVE_MAX 64
+
+/* Max number of bytes we can move from memory to memory in one
+   reasonably fast instruction, as opposed to MOVE_MAX_PIECES which
+   is the number of bytes at a time which we can move efficiently.
+   MOVE_MAX_PIECES defaults to MOVE_MAX.  */
+
+#define MOVE_MAX \
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+   store efficiently.  */
+#define STORE_MAX_PIECES \
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c
index 6c3097fb2a6..949dd5c337a 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=x86-64" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
index 7ffc19e56a8..98b6dfb16f3 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
@@ -29,5 +29,5 @@  foo (void)
     array[i] = MK_CONST128_BROADCAST (0x1f);
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index edf52765c60..e5616d8d258 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -3,5 +3,5 @@ 
 
 #include "pr100865-10a.c"
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c
index 17efe2d72a3..f3ea7753abe 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index 007e79f91b0..714c43e12c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
index f55883598f9..365487337ae 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake" } */
 
 extern char array[64];
@@ -11,6 +11,6 @@  foo (void)
     array[i] = -45;
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 1e50dc842bc..8e8a7eaaaff 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -1,9 +1,9 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 #include "pr100865-4a.c"
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
-/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
index 1d9f282dc0d..4fd5a40d99d 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-options "-O2 -msse2 -mtune=generic" } */
 
 extern char *dst, *src;
 
@@ -9,9 +9,5 @@  foo (void)
   __builtin_memcpy (dst, src, 15);
 }
 
-/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
index e5c19f49cf5..96ee5cb08c1 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
index 185ea60e1d2..403cdb248a2 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
@@ -9,6 +9,6 @@  foo (int c)
   __builtin_memset (dst, c, 17);
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
-/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
index d820cc318c3..bb0aadbc77e 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
index f6f179e9b5b..73d5d5abaee 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
index 7b2ea66dcfc..71f1fd8c4df 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-24.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=x86-64" } */
 
 struct S
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
index 57642ea8d2d..ad19a88c883 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-25.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=x86-64" } */
 
 struct S
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
index ec0bc0100ae..ee4c04678d1 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
@@ -1,4 +1,4 @@ 
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
-- 
2.31.1