diff mbox series

i386: Properly encode xmm16-xmm31/ymm16-ymm31 for vector move

Message ID 20190222162451.GA18480@intel.com
State New
Headers show
Series i386: Properly encode xmm16-xmm31/ymm16-ymm31 for vector move | expand

Commit Message

H.J. Lu Feb. 22, 2019, 4:24 p.m. UTC
Hi Jan, Uros,

This patch fixes the wrong code bug:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229

Tested on AVX2 and AVX512 with and without --with-arch=native.

OK for trunk?

Thanks.

H.J.
--
i386 backend has

INT_MODE (OI, 32);
INT_MODE (XI, 64);

So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
in case of const_1, all 512 bits set.

We can load zeros with narrower instruction, (e.g. 256 bit by inherent
zeroing of highpart in case of 128 bit xor), so TImode in this case.

Some targets prefer V4SF mode, so they will emit float xorps for zeroing.

sse.md has

(define_insn "mov<mode>_internal"
  [(set (match_operand:VMOVE 0 "nonimmediate_operand"
         "=v,v ,v ,m")
        (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
         " C,BC,vm,v"))]
....
      /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
         in avx512f, so we need to use workarounds, to access sse registers
         16-31, which are evex-only. In avx512vl we don't need workarounds.  */
      if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
          && (EXT_REX_SSE_REG_P (operands[0])
              || EXT_REX_SSE_REG_P (operands[1])))
        {
          if (memory_operand (operands[0], <MODE>mode))
            {
              if (<MODE_SIZE> == 32)
                return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
              else if (<MODE_SIZE> == 16)
                return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
              else
                gcc_unreachable ();
            }
...

However, since ix86_hard_regno_mode_ok has

     /* TODO check for QI/HI scalars.  */
      /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
      if (TARGET_AVX512VL
          && (mode == OImode
              || mode == TImode
              || VALID_AVX256_REG_MODE (mode)
              || VALID_AVX512VL_128_REG_MODE (mode)))
        return true;

      /* xmm16-xmm31 are only available for AVX-512.  */
      if (EXT_REX_SSE_REGNO_P (regno))
        return false;

      if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
          && (EXT_REX_SSE_REG_P (operands[0])
              || EXT_REX_SSE_REG_P (operands[1])))

is a dead code.

Also for

long long *p;
volatile __m256i yy;

void
foo (void)
{
   _mm256_store_epi64 (p, yy);
}

with AVX512VL, we should generate

	vmovdqa		%ymm0, (%rax)

not

	vmovdqa64	%ymm0, (%rax)

All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:

1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
moves will be generated.
2. If xmm16-xmm31/ymm16-ymm31 registers are used:
   a. With AVX512VL, AVX512VL vector moves will be generated.
   b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
      move will be done with zmm register move.

ext_sse_reg_operand is removed since it is no longer needed.

Tested on AVX2 and AVX512 with and without --with-arch=native.

gcc/

	PR target/89229
	PR target/89346
	* config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
	* config/i386/i386.c (ix86_get_ssemov): New function.
	(ix86_output_ssemov): Likewise.
	* config/i386/i386.md (*movxi_internal_avx512f): Call
	ix86_output_ssemov for TYPE_SSEMOV.
	(*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
	Remove ext_sse_reg_operand and TARGET_AVX512VL check.
	(*movti_internal): Likewise.
	(*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
	Remove ext_sse_reg_operand check.
	(*movsi_internal): Likewise.
	(*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
	(*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
	Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
	and ext_sse_reg_operand check.
	(*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
	Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
	ext_sse_reg_operand check.
	* config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
	ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
	check.
	* config/i386/sse.md (VMOVE:mov<mode>_internal): Call
	ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
	check.
	* config/i386/predicates.md (ext_sse_reg_operand): Removed.

gcc/testsuite/

	PR target/89229
	PR target/89346
	* gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
	* gcc.target/i386/pr89229-2a.c: New test.
	* gcc.target/i386/pr89229-2b.c: Likewise.
	* gcc.target/i386/pr89229-2c.c: Likewise.
	* gcc.target/i386/pr89229-3a.c: Likewise.
	* gcc.target/i386/pr89229-3b.c: Likewise.
	* gcc.target/i386/pr89229-3c.c: Likewise.
	* gcc.target/i386/pr89229-4a.c: Likewise.
	* gcc.target/i386/pr89229-4b.c: Likewise.
	* gcc.target/i386/pr89229-4c.c: Likewise.
	* gcc.target/i386/pr89229-5a.c: Likewise.
	* gcc.target/i386/pr89229-5b.c: Likewise.
	* gcc.target/i386/pr89229-5c.c: Likewise.
	* gcc.target/i386/pr89229-6a.c: Likewise.
	* gcc.target/i386/pr89229-6b.c: Likewise.
	* gcc.target/i386/pr89229-6c.c: Likewise.
	* gcc.target/i386/pr89229-7a.c: Likewise.
	* gcc.target/i386/pr89229-7b.c: Likewise.
	* gcc.target/i386/pr89229-7c.c: Likewise.
---
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        | 274 ++++++++++++++++++
 gcc/config/i386/i386.md                       | 212 +-------------
 gcc/config/i386/mmx.md                        |  29 +-
 gcc/config/i386/predicates.md                 |   5 -
 gcc/config/i386/sse.md                        |  98 +------
 .../gcc.target/i386/avx512vl-vmovdqa64-1.c    |   6 +-
 gcc/testsuite/gcc.target/i386/pr89229-2a.c    |  15 +
 gcc/testsuite/gcc.target/i386/pr89229-2b.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr89229-2c.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-3a.c    |  17 ++
 gcc/testsuite/gcc.target/i386/pr89229-3b.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-3c.c    |   7 +
 gcc/testsuite/gcc.target/i386/pr89229-4a.c    |  17 ++
 gcc/testsuite/gcc.target/i386/pr89229-4b.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-4c.c    |   7 +
 gcc/testsuite/gcc.target/i386/pr89229-5a.c    |  16 +
 gcc/testsuite/gcc.target/i386/pr89229-5b.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-5c.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-6a.c    |  16 +
 gcc/testsuite/gcc.target/i386/pr89229-6b.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-6c.c    |   6 +
 gcc/testsuite/gcc.target/i386/pr89229-7a.c    |  16 +
 gcc/testsuite/gcc.target/i386/pr89229-7b.c    |  12 +
 gcc/testsuite/gcc.target/i386/pr89229-7c.c    |   6 +
 25 files changed, 480 insertions(+), 330 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-4c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-5c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-6c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89229-7c.c

Comments

H.J. Lu May 21, 2019, 9:43 p.m. UTC | #1
On Fri, Feb 22, 2019 at 8:25 AM H.J. Lu <hongjiu.lu@intel.com> wrote:
>
> Hi Jan, Uros,
>
> This patch fixes the wrong code bug:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229
>
> Tested on AVX2 and AVX512 with and without --with-arch=native.
>
> OK for trunk?
>
> Thanks.
>
> H.J.
> --
> i386 backend has
>
> INT_MODE (OI, 32);
> INT_MODE (XI, 64);
>
> So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
> in case of const_1, all 512 bits set.
>
> We can load zeros with narrower instruction, (e.g. 256 bit by inherent
> zeroing of highpart in case of 128 bit xor), so TImode in this case.
>
> Some targets prefer V4SF mode, so they will emit float xorps for zeroing.
>
> sse.md has
>
> (define_insn "mov<mode>_internal"
>   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
>          "=v,v ,v ,m")
>         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
>          " C,BC,vm,v"))]
> ....
>       /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
>          in avx512f, so we need to use workarounds, to access sse registers
>          16-31, which are evex-only. In avx512vl we don't need workarounds.  */
>       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
>           && (EXT_REX_SSE_REG_P (operands[0])
>               || EXT_REX_SSE_REG_P (operands[1])))
>         {
>           if (memory_operand (operands[0], <MODE>mode))
>             {
>               if (<MODE_SIZE> == 32)
>                 return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
>               else if (<MODE_SIZE> == 16)
>                 return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
>               else
>                 gcc_unreachable ();
>             }
> ...
>
> However, since ix86_hard_regno_mode_ok has
>
>      /* TODO check for QI/HI scalars.  */
>       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
>       if (TARGET_AVX512VL
>           && (mode == OImode
>               || mode == TImode
>               || VALID_AVX256_REG_MODE (mode)
>               || VALID_AVX512VL_128_REG_MODE (mode)))
>         return true;
>
>       /* xmm16-xmm31 are only available for AVX-512.  */
>       if (EXT_REX_SSE_REGNO_P (regno))
>         return false;
>
>       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
>           && (EXT_REX_SSE_REG_P (operands[0])
>               || EXT_REX_SSE_REG_P (operands[1])))
>
> is a dead code.
>
> Also for
>
> long long *p;
> volatile __m256i yy;
>
> void
> foo (void)
> {
>    _mm256_store_epi64 (p, yy);
> }
>
> with AVX512VL, we should generate
>
>         vmovdqa         %ymm0, (%rax)
>
> not
>
>         vmovdqa64       %ymm0, (%rax)
>
> All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:
>
> 1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
> moves will be generated.
> 2. If xmm16-xmm31/ymm16-ymm31 registers are used:
>    a. With AVX512VL, AVX512VL vector moves will be generated.
>    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
>       move will be done with zmm register move.
>
> ext_sse_reg_operand is removed since it is no longer needed.
>
> Tested on AVX2 and AVX512 with and without --with-arch=native.
>
> gcc/
>
>         PR target/89229
>         PR target/89346
>         * config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
>         * config/i386/i386.c (ix86_get_ssemov): New function.
>         (ix86_output_ssemov): Likewise.
>         * config/i386/i386.md (*movxi_internal_avx512f): Call
>         ix86_output_ssemov for TYPE_SSEMOV.
>         (*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
>         Remove ext_sse_reg_operand and TARGET_AVX512VL check.
>         (*movti_internal): Likewise.
>         (*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
>         Remove ext_sse_reg_operand check.
>         (*movsi_internal): Likewise.
>         (*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
>         (*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
>         Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
>         and ext_sse_reg_operand check.
>         (*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
>         Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
>         ext_sse_reg_operand check.
>         * config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
>         ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
>         check.
>         * config/i386/sse.md (VMOVE:mov<mode>_internal): Call
>         ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
>         check.
>         * config/i386/predicates.md (ext_sse_reg_operand): Removed.
>
> gcc/testsuite/
>
>         PR target/89229
>         PR target/89346
>         * gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
>         * gcc.target/i386/pr89229-2a.c: New test.
>         * gcc.target/i386/pr89229-2b.c: Likewise.
>         * gcc.target/i386/pr89229-2c.c: Likewise.
>         * gcc.target/i386/pr89229-3a.c: Likewise.
>         * gcc.target/i386/pr89229-3b.c: Likewise.
>         * gcc.target/i386/pr89229-3c.c: Likewise.
>         * gcc.target/i386/pr89229-4a.c: Likewise.
>         * gcc.target/i386/pr89229-4b.c: Likewise.
>         * gcc.target/i386/pr89229-4c.c: Likewise.
>         * gcc.target/i386/pr89229-5a.c: Likewise.
>         * gcc.target/i386/pr89229-5b.c: Likewise.
>         * gcc.target/i386/pr89229-5c.c: Likewise.
>         * gcc.target/i386/pr89229-6a.c: Likewise.
>         * gcc.target/i386/pr89229-6b.c: Likewise.
>         * gcc.target/i386/pr89229-6c.c: Likewise.
>         * gcc.target/i386/pr89229-7a.c: Likewise.
>         * gcc.target/i386/pr89229-7b.c: Likewise.
>         * gcc.target/i386/pr89229-7c.c: Likewise.
> ---

PING:

https://gcc.gnu.org/ml/gcc-patches/2019-02/msg01841.html
H.J. Lu May 31, 2019, 5:38 p.m. UTC | #2
On Tue, May 21, 2019 at 2:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Feb 22, 2019 at 8:25 AM H.J. Lu <hongjiu.lu@intel.com> wrote:
> >
> > Hi Jan, Uros,
> >
> > This patch fixes the wrong code bug:
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229
> >
> > Tested on AVX2 and AVX512 with and without --with-arch=native.
> >
> > OK for trunk?
> >
> > Thanks.
> >
> > H.J.
> > --
> > i386 backend has
> >
> > INT_MODE (OI, 32);
> > INT_MODE (XI, 64);
> >
> > So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
> > in case of const_1, all 512 bits set.
> >
> > We can load zeros with narrower instruction, (e.g. 256 bit by inherent
> > zeroing of highpart in case of 128 bit xor), so TImode in this case.
> >
> > Some targets prefer V4SF mode, so they will emit float xorps for zeroing.
> >
> > sse.md has
> >
> > (define_insn "mov<mode>_internal"
> >   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> >          "=v,v ,v ,m")
> >         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> >          " C,BC,vm,v"))]
> > ....
> >       /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
> >          in avx512f, so we need to use workarounds, to access sse registers
> >          16-31, which are evex-only. In avx512vl we don't need workarounds.  */
> >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> >           && (EXT_REX_SSE_REG_P (operands[0])
> >               || EXT_REX_SSE_REG_P (operands[1])))
> >         {
> >           if (memory_operand (operands[0], <MODE>mode))
> >             {
> >               if (<MODE_SIZE> == 32)
> >                 return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> >               else if (<MODE_SIZE> == 16)
> >                 return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> >               else
> >                 gcc_unreachable ();
> >             }
> > ...
> >
> > However, since ix86_hard_regno_mode_ok has
> >
> >      /* TODO check for QI/HI scalars.  */
> >       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
> >       if (TARGET_AVX512VL
> >           && (mode == OImode
> >               || mode == TImode
> >               || VALID_AVX256_REG_MODE (mode)
> >               || VALID_AVX512VL_128_REG_MODE (mode)))
> >         return true;
> >
> >       /* xmm16-xmm31 are only available for AVX-512.  */
> >       if (EXT_REX_SSE_REGNO_P (regno))
> >         return false;
> >
> >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> >           && (EXT_REX_SSE_REG_P (operands[0])
> >               || EXT_REX_SSE_REG_P (operands[1])))
> >
> > is a dead code.
> >
> > Also for
> >
> > long long *p;
> > volatile __m256i yy;
> >
> > void
> > foo (void)
> > {
> >    _mm256_store_epi64 (p, yy);
> > }
> >
> > with AVX512VL, we should generate
> >
> >         vmovdqa         %ymm0, (%rax)
> >
> > not
> >
> >         vmovdqa64       %ymm0, (%rax)
> >
> > All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:
> >
> > 1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
> > moves will be generated.
> > 2. If xmm16-xmm31/ymm16-ymm31 registers are used:
> >    a. With AVX512VL, AVX512VL vector moves will be generated.
> >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> >       move will be done with zmm register move.
> >
> > ext_sse_reg_operand is removed since it is no longer needed.
> >
> > Tested on AVX2 and AVX512 with and without --with-arch=native.
> >
> > gcc/
> >
> >         PR target/89229
> >         PR target/89346
> >         * config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
> >         * config/i386/i386.c (ix86_get_ssemov): New function.
> >         (ix86_output_ssemov): Likewise.
> >         * config/i386/i386.md (*movxi_internal_avx512f): Call
> >         ix86_output_ssemov for TYPE_SSEMOV.
> >         (*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> >         Remove ext_sse_reg_operand and TARGET_AVX512VL check.
> >         (*movti_internal): Likewise.
> >         (*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> >         Remove ext_sse_reg_operand check.
> >         (*movsi_internal): Likewise.
> >         (*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> >         (*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> >         Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
> >         and ext_sse_reg_operand check.
> >         (*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> >         Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
> >         ext_sse_reg_operand check.
> >         * config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
> >         ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
> >         check.
> >         * config/i386/sse.md (VMOVE:mov<mode>_internal): Call
> >         ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
> >         check.
> >         * config/i386/predicates.md (ext_sse_reg_operand): Removed.
> >
> > gcc/testsuite/
> >
> >         PR target/89229
> >         PR target/89346
> >         * gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
> >         * gcc.target/i386/pr89229-2a.c: New test.
> >         * gcc.target/i386/pr89229-2b.c: Likewise.
> >         * gcc.target/i386/pr89229-2c.c: Likewise.
> >         * gcc.target/i386/pr89229-3a.c: Likewise.
> >         * gcc.target/i386/pr89229-3b.c: Likewise.
> >         * gcc.target/i386/pr89229-3c.c: Likewise.
> >         * gcc.target/i386/pr89229-4a.c: Likewise.
> >         * gcc.target/i386/pr89229-4b.c: Likewise.
> >         * gcc.target/i386/pr89229-4c.c: Likewise.
> >         * gcc.target/i386/pr89229-5a.c: Likewise.
> >         * gcc.target/i386/pr89229-5b.c: Likewise.
> >         * gcc.target/i386/pr89229-5c.c: Likewise.
> >         * gcc.target/i386/pr89229-6a.c: Likewise.
> >         * gcc.target/i386/pr89229-6b.c: Likewise.
> >         * gcc.target/i386/pr89229-6c.c: Likewise.
> >         * gcc.target/i386/pr89229-7a.c: Likewise.
> >         * gcc.target/i386/pr89229-7b.c: Likewise.
> >         * gcc.target/i386/pr89229-7c.c: Likewise.
> > ---
>
> PING:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-02/msg01841.html
>
>

PING.
H.J. Lu June 18, 2019, 3:59 p.m. UTC | #3
On Fri, May 31, 2019 at 10:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 21, 2019 at 2:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Feb 22, 2019 at 8:25 AM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > >
> > > Hi Jan, Uros,
> > >
> > > This patch fixes the wrong code bug:
> > >
> > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229
> > >
> > > Tested on AVX2 and AVX512 with and without --with-arch=native.
> > >
> > > OK for trunk?
> > >
> > > Thanks.
> > >
> > > H.J.
> > > --
> > > i386 backend has
> > >
> > > INT_MODE (OI, 32);
> > > INT_MODE (XI, 64);
> > >
> > > So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
> > > in case of const_1, all 512 bits set.
> > >
> > > We can load zeros with narrower instruction, (e.g. 256 bit by inherent
> > > zeroing of highpart in case of 128 bit xor), so TImode in this case.
> > >
> > > Some targets prefer V4SF mode, so they will emit float xorps for zeroing.
> > >
> > > sse.md has
> > >
> > > (define_insn "mov<mode>_internal"
> > >   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> > >          "=v,v ,v ,m")
> > >         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> > >          " C,BC,vm,v"))]
> > > ....
> > >       /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
> > >          in avx512f, so we need to use workarounds, to access sse registers
> > >          16-31, which are evex-only. In avx512vl we don't need workarounds.  */
> > >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> > >           && (EXT_REX_SSE_REG_P (operands[0])
> > >               || EXT_REX_SSE_REG_P (operands[1])))
> > >         {
> > >           if (memory_operand (operands[0], <MODE>mode))
> > >             {
> > >               if (<MODE_SIZE> == 32)
> > >                 return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> > >               else if (<MODE_SIZE> == 16)
> > >                 return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> > >               else
> > >                 gcc_unreachable ();
> > >             }
> > > ...
> > >
> > > However, since ix86_hard_regno_mode_ok has
> > >
> > >      /* TODO check for QI/HI scalars.  */
> > >       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
> > >       if (TARGET_AVX512VL
> > >           && (mode == OImode
> > >               || mode == TImode
> > >               || VALID_AVX256_REG_MODE (mode)
> > >               || VALID_AVX512VL_128_REG_MODE (mode)))
> > >         return true;
> > >
> > >       /* xmm16-xmm31 are only available for AVX-512.  */
> > >       if (EXT_REX_SSE_REGNO_P (regno))
> > >         return false;
> > >
> > >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> > >           && (EXT_REX_SSE_REG_P (operands[0])
> > >               || EXT_REX_SSE_REG_P (operands[1])))
> > >
> > > is a dead code.
> > >
> > > Also for
> > >
> > > long long *p;
> > > volatile __m256i yy;
> > >
> > > void
> > > foo (void)
> > > {
> > >    _mm256_store_epi64 (p, yy);
> > > }
> > >
> > > with AVX512VL, we should generate
> > >
> > >         vmovdqa         %ymm0, (%rax)
> > >
> > > not
> > >
> > >         vmovdqa64       %ymm0, (%rax)
> > >
> > > All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:
> > >
> > > 1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
> > > moves will be generated.
> > > 2. If xmm16-xmm31/ymm16-ymm31 registers are used:
> > >    a. With AVX512VL, AVX512VL vector moves will be generated.
> > >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> > >       move will be done with zmm register move.
> > >
> > > ext_sse_reg_operand is removed since it is no longer needed.
> > >
> > > Tested on AVX2 and AVX512 with and without --with-arch=native.
> > >
> > > gcc/
> > >
> > >         PR target/89229
> > >         PR target/89346
> > >         * config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
> > >         * config/i386/i386.c (ix86_get_ssemov): New function.
> > >         (ix86_output_ssemov): Likewise.
> > >         * config/i386/i386.md (*movxi_internal_avx512f): Call
> > >         ix86_output_ssemov for TYPE_SSEMOV.
> > >         (*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> > >         Remove ext_sse_reg_operand and TARGET_AVX512VL check.
> > >         (*movti_internal): Likewise.
> > >         (*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > >         Remove ext_sse_reg_operand check.
> > >         (*movsi_internal): Likewise.
> > >         (*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > >         (*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > >         Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
> > >         and ext_sse_reg_operand check.
> > >         (*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> > >         Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
> > >         ext_sse_reg_operand check.
> > >         * config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
> > >         ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
> > >         check.
> > >         * config/i386/sse.md (VMOVE:mov<mode>_internal): Call
> > >         ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
> > >         check.
> > >         * config/i386/predicates.md (ext_sse_reg_operand): Removed.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/89229
> > >         PR target/89346
> > >         * gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
> > >         * gcc.target/i386/pr89229-2a.c: New test.
> > >         * gcc.target/i386/pr89229-2b.c: Likewise.
> > >         * gcc.target/i386/pr89229-2c.c: Likewise.
> > >         * gcc.target/i386/pr89229-3a.c: Likewise.
> > >         * gcc.target/i386/pr89229-3b.c: Likewise.
> > >         * gcc.target/i386/pr89229-3c.c: Likewise.
> > >         * gcc.target/i386/pr89229-4a.c: Likewise.
> > >         * gcc.target/i386/pr89229-4b.c: Likewise.
> > >         * gcc.target/i386/pr89229-4c.c: Likewise.
> > >         * gcc.target/i386/pr89229-5a.c: Likewise.
> > >         * gcc.target/i386/pr89229-5b.c: Likewise.
> > >         * gcc.target/i386/pr89229-5c.c: Likewise.
> > >         * gcc.target/i386/pr89229-6a.c: Likewise.
> > >         * gcc.target/i386/pr89229-6b.c: Likewise.
> > >         * gcc.target/i386/pr89229-6c.c: Likewise.
> > >         * gcc.target/i386/pr89229-7a.c: Likewise.
> > >         * gcc.target/i386/pr89229-7b.c: Likewise.
> > >         * gcc.target/i386/pr89229-7c.c: Likewise.
> > > ---
> >
> > PING:
> >
> > https://gcc.gnu.org/ml/gcc-patches/2019-02/msg01841.html
> >
> >
>
> PING.
>

PING.
H.J. Lu July 8, 2019, 3:19 p.m. UTC | #4
On Tue, Jun 18, 2019 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, May 31, 2019 at 10:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, May 21, 2019 at 2:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Feb 22, 2019 at 8:25 AM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > > >
> > > > Hi Jan, Uros,
> > > >
> > > > This patch fixes the wrong code bug:
> > > >
> > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229
> > > >
> > > > Tested on AVX2 and AVX512 with and without --with-arch=native.
> > > >
> > > > OK for trunk?
> > > >
> > > > Thanks.
> > > >
> > > > H.J.
> > > > --
> > > > i386 backend has
> > > >
> > > > INT_MODE (OI, 32);
> > > > INT_MODE (XI, 64);
> > > >
> > > > So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
> > > > in case of const_1, all 512 bits set.
> > > >
> > > > We can load zeros with narrower instruction, (e.g. 256 bit by inherent
> > > > zeroing of highpart in case of 128 bit xor), so TImode in this case.
> > > >
> > > > Some targets prefer V4SF mode, so they will emit float xorps for zeroing.
> > > >
> > > > sse.md has
> > > >
> > > > (define_insn "mov<mode>_internal"
> > > >   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> > > >          "=v,v ,v ,m")
> > > >         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> > > >          " C,BC,vm,v"))]
> > > > ....
> > > >       /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
> > > >          in avx512f, so we need to use workarounds, to access sse registers
> > > >          16-31, which are evex-only. In avx512vl we don't need workarounds.  */
> > > >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> > > >           && (EXT_REX_SSE_REG_P (operands[0])
> > > >               || EXT_REX_SSE_REG_P (operands[1])))
> > > >         {
> > > >           if (memory_operand (operands[0], <MODE>mode))
> > > >             {
> > > >               if (<MODE_SIZE> == 32)
> > > >                 return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> > > >               else if (<MODE_SIZE> == 16)
> > > >                 return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
> > > >               else
> > > >                 gcc_unreachable ();
> > > >             }
> > > > ...
> > > >
> > > > However, since ix86_hard_regno_mode_ok has
> > > >
> > > >      /* TODO check for QI/HI scalars.  */
> > > >       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
> > > >       if (TARGET_AVX512VL
> > > >           && (mode == OImode
> > > >               || mode == TImode
> > > >               || VALID_AVX256_REG_MODE (mode)
> > > >               || VALID_AVX512VL_128_REG_MODE (mode)))
> > > >         return true;
> > > >
> > > >       /* xmm16-xmm31 are only available for AVX-512.  */
> > > >       if (EXT_REX_SSE_REGNO_P (regno))
> > > >         return false;
> > > >
> > > >       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
> > > >           && (EXT_REX_SSE_REG_P (operands[0])
> > > >               || EXT_REX_SSE_REG_P (operands[1])))
> > > >
> > > > is a dead code.
> > > >
> > > > Also for
> > > >
> > > > long long *p;
> > > > volatile __m256i yy;
> > > >
> > > > void
> > > > foo (void)
> > > > {
> > > >    _mm256_store_epi64 (p, yy);
> > > > }
> > > >
> > > > with AVX512VL, we should generate
> > > >
> > > >         vmovdqa         %ymm0, (%rax)
> > > >
> > > > not
> > > >
> > > >         vmovdqa64       %ymm0, (%rax)
> > > >
> > > > All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:
> > > >
> > > > 1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
> > > > moves will be generated.
> > > > 2. If xmm16-xmm31/ymm16-ymm31 registers are used:
> > > >    a. With AVX512VL, AVX512VL vector moves will be generated.
> > > >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> > > >       move will be done with zmm register move.
> > > >
> > > > ext_sse_reg_operand is removed since it is no longer needed.
> > > >
> > > > Tested on AVX2 and AVX512 with and without --with-arch=native.
> > > >
> > > > gcc/
> > > >
> > > >         PR target/89229
> > > >         PR target/89346
> > > >         * config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
> > > >         * config/i386/i386.c (ix86_get_ssemov): New function.
> > > >         (ix86_output_ssemov): Likewise.
> > > >         * config/i386/i386.md (*movxi_internal_avx512f): Call
> > > >         ix86_output_ssemov for TYPE_SSEMOV.
> > > >         (*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> > > >         Remove ext_sse_reg_operand and TARGET_AVX512VL check.
> > > >         (*movti_internal): Likewise.
> > > >         (*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > > >         Remove ext_sse_reg_operand check.
> > > >         (*movsi_internal): Likewise.
> > > >         (*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > > >         (*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> > > >         Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
> > > >         and ext_sse_reg_operand check.
> > > >         (*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> > > >         Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
> > > >         ext_sse_reg_operand check.
> > > >         * config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
> > > >         ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
> > > >         check.
> > > >         * config/i386/sse.md (VMOVE:mov<mode>_internal): Call
> > > >         ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
> > > >         check.
> > > >         * config/i386/predicates.md (ext_sse_reg_operand): Removed.
> > > >
> > > > gcc/testsuite/
> > > >
> > > >         PR target/89229
> > > >         PR target/89346
> > > >         * gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
> > > >         * gcc.target/i386/pr89229-2a.c: New test.
> > > >         * gcc.target/i386/pr89229-2b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-2c.c: Likewise.
> > > >         * gcc.target/i386/pr89229-3a.c: Likewise.
> > > >         * gcc.target/i386/pr89229-3b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-3c.c: Likewise.
> > > >         * gcc.target/i386/pr89229-4a.c: Likewise.
> > > >         * gcc.target/i386/pr89229-4b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-4c.c: Likewise.
> > > >         * gcc.target/i386/pr89229-5a.c: Likewise.
> > > >         * gcc.target/i386/pr89229-5b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-5c.c: Likewise.
> > > >         * gcc.target/i386/pr89229-6a.c: Likewise.
> > > >         * gcc.target/i386/pr89229-6b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-6c.c: Likewise.
> > > >         * gcc.target/i386/pr89229-7a.c: Likewise.
> > > >         * gcc.target/i386/pr89229-7b.c: Likewise.
> > > >         * gcc.target/i386/pr89229-7c.c: Likewise.
> > > > ---
> > >
> > > PING:
> > >
> > > https://gcc.gnu.org/ml/gcc-patches/2019-02/msg01841.html
> > >
> > >
> >
> > PING.
> >
>
> PING.
>

PING.
Jeff Law July 22, 2019, 11:11 p.m. UTC | #5
On 2/22/19 9:24 AM, H.J. Lu wrote:
> Hi Jan, Uros,
> 
> This patch fixes the wrong code bug:
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89229
> 
> Tested on AVX2 and AVX512 with and without --with-arch=native.
> 
> OK for trunk?
> 
> Thanks.
> 
> H.J.
> --
> i386 backend has
> 
> INT_MODE (OI, 32);
> INT_MODE (XI, 64);
> 
> So, XI_MODE represents 64 INTEGER bytes = 64 * 8 = 512 bit operation,
> in case of const_1, all 512 bits set.
> 
> We can load zeros with narrower instruction, (e.g. 256 bit by inherent
> zeroing of highpart in case of 128 bit xor), so TImode in this case.
> 
> Some targets prefer V4SF mode, so they will emit float xorps for zeroing.
> 
> sse.md has
> 
> (define_insn "mov<mode>_internal"
>   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
>          "=v,v ,v ,m")
>         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
>          " C,BC,vm,v"))]
> ....
>       /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
>          in avx512f, so we need to use workarounds, to access sse registers
>          16-31, which are evex-only. In avx512vl we don't need workarounds.  */
>       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
>           && (EXT_REX_SSE_REG_P (operands[0])
>               || EXT_REX_SSE_REG_P (operands[1])))
>         {
>           if (memory_operand (operands[0], <MODE>mode))
>             {
>               if (<MODE_SIZE> == 32)
>                 return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
>               else if (<MODE_SIZE> == 16)
>                 return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
>               else
>                 gcc_unreachable ();
>             }
> ...
> 
> However, since ix86_hard_regno_mode_ok has
> 
>      /* TODO check for QI/HI scalars.  */
>       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
>       if (TARGET_AVX512VL
>           && (mode == OImode
>               || mode == TImode
>               || VALID_AVX256_REG_MODE (mode)
>               || VALID_AVX512VL_128_REG_MODE (mode)))
>         return true;
> 
>       /* xmm16-xmm31 are only available for AVX-512.  */
>       if (EXT_REX_SSE_REGNO_P (regno))
>         return false;
> 
>       if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
>           && (EXT_REX_SSE_REG_P (operands[0])
>               || EXT_REX_SSE_REG_P (operands[1])))
> 
> is a dead code.
> 
> Also for
> 
> long long *p;
> volatile __m256i yy;
> 
> void
> foo (void)
> {
>    _mm256_store_epi64 (p, yy);
> }
> 
> with AVX512VL, we should generate
> 
> 	vmovdqa		%ymm0, (%rax)
> 
> not
> 
> 	vmovdqa64	%ymm0, (%rax)
> 
> All TYPE_SSEMOV vector moves are consolidated to ix86_output_ssemov:
> 
> 1. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE/AVX vector
> moves will be generated.
> 2. If xmm16-xmm31/ymm16-ymm31 registers are used:
>    a. With AVX512VL, AVX512VL vector moves will be generated.
>    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
>       move will be done with zmm register move.
> 
> ext_sse_reg_operand is removed since it is no longer needed.
> 
> Tested on AVX2 and AVX512 with and without --with-arch=native.
> 
> gcc/
> 
> 	PR target/89229
> 	PR target/89346
> 	* config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
> 	* config/i386/i386.c (ix86_get_ssemov): New function.
> 	(ix86_output_ssemov): Likewise.
> 	* config/i386/i386.md (*movxi_internal_avx512f): Call
> 	ix86_output_ssemov for TYPE_SSEMOV.
> 	(*movoi_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> 	Remove ext_sse_reg_operand and TARGET_AVX512VL check.
> 	(*movti_internal): Likewise.
> 	(*movdi_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> 	Remove ext_sse_reg_operand check.
> 	(*movsi_internal): Likewise.
> 	(*movtf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> 	(*movdf_internal): Call ix86_output_ssemov for TYPE_SSEMOV.
> 	Remove TARGET_AVX512F, TARGET_PREFER_AVX256, TARGET_AVX512VL
> 	and ext_sse_reg_operand check.
> 	(*movsf_internal_avx): Call ix86_output_ssemov for TYPE_SSEMOV.
> 	Remove TARGET_PREFER_AVX256, TARGET_AVX512VL and
> 	ext_sse_reg_operand check.
> 	* config/i386/mmx.md (MMXMODE:*mov<mode>_internal): Call
> 	ix86_output_ssemov for TYPE_SSEMOV.  Remove ext_sse_reg_operand
> 	check.
> 	* config/i386/sse.md (VMOVE:mov<mode>_internal): Call
> 	ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
> 	check.
> 	* config/i386/predicates.md (ext_sse_reg_operand): Removed.
> 
> gcc/testsuite/
> 
> 	PR target/89229
> 	PR target/89346
> 	* gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
> 	* gcc.target/i386/pr89229-2a.c: New test.
> 	* gcc.target/i386/pr89229-2b.c: Likewise.
> 	* gcc.target/i386/pr89229-2c.c: Likewise.
> 	* gcc.target/i386/pr89229-3a.c: Likewise.
> 	* gcc.target/i386/pr89229-3b.c: Likewise.
> 	* gcc.target/i386/pr89229-3c.c: Likewise.
> 	* gcc.target/i386/pr89229-4a.c: Likewise.
> 	* gcc.target/i386/pr89229-4b.c: Likewise.
> 	* gcc.target/i386/pr89229-4c.c: Likewise.
> 	* gcc.target/i386/pr89229-5a.c: Likewise.
> 	* gcc.target/i386/pr89229-5b.c: Likewise.
> 	* gcc.target/i386/pr89229-5c.c: Likewise.
> 	* gcc.target/i386/pr89229-6a.c: Likewise.
> 	* gcc.target/i386/pr89229-6b.c: Likewise.
> 	* gcc.target/i386/pr89229-6c.c: Likewise.
> 	* gcc.target/i386/pr89229-7a.c: Likewise.
> 	* gcc.target/i386/pr89229-7b.c: Likewise.
> 	* gcc.target/i386/pr89229-7c.c: Likewise.
I've tried to follow what you're doing here, but frankly all this code
is an absolute mess.  Some comments about the difference cases would
likely help me and anyone else that needed to look at this in the future.

I like that we're consolidating things, but it's just damn hard to map
from what we do now to what you're doing in this patch and verify that
you're just changing the cases that you really want to be changing.

Is there any way to break this down into more manageable hunks?  Perhaps
changing one pattern from the md file at a time and walking through any
changes in code generation for the change (as part of the patch
discusion, not necessarily as comments in the patch?)

Again, what I'm trying to do is cut this down into something that is
understandable to someone that isn't intimately familiar with the code
and what you're trying to change.

Just an example, I'm having trouble just following how this affects the
one pattern in sse.md you're changing.  I can't see that the cases that
should stay the same are staying the same nor is it easy to tease out
what cases you want to change for that pattern.




>  
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 81dfed12837..80ebc187041 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -10286,6 +10286,280 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
>    return true;
>  }
>  
> +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> +   TARGET_AVX512VL or it is a register to register move which can
> +   be done with zmm register move. */
> +
> +static const char *
> +ix86_get_ssemov (rtx *operands, unsigned size,
> +		 enum attr_mode insn_mode, machine_mode mode)
> +{
> +  char buf[128];
> +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> +		       || misaligned_operand (operands[1], mode));
> +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> +		     || EXT_REX_SSE_REG_P (operands[1]));
> +  machine_mode scalar_mode;
> +
> +  const char *opcode = NULL;
> +  enum
> +    {
> +      opcode_int,
> +      opcode_float,
> +      opcode_double
> +    } type = opcode_int;
> +
> +  switch (insn_mode)
> +    {
> +    case MODE_V16SF:
> +    case MODE_V8SF:
> +    case MODE_V4SF:
> +      scalar_mode = E_SFmode;
> +      break;
> +    case MODE_V8DF:
> +    case MODE_V4DF:
> +    case MODE_V2DF:
> +      scalar_mode = E_DFmode;
> +      break;
> +    case MODE_XI:
> +    case MODE_OI:
> +    case MODE_TI:
> +      scalar_mode = GET_MODE_INNER (mode);
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
So why are the switches split across functions?  Is there some reason
why you don't have output_ssemov first compute the size with its
existing switch, then a switch like the one above to compute the scalar
mode to pass down to get_ssemov?  Or put the two switches in get_ssemov?




Jeff
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 2d600173917..27f5cc13abf 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -38,6 +38,8 @@  extern void ix86_expand_split_stack_prologue (void);
 extern void ix86_output_addr_vec_elt (FILE *, int);
 extern void ix86_output_addr_diff_elt (FILE *, int, int);
 
+extern const char *ix86_output_ssemov (rtx_insn *, rtx *);
+
 extern enum calling_abi ix86_cfun_abi (void);
 extern enum calling_abi ix86_function_type_abi (const_tree);
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 81dfed12837..80ebc187041 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -10286,6 +10286,280 @@  ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
   return true;
 }
 
+/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
+   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
+   TARGET_AVX512VL or it is a register to register move which can
+   be done with zmm register move. */
+
+static const char *
+ix86_get_ssemov (rtx *operands, unsigned size,
+		 enum attr_mode insn_mode, machine_mode mode)
+{
+  char buf[128];
+  bool misaligned_p = (misaligned_operand (operands[0], mode)
+		       || misaligned_operand (operands[1], mode));
+  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
+		     || EXT_REX_SSE_REG_P (operands[1]));
+  machine_mode scalar_mode;
+
+  const char *opcode = NULL;
+  enum
+    {
+      opcode_int,
+      opcode_float,
+      opcode_double
+    } type = opcode_int;
+
+  switch (insn_mode)
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      scalar_mode = E_SFmode;
+      break;
+    case MODE_V8DF:
+    case MODE_V4DF:
+    case MODE_V2DF:
+      scalar_mode = E_DFmode;
+      break;
+    case MODE_XI:
+    case MODE_OI:
+    case MODE_TI:
+      scalar_mode = GET_MODE_INNER (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (SCALAR_FLOAT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_SFmode:
+	  if (size == 64 || !evex_reg_p || TARGET_AVX512VL)
+	    opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  else
+	    type = opcode_float;
+	  break;
+	case E_DFmode:
+	  if (size == 64 || !evex_reg_p || TARGET_AVX512VL)
+	    opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  else
+	    type = opcode_double;
+	  break;
+	case E_TFmode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else if (SCALAR_INT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_QImode:
+	  if (size == 64)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = (misaligned_p
+			  ? (TARGET_AVX512BW
+			     ? "vmovdqu8"
+			     : "vmovdqu64")
+			  : "vmovdqa64");
+	    }
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_HImode:
+	  if (size == 64)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = (misaligned_p
+			  ? (TARGET_AVX512BW
+			     ? "vmovdqu16"
+			     : "vmovdqu64")
+			  : "vmovdqa64");
+	    }
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_SImode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_DImode:
+	case E_TImode:
+	case E_OImode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_XImode:
+	  opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    gcc_unreachable ();
+
+  if (!opcode)
+    {
+      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
+         registers without AVX512VL by using zmm register move.  */
+      if (!evex_reg_p
+	  || TARGET_AVX512VL
+	  || memory_operand (operands[0], mode)
+	  || memory_operand (operands[1], mode))
+	gcc_unreachable ();
+      size = 64;
+      switch (type)
+	{
+	case opcode_int:
+	  opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  break;
+	case opcode_float:
+	  opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  break;
+	case opcode_double:
+	  opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  break;
+	}
+    }
+
+  switch (size)
+    {
+    case 64:
+      snprintf (buf, sizeof (buf), "%s\t{%%g1, %%g0|%%g0, %%g1}",
+		opcode);
+      break;
+    case 32:
+      snprintf (buf, sizeof (buf), "%s\t{%%t1, %%t0|%%t0, %%t1}",
+		opcode);
+      break;
+    case 16:
+      snprintf (buf, sizeof (buf), "%s\t{%%x1, %%x0|%%x0, %%x1}",
+		opcode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  output_asm_insn (buf, operands);
+  return "";
+}
+
+/* Return the template of the TYPE_SSEMOV instruction to move
+   operands[1] into operands[0].  */
+
+const char *
+ix86_output_ssemov (rtx_insn *insn, rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (get_attr_type (insn) != TYPE_SSEMOV
+      || mode != GET_MODE (operands[1]))
+    gcc_unreachable ();
+
+  enum attr_mode insn_mode = get_attr_mode (insn);
+
+  switch (insn_mode)
+    {
+    case MODE_XI:
+    case MODE_V8DF:
+    case MODE_V16SF:
+      return ix86_get_ssemov (operands, 64, insn_mode, mode);
+
+    case MODE_OI:
+    case MODE_V4DF:
+    case MODE_V8SF:
+      return ix86_get_ssemov (operands, 32, insn_mode, mode);
+
+    case MODE_TI:
+    case MODE_V2DF:
+    case MODE_V4SF:
+      return ix86_get_ssemov (operands, 16, insn_mode, mode);
+
+    case MODE_DI:
+      /* Handle broken assemblers that require movd instead of movq. */
+      if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	  && (GENERAL_REG_P (operands[0])
+	      || GENERAL_REG_P (operands[1])))
+	return "%vmovd\t{%1, %0|%0, %1}";
+      else
+	return "%vmovq\t{%1, %0|%0, %1}";
+
+    case MODE_V2SF:
+      if (TARGET_AVX && REG_P (operands[0]))
+	return "vmovlps\t{%1, %d0|%d0, %1}";
+      else
+	return "%vmovlps\t{%1, %0|%0, %1}";
+
+    case MODE_DF:
+      if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	return "vmovsd\t{%d1, %0|%0, %d1}";
+      else
+	return "%vmovsd\t{%1, %0|%0, %1}";
+
+    case MODE_V1DF:
+      gcc_assert (!TARGET_AVX);
+       return "movlpd\t{%1, %0|%0, %1}";
+
+    case MODE_SI:
+      return "%vmovd\t{%1, %0|%0, %1}";
+
+    case MODE_SF:
+      if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	return "vmovss\t{%d1, %0|%0, %d1}";
+      else
+	return "%vmovss\t{%1, %0|%0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Returns true if OP contains a symbol reference */
 
 bool
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b1ae88c400f..240384917df 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1878,11 +1878,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      if (misaligned_operand (operands[0], XImode)
-	  || misaligned_operand (operands[1], XImode))
-	return "vmovdqu32\t{%1, %0|%0, %1}";
-      else
-	return "vmovdqa32\t{%1, %0|%0, %1}";
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -1905,25 +1901,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      if (misaligned_operand (operands[0], OImode)
-	  || misaligned_operand (operands[1], OImode))
-	{
-	  if (get_attr_mode (insn) == MODE_V8SF)
-	    return "vmovups\t{%1, %0|%0, %1}";
-	  else if (get_attr_mode (insn) == MODE_XI)
-	    return "vmovdqu32\t{%1, %0|%0, %1}";
-	  else
-	    return "vmovdqu\t{%1, %0|%0, %1}";
-	}
-      else
-	{
-	  if (get_attr_mode (insn) == MODE_V8SF)
-	    return "vmovaps\t{%1, %0|%0, %1}";
-	  else if (get_attr_mode (insn) == MODE_XI)
-	    return "vmovdqa32\t{%1, %0|%0, %1}";
-	  else
-	    return "vmovdqa\t{%1, %0|%0, %1}";
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -1933,13 +1911,7 @@ 
    (set_attr "type" "sselog1,sselog1,ssemov,ssemov")
    (set_attr "prefix" "vex")
    (set (attr "mode")
-	(cond [(ior (match_operand 0 "ext_sse_reg_operand")
-		    (match_operand 1 "ext_sse_reg_operand"))
-		 (const_string "XI")
-	       (and (eq_attr "alternative" "1")
-		    (match_test "TARGET_AVX512VL"))
-		 (const_string "XI")
-	       (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+	(cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 		    (and (eq_attr "alternative" "3")
 			 (match_test "TARGET_SSE_TYPELESS_STORES")))
 		 (const_string "V8SF")
@@ -1965,27 +1937,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      /* TDmode values are passed as TImode on the stack.  Moving them
-	 to stack may result in unaligned memory access.  */
-      if (misaligned_operand (operands[0], TImode)
-	  || misaligned_operand (operands[1], TImode))
-	{
-	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "%vmovups\t{%1, %0|%0, %1}";
-	  else if (get_attr_mode (insn) == MODE_XI)
-	    return "vmovdqu32\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovdqu\t{%1, %0|%0, %1}";
-	}
-      else
-	{
-	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "%vmovaps\t{%1, %0|%0, %1}";
-	  else if (get_attr_mode (insn) == MODE_XI)
-	    return "vmovdqa32\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovdqa\t{%1, %0|%0, %1}";
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -2012,12 +1964,6 @@ 
    (set (attr "mode")
 	(cond [(eq_attr "alternative" "0,1")
 		 (const_string "DI")
-	       (ior (match_operand 0 "ext_sse_reg_operand")
-		    (match_operand 1 "ext_sse_reg_operand"))
-		 (const_string "XI")
-	       (and (eq_attr "alternative" "3")
-		    (match_test "TARGET_AVX512VL"))
-		 (const_string "XI")
 	       (ior (not (match_test "TARGET_SSE2"))
 		    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 			 (and (eq_attr "alternative" "5")
@@ -2091,31 +2037,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      switch (get_attr_mode (insn))
-	{
-	case MODE_DI:
-	  /* Handle broken assemblers that require movd instead of movq.  */
-	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
-	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
-	    return "%vmovd\t{%1, %0|%0, %1}";
-	  return "%vmovq\t{%1, %0|%0, %1}";
-
-	case MODE_TI:
-	  /* Handle AVX512 registers set.  */
-	  if (EXT_REX_SSE_REG_P (operands[0])
-	      || EXT_REX_SSE_REG_P (operands[1]))
-	    return "vmovdqa64\t{%1, %0|%0, %1}";
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
-
-	case MODE_V2SF:
-	  gcc_assert (!TARGET_AVX);
-	  return "movlps\t{%1, %0|%0, %1}";
-	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     case TYPE_SSECVT:
       if (SSE_REG_P (operands[0]))
@@ -2201,10 +2123,7 @@ 
      (cond [(eq_attr "alternative" "2")
 	      (const_string "SI")
 	    (eq_attr "alternative" "12,13")
-	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
-			  (match_operand 1 "ext_sse_reg_operand"))
-		       (const_string "TI")
-		     (ior (not (match_test "TARGET_SSE2"))
+	      (cond [(ior (not (match_test "TARGET_SSE2"))
 			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
 		       (const_string "V4SF")
 		     (match_test "TARGET_AVX")
@@ -2327,25 +2246,7 @@ 
       gcc_unreachable ();
 
     case TYPE_SSEMOV:
-      switch (get_attr_mode (insn))
-	{
-	case MODE_SI:
-          return "%vmovd\t{%1, %0|%0, %1}";
-	case MODE_TI:
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
-	case MODE_XI:
-	  return "vmovdqa32\t{%g1, %g0|%g0, %g1}";
-
-	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
-
-	case MODE_SF:
-	  gcc_assert (!TARGET_AVX);
-          return "movss\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     case TYPE_MMX:
       return "pxor\t%0, %0";
@@ -2411,10 +2312,7 @@ 
      (cond [(eq_attr "alternative" "2,3")
 	      (const_string "DI")
 	    (eq_attr "alternative" "8,9")
-	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
-			  (match_operand 1 "ext_sse_reg_operand"))
-		       (const_string "XI")
-		     (ior (not (match_test "TARGET_SSE2"))
+	      (cond [(ior (not (match_test "TARGET_SSE2"))
 			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
 		       (const_string "V4SF")
 		     (match_test "TARGET_AVX")
@@ -3235,31 +3133,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      /* Handle misaligned load/store since we
-         don't have movmisaligntf pattern. */
-      if (misaligned_operand (operands[0], TFmode)
-	  || misaligned_operand (operands[1], TFmode))
-	{
-	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "%vmovups\t{%1, %0|%0, %1}";
-	  else if (TARGET_AVX512VL
-		   && (EXT_REX_SSE_REG_P (operands[0])
-		       || EXT_REX_SSE_REG_P (operands[1])))
-	    return "vmovdqu64\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovdqu\t{%1, %0|%0, %1}";
-	}
-      else
-	{
-	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "%vmovaps\t{%1, %0|%0, %1}";
-	  else if (TARGET_AVX512VL
-		   && (EXT_REX_SSE_REG_P (operands[0])
-		       || EXT_REX_SSE_REG_P (operands[1])))
-	    return "vmovdqa64\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovdqa\t{%1, %0|%0, %1}";
-	}
+      return ix86_output_ssemov (insn, operands);
 
     case TYPE_MULTI:
 	return "#";
@@ -3412,37 +3286,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      switch (get_attr_mode (insn))
-	{
-	case MODE_DF:
-	  if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
-	    return "vmovsd\t{%d1, %0|%0, %d1}";
-	  return "%vmovsd\t{%1, %0|%0, %1}";
-
-	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
-	case MODE_V8DF:
-	  return "vmovapd\t{%g1, %g0|%g0, %g1}";
-	case MODE_V2DF:
-	  return "%vmovapd\t{%1, %0|%0, %1}";
-
-	case MODE_V2SF:
-	  gcc_assert (!TARGET_AVX);
-	  return "movlps\t{%1, %0|%0, %1}";
-	case MODE_V1DF:
-	  gcc_assert (!TARGET_AVX);
-	  return "movlpd\t{%1, %0|%0, %1}";
-
-	case MODE_DI:
-	  /* Handle broken assemblers that require movd instead of movq.  */
-	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
-	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
-	    return "%vmovd\t{%1, %0|%0, %1}";
-	  return "%vmovq\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -3498,9 +3342,6 @@ 
 	       (eq_attr "alternative" "12,16")
 		 (cond [(not (match_test "TARGET_SSE2"))
 		 	  (const_string "V4SF")
-			(and (match_test "TARGET_AVX512F")
-			  (not (match_test "TARGET_PREFER_AVX256")))
-			  (const_string "XI")
 			(match_test "TARGET_AVX")
 			  (const_string "V2DF")
 			(match_test "optimize_function_for_size_p (cfun)")
@@ -3516,12 +3357,7 @@ 
 
 	       /* movaps is one byte shorter for non-AVX targets.  */
 	       (eq_attr "alternative" "13,17")
-		 (cond [(and (ior (not (match_test "TARGET_PREFER_AVX256"))
-				  (not (match_test "TARGET_AVX512VL")))
-			     (ior (match_operand 0 "ext_sse_reg_operand")
-				  (match_operand 1 "ext_sse_reg_operand")))
-			  (const_string "V8DF")
-			(ior (not (match_test "TARGET_SSE2"))
+		 (cond [(ior (not (match_test "TARGET_SSE2"))
 			     (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
 			  (const_string "V4SF")
 			(match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
@@ -3613,24 +3449,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      switch (get_attr_mode (insn))
-	{
-	case MODE_SF:
-	  if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
-	    return "vmovss\t{%d1, %0|%0, %d1}";
-	  return "%vmovss\t{%1, %0|%0, %1}";
-
-	case MODE_V16SF:
-	  return "vmovaps\t{%g1, %g0|%g0, %g1}";
-	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
-
-	case MODE_SI:
-	  return "%vmovd\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     case TYPE_MMXMOV:
       switch (get_attr_mode (insn))
@@ -3703,12 +3522,7 @@ 
 		  better to maintain the whole registers in single format
 		  to avoid problems on using packed logical operations.  */
 	       (eq_attr "alternative" "6")
-		 (cond [(and (ior (not (match_test "TARGET_PREFER_AVX256"))
-				  (not (match_test "TARGET_AVX512VL")))
-			     (ior (match_operand 0 "ext_sse_reg_operand")
-				  (match_operand 1 "ext_sse_reg_operand")))
-			  (const_string "V16SF")
-			(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+		 (cond [(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
 			     (match_test "TARGET_SSE_SPLIT_REGS"))
 			  (const_string "V4SF")
 		       ]
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b566cc80020..57060b9d233 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -115,29 +115,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      switch (get_attr_mode (insn))
-	{
-	case MODE_DI:
-	  /* Handle broken assemblers that require movd instead of movq.  */
-	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
-	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
-	    return "%vmovd\t{%1, %0|%0, %1}";
-	  return "%vmovq\t{%1, %0|%0, %1}";
-	case MODE_TI:
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
-	case MODE_XI:
-	  return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
-
-	case MODE_V2SF:
-	  if (TARGET_AVX && REG_P (operands[0]))
-	    return "vmovlps\t{%1, %0, %0|%0, %0, %1}";
-	  return "%vmovlps\t{%1, %0|%0, %1}";
-	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -186,10 +164,7 @@ 
      (cond [(eq_attr "alternative" "2")
 	      (const_string "SI")
 	    (eq_attr "alternative" "11,12")
-	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
-			  (match_operand 1 "ext_sse_reg_operand"))
-			(const_string "XI")
-		     (match_test "<MODE>mode == V2SFmode")
+	      (cond [(match_test "<MODE>mode == V2SFmode")
 		       (const_string "V4SF")
 		     (ior (not (match_test "TARGET_SSE2"))
 			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 865947debcc..99226e86436 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -54,11 +54,6 @@ 
   (and (match_code "reg")
        (match_test "SSE_REGNO_P (REGNO (op))")))
 
-;; True if the operand is an AVX-512 new register.
-(define_predicate "ext_sse_reg_operand"
-  (and (match_code "reg")
-       (match_test "EXT_REX_SSE_REGNO_P (REGNO (op))")))
-
 ;; Return true if op is a QImode register.
 (define_predicate "any_QIreg_operand"
   (and (match_code "reg")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ac299495b2c..9bd190298d7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -982,98 +982,7 @@ 
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
-	 in avx512f, so we need to use workarounds, to access sse registers
-	 16-31, which are evex-only. In avx512vl we don't need workarounds.  */
-      if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
-	  && (EXT_REX_SSE_REG_P (operands[0])
-	      || EXT_REX_SSE_REG_P (operands[1])))
-	{
-	  if (memory_operand (operands[0], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else if (<MODE_SIZE> == 16)
-		return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else if (memory_operand (operands[1], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vbroadcast<shuffletype>64x4\t{%1, %g0|%g0, %1}";
-	      else if (<MODE_SIZE> == 16)
-		return "vbroadcast<shuffletype>32x4\t{%1, %g0|%g0, %1}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else
-	    /* Reg -> reg move is always aligned.  Just use wider move.  */
-	    switch (get_attr_mode (insn))
-	      {
-	      case MODE_V8SF:
-	      case MODE_V4SF:
-		return "vmovaps\t{%g1, %g0|%g0, %g1}";
-	      case MODE_V4DF:
-	      case MODE_V2DF:
-		return "vmovapd\t{%g1, %g0|%g0, %g1}";
-	      case MODE_OI:
-	      case MODE_TI:
-		return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
-	      default:
-		gcc_unreachable ();
-	      }
-	}
-
-      switch (get_attr_mode (insn))
-	{
-	case MODE_V16SF:
-	case MODE_V8SF:
-	case MODE_V4SF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovups\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovaps\t{%1, %0|%0, %1}";
-
-	case MODE_V8DF:
-	case MODE_V4DF:
-	case MODE_V2DF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovupd\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovapd\t{%1, %0|%0, %1}";
-
-	case MODE_OI:
-	case MODE_TI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return TARGET_AVX512VL
-		   && (<MODE>mode == V4SImode
-		       || <MODE>mode == V2DImode
-		       || <MODE>mode == V8SImode
-		       || <MODE>mode == V4DImode
-		       || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "%vmovdqu\t{%1, %0|%0, %1}";
-	  else
-	    return TARGET_AVX512VL ? "vmovdqa64\t{%1, %0|%0, %1}"
-				   : "%vmovdqa\t{%1, %0|%0, %1}";
-	case MODE_XI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return (<MODE>mode == V16SImode
-		    || <MODE>mode == V8DImode
-		    || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "vmovdqu64\t{%1, %0|%0, %1}";
-	  else
-	    return "vmovdqa64\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -1082,10 +991,7 @@ 
   [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(and (eq_attr "alternative" "1")
-		    (match_test "TARGET_AVX512VL"))
-		 (const_string "<sseinsnmode>")
-	       (and (match_test "<MODE_SIZE> == 16")
+	(cond [(and (match_test "<MODE_SIZE> == 16")
 		    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 			 (and (eq_attr "alternative" "3")
 			      (match_test "TARGET_SSE_TYPELESS_STORES"))))
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
index 14fe4b84544..cf57517d6df 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
@@ -4,13 +4,13 @@ 
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\nxy\]*\\((?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-2a.c b/gcc/testsuite/gcc.target/i386/pr89229-2a.c
new file mode 100644
index 00000000000..0cf78039481
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-2a.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+typedef __int128 __m128t __attribute__ ((__vector_size__ (16),
+					 __may_alias__));
+
+__m128t
+foo1 (void)
+{
+  register __int128 xmm16 __asm ("xmm16") = (__int128) -1;
+  asm volatile ("" : "+v" (xmm16));
+  return (__m128t) xmm16;
+}
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-2b.c b/gcc/testsuite/gcc.target/i386/pr89229-2b.c
new file mode 100644
index 00000000000..8d5d6c41d30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-2b.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+typedef __int128 __m128t __attribute__ ((__vector_size__ (16),
+					 __may_alias__));
+
+__m128t
+foo1 (void)
+{
+  register __int128 xmm16 __asm ("xmm16") = (__int128) -1; /* { dg-error "register specified for 'xmm16'" } */
+  asm volatile ("" : "+v" (xmm16));
+  return (__m128t) xmm16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-2c.c b/gcc/testsuite/gcc.target/i386/pr89229-2c.c
new file mode 100644
index 00000000000..218da46dcd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-2c.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-2a.c"
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-3a.c b/gcc/testsuite/gcc.target/i386/pr89229-3a.c
new file mode 100644
index 00000000000..fd56f447016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-3a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern int i;
+
+int
+foo1 (void)
+{
+  register int xmm16 __asm ("xmm16") = i;
+  asm volatile ("" : "+v" (xmm16));
+  register int xmm17 __asm ("xmm17") = xmm16;
+  asm volatile ("" : "+v" (xmm17));
+  return xmm17;
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-3b.c b/gcc/testsuite/gcc.target/i386/pr89229-3b.c
new file mode 100644
index 00000000000..9265fc0354b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-3b.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+#include "pr89229-3a.c"
+
+/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-3c.c b/gcc/testsuite/gcc.target/i386/pr89229-3c.c
new file mode 100644
index 00000000000..d3fdf1ee273
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-3c.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-3a.c"
+
+/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-4a.c b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
new file mode 100644
index 00000000000..cb9b071e873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+extern long long i;
+
+long long
+foo1 (void)
+{
+  register long long xmm16 __asm ("xmm16") = i;
+  asm volatile ("" : "+v" (xmm16));
+  register long long xmm17 __asm ("xmm17") = xmm16;
+  asm volatile ("" : "+v" (xmm17));
+  return xmm17;
+}
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-4b.c b/gcc/testsuite/gcc.target/i386/pr89229-4b.c
new file mode 100644
index 00000000000..023e81253a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-4b.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+#include "pr89229-4a.c"
+
+/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-4c.c b/gcc/testsuite/gcc.target/i386/pr89229-4c.c
new file mode 100644
index 00000000000..e02eb37c16d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-4c.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-4a.c"
+
+/* { dg-final { scan-assembler-times "vmovdqa64\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-5a.c b/gcc/testsuite/gcc.target/i386/pr89229-5a.c
new file mode 100644
index 00000000000..856115b2f5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-5a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern float d;
+
+void
+foo1 (float x)
+{
+  register float xmm16 __asm ("xmm16") = x;
+  asm volatile ("" : "+v" (xmm16));
+  register float xmm17 __asm ("xmm17") = xmm16;
+  asm volatile ("" : "+v" (xmm17));
+  d = xmm17;
+}
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-5b.c b/gcc/testsuite/gcc.target/i386/pr89229-5b.c
new file mode 100644
index 00000000000..cb0f3b55ccc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-5b.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+#include "pr89229-5a.c"
+
+/* { dg-final { scan-assembler-times "vmovaps\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-5c.c b/gcc/testsuite/gcc.target/i386/pr89229-5c.c
new file mode 100644
index 00000000000..529a520133c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-5c.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-5a.c"
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-6a.c b/gcc/testsuite/gcc.target/i386/pr89229-6a.c
new file mode 100644
index 00000000000..f88d7c8d74c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-6a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern double d;
+
+void
+foo1 (double x)
+{
+  register double xmm16 __asm ("xmm16") = x;
+  asm volatile ("" : "+v" (xmm16));
+  register double xmm17 __asm ("xmm17") = xmm16;
+  asm volatile ("" : "+v" (xmm17));
+  d = xmm17;
+}
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-6b.c b/gcc/testsuite/gcc.target/i386/pr89229-6b.c
new file mode 100644
index 00000000000..316d85d921e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-6b.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+#include "pr89229-6a.c"
+
+/* { dg-final { scan-assembler-times "vmovapd\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-6c.c b/gcc/testsuite/gcc.target/i386/pr89229-6c.c
new file mode 100644
index 00000000000..7a4d254670c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-6c.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-6a.c"
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-7a.c b/gcc/testsuite/gcc.target/i386/pr89229-7a.c
new file mode 100644
index 00000000000..fcb85c366b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-7a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern __float128 d;
+
+void
+foo1 (__float128 x)
+{
+  register __float128 xmm16 __asm ("xmm16") = x;
+  asm volatile ("" : "+v" (xmm16));
+  register __float128 xmm17 __asm ("xmm17") = xmm16;
+  asm volatile ("" : "+v" (xmm17));
+  d = xmm17;
+}
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-7b.c b/gcc/testsuite/gcc.target/i386/pr89229-7b.c
new file mode 100644
index 00000000000..37eb83c783b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-7b.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mno-avx512vl" } */
+
+extern __float128 d;
+
+void
+foo1 (__float128 x)
+{
+  register __float128 xmm16 __asm ("xmm16") = x; /* { dg-error "register specified for 'xmm16'" } */
+  asm volatile ("" : "+v" (xmm16));
+  d = xmm16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-7c.c b/gcc/testsuite/gcc.target/i386/pr89229-7c.c
new file mode 100644
index 00000000000..e37ff2bf5bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89229-7c.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512 -mprefer-vector-width=512" } */
+
+#include "pr89229-7a.c"
+
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */