diff mbox series

[01/10] i386: Properly encode vector registers in vector move

Message ID 20200215152628.32068-2-hjl.tools@gmail.com
State New
Headers show
Series i386: Properly encode xmm16-xmm31/ymm16-ymm31 for vector move | expand

Commit Message

H.J. Lu Feb. 15, 2020, 3:26 p.m. UTC
On x86, when AVX and AVX512 are enabled, vector move instructions can
be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):

   0:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
   4:	62 f1 fd 08 6f d1    	vmovdqa64 %xmm1,%xmm2

We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
and 256-bit vector moves.  Mode attributes on x86 vector move patterns
indicate target preferences of vector move encoding.  For vector register
to vector register move, we can use 512-bit vector move instructions to
move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
if upper 16 vector registers aren't used.  This patch adds a function,
ix86_output_ssemov, to generate vector moves:

1. If zmm registers are used, use EVEX encoding.
2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
will be generated.
3. If xmm16-xmm31/ymm16-ymm31 registers are used:
   a. With AVX512VL, AVX512VL vector moves will be generated.
   b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
      move will be done with zmm register move.

Tested on AVX2 and AVX512 with and without --with-arch=native.

gcc/

	PR target/89229
	PR target/89346
	* config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
	* config/i386/i386.c (ix86_get_ssemov): New function.
	(ix86_output_ssemov): Likewise.
	* config/i386/sse.md (VMOVE:mov<mode>_internal): Call
	ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
	check.

gcc/testsuite/

	PR target/89229
	PR target/89346
	* gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
	* gcc.target/i386/pr89229-2a.c: New test.
---
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        | 274 ++++++++++++++++++
 gcc/config/i386/sse.md                        |  98 +------
 .../gcc.target/i386/avx512vl-vmovdqa64-1.c    |   7 +-
 gcc/testsuite/gcc.target/i386/pr89346.c       |  15 +
 5 files changed, 296 insertions(+), 100 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89346.c

Comments

Jeff Law Feb. 26, 2020, 10:41 p.m. UTC | #1
On Sat, 2020-02-15 at 07:26 -0800, H.J. Lu wrote:
> On x86, when AVX and AVX512 are enabled, vector move instructions can
> be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):
> 
>    0:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
>    4:	62 f1 fd 08 6f d1    	vmovdqa64 %xmm1,%xmm2
> 
> We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
> only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
> and 256-bit vector moves.  Mode attributes on x86 vector move patterns
> indicate target preferences of vector move encoding.  For vector register
> to vector register move, we can use 512-bit vector move instructions to
> move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
> and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
> if upper 16 vector registers aren't used.  This patch adds a function,
> ix86_output_ssemov, to generate vector moves:
> 
> 1. If zmm registers are used, use EVEX encoding.
> 2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
> will be generated.
> 3. If xmm16-xmm31/ymm16-ymm31 registers are used:
>    a. With AVX512VL, AVX512VL vector moves will be generated.
>    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
>       move will be done with zmm register move.
> 
> 
[ ... ]

>  
> +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> +   TARGET_AVX512VL or it is a register to register move which can
> +   be done with zmm register move. */
> +
> +static const char *
> +ix86_get_ssemov (rtx *operands, unsigned size,
> +		 enum attr_mode insn_mode, machine_mode mode)
> +{
> +  char buf[128];
> +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> +		       || misaligned_operand (operands[1], mode));
> +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> +		     || EXT_REX_SSE_REG_P (operands[1]));
> +  machine_mode scalar_mode;
> +
> +  else if (SCALAR_INT_MODE_P (scalar_mode))
> +    {
> +      switch (scalar_mode)
> +	{
> +	case E_QImode:
> +	  if (size == 64)
> +	    opcode = (misaligned_p
> +		      ? (TARGET_AVX512BW
> +			 ? "vmovdqu8"
> +			 : "vmovdqu64")
> +		      : "vmovdqa64");
> +	  else if (evex_reg_p)
> +	    {
> +	      if (TARGET_AVX512VL)
> +		opcode = (misaligned_p
> +			  ? (TARGET_AVX512BW
> +			     ? "vmovdqu8"
> +			     : "vmovdqu64")
> +			  : "vmovdqa64");
> +	    }
> +	  else
> +	    opcode = (misaligned_p
> +		      ? (TARGET_AVX512BW
> +			 ? "vmovdqu8"
> +			 : "%vmovdqu")
> +		      : "%vmovdqa");
> +	  break;
> +	case E_HImode:
> +	  if (size == 64)
> +	    opcode = (misaligned_p
> +		      ? (TARGET_AVX512BW
> +			 ? "vmovdqu16"
> +			 : "vmovdqu64")
> +		      : "vmovdqa64");
> +	  else if (evex_reg_p)
> +	    {
> +	      if (TARGET_AVX512VL)
> +		opcode = (misaligned_p
> +			  ? (TARGET_AVX512BW
> +			     ? "vmovdqu16"
> +			     : "vmovdqu64")
> +			  : "vmovdqa64");
> +	    }
> +	  else
> +	    opcode = (misaligned_p
> +		      ? (TARGET_AVX512BW
> +			 ? "vmovdqu16"
> +			 : "%vmovdqu")
> +		      : "%vmovdqa");
> +	  break;
> +	case E_SImode:
> +	  if (size == 64)
> +	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> +	  else if (evex_reg_p)
> +	    {
> +	      if (TARGET_AVX512VL)
> +		opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> +	    }
> +	  else
> +	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> +	  break;
> +	case E_DImode:
> +	case E_TImode:
> +	case E_OImode:
> +	  if (size == 64)
> +	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +	  else if (evex_reg_p)
> +	    {
> +	      if (TARGET_AVX512VL)
> +		opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +	    }
> +	  else
> +	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> +	  break;
> +	case E_XImode:
> +	  opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +	  break;
> +	default:
> +	  gcc_unreachable ();
> +	}
> +    }
> +  else
> +    gcc_unreachable ();
> +
> +  if (!opcode)
> +    {
> +      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
> +	 registers without AVX512VL by using zmm register move.  */
So the overall flow control in here is rather convoluted.  I hate the way you
don't set OPCODE above and then do it down here.  I would suggest breaking 
the !opcode bits into its own little function.  Then above in those places
where you do

if (TARGET_AVX512VL)
   opcode = <whatever>;


Instead change those to something like

if (TARGET_AVX512VL)
   opcode = <whatever>;
else
   opcode = new_function (...)

That way opcode is set on every path through the major if-else in this
function.

Second when I suggested you break the patch up on a per-pattern basis, I
probably should have also said that I would start with the minimal support in
ix86_get_ssemov and ix86_output_ssemov to support the pattern you just
converted.  That way the mapping from current code to new code is more obvious.
 

As it stands the breaking into separate patches didn't really help much because
we still have all the complexity in ix86_get_ssemov and ix86_output_ssemov in
patch #1 and that's the code I'm most worried about verifying we get right,
particularly at this stage.  I literally can't take any patch and map from the
old code to the new code without having to understand all of patch #1.



Jeff
H.J. Lu Feb. 27, 2020, 12:02 a.m. UTC | #2
On Wed, Feb 26, 2020 at 2:42 PM Jeff Law <law@redhat.com> wrote:
>
> On Sat, 2020-02-15 at 07:26 -0800, H.J. Lu wrote:
> > On x86, when AVX and AVX512 are enabled, vector move instructions can
> > be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):
> >
> >    0: c5 f9 6f d1             vmovdqa %xmm1,%xmm2
> >    4: 62 f1 fd 08 6f d1       vmovdqa64 %xmm1,%xmm2
> >
> > We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
> > only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
> > and 256-bit vector moves.  Mode attributes on x86 vector move patterns
> > indicate target preferences of vector move encoding.  For vector register
> > to vector register move, we can use 512-bit vector move instructions to
> > move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
> > and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
> > if upper 16 vector registers aren't used.  This patch adds a function,
> > ix86_output_ssemov, to generate vector moves:
> >
> > 1. If zmm registers are used, use EVEX encoding.
> > 2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
> > will be generated.
> > 3. If xmm16-xmm31/ymm16-ymm31 registers are used:
> >    a. With AVX512VL, AVX512VL vector moves will be generated.
> >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> >       move will be done with zmm register move.
> >
> >
> [ ... ]
>
> >
> > +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> > +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> > +   TARGET_AVX512VL or it is a register to register move which can
> > +   be done with zmm register move. */
> > +
> > +static const char *
> > +ix86_get_ssemov (rtx *operands, unsigned size,
> > +              enum attr_mode insn_mode, machine_mode mode)
> > +{
> > +  char buf[128];
> > +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > +                    || misaligned_operand (operands[1], mode));
> > +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> > +                  || EXT_REX_SSE_REG_P (operands[1]));
> > +  machine_mode scalar_mode;
> > +
> > +  else if (SCALAR_INT_MODE_P (scalar_mode))
> > +    {
> > +      switch (scalar_mode)
> > +     {
> > +     case E_QImode:
> > +       if (size == 64)
> > +         opcode = (misaligned_p
> > +                   ? (TARGET_AVX512BW
> > +                      ? "vmovdqu8"
> > +                      : "vmovdqu64")
> > +                   : "vmovdqa64");
> > +       else if (evex_reg_p)
> > +         {
> > +           if (TARGET_AVX512VL)
> > +             opcode = (misaligned_p
> > +                       ? (TARGET_AVX512BW
> > +                          ? "vmovdqu8"
> > +                          : "vmovdqu64")
> > +                       : "vmovdqa64");
> > +         }
> > +       else
> > +         opcode = (misaligned_p
> > +                   ? (TARGET_AVX512BW
> > +                      ? "vmovdqu8"
> > +                      : "%vmovdqu")
> > +                   : "%vmovdqa");
> > +       break;
> > +     case E_HImode:
> > +       if (size == 64)
> > +         opcode = (misaligned_p
> > +                   ? (TARGET_AVX512BW
> > +                      ? "vmovdqu16"
> > +                      : "vmovdqu64")
> > +                   : "vmovdqa64");
> > +       else if (evex_reg_p)
> > +         {
> > +           if (TARGET_AVX512VL)
> > +             opcode = (misaligned_p
> > +                       ? (TARGET_AVX512BW
> > +                          ? "vmovdqu16"
> > +                          : "vmovdqu64")
> > +                       : "vmovdqa64");
> > +         }
> > +       else
> > +         opcode = (misaligned_p
> > +                   ? (TARGET_AVX512BW
> > +                      ? "vmovdqu16"
> > +                      : "%vmovdqu")
> > +                   : "%vmovdqa");
> > +       break;
> > +     case E_SImode:
> > +       if (size == 64)
> > +         opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +       else if (evex_reg_p)
> > +         {
> > +           if (TARGET_AVX512VL)
> > +             opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +         }
> > +       else
> > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +       break;
> > +     case E_DImode:
> > +     case E_TImode:
> > +     case E_OImode:
> > +       if (size == 64)
> > +         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +       else if (evex_reg_p)
> > +         {
> > +           if (TARGET_AVX512VL)
> > +             opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +         }
> > +       else
> > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +       break;
> > +     case E_XImode:
> > +       opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +       break;
> > +     default:
> > +       gcc_unreachable ();
> > +     }
> > +    }
> > +  else
> > +    gcc_unreachable ();
> > +
> > +  if (!opcode)
> > +    {
> > +      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
> > +      registers without AVX512VL by using zmm register move.  */
> So the overall flow control in here is rather convoluted.  I hate the way you
> don't set OPCODE above and then do it down here.  I would suggest breaking
> the !opcode bits into its own little function.  Then above in those places
> where you do
>
> if (TARGET_AVX512VL)
>    opcode = <whatever>;
>
>
> Instead change those to something like
>
> if (TARGET_AVX512VL)
>    opcode = <whatever>;
> else
>    opcode = new_function (...)
>
> That way opcode is set on every path through the major if-else in this
> function.
>
> Second when I suggested you break the patch up on a per-pattern basis, I
> probably should have also said that I would start with the minimal support in
> ix86_get_ssemov and ix86_output_ssemov to support the pattern you just
> converted.  That way the mapping from current code to new code is more obvious.

I will do these.   On x86,  different instructions can move vector
registers.  They all
do the same thing.  But some are preferred over others, depending on
tuning options.

>
> As it stands the breaking into separate patches didn't really help much because
> we still have all the complexity in ix86_get_ssemov and ix86_output_ssemov in
> patch #1 and that's the code I'm most worried about verifying we get right,
> particularly at this stage.  I literally can't take any patch and map from the
> old code to the new code without having to understand all of patch #1.

The old code is very convoluted and wrong in some cases.  I am trying to
clean it up.  I will update my patches based on your feedback.
Jeff Law Feb. 27, 2020, 12:24 a.m. UTC | #3
On Wed, 2020-02-26 at 16:02 -0800, H.J. Lu wrote:
> On Wed, Feb 26, 2020 at 2:42 PM Jeff Law <law@redhat.com> wrote:
> > On Sat, 2020-02-15 at 07:26 -0800, H.J. Lu wrote:
> > > On x86, when AVX and AVX512 are enabled, vector move instructions can
> > > be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):
> > > 
> > >    0: c5 f9 6f d1             vmovdqa %xmm1,%xmm2
> > >    4: 62 f1 fd 08 6f d1       vmovdqa64 %xmm1,%xmm2
> > > 
> > > We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
> > > only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
> > > and 256-bit vector moves.  Mode attributes on x86 vector move patterns
> > > indicate target preferences of vector move encoding.  For vector register
> > > to vector register move, we can use 512-bit vector move instructions to
> > > move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
> > > and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
> > > if upper 16 vector registers aren't used.  This patch adds a function,
> > > ix86_output_ssemov, to generate vector moves:
> > > 
> > > 1. If zmm registers are used, use EVEX encoding.
> > > 2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
> > > will be generated.
> > > 3. If xmm16-xmm31/ymm16-ymm31 registers are used:
> > >    a. With AVX512VL, AVX512VL vector moves will be generated.
> > >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> > >       move will be done with zmm register move.
> > > 
> > > 
> > [ ... ]
> > 
> > > +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> > > +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> > > +   TARGET_AVX512VL or it is a register to register move which can
> > > +   be done with zmm register move. */
> > > +
> > > +static const char *
> > > +ix86_get_ssemov (rtx *operands, unsigned size,
> > > +              enum attr_mode insn_mode, machine_mode mode)
> > > +{
> > > +  char buf[128];
> > > +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > > +                    || misaligned_operand (operands[1], mode));
> > > +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> > > +                  || EXT_REX_SSE_REG_P (operands[1]));
> > > +  machine_mode scalar_mode;
> > > +
> > > +  else if (SCALAR_INT_MODE_P (scalar_mode))
> > > +    {
> > > +      switch (scalar_mode)
> > > +     {
> > > +     case E_QImode:
> > > +       if (size == 64)
> > > +         opcode = (misaligned_p
> > > +                   ? (TARGET_AVX512BW
> > > +                      ? "vmovdqu8"
> > > +                      : "vmovdqu64")
> > > +                   : "vmovdqa64");
> > > +       else if (evex_reg_p)
> > > +         {
> > > +           if (TARGET_AVX512VL)
> > > +             opcode = (misaligned_p
> > > +                       ? (TARGET_AVX512BW
> > > +                          ? "vmovdqu8"
> > > +                          : "vmovdqu64")
> > > +                       : "vmovdqa64");
> > > +         }
> > > +       else
> > > +         opcode = (misaligned_p
> > > +                   ? (TARGET_AVX512BW
> > > +                      ? "vmovdqu8"
> > > +                      : "%vmovdqu")
> > > +                   : "%vmovdqa");
> > > +       break;
> > > +     case E_HImode:
> > > +       if (size == 64)
> > > +         opcode = (misaligned_p
> > > +                   ? (TARGET_AVX512BW
> > > +                      ? "vmovdqu16"
> > > +                      : "vmovdqu64")
> > > +                   : "vmovdqa64");
> > > +       else if (evex_reg_p)
> > > +         {
> > > +           if (TARGET_AVX512VL)
> > > +             opcode = (misaligned_p
> > > +                       ? (TARGET_AVX512BW
> > > +                          ? "vmovdqu16"
> > > +                          : "vmovdqu64")
> > > +                       : "vmovdqa64");
> > > +         }
> > > +       else
> > > +         opcode = (misaligned_p
> > > +                   ? (TARGET_AVX512BW
> > > +                      ? "vmovdqu16"
> > > +                      : "%vmovdqu")
> > > +                   : "%vmovdqa");
> > > +       break;
> > > +     case E_SImode:
> > > +       if (size == 64)
> > > +         opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > +       else if (evex_reg_p)
> > > +         {
> > > +           if (TARGET_AVX512VL)
> > > +             opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > +         }
> > > +       else
> > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > +       break;
> > > +     case E_DImode:
> > > +     case E_TImode:
> > > +     case E_OImode:
> > > +       if (size == 64)
> > > +         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +       else if (evex_reg_p)
> > > +         {
> > > +           if (TARGET_AVX512VL)
> > > +             opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +         }
> > > +       else
> > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > +       break;
> > > +     case E_XImode:
> > > +       opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +       break;
> > > +     default:
> > > +       gcc_unreachable ();
> > > +     }
> > > +    }
> > > +  else
> > > +    gcc_unreachable ();
> > > +
> > > +  if (!opcode)
> > > +    {
> > > +      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
> > > +      registers without AVX512VL by using zmm register move.  */
> > So the overall flow control in here is rather convoluted.  I hate the way
> > you
> > don't set OPCODE above and then do it down here.  I would suggest breaking
> > the !opcode bits into its own little function.  Then above in those places
> > where you do
> > 
> > if (TARGET_AVX512VL)
> >    opcode = <whatever>;
> > 
> > 
> > Instead change those to something like
> > 
> > if (TARGET_AVX512VL)
> >    opcode = <whatever>;
> > else
> >    opcode = new_function (...)
> > 
> > That way opcode is set on every path through the major if-else in this
> > function.
> > 
> > Second when I suggested you break the patch up on a per-pattern basis, I
> > probably should have also said that I would start with the minimal support
> > in
> > ix86_get_ssemov and ix86_output_ssemov to support the pattern you just
> > converted.  That way the mapping from current code to new code is more
> > obvious.
> 
> I will do these.   On x86,  different instructions can move vector
> registers.  They all
> do the same thing.  But some are preferred over others, depending on
> tuning options.
I know.

> 
> > As it stands the breaking into separate patches didn't really help much
> > because
> > we still have all the complexity in ix86_get_ssemov and ix86_output_ssemov
> > in
> > patch #1 and that's the code I'm most worried about verifying we get right,
> > particularly at this stage.  I literally can't take any patch and map from
> > the
> > old code to the new code without having to understand all of patch #1.
> 
> The old code is very convoluted and wrong in some cases.  I am trying to
> clean it up.  I will update my patches based on your feedback.
Thanks.  I was going to try and break those two functions down on my own, but
you're more likely to get it right than I am :-)

jeff
>
H.J. Lu Feb. 27, 2020, 2:50 p.m. UTC | #4
On Wed, Feb 26, 2020 at 4:24 PM Jeff Law <law@redhat.com> wrote:
>
> On Wed, 2020-02-26 at 16:02 -0800, H.J. Lu wrote:
> > On Wed, Feb 26, 2020 at 2:42 PM Jeff Law <law@redhat.com> wrote:
> > > On Sat, 2020-02-15 at 07:26 -0800, H.J. Lu wrote:
> > > > On x86, when AVX and AVX512 are enabled, vector move instructions can
> > > > be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):
> > > >
> > > >    0: c5 f9 6f d1             vmovdqa %xmm1,%xmm2
> > > >    4: 62 f1 fd 08 6f d1       vmovdqa64 %xmm1,%xmm2
> > > >
> > > > We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
> > > > only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
> > > > and 256-bit vector moves.  Mode attributes on x86 vector move patterns
> > > > indicate target preferences of vector move encoding.  For vector register
> > > > to vector register move, we can use 512-bit vector move instructions to
> > > > move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
> > > > and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
> > > > if upper 16 vector registers aren't used.  This patch adds a function,
> > > > ix86_output_ssemov, to generate vector moves:
> > > >
> > > > 1. If zmm registers are used, use EVEX encoding.
> > > > 2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
> > > > will be generated.
> > > > 3. If xmm16-xmm31/ymm16-ymm31 registers are used:
> > > >    a. With AVX512VL, AVX512VL vector moves will be generated.
> > > >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> > > >       move will be done with zmm register move.
> > > >
> > > >
> > > [ ... ]
> > >
> > > > +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> > > > +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> > > > +   TARGET_AVX512VL or it is a register to register move which can
> > > > +   be done with zmm register move. */
> > > > +
> > > > +static const char *
> > > > +ix86_get_ssemov (rtx *operands, unsigned size,
> > > > +              enum attr_mode insn_mode, machine_mode mode)
> > > > +{
> > > > +  char buf[128];
> > > > +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > > > +                    || misaligned_operand (operands[1], mode));
> > > > +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> > > > +                  || EXT_REX_SSE_REG_P (operands[1]));
> > > > +  machine_mode scalar_mode;
> > > > +
> > > > +  else if (SCALAR_INT_MODE_P (scalar_mode))
> > > > +    {
> > > > +      switch (scalar_mode)
> > > > +     {
> > > > +     case E_QImode:
> > > > +       if (size == 64)
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu8"
> > > > +                      : "vmovdqu64")
> > > > +                   : "vmovdqa64");
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = (misaligned_p
> > > > +                       ? (TARGET_AVX512BW
> > > > +                          ? "vmovdqu8"
> > > > +                          : "vmovdqu64")
> > > > +                       : "vmovdqa64");
> > > > +         }
> > > > +       else
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu8"
> > > > +                      : "%vmovdqu")
> > > > +                   : "%vmovdqa");
> > > > +       break;
> > > > +     case E_HImode:
> > > > +       if (size == 64)
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu16"
> > > > +                      : "vmovdqu64")
> > > > +                   : "vmovdqa64");
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = (misaligned_p
> > > > +                       ? (TARGET_AVX512BW
> > > > +                          ? "vmovdqu16"
> > > > +                          : "vmovdqu64")
> > > > +                       : "vmovdqa64");
> > > > +         }
> > > > +       else
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu16"
> > > > +                      : "%vmovdqu")
> > > > +                   : "%vmovdqa");
> > > > +       break;
> > > > +     case E_SImode:
> > > > +       if (size == 64)
> > > > +         opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > > +         }
> > > > +       else
> > > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > > +       break;
> > > > +     case E_DImode:
> > > > +     case E_TImode:
> > > > +     case E_OImode:
> > > > +       if (size == 64)
> > > > +         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +         }
> > > > +       else
> > > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > > +       break;
> > > > +     case E_XImode:
> > > > +       opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +       break;
> > > > +     default:
> > > > +       gcc_unreachable ();
> > > > +     }
> > > > +    }
> > > > +  else
> > > > +    gcc_unreachable ();
> > > > +
> > > > +  if (!opcode)
> > > > +    {
> > > > +      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
> > > > +      registers without AVX512VL by using zmm register move.  */
> > > So the overall flow control in here is rather convoluted.  I hate the way
> > > you
> > > don't set OPCODE above and then do it down here.  I would suggest breaking
> > > the !opcode bits into its own little function.  Then above in those places
> > > where you do
> > >
> > > if (TARGET_AVX512VL)
> > >    opcode = <whatever>;
> > >
> > >
> > > Instead change those to something like
> > >
> > > if (TARGET_AVX512VL)
> > >    opcode = <whatever>;
> > > else
> > >    opcode = new_function (...)
> > >
> > > That way opcode is set on every path through the major if-else in this
> > > function.
> > >
> > > Second when I suggested you break the patch up on a per-pattern basis, I
> > > probably should have also said that I would start with the minimal support
> > > in
> > > ix86_get_ssemov and ix86_output_ssemov to support the pattern you just
> > > converted.  That way the mapping from current code to new code is more
> > > obvious.
> >
> > I will do these.   On x86,  different instructions can move vector
> > registers.  They all
> > do the same thing.  But some are preferred over others, depending on
> > tuning options.
> I know.
>
> >
> > > As it stands the breaking into separate patches didn't really help much
> > > because
> > > we still have all the complexity in ix86_get_ssemov and ix86_output_ssemov
> > > in
> > > patch #1 and that's the code I'm most worried about verifying we get right,
> > > particularly at this stage.  I literally can't take any patch and map from
> > > the
> > > old code to the new code without having to understand all of patch #1.
> >
> > The old code is very convoluted and wrong in some cases.  I am trying to
> > clean it up.  I will update my patches based on your feedback.
> Thanks.  I was going to try and break those two functions down on my own, but
> you're more likely to get it right than I am :-)
>

How about this?  If it looks OK, I will post the whole patch set.

Thanks.
Jeff Law Feb. 29, 2020, 12:16 a.m. UTC | #5
On Thu, 2020-02-27 at 06:50 -0800, H.J. Lu wrote:
> 
> How about this?  If it looks OK, I will post the whole patch set.
It's better.  I'm guessing the two cases that were previously handled with
vextract/vbroadcast aren't supposed to happen?  They're caught here IIUC:

> +  /* NB: To move xmm16-xmm31/ymm16-ymm31 registers without AVX512VL,
> +     we can only use zmm register move without memory operand.  */
> +   if (evex_reg_p
> +       && !TARGET_AVX512VL
> +       && GET_MODE_SIZE (mode) < 64)
> +     {
> +       if (memory_operand (operands[0], mode)
> +	   || memory_operand (operands[1], mode))
> +	gcc_unreachable ();
> 

If they truly can't happen, that's fine.  My worry is I don't see changes to
the operand predicates or constraints which would avoid this case.   Is it
prevented by the mode iterator on the operands?  Again, just want to make sure
I understand why the vextract/vbroadcast stuff isn't in the new code.

I'm doing a little assuming that the <ssescalarsize> bits in the old code are
mapped correctly to the 32/64 suffixes on the opcodes in the new version.

I'm also assuming that mapping of "size" in the argument to ix86_get_ssemov to
the operand modifiers g, t, and x are right.  I'm guessing the operand
modifiers weren't needed in the original because we had the actual operand and
could look at it to get the right modifier.  In the evex, but not avx512vl case
those are forced to a g modifier which seems to match the original.

Are we going to need further refinements to ix86_output_ssemov/ix86_get_ssemov?
If so, then I'd suggest the next patch be those patterns which don't require
further refinements to ix86_output_ssemov.

If no further refinements to ix86_output_ssemov/ix86_get_ssemov are required,
then I think you can just send the rest of the pattern changes in a single
unit.

jeff
H.J. Lu Feb. 29, 2020, 2:15 a.m. UTC | #6
On Fri, Feb 28, 2020 at 4:16 PM Jeff Law <law@redhat.com> wrote:
>
> On Thu, 2020-02-27 at 06:50 -0800, H.J. Lu wrote:
> >
> > How about this?  If it looks OK, I will post the whole patch set.
> It's better.  I'm guessing the two cases that were previously handled with
> vextract/vbroadcast aren't supposed to happen?  They're caught here IIUC:
>
> > +  /* NB: To move xmm16-xmm31/ymm16-ymm31 registers without AVX512VL,
> > +     we can only use zmm register move without memory operand.  */
> > +   if (evex_reg_p
> > +       && !TARGET_AVX512VL
> > +       && GET_MODE_SIZE (mode) < 64)
> > +     {
> > +       if (memory_operand (operands[0], mode)
> > +        || memory_operand (operands[1], mode))
> > +     gcc_unreachable ();
> >
>
> If they truly can't happen, that's fine.  My worry is I don't see changes to
> the operand predicates or constraints which would avoid this case.   Is it
> prevented by the mode iterator on the operands?  Again, just want to make sure
> I understand why the vextract/vbroadcast stuff isn't in the new code.

There are no GCC testcases to show that they are actually ever used.   That is
why I removed them and added gcc_unreachable ().

> I'm doing a little assuming that the <ssescalarsize> bits in the old code are
> mapped correctly to the 32/64 suffixes on the opcodes in the new version.
>
> I'm also assuming that mapping of "size" in the argument to ix86_get_ssemov to
> the operand modifiers g, t, and x are right.  I'm guessing the operand
> modifiers weren't needed in the original because we had the actual operand and
> could look at it to get the right modifier.  In the evex, but not avx512vl case
> those are forced to a g modifier which seems to match the original.
>
> Are we going to need further refinements to ix86_output_ssemov/ix86_get_ssemov?
> If so, then I'd suggest the next patch be those patterns which don't require
> further refinements to ix86_output_ssemov.

4 patches don't require changes in ix86_output_ssemov/ix86_get_ssemov:

https://gitlab.com/x86-gcc/gcc/-/commit/426f2464abb80b97b8533f9efa15bbe72e6aa888
https://gitlab.com/x86-gcc/gcc/-/commit/ec5b40d77f7a4424935275f1a7ccedbce83b6f54
https://gitlab.com/x86-gcc/gcc/-/commit/92fdd98234984f86b66fb5403dd828661cd7999f
https://gitlab.com/x86-gcc/gcc/-/commit/f8fa5e571caf6740b36d042d631b4ace11683cd7

I can combine them into a single patch.

Other 5 patches contain a small change to  ix86_output_ssemov:

https://gitlab.com/x86-gcc/gcc/-/commit/b1746392e1d350d689a80fb71b2c72f909c20f30
https://gitlab.com/x86-gcc/gcc/-/commit/14c3cbdbdcc36fa1edea4572b89a039726a4e2bc
https://gitlab.com/x86-gcc/gcc/-/commit/69c8c928b26242116cc261a9d2f6b1265218f1d3
https://gitlab.com/x86-gcc/gcc/-/commit/04335f582f0b281d5f357185d154087997fd7cfd
https://gitlab.com/x86-gcc/gcc/-/commit/64f6a5d6d3405331d9c02aaae0faccf449d6647a

Should I made the change and submit them for review?

> If no further refinements to ix86_output_ssemov/ix86_get_ssemov are required,
> then I think you can just send the rest of the pattern changes in a single
> unit.
>
> jeff
>

Thanks.
H.J. Lu Feb. 29, 2020, 11:33 a.m. UTC | #7
On Fri, Feb 28, 2020 at 6:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Feb 28, 2020 at 4:16 PM Jeff Law <law@redhat.com> wrote:
> >
> > On Thu, 2020-02-27 at 06:50 -0800, H.J. Lu wrote:
> > >
> > > How about this?  If it looks OK, I will post the whole patch set.
> > It's better.  I'm guessing the two cases that were previously handled with
> > vextract/vbroadcast aren't supposed to happen?  They're caught here IIUC:
> >
> > > +  /* NB: To move xmm16-xmm31/ymm16-ymm31 registers without AVX512VL,
> > > +     we can only use zmm register move without memory operand.  */
> > > +   if (evex_reg_p
> > > +       && !TARGET_AVX512VL
> > > +       && GET_MODE_SIZE (mode) < 64)
> > > +     {
> > > +       if (memory_operand (operands[0], mode)
> > > +        || memory_operand (operands[1], mode))
> > > +     gcc_unreachable ();
> > >
> >
> > If they truly can't happen, that's fine.  My worry is I don't see changes to
> > the operand predicates or constraints which would avoid this case.   Is it
> > prevented by the mode iterator on the operands?  Again, just want to make sure
> > I understand why the vextract/vbroadcast stuff isn't in the new code.
>
> There are no GCC testcases to show that they are actually ever used.   That is
> why I removed them and added gcc_unreachable ().

This is covered by the testcases I added:

[hjl@gnu-cfl-2 gcc]$ cat /tmp/x.c
#include <x86intrin.h>

extern __m128 d;

void
foo1 (__m128 x)
{
  register __m128 xmm16 __asm ("xmm16") = x;
  asm volatile ("" : "+v" (xmm16));
  d = xmm16;
}
[hjl@gnu-cfl-2 gcc]$ gcc -O2 -march=skylake-avx512  /tmp/x.c -S
[hjl@gnu-cfl-2 gcc]$ gcc -O2 -march=skylake-avx512 -mno-avx512vl  /tmp/x.c -S
/tmp/x.c: In function ‘foo1’:
/tmp/x.c:8:19: error: register specified for ‘xmm16’ isn’t suitable
for data type
    8 |   register __m128 xmm16 __asm ("xmm16") = x;
      |                   ^~~~~
[hjl@gnu-cfl-2 gcc]$

GCC doesn't allow xmm16-xmm31/ymm16-ymm31 without AVX512VL since
ix86_hard_regno_mode_ok has

     /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
      if (TARGET_AVX512VL
          && (mode == OImode
              || mode == TImode
              || VALID_AVX256_REG_MODE (mode)
              || VALID_AVX512VL_128_REG_MODE (mode)))
        return true;

      /* xmm16-xmm31 are only available for AVX-512.  */
      if (EXT_REX_SSE_REGNO_P (regno))
        return false;

The vextract/vbroadcast stuff is dead code.

> > I'm doing a little assuming that the <ssescalarsize> bits in the old code are
> > mapped correctly to the 32/64 suffixes on the opcodes in the new version.
> >
> > I'm also assuming that mapping of "size" in the argument to ix86_get_ssemov to
> > the operand modifiers g, t, and x are right.  I'm guessing the operand
> > modifiers weren't needed in the original because we had the actual operand and
> > could look at it to get the right modifier.  In the evex, but not avx512vl case
> > those are forced to a g modifier which seems to match the original.
> >
> > Are we going to need further refinements to ix86_output_ssemov/ix86_get_ssemov?
> > If so, then I'd suggest the next patch be those patterns which don't require
> > further refinements to ix86_output_ssemov.
>
> 4 patches don't require changes in ix86_output_ssemov/ix86_get_ssemov:
>
> https://gitlab.com/x86-gcc/gcc/-/commit/426f2464abb80b97b8533f9efa15bbe72e6aa888
> https://gitlab.com/x86-gcc/gcc/-/commit/ec5b40d77f7a4424935275f1a7ccedbce83b6f54
> https://gitlab.com/x86-gcc/gcc/-/commit/92fdd98234984f86b66fb5403dd828661cd7999f
> https://gitlab.com/x86-gcc/gcc/-/commit/f8fa5e571caf6740b36d042d631b4ace11683cd7
>
> I can combine them into a single patch.
>
> Other 5 patches contain a small change to  ix86_output_ssemov:
>
> https://gitlab.com/x86-gcc/gcc/-/commit/b1746392e1d350d689a80fb71b2c72f909c20f30
> https://gitlab.com/x86-gcc/gcc/-/commit/14c3cbdbdcc36fa1edea4572b89a039726a4e2bc
> https://gitlab.com/x86-gcc/gcc/-/commit/69c8c928b26242116cc261a9d2f6b1265218f1d3
> https://gitlab.com/x86-gcc/gcc/-/commit/04335f582f0b281d5f357185d154087997fd7cfd
> https://gitlab.com/x86-gcc/gcc/-/commit/64f6a5d6d3405331d9c02aaae0faccf449d6647a
>
> Should I made the change and submit them for review?

I am preparing the new patch set.

> > If no further refinements to ix86_output_ssemov/ix86_get_ssemov are required,
> > then I think you can just send the rest of the pattern changes in a single
> > unit.
> >
> > jeff
> >
Jeff Law March 5, 2020, 11:43 p.m. UTC | #8
On Fri, 2020-02-28 at 18:15 -0800, H.J. Lu wrote:
> On Fri, Feb 28, 2020 at 4:16 PM Jeff Law <law@redhat.com> wrote:
> > On Thu, 2020-02-27 at 06:50 -0800, H.J. Lu wrote:
> > > How about this?  If it looks OK, I will post the whole patch set.
> > It's better.  I'm guessing the two cases that were previously handled with
> > vextract/vbroadcast aren't supposed to happen?  They're caught here IIUC:
> > 
> > > +  /* NB: To move xmm16-xmm31/ymm16-ymm31 registers without AVX512VL,
> > > +     we can only use zmm register move without memory operand.  */
> > > +   if (evex_reg_p
> > > +       && !TARGET_AVX512VL
> > > +       && GET_MODE_SIZE (mode) < 64)
> > > +     {
> > > +       if (memory_operand (operands[0], mode)
> > > +        || memory_operand (operands[1], mode))
> > > +     gcc_unreachable ();
> > > 
> > 
> > If they truly can't happen, that's fine.  My worry is I don't see changes to
> > the operand predicates or constraints which would avoid this case.   Is it
> > prevented by the mode iterator on the operands?  Again, just want to make
> > sure
> > I understand why the vextract/vbroadcast stuff isn't in the new code.
> 
> There are no GCC testcases to show that they are actually ever used.   That is
> why I removed them and added gcc_unreachable ().
Understood.   

> 
> 4 patches don't require changes in ix86_output_ssemov/ix86_get_ssemov:
> 
> https://gitlab.com/x86-gcc/gcc/-/commit/426f2464abb80b97b8533f9efa15bbe72e6aa888
> https://gitlab.com/x86-gcc/gcc/-/commit/ec5b40d77f7a4424935275f1a7ccedbce83b6f54
> https://gitlab.com/x86-gcc/gcc/-/commit/92fdd98234984f86b66fb5403dd828661cd7999f
> https://gitlab.com/x86-gcc/gcc/-/commit/f8fa5e571caf6740b36d042d631b4ace11683cd7
> 
> I can combine them into a single patch.
That sounds reasonable -- it should be trivial to review.  Then we can work
through the patches that require changes to ix86_output_ssemov.

Thanks for your patience.  I'm juggling a fair amount of stuff right now.

jeff
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 266381ca5a6..39fcaa0ad5f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -38,6 +38,8 @@  extern void ix86_expand_split_stack_prologue (void);
 extern void ix86_output_addr_vec_elt (FILE *, int);
 extern void ix86_output_addr_diff_elt (FILE *, int, int);
 
+extern const char *ix86_output_ssemov (rtx_insn *, rtx *);
+
 extern enum calling_abi ix86_cfun_abi (void);
 extern enum calling_abi ix86_function_type_abi (const_tree);
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index dac7a3fc5fd..26f8c9494b9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -4915,6 +4915,280 @@  ix86_pre_reload_split (void)
 	  && !(cfun->curr_properties & PROP_rtl_split_insns));
 }
 
+/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
+   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
+   TARGET_AVX512VL or it is a register to register move which can
+   be done with zmm register move. */
+
+static const char *
+ix86_get_ssemov (rtx *operands, unsigned size,
+		 enum attr_mode insn_mode, machine_mode mode)
+{
+  char buf[128];
+  bool misaligned_p = (misaligned_operand (operands[0], mode)
+		       || misaligned_operand (operands[1], mode));
+  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
+		     || EXT_REX_SSE_REG_P (operands[1]));
+  machine_mode scalar_mode;
+
+  const char *opcode = NULL;
+  enum
+    {
+      opcode_int,
+      opcode_float,
+      opcode_double
+    } type = opcode_int;
+
+  switch (insn_mode)
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      scalar_mode = E_SFmode;
+      break;
+    case MODE_V8DF:
+    case MODE_V4DF:
+    case MODE_V2DF:
+      scalar_mode = E_DFmode;
+      break;
+    case MODE_XI:
+    case MODE_OI:
+    case MODE_TI:
+      scalar_mode = GET_MODE_INNER (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (SCALAR_FLOAT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_SFmode:
+	  if (size == 64 || !evex_reg_p || TARGET_AVX512VL)
+	    opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  else
+	    type = opcode_float;
+	  break;
+	case E_DFmode:
+	  if (size == 64 || !evex_reg_p || TARGET_AVX512VL)
+	    opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  else
+	    type = opcode_double;
+	  break;
+	case E_TFmode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else if (SCALAR_INT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_QImode:
+	  if (size == 64)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = (misaligned_p
+			  ? (TARGET_AVX512BW
+			     ? "vmovdqu8"
+			     : "vmovdqu64")
+			  : "vmovdqa64");
+	    }
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_HImode:
+	  if (size == 64)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = (misaligned_p
+			  ? (TARGET_AVX512BW
+			     ? "vmovdqu16"
+			     : "vmovdqu64")
+			  : "vmovdqa64");
+	    }
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_SImode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_DImode:
+	case E_TImode:
+	case E_OImode:
+	  if (size == 64)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else if (evex_reg_p)
+	    {
+	      if (TARGET_AVX512VL)
+		opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    }
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_XImode:
+	  opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    gcc_unreachable ();
+
+  if (!opcode)
+    {
+      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
+	 registers without AVX512VL by using zmm register move.  */
+      if (!evex_reg_p
+	  || TARGET_AVX512VL
+	  || memory_operand (operands[0], mode)
+	  || memory_operand (operands[1], mode))
+	gcc_unreachable ();
+      size = 64;
+      switch (type)
+	{
+	case opcode_int:
+	  opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  break;
+	case opcode_float:
+	  opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  break;
+	case opcode_double:
+	  opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  break;
+	}
+    }
+
+  switch (size)
+    {
+    case 64:
+      snprintf (buf, sizeof (buf), "%s\t{%%g1, %%g0|%%g0, %%g1}",
+		opcode);
+      break;
+    case 32:
+      snprintf (buf, sizeof (buf), "%s\t{%%t1, %%t0|%%t0, %%t1}",
+		opcode);
+      break;
+    case 16:
+      snprintf (buf, sizeof (buf), "%s\t{%%x1, %%x0|%%x0, %%x1}",
+		opcode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  output_asm_insn (buf, operands);
+  return "";
+}
+
+/* Return the template of the TYPE_SSEMOV instruction to move
+   operands[1] into operands[0].  */
+
+const char *
+ix86_output_ssemov (rtx_insn *insn, rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (get_attr_type (insn) != TYPE_SSEMOV
+      || mode != GET_MODE (operands[1]))
+    gcc_unreachable ();
+
+  enum attr_mode insn_mode = get_attr_mode (insn);
+
+  switch (insn_mode)
+    {
+    case MODE_XI:
+    case MODE_V8DF:
+    case MODE_V16SF:
+      return ix86_get_ssemov (operands, 64, insn_mode, mode);
+
+    case MODE_OI:
+    case MODE_V4DF:
+    case MODE_V8SF:
+      return ix86_get_ssemov (operands, 32, insn_mode, mode);
+
+    case MODE_TI:
+    case MODE_V2DF:
+    case MODE_V4SF:
+      return ix86_get_ssemov (operands, 16, insn_mode, mode);
+
+    case MODE_DI:
+      /* Handle broken assemblers that require movd instead of movq. */
+      if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	  && (GENERAL_REG_P (operands[0])
+	      || GENERAL_REG_P (operands[1])))
+	return "%vmovd\t{%1, %0|%0, %1}";
+      else
+	return "%vmovq\t{%1, %0|%0, %1}";
+
+    case MODE_V2SF:
+      if (TARGET_AVX && REG_P (operands[0]))
+	return "vmovlps\t{%1, %d0|%d0, %1}";
+      else
+	return "%vmovlps\t{%1, %0|%0, %1}";
+
+    case MODE_DF:
+      if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	return "vmovsd\t{%d1, %0|%0, %d1}";
+      else
+	return "%vmovsd\t{%1, %0|%0, %1}";
+
+    case MODE_V1DF:
+      gcc_assert (!TARGET_AVX);
+       return "movlpd\t{%1, %0|%0, %1}";
+
+    case MODE_SI:
+      return "%vmovd\t{%1, %0|%0, %1}";
+
+    case MODE_SF:
+      if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	return "vmovss\t{%d1, %0|%0, %d1}";
+      else
+	return "%vmovss\t{%1, %0|%0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Returns true if OP contains a symbol reference */
 
 bool
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ee1f138d1af..8f5902292c6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1013,98 +1013,7 @@  (define_insn "mov<mode>_internal"
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
-	 in avx512f, so we need to use workarounds, to access sse registers
-	 16-31, which are evex-only. In avx512vl we don't need workarounds.  */
-      if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
-	  && (EXT_REX_SSE_REG_P (operands[0])
-	      || EXT_REX_SSE_REG_P (operands[1])))
-	{
-	  if (memory_operand (operands[0], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else if (<MODE_SIZE> == 16)
-		return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else if (memory_operand (operands[1], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vbroadcast<shuffletype>64x4\t{%1, %g0|%g0, %1}";
-	      else if (<MODE_SIZE> == 16)
-		return "vbroadcast<shuffletype>32x4\t{%1, %g0|%g0, %1}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else
-	    /* Reg -> reg move is always aligned.  Just use wider move.  */
-	    switch (get_attr_mode (insn))
-	      {
-	      case MODE_V8SF:
-	      case MODE_V4SF:
-		return "vmovaps\t{%g1, %g0|%g0, %g1}";
-	      case MODE_V4DF:
-	      case MODE_V2DF:
-		return "vmovapd\t{%g1, %g0|%g0, %g1}";
-	      case MODE_OI:
-	      case MODE_TI:
-		return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
-	      default:
-		gcc_unreachable ();
-	      }
-	}
-
-      switch (get_attr_mode (insn))
-	{
-	case MODE_V16SF:
-	case MODE_V8SF:
-	case MODE_V4SF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovups\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovaps\t{%1, %0|%0, %1}";
-
-	case MODE_V8DF:
-	case MODE_V4DF:
-	case MODE_V2DF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovupd\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovapd\t{%1, %0|%0, %1}";
-
-	case MODE_OI:
-	case MODE_TI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return TARGET_AVX512VL
-		   && (<MODE>mode == V4SImode
-		       || <MODE>mode == V2DImode
-		       || <MODE>mode == V8SImode
-		       || <MODE>mode == V4DImode
-		       || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "%vmovdqu\t{%1, %0|%0, %1}";
-	  else
-	    return TARGET_AVX512VL ? "vmovdqa64\t{%1, %0|%0, %1}"
-				   : "%vmovdqa\t{%1, %0|%0, %1}";
-	case MODE_XI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return (<MODE>mode == V16SImode
-		    || <MODE>mode == V8DImode
-		    || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "vmovdqu64\t{%1, %0|%0, %1}";
-	  else
-	    return "vmovdqa64\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -1113,10 +1022,7 @@  (define_insn "mov<mode>_internal"
   [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(and (eq_attr "alternative" "1")
-		    (match_test "TARGET_AVX512VL"))
-		 (const_string "<sseinsnmode>")
-	       (match_test "TARGET_AVX")
+	(cond [(match_test "TARGET_AVX")
 		 (const_string "<sseinsnmode>")
 	       (ior (not (match_test "TARGET_SSE2"))
 		    (match_test "optimize_function_for_size_p (cfun)"))
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
index 14fe4b84544..db4d9d14875 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
@@ -4,14 +4,13 @@ 
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\nxy\]*\\((?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr89346.c b/gcc/testsuite/gcc.target/i386/pr89346.c
new file mode 100644
index 00000000000..cdc9accf521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89346.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+#include <immintrin.h>
+
+long long *p;
+volatile __m256i y;
+
+void
+foo (void)
+{
+   _mm256_store_epi64 (p, y);
+}
+
+/* { dg-final { scan-assembler-not "vmovdqa64" } } */