i386: prefer vpermilpd over vpermpd [PR93395]
diff mbox series

Message ID 20200123214816.GJ10088@tucnak
State New
Headers show
Series
  • i386: prefer vpermilpd over vpermpd [PR93395]
Related show

Commit Message

Jakub Jelinek Jan. 23, 2020, 9:48 p.m. UTC
Hi!

In Agner Fog's tables, vpermilp[sd] with immediates seem to be
much faster than vpermpd with immediate, for a good reason,
the former only permute something within the lanes and don't do anything
intra-lane, while vpermpd can.  So, functionality-wise, vpermilpd
is more efficient subset of vpermpd.  We use the same RTL for those
though (and also for certain broadcast).

Now, the problem was that the vpermpd pattern appeared first in sse.md,
followed by the broadcast patterns, followed by the vpermilp[sd].
Which means unless -mavx -mno-avx2, we'd emit vpermpd instead of the
more efficient alternatives.

The following patch reorders them, so that vpermpd comes last, if we
can match a broadcast, we do, if we can match a vpermilp[sd] that is not a
broadcast, we will, otherwise fall back (of course only if -mavx2) to
vpermpd.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-01-23  Jakub Jelinek  <jakub@redhat.com>

	PR target/93395
	* config/i386/sse.md (*avx_vperm_broadcast_v4sf,
	*avx_vperm_broadcast_<mode>,
	<sse2_avx_avx512f>_vpermil<mode><mask_name>,
	*<sse2_avx_avx512f>_vpermilp<mode><mask_name>):
	Move before avx2_perm<mode>/avx512f_perm<mode>.

	* gcc.target/i386/pr93395.c: New test.
	* gcc.target/i386/avx512vl-vpermilpdi-1.c: Remove xfail.


	Jakub

Comments

Uros Bizjak Jan. 24, 2020, 6:54 a.m. UTC | #1
On Thu, Jan 23, 2020 at 10:48 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> In Agner Fog's tables, vpermilp[sd] with immediates seem to be
> much faster than vpermpd with immediate, for a good reason,
> the former only permute something within the lanes and don't do anything
> intra-lane, while vpermpd can.  So, functionality-wise, vpermilpd
> is more efficient subset of vpermpd.  We use the same RTL for those
> though (and also for certain broadcast).
>
> Now, the problem was that the vpermpd pattern appeared first in sse.md,
> followed by the broadcast patterns, followed by the vpermilp[sd].
> Which means unless -mavx -mno-avx2, we'd emit vpermpd instead of the
> more efficient alternatives.
>
> The following patch reorders them, so that vpermpd comes last, if we
> can match a broadcast, we do, if we can match a vpermilp[sd] that is not a
> broadcast, we will, otherwise fall back (of course only if -mavx2) to
> vpermpd.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-01-23  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/93395
>         * config/i386/sse.md (*avx_vperm_broadcast_v4sf,
>         *avx_vperm_broadcast_<mode>,
>         <sse2_avx_avx512f>_vpermil<mode><mask_name>,
>         *<sse2_avx_avx512f>_vpermilp<mode><mask_name>):
>         Move before avx2_perm<mode>/avx512f_perm<mode>.
>
>         * gcc.target/i386/pr93395.c: New test.
>         * gcc.target/i386/avx512vl-vpermilpdi-1.c: Remove xfail.

LGTM.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2020-01-23 19:24:14.851423969 +0100
> +++ gcc/config/i386/sse.md      2020-01-23 19:41:58.729091766 +0100
> @@ -19875,6 +19875,164 @@ (define_insn "<avx512>_permvar<mode><mas
>     (set_attr "prefix" "<mask_prefix2>")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
> +;; If it so happens that the input is in memory, use vbroadcast.
> +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
> +(define_insn "*avx_vperm_broadcast_v4sf"
> +  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
> +       (vec_select:V4SF
> +         (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
> +         (match_parallel 2 "avx_vbroadcast_operand"
> +           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> +  "TARGET_AVX"
> +{
> +  int elt = INTVAL (operands[3]);
> +  switch (which_alternative)
> +    {
> +    case 0:
> +    case 1:
> +      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
> +      return "vbroadcastss\t{%1, %0|%0, %k1}";
> +    case 2:
> +      operands[2] = GEN_INT (elt * 0x55);
> +      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
> +    default:
> +      gcc_unreachable ();
> +    }
> +}
> +  [(set_attr "type" "ssemov,ssemov,sselog1")
> +   (set_attr "prefix_extra" "1")
> +   (set_attr "length_immediate" "0,0,1")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "SF,SF,V4SF")])
> +
> +(define_insn_and_split "*avx_vperm_broadcast_<mode>"
> +  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
> +       (vec_select:VF_256
> +         (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
> +         (match_parallel 2 "avx_vbroadcast_operand"
> +           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> +  "TARGET_AVX"
> +  "#"
> +  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
> +  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
> +{
> +  rtx op0 = operands[0], op1 = operands[1];
> +  int elt = INTVAL (operands[3]);
> +
> +  if (REG_P (op1))
> +    {
> +      int mask;
> +
> +      if (TARGET_AVX2 && elt == 0)
> +       {
> +         emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
> +                                                         op1)));
> +         DONE;
> +       }
> +
> +      /* Shuffle element we care about into all elements of the 128-bit lane.
> +        The other lane gets shuffled too, but we don't care.  */
> +      if (<MODE>mode == V4DFmode)
> +       mask = (elt & 1 ? 15 : 0);
> +      else
> +       mask = (elt & 3) * 0x55;
> +      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
> +
> +      /* Shuffle the lane we care about into both lanes of the dest.  */
> +      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
> +      if (EXT_REX_SSE_REG_P (op0))
> +       {
> +         /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
> +            or VSHUFF128.  */
> +         gcc_assert (<MODE>mode == V8SFmode);
> +         if ((mask & 1) == 0)
> +           emit_insn (gen_avx2_vec_dupv8sf (op0,
> +                                            gen_lowpart (V4SFmode, op0)));
> +         else
> +           emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
> +                                                 GEN_INT (4), GEN_INT (5),
> +                                                 GEN_INT (6), GEN_INT (7),
> +                                                 GEN_INT (12), GEN_INT (13),
> +                                                 GEN_INT (14), GEN_INT (15)));
> +         DONE;
> +       }
> +
> +      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
> +      DONE;
> +    }
> +
> +  operands[1] = adjust_address (op1, <ssescalarmode>mode,
> +                               elt * GET_MODE_SIZE (<ssescalarmode>mode));
> +})
> +
> +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> +  [(set (match_operand:VF2 0 "register_operand")
> +       (vec_select:VF2
> +         (match_operand:VF2 1 "nonimmediate_operand")
> +         (match_operand:SI 2 "const_0_to_255_operand")))]
> +  "TARGET_AVX && <mask_mode512bit_condition>"
> +{
> +  int mask = INTVAL (operands[2]);
> +  rtx perm[<ssescalarnum>];
> +
> +  int i;
> +  for (i = 0; i < <ssescalarnum>; i = i + 2)
> +    {
> +      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
> +      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
> +    }
> +
> +  operands[2]
> +    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> +})
> +
> +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> +  [(set (match_operand:VF1 0 "register_operand")
> +       (vec_select:VF1
> +         (match_operand:VF1 1 "nonimmediate_operand")
> +         (match_operand:SI 2 "const_0_to_255_operand")))]
> +  "TARGET_AVX && <mask_mode512bit_condition>"
> +{
> +  int mask = INTVAL (operands[2]);
> +  rtx perm[<ssescalarnum>];
> +
> +  int i;
> +  for (i = 0; i < <ssescalarnum>; i = i + 4)
> +    {
> +      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
> +      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
> +      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
> +      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
> +    }
> +
> +  operands[2]
> +    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> +})
> +
> +;; This pattern needs to come before the avx2_perm*/avx512f_perm*
> +;; patterns, as they have the same RTL representation (vpermilp*
> +;; being a subset of what vpermp* can do), but vpermilp* has shorter
> +;; latency as it never crosses lanes.
> +(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
> +  [(set (match_operand:VF 0 "register_operand" "=v")
> +       (vec_select:VF
> +         (match_operand:VF 1 "nonimmediate_operand" "vm")
> +         (match_parallel 2 ""
> +           [(match_operand 3 "const_int_operand")])))]
> +  "TARGET_AVX && <mask_mode512bit_condition>
> +   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
> +{
> +  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
> +  operands[2] = GEN_INT (mask);
> +  return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "prefix_extra" "1")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "<mask_prefix>")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_expand "avx2_perm<mode>"
>    [(match_operand:VI8F_256 0 "register_operand")
>     (match_operand:VI8F_256 1 "nonimmediate_operand")
> @@ -20376,160 +20534,6 @@ (define_insn "avx512cd_maskw_vec_dup<mod
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "XI")])
>
> -;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
> -;; If it so happens that the input is in memory, use vbroadcast.
> -;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
> -(define_insn "*avx_vperm_broadcast_v4sf"
> -  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
> -       (vec_select:V4SF
> -         (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
> -         (match_parallel 2 "avx_vbroadcast_operand"
> -           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> -  "TARGET_AVX"
> -{
> -  int elt = INTVAL (operands[3]);
> -  switch (which_alternative)
> -    {
> -    case 0:
> -    case 1:
> -      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
> -      return "vbroadcastss\t{%1, %0|%0, %k1}";
> -    case 2:
> -      operands[2] = GEN_INT (elt * 0x55);
> -      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
> -    default:
> -      gcc_unreachable ();
> -    }
> -}
> -  [(set_attr "type" "ssemov,ssemov,sselog1")
> -   (set_attr "prefix_extra" "1")
> -   (set_attr "length_immediate" "0,0,1")
> -   (set_attr "prefix" "maybe_evex")
> -   (set_attr "mode" "SF,SF,V4SF")])
> -
> -(define_insn_and_split "*avx_vperm_broadcast_<mode>"
> -  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
> -       (vec_select:VF_256
> -         (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
> -         (match_parallel 2 "avx_vbroadcast_operand"
> -           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> -  "TARGET_AVX"
> -  "#"
> -  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
> -  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
> -{
> -  rtx op0 = operands[0], op1 = operands[1];
> -  int elt = INTVAL (operands[3]);
> -
> -  if (REG_P (op1))
> -    {
> -      int mask;
> -
> -      if (TARGET_AVX2 && elt == 0)
> -       {
> -         emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
> -                                                         op1)));
> -         DONE;
> -       }
> -
> -      /* Shuffle element we care about into all elements of the 128-bit lane.
> -        The other lane gets shuffled too, but we don't care.  */
> -      if (<MODE>mode == V4DFmode)
> -       mask = (elt & 1 ? 15 : 0);
> -      else
> -       mask = (elt & 3) * 0x55;
> -      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
> -
> -      /* Shuffle the lane we care about into both lanes of the dest.  */
> -      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
> -      if (EXT_REX_SSE_REG_P (op0))
> -       {
> -         /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
> -            or VSHUFF128.  */
> -         gcc_assert (<MODE>mode == V8SFmode);
> -         if ((mask & 1) == 0)
> -           emit_insn (gen_avx2_vec_dupv8sf (op0,
> -                                            gen_lowpart (V4SFmode, op0)));
> -         else
> -           emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
> -                                                 GEN_INT (4), GEN_INT (5),
> -                                                 GEN_INT (6), GEN_INT (7),
> -                                                 GEN_INT (12), GEN_INT (13),
> -                                                 GEN_INT (14), GEN_INT (15)));
> -         DONE;
> -       }
> -
> -      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
> -      DONE;
> -    }
> -
> -  operands[1] = adjust_address (op1, <ssescalarmode>mode,
> -                               elt * GET_MODE_SIZE (<ssescalarmode>mode));
> -})
> -
> -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> -  [(set (match_operand:VF2 0 "register_operand")
> -       (vec_select:VF2
> -         (match_operand:VF2 1 "nonimmediate_operand")
> -         (match_operand:SI 2 "const_0_to_255_operand")))]
> -  "TARGET_AVX && <mask_mode512bit_condition>"
> -{
> -  int mask = INTVAL (operands[2]);
> -  rtx perm[<ssescalarnum>];
> -
> -  int i;
> -  for (i = 0; i < <ssescalarnum>; i = i + 2)
> -    {
> -      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
> -      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
> -    }
> -
> -  operands[2]
> -    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> -})
> -
> -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> -  [(set (match_operand:VF1 0 "register_operand")
> -       (vec_select:VF1
> -         (match_operand:VF1 1 "nonimmediate_operand")
> -         (match_operand:SI 2 "const_0_to_255_operand")))]
> -  "TARGET_AVX && <mask_mode512bit_condition>"
> -{
> -  int mask = INTVAL (operands[2]);
> -  rtx perm[<ssescalarnum>];
> -
> -  int i;
> -  for (i = 0; i < <ssescalarnum>; i = i + 4)
> -    {
> -      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
> -      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
> -      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
> -      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
> -    }
> -
> -  operands[2]
> -    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> -})
> -
> -(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
> -  [(set (match_operand:VF 0 "register_operand" "=v")
> -       (vec_select:VF
> -         (match_operand:VF 1 "nonimmediate_operand" "vm")
> -         (match_parallel 2 ""
> -           [(match_operand 3 "const_int_operand")])))]
> -  "TARGET_AVX && <mask_mode512bit_condition>
> -   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
> -{
> -  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
> -  operands[2] = GEN_INT (mask);
> -  return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
> -}
> -  [(set_attr "type" "sselog")
> -   (set_attr "prefix_extra" "1")
> -   (set_attr "length_immediate" "1")
> -   (set_attr "prefix" "<mask_prefix>")
> -   (set_attr "mode" "<sseinsnmode>")])
> -
>  (define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3<mask_name>"
>    [(set (match_operand:VF 0 "register_operand" "=v")
>         (unspec:VF
> --- gcc/testsuite/gcc.target/i386/pr93395.c.jj  2020-01-23 19:33:06.649854297 +0100
> +++ gcc/testsuite/gcc.target/i386/pr93395.c     2020-01-23 19:33:06.648854311 +0100
> @@ -0,0 +1,44 @@
> +/* PR target/93395 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -masm=att" } */
> +/* { dg-final { scan-assembler-times "vpermilpd\t.5, %ymm" 3 } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\t.85, %zmm" 3 } } */
> +/* { dg-final { scan-assembler-not "vpermpd\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256d
> +foo1 (__m256d a)
> +{
> +  return _mm256_permute4x64_pd (a, 177);
> +}
> +
> +__m256d
> +foo2 (__m256d a)
> +{
> +  return _mm256_permute_pd (a, 5);
> +}
> +
> +__m256d
> +foo3 (__m256d a)
> +{
> +  return __builtin_shuffle (a, (__v4di) { 1, 0, 3, 2 });
> +}
> +
> +__m512d
> +foo4 (__m512d a)
> +{
> +  return _mm512_permutex_pd (a, 177);
> +}
> +
> +__m512d
> +foo5 (__m512d a)
> +{
> +  return _mm512_permute_pd (a, 85);
> +}
> +
> +__m512d
> +foo6 (__m512d a)
> +{
> +  return __builtin_shuffle (a, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });
> +}
> --- gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c.jj    2020-01-12 11:54:37.929390537 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c       2020-01-23 19:35:46.068553312 +0100
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
>
>
>         Jakub
>

Patch
diff mbox series

--- gcc/config/i386/sse.md.jj	2020-01-23 19:24:14.851423969 +0100
+++ gcc/config/i386/sse.md	2020-01-23 19:41:58.729091766 +0100
@@ -19875,6 +19875,164 @@  (define_insn "<avx512>_permvar<mode><mas
    (set_attr "prefix" "<mask_prefix2>")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
+;; If it so happens that the input is in memory, use vbroadcast.
+;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
+(define_insn "*avx_vperm_broadcast_v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
+	(vec_select:V4SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+{
+  int elt = INTVAL (operands[3]);
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
+      return "vbroadcastss\t{%1, %0|%0, %k1}";
+    case 2:
+      operands[2] = GEN_INT (elt * 0x55);
+      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "ssemov,ssemov,sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "0,0,1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "SF,SF,V4SF")])
+
+(define_insn_and_split "*avx_vperm_broadcast_<mode>"
+  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
+	(vec_select:VF_256
+	  (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+  "#"
+  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
+  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
+{
+  rtx op0 = operands[0], op1 = operands[1];
+  int elt = INTVAL (operands[3]);
+
+  if (REG_P (op1))
+    {
+      int mask;
+
+      if (TARGET_AVX2 && elt == 0)
+	{
+	  emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
+							  op1)));
+	  DONE;
+	}
+
+      /* Shuffle element we care about into all elements of the 128-bit lane.
+	 The other lane gets shuffled too, but we don't care.  */
+      if (<MODE>mode == V4DFmode)
+	mask = (elt & 1 ? 15 : 0);
+      else
+	mask = (elt & 3) * 0x55;
+      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
+
+      /* Shuffle the lane we care about into both lanes of the dest.  */
+      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
+      if (EXT_REX_SSE_REG_P (op0))
+	{
+	  /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
+	     or VSHUFF128.  */
+	  gcc_assert (<MODE>mode == V8SFmode);
+	  if ((mask & 1) == 0)
+	    emit_insn (gen_avx2_vec_dupv8sf (op0,
+					     gen_lowpart (V4SFmode, op0)));
+	  else
+	    emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
+						  GEN_INT (4), GEN_INT (5),
+						  GEN_INT (6), GEN_INT (7),
+						  GEN_INT (12), GEN_INT (13),
+						  GEN_INT (14), GEN_INT (15)));
+	  DONE;
+	}
+
+      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
+      DONE;
+    }
+
+  operands[1] = adjust_address (op1, <ssescalarmode>mode,
+				elt * GET_MODE_SIZE (<ssescalarmode>mode));
+})
+
+(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
+  [(set (match_operand:VF2 0 "register_operand")
+	(vec_select:VF2
+	  (match_operand:VF2 1 "nonimmediate_operand")
+	  (match_operand:SI 2 "const_0_to_255_operand")))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+{
+  int mask = INTVAL (operands[2]);
+  rtx perm[<ssescalarnum>];
+
+  int i;
+  for (i = 0; i < <ssescalarnum>; i = i + 2)
+    {
+      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
+      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
+    }
+
+  operands[2]
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
+})
+
+(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
+  [(set (match_operand:VF1 0 "register_operand")
+	(vec_select:VF1
+	  (match_operand:VF1 1 "nonimmediate_operand")
+	  (match_operand:SI 2 "const_0_to_255_operand")))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+{
+  int mask = INTVAL (operands[2]);
+  rtx perm[<ssescalarnum>];
+
+  int i;
+  for (i = 0; i < <ssescalarnum>; i = i + 4)
+    {
+      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
+      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
+      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
+      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
+    }
+
+  operands[2]
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
+})
+
+;; This pattern needs to come before the avx2_perm*/avx512f_perm*
+;; patterns, as they have the same RTL representation (vpermilp*
+;; being a subset of what vpermp* can do), but vpermilp* has shorter
+;; latency as it never crosses lanes.
+(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v")
+	(vec_select:VF
+	  (match_operand:VF 1 "nonimmediate_operand" "vm")
+	  (match_parallel 2 ""
+	    [(match_operand 3 "const_int_operand")])))]
+  "TARGET_AVX && <mask_mode512bit_condition>
+   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
+{
+  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
+  operands[2] = GEN_INT (mask);
+  return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "avx2_perm<mode>"
   [(match_operand:VI8F_256 0 "register_operand")
    (match_operand:VI8F_256 1 "nonimmediate_operand")
@@ -20376,160 +20534,6 @@  (define_insn "avx512cd_maskw_vec_dup<mod
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
-;; If it so happens that the input is in memory, use vbroadcast.
-;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
-(define_insn "*avx_vperm_broadcast_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
-	(vec_select:V4SF
-	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
-	  (match_parallel 2 "avx_vbroadcast_operand"
-	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
-  "TARGET_AVX"
-{
-  int elt = INTVAL (operands[3]);
-  switch (which_alternative)
-    {
-    case 0:
-    case 1:
-      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
-      return "vbroadcastss\t{%1, %0|%0, %k1}";
-    case 2:
-      operands[2] = GEN_INT (elt * 0x55);
-      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
-    default:
-      gcc_unreachable ();
-    }
-}
-  [(set_attr "type" "ssemov,ssemov,sselog1")
-   (set_attr "prefix_extra" "1")
-   (set_attr "length_immediate" "0,0,1")
-   (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "SF,SF,V4SF")])
-
-(define_insn_and_split "*avx_vperm_broadcast_<mode>"
-  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
-	(vec_select:VF_256
-	  (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
-	  (match_parallel 2 "avx_vbroadcast_operand"
-	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
-  "TARGET_AVX"
-  "#"
-  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
-  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
-{
-  rtx op0 = operands[0], op1 = operands[1];
-  int elt = INTVAL (operands[3]);
-
-  if (REG_P (op1))
-    {
-      int mask;
-
-      if (TARGET_AVX2 && elt == 0)
-	{
-	  emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
-							  op1)));
-	  DONE;
-	}
-
-      /* Shuffle element we care about into all elements of the 128-bit lane.
-	 The other lane gets shuffled too, but we don't care.  */
-      if (<MODE>mode == V4DFmode)
-	mask = (elt & 1 ? 15 : 0);
-      else
-	mask = (elt & 3) * 0x55;
-      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
-
-      /* Shuffle the lane we care about into both lanes of the dest.  */
-      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
-      if (EXT_REX_SSE_REG_P (op0))
-	{
-	  /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
-	     or VSHUFF128.  */
-	  gcc_assert (<MODE>mode == V8SFmode);
-	  if ((mask & 1) == 0)
-	    emit_insn (gen_avx2_vec_dupv8sf (op0,
-					     gen_lowpart (V4SFmode, op0)));
-	  else
-	    emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
-						  GEN_INT (4), GEN_INT (5),
-						  GEN_INT (6), GEN_INT (7),
-						  GEN_INT (12), GEN_INT (13),
-						  GEN_INT (14), GEN_INT (15)));
-	  DONE;
-	}
-
-      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
-      DONE;
-    }
-
-  operands[1] = adjust_address (op1, <ssescalarmode>mode,
-				elt * GET_MODE_SIZE (<ssescalarmode>mode));
-})
-
-(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
-  [(set (match_operand:VF2 0 "register_operand")
-	(vec_select:VF2
-	  (match_operand:VF2 1 "nonimmediate_operand")
-	  (match_operand:SI 2 "const_0_to_255_operand")))]
-  "TARGET_AVX && <mask_mode512bit_condition>"
-{
-  int mask = INTVAL (operands[2]);
-  rtx perm[<ssescalarnum>];
-
-  int i;
-  for (i = 0; i < <ssescalarnum>; i = i + 2)
-    {
-      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
-      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
-    }
-
-  operands[2]
-    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
-})
-
-(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
-  [(set (match_operand:VF1 0 "register_operand")
-	(vec_select:VF1
-	  (match_operand:VF1 1 "nonimmediate_operand")
-	  (match_operand:SI 2 "const_0_to_255_operand")))]
-  "TARGET_AVX && <mask_mode512bit_condition>"
-{
-  int mask = INTVAL (operands[2]);
-  rtx perm[<ssescalarnum>];
-
-  int i;
-  for (i = 0; i < <ssescalarnum>; i = i + 4)
-    {
-      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
-      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
-      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
-      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
-    }
-
-  operands[2]
-    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
-})
-
-(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
-  [(set (match_operand:VF 0 "register_operand" "=v")
-	(vec_select:VF
-	  (match_operand:VF 1 "nonimmediate_operand" "vm")
-	  (match_parallel 2 ""
-	    [(match_operand 3 "const_int_operand")])))]
-  "TARGET_AVX && <mask_mode512bit_condition>
-   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
-{
-  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
-  operands[2] = GEN_INT (mask);
-  return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
-}
-  [(set_attr "type" "sselog")
-   (set_attr "prefix_extra" "1")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "<mask_prefix>")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3<mask_name>"
   [(set (match_operand:VF 0 "register_operand" "=v")
 	(unspec:VF
--- gcc/testsuite/gcc.target/i386/pr93395.c.jj	2020-01-23 19:33:06.649854297 +0100
+++ gcc/testsuite/gcc.target/i386/pr93395.c	2020-01-23 19:33:06.648854311 +0100
@@ -0,0 +1,44 @@ 
+/* PR target/93395 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -masm=att" } */
+/* { dg-final { scan-assembler-times "vpermilpd\t.5, %ymm" 3 } } */
+/* { dg-final { scan-assembler-times "vpermilpd\t.85, %zmm" 3 } } */
+/* { dg-final { scan-assembler-not "vpermpd\t" } } */
+
+#include <immintrin.h>
+
+__m256d
+foo1 (__m256d a)
+{
+  return _mm256_permute4x64_pd (a, 177);
+}
+
+__m256d
+foo2 (__m256d a)
+{
+  return _mm256_permute_pd (a, 5);
+}
+
+__m256d
+foo3 (__m256d a)
+{
+  return __builtin_shuffle (a, (__v4di) { 1, 0, 3, 2 });
+}
+
+__m512d
+foo4 (__m512d a)
+{
+  return _mm512_permutex_pd (a, 177);
+}
+
+__m512d
+foo5 (__m512d a)
+{
+  return _mm512_permute_pd (a, 85);
+}
+
+__m512d
+foo6 (__m512d a)
+{
+  return __builtin_shuffle (a, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
--- gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c.jj	2020-01-12 11:54:37.929390537 +0100
+++ gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c	2020-01-23 19:35:46.068553312 +0100
@@ -1,7 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */