diff mbox

[6/6] arm: Implement vec_perm and vec_perm_const for NEON.

Message ID 1323378383-9824-7-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Dec. 8, 2011, 9:06 p.m. UTC
---
 gcc/config/arm/arm-protos.h           |    3 +
 gcc/config/arm/arm.c                  |  527 ++++++++++++++++++++++++++++++++-
 gcc/config/arm/neon.md                |   59 ++++
 gcc/config/arm/vec-common.md          |   26 ++
 gcc/testsuite/lib/target-supports.exp |    9 +-
 5 files changed, 620 insertions(+), 4 deletions(-)

Comments

Ramana Radhakrishnan Dec. 9, 2011, 6:02 p.m. UTC | #1
On 8 December 2011 21:06, Richard Henderson <rth@redhat.com> wrote:
> ---
>  gcc/config/arm/arm-protos.h           |    3 +
>  gcc/config/arm/arm.c                  |  527 ++++++++++++++++++++++++++++++++-
>  gcc/config/arm/neon.md                |   59 ++++
>  gcc/config/arm/vec-common.md          |   26 ++
>  gcc/testsuite/lib/target-supports.exp |    9 +-
>  5 files changed, 620 insertions(+), 4 deletions(-)

I haven't been following the vector permute work in great detail  and
I must say I haven't read this patch series in great detail yet.

For Neon a further optimization to consider might be to use the vext
instruction which could achieve permute masks that are monotonically
increasing constants ? While I expect the latency for a vext or vtbl
instruction to be about the same (your mileage might vary depending on
the core), using vext gives us the freedom of not needing a register
for the permute mask -

a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n +
4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n

    If the mask being provided is a reverse of the mask above, it's
probably not worth it.


Additionally , can we also detect rotate rights ? unless ofcourse
there's a different interface -

   a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1


Masks doing rotate lefts are probably not worth it in this



regards,
Ramana







>
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index 296550a..8c3e412 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -244,4 +244,7 @@ extern const struct tune_params *current_tune;
>  extern int vfp3_const_double_for_fract_bits (rtx);
>  #endif /* RTX_CODE */
>
> +extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
> +extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
> +
>  #endif /* ! GCC_ARM_PROTOS_H */
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index 65b4e9d..0395a41 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -267,6 +267,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
>  static int arm_default_branch_cost (bool, bool);
>  static int arm_cortex_a5_branch_cost (bool, bool);
>
> +static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
> +                                            const unsigned char *sel);
> +
>
>  /* Table of machine attributes.  */
>  static const struct attribute_spec arm_attribute_table[] =
> @@ -604,6 +607,10 @@ static const struct attribute_spec arm_attribute_table[] =
>  #define TARGET_PREFERRED_RENAME_CLASS \
>   arm_preferred_rename_class
>
> +#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
> +#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
> +  arm_vectorize_vec_perm_const_ok
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  /* Obstack for minipool constant handling.  */
> @@ -25064,6 +25071,524 @@ vfp3_const_double_for_fract_bits (rtx operand)
>     }
>   return 0;
>  }
> +
> +#define MAX_VECT_LEN 16
>
> -#include "gt-arm.h"
> +struct expand_vec_perm_d
> +{
> +  rtx target, op0, op1;
> +  unsigned char perm[MAX_VECT_LEN];
> +  enum machine_mode vmode;
> +  unsigned char nelt;
> +  bool one_vector_p;
> +  bool testing_p;
> +};
> +
> +/* Generate a variable permutation.  */
> +
> +static void
> +arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
> +{
> +  enum machine_mode vmode = GET_MODE (target);
> +  bool one_vector_p = rtx_equal_p (op0, op1);
> +
> +  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
> +  gcc_checking_assert (GET_MODE (op0) == vmode);
> +  gcc_checking_assert (GET_MODE (op1) == vmode);
> +  gcc_checking_assert (GET_MODE (sel) == vmode);
> +  gcc_checking_assert (TARGET_NEON);
> +
> +  if (one_vector_p)
> +    {
> +      if (vmode == V8QImode)
> +       emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
> +      else
> +       emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
> +    }
> +  else
> +    {
> +      enum machine_mode mode1, mode2;
> +      rtx pair, part;
> +
> +      if (vmode == V8QImode)
> +       mode1 = DImode, mode2 = TImode;
> +      else
> +       mode1 = TImode, mode2 = OImode;
> +
> +      pair = gen_reg_rtx (mode2);
> +      emit_insn (gen_rtx_CLOBBER (VOIDmode, pair));
> +
> +      part = simplify_gen_subreg (mode1, pair, mode2,
> +                                 subreg_lowpart_offset (mode1, mode2));
> +      emit_move_insn (part, gen_lowpart (mode1, op0));
> +
> +      part = simplify_gen_subreg (mode1, pair, mode2,
> +                                 subreg_highpart_offset (mode1, mode2));
> +      emit_move_insn (part, gen_lowpart (mode1, op1));
> +
> +      if (vmode == V8QImode)
> +       emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
> +      else
> +       emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
> +    }
> +}
> +
> +void
> +arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
> +{
> +  enum machine_mode vmode = GET_MODE (target);
> +  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
> +  bool one_vector_p = rtx_equal_p (op0, op1);
> +  rtx rmask[MAX_VECT_LEN], mask;
> +
> +  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
> +     numbering of elements for big-endian, we must reverse the order.  */
> +  gcc_checking_assert (!BYTES_BIG_ENDIAN);
> +
> +  /* The VTBL instruction does not use a modulo index, so we must take care
> +     of that ourselves.  */
> +  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
> +  for (i = 0; i < nelt; ++i)
> +    rmask[i] = mask;
> +  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
> +  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
> +
> +  arm_expand_vec_perm_1 (target, op0, op1, sel);
> +}
> +
> +/* Generate or test for an insn that supports a constant permutation.  */
> +
> +/* Recognize patterns for the VUZP insns.  */
> +
> +static bool
> +arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
> +{
> +  unsigned int i, odd, mask, nelt = d->nelt;
> +  rtx out0, out1, in0, in1, x;
> +  rtx (*gen)(rtx, rtx, rtx, rtx);
> +
> +  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> +    return false;
> +
> +  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
> +  if (d->perm[0] == 0)
> +    odd = 0;
> +  else if (d->perm[0] == 1)
> +    odd = 1;
> +  else
> +    return false;
> +  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> +
> +  for (i = 0; i < nelt; i++)
> +    {
> +      unsigned elt = (i * 2 + odd) & mask;
> +      if (d->perm[i] != elt)
> +       return false;
> +    }
> +
> +  /* Success!  */
> +  if (d->testing_p)
> +    return true;
> +
> +  switch (d->vmode)
> +    {
> +    case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
> +    case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
> +    case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
> +    case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
> +    case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
> +    case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
> +    case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
> +    case V4SFmode:  gen = gen_neon_vuzpv4sf_internal;  break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  in0 = d->op0;
> +  in1 = d->op1;
> +  if (BYTES_BIG_ENDIAN)
> +    {
> +      x = in0, in0 = in1, in1 = x;
> +      odd = !odd;
> +    }
> +
> +  out0 = d->target;
> +  out1 = gen_reg_rtx (d->vmode);
> +  if (odd)
> +    x = out0, out0 = out1, out1 = x;
> +
> +  emit_insn (gen (out0, in0, in1, out1));
> +  return true;
> +}
> +
> +/* Recognize patterns for the VZIP insns.  */
> +
> +static bool
> +arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
> +{
> +  unsigned int i, high, mask, nelt = d->nelt;
> +  rtx out0, out1, in0, in1, x;
> +  rtx (*gen)(rtx, rtx, rtx, rtx);
> +
> +  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> +    return false;
> +
> +  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
> +  high = nelt / 2;
> +  if (d->perm[0] == high)
> +    ;
> +  else if (d->perm[0] == 0)
> +    high = 0;
> +  else
> +    return false;
> +  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> +
> +  for (i = 0; i < nelt / 2; i++)
> +    {
> +      unsigned elt = (i + high) & mask;
> +      if (d->perm[i * 2] != elt)
> +       return false;
> +      elt = (elt + nelt) & mask;
> +      if (d->perm[i * 2 + 1] != elt)
> +       return false;
> +    }
> +
> +  /* Success!  */
> +  if (d->testing_p)
> +    return true;
> +
> +  switch (d->vmode)
> +    {
> +    case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
> +    case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
> +    case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
> +    case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
> +    case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
> +    case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
> +    case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
> +    case V4SFmode:  gen = gen_neon_vzipv4sf_internal;  break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  in0 = d->op0;
> +  in1 = d->op1;
> +  if (BYTES_BIG_ENDIAN)
> +    {
> +      x = in0, in0 = in1, in1 = x;
> +      high = !high;
> +    }
> +
> +  out0 = d->target;
> +  out1 = gen_reg_rtx (d->vmode);
> +  if (high)
> +    x = out0, out0 = out1, out1 = x;
> +
> +  emit_insn (gen (out0, in0, in1, out1));
> +  return true;
> +}
> +
> +/* Recognize patterns for the VREV insns.  */
> +
> +static bool
> +arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
> +{
> +  unsigned int i, j, diff, nelt = d->nelt;
> +  rtx (*gen)(rtx, rtx, rtx);
> +
> +  if (!d->one_vector_p)
> +    return false;
> +
> +  diff = d->perm[0];
> +  switch (diff)
> +    {
> +    case 7:
> +      switch (d->vmode)
> +       {
> +       case V16QImode: gen = gen_neon_vrev64v16qi; break;
> +       case V8QImode:  gen = gen_neon_vrev64v8qi;  break;
> +       default:
> +         return false;
> +       }
> +      break;
> +    case 3:
> +      switch (d->vmode)
> +       {
> +       case V16QImode: gen = gen_neon_vrev32v16qi; break;
> +       case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
> +       case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
> +       case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
> +       default:
> +         return false;
> +       }
> +      break;
> +    case 1:
> +      switch (d->vmode)
> +       {
> +       case V16QImode: gen = gen_neon_vrev16v16qi; break;
> +       case V8QImode:  gen = gen_neon_vrev16v8qi;  break;
> +       case V8HImode:  gen = gen_neon_vrev32v8hi;  break;
> +       case V4HImode:  gen = gen_neon_vrev32v4hi;  break;
> +       case V4SImode:  gen = gen_neon_vrev64v4si;  break;
> +       case V2SImode:  gen = gen_neon_vrev64v2si;  break;
> +       case V4SFmode:  gen = gen_neon_vrev64v4sf;  break;
> +       case V2SFmode:  gen = gen_neon_vrev64v2sf;  break;
> +       default:
> +         return false;
> +       }
> +      break;
> +    default:
> +      return false;
> +    }
> +
> +  for (i = 0; i < nelt; i += diff)
> +    for (j = 0; j <= diff; j += 1)
> +      if (d->perm[i + j] != i + diff - j)
> +       return false;
> +
> +  /* Success! */
> +  if (d->testing_p)
> +    return true;
> +
> +  /* ??? The third operand is an artifact of the builtin infrastructure
> +     and is ignored by the actual instruction.  */
> +  emit_insn (gen (d->target, d->op0, const0_rtx));
> +  return true;
> +}
> +
> +/* Recognize patterns for the VTRN insns.  */
> +
> +static bool
> +arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
> +{
> +  unsigned int i, odd, nelt = d->nelt;
> +  rtx out0, out1, in0, in1, x;
> +  rtx (*gen)(rtx, rtx, rtx, rtx);
> +
> +  if (d->one_vector_p)
> +    return false;
> +  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> +    return false;
> +
> +  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
> +  if (d->perm[0] == 0)
> +    odd = 0;
> +  else if (d->perm[0] == 1)
> +    odd = 1;
> +  else
> +    return false;
> +
> +  for (i = 0; i < nelt; i += 2)
> +    {
> +      if (d->perm[i] != i + odd)
> +       return false;
> +      if (d->perm[i + 1] != i + nelt + odd)
> +       return false;
> +    }
> +
> +  /* Success!  */
> +  if (d->testing_p)
> +    return true;
> +
> +  switch (d->vmode)
> +    {
> +    case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
> +    case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
> +    case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
> +    case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
> +    case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
> +    case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
> +    case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
> +    case V4SFmode:  gen = gen_neon_vtrnv4sf_internal;  break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  in0 = d->op0;
> +  in1 = d->op1;
> +  if (BYTES_BIG_ENDIAN)
> +    {
> +      x = in0, in0 = in1, in1 = x;
> +      odd = !odd;
> +    }
> +
> +  out0 = d->target;
> +  out1 = gen_reg_rtx (d->vmode);
> +  if (odd)
> +    x = out0, out0 = out1, out1 = x;
> +
> +  emit_insn (gen (out0, in0, in1, out1));
> +  return true;
> +}
> +
> +/* The NEON VTBL instruction is a fully variable permuation that's even
> +   stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
> +   is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
> +   can do slightly better by expanding this as a constant where we don't
> +   have to apply a mask.  */
> +
> +static bool
> +arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
> +{
> +  rtx rperm[MAX_VECT_LEN], sel;
> +  enum machine_mode vmode = d->vmode;
> +  unsigned int i, nelt = d->nelt;
>
> +  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
> +     numbering of elements for big-endian, we must reverse the order.  */
> +  if (BYTES_BIG_ENDIAN)
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  /* Generic code will try constant permutation twice.  Once with the
> +     original mode and again with the elements lowered to QImode.
> +     So wait and don't do the selector expansion ourselves.  */
> +  if (vmode != V8QImode && vmode != V16QImode)
> +    return false;
> +
> +  for (i = 0; i < nelt; ++i)
> +    rperm[i] = GEN_INT (d->perm[i]);
> +  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
> +  sel = force_reg (vmode, sel);
> +
> +  arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
> +  return true;
> +}
> +
> +static bool
> +arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> +{
> +  /* The pattern matching functions above are written to look for a small
> +     number to begin the sequence (0, 1, N/2).  If we begin with an index
> +     from the second operand, we can swap the operands.  */
> +  if (d->perm[0] >= d->nelt)
> +    {
> +      unsigned i, nelt = d->nelt;
> +      rtx x;
> +
> +      for (i = 0; i < nelt; ++i)
> +       d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
> +
> +      x = d->op0;
> +      d->op0 = d->op1;
> +      d->op1 = x;
> +    }
> +
> +  if (TARGET_NEON)
> +    {
> +      if (arm_evpc_neon_vuzp (d))
> +       return true;
> +      if (arm_evpc_neon_vzip (d))
> +       return true;
> +      if (arm_evpc_neon_vrev (d))
> +       return true;
> +      if (arm_evpc_neon_vtrn (d))
> +       return true;
> +      return arm_evpc_neon_vtbl (d);
> +    }
> +  return false;
> +}
> +
> +/* Expand a vec_perm_const pattern.  */
> +
> +bool
> +arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
> +{
> +  struct expand_vec_perm_d d;
> +  int i, nelt, which;
> +
> +  d.target = target;
> +  d.op0 = op0;
> +  d.op1 = op1;
> +
> +  d.vmode = GET_MODE (target);
> +  gcc_assert (VECTOR_MODE_P (d.vmode));
> +  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> +  d.testing_p = false;
> +
> +  for (i = which = 0; i < nelt; ++i)
> +    {
> +      rtx e = XVECEXP (sel, 0, i);
> +      int ei = INTVAL (e) & (2 * nelt - 1);
> +      which |= (ei < nelt ? 1 : 2);
> +      d.perm[i] = ei;
> +    }
> +
> +  switch (which)
> +    {
> +    default:
> +      gcc_unreachable();
> +
> +    case 3:
> +      d.one_vector_p = false;
> +      if (!rtx_equal_p (op0, op1))
> +       break;
> +
> +      /* The elements of PERM do not suggest that only the first operand
> +        is used, but both operands are identical.  Allow easier matching
> +        of the permutation by folding the permutation into the single
> +        input vector.  */
> +      /* FALLTHRU */
> +    case 2:
> +      for (i = 0; i < nelt; ++i)
> +        d.perm[i] &= nelt - 1;
> +      d.op0 = op1;
> +      d.one_vector_p = true;
> +      break;
> +
> +    case 1:
> +      d.op1 = op0;
> +      d.one_vector_p = true;
> +      break;
> +    }
> +
> +  return arm_expand_vec_perm_const_1 (&d);
> +}
> +
> +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
> +
> +static bool
> +arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
> +                                const unsigned char *sel)
> +{
> +  struct expand_vec_perm_d d;
> +  unsigned int i, nelt, which;
> +  bool ret;
> +
> +  d.vmode = vmode;
> +  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> +  d.testing_p = true;
> +  memcpy (d.perm, sel, nelt);
> +
> +  /* Categorize the set of elements in the selector.  */
> +  for (i = which = 0; i < nelt; ++i)
> +    {
> +      unsigned char e = d.perm[i];
> +      gcc_assert (e < 2 * nelt);
> +      which |= (e < nelt ? 1 : 2);
> +    }
> +
> +  /* For all elements from second vector, fold the elements to first.  */
> +  if (which == 2)
> +    for (i = 0; i < nelt; ++i)
> +      d.perm[i] -= nelt;
> +
> +  /* Check whether the mask can be applied to the vector type.  */
> +  d.one_vector_p = (which != 3);
> +
> +  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
> +  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
> +  if (!d.one_vector_p)
> +    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
> +
> +  start_sequence ();
> +  ret = arm_expand_vec_perm_const_1 (&d);
> +  end_sequence ();
> +
> +  return ret;
> +}
> +
> +
> +#include "gt-arm.h"
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 94e0a5f..bd68d39 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -3876,6 +3876,65 @@
>   [(set_attr "neon_type" "neon_bp_3cycle")]
>  )
>
> +;; These two are used by the vec_perm infrastructure for V16QImode.
> +(define_insn_and_split "neon_vtbl1v16qi"
> +  [(set (match_operand:V16QI 0 "s_register_operand" "=w")
> +       (unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w")
> +                      (match_operand:V16QI 2 "s_register_operand" "w")]
> +                     UNSPEC_VTBL))]
> +  "TARGET_NEON"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +{
> +  rtx op0, op1, op2, part0, part2;
> +  unsigned ofs;
> +
> +  op0 = operands[0];
> +  op1 = gen_lowpart (TImode, operands[1]);
> +  op2 = operands[2];
> +
> +  ofs = subreg_lowpart_offset (V8QImode, V16QImode);
> +  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
> +  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
> +  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
> +
> +  ofs = subreg_highpart_offset (V8QImode, V16QImode);
> +  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
> +  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
> +  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
> +  DONE;
> +})
> +
> +(define_insn_and_split "neon_vtbl2v16qi"
> +  [(set (match_operand:V16QI 0 "s_register_operand" "=w")
> +       (unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w")
> +                      (match_operand:V16QI 2 "s_register_operand" "w")]
> +                     UNSPEC_VTBL))]
> +  "TARGET_NEON"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +{
> +  rtx op0, op1, op2, part0, part2;
> +  unsigned ofs;
> +
> +  op0 = operands[0];
> +  op1 = operands[1];
> +  op2 = operands[2];
> +
> +  ofs = subreg_lowpart_offset (V8QImode, V16QImode);
> +  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
> +  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
> +  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
> +
> +  ofs = subreg_highpart_offset (V8QImode, V16QImode);
> +  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
> +  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
> +  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
> +  DONE;
> +})
> +
>  (define_insn "neon_vtbx1v8qi"
>   [(set (match_operand:V8QI 0 "s_register_operand" "=w")
>        (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index c27c414..eb29900 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -108,3 +108,29 @@
>    || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
>  {
>  })
> +
> +(define_expand "vec_perm_const<mode>"
> +  [(match_operand:VALL 0 "s_register_operand" "")
> +   (match_operand:VALL 1 "s_register_operand" "")
> +   (match_operand:VALL 2 "s_register_operand" "")
> +   (match_operand:<V_cmp_result> 3 "" "")]
> +  "TARGET_NEON
> +   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
> +{
> +  if (arm_expand_vec_perm_const (operands[0], operands[1],
> +                                operands[2], operands[3]))
> +    DONE;
> +  else
> +    FAIL;
> +})
> +
> +(define_expand "vec_perm<mode>"
> +  [(match_operand:VE 0 "s_register_operand" "")
> +   (match_operand:VE 1 "s_register_operand" "")
> +   (match_operand:VE 2 "s_register_operand" "")
> +   (match_operand:VE 3 "s_register_operand" "")]
> +  "TARGET_NEON && !BYTES_BIG_ENDIAN"
> +{
> +  arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]);
> +  DONE;
> +})
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index 78223af..d99a0b3 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -2725,7 +2725,8 @@ proc check_effective_target_vect_perm { } {
>         verbose "check_effective_target_vect_perm: using cached result" 2
>     } else {
>         set et_vect_perm_saved 0
> -        if { [istarget powerpc*-*-*]
> +        if { [is-effective-target arm_neon_ok]
> +            || [istarget powerpc*-*-*]
>              || [istarget spu-*-*]
>             || [istarget i?86-*-*]
>             || [istarget x86_64-*-*] } {
> @@ -2748,7 +2749,8 @@ proc check_effective_target_vect_perm_byte { } {
>         verbose "check_effective_target_vect_perm_byte: using cached result" 2
>     } else {
>         set et_vect_perm_byte_saved 0
> -        if { [istarget powerpc*-*-*]
> +        if { [is-effective-target arm_neon_ok]
> +            || [istarget powerpc*-*-*]
>              || [istarget spu-*-*] } {
>             set et_vect_perm_byte_saved 1
>         }
> @@ -2769,7 +2771,8 @@ proc check_effective_target_vect_perm_short { } {
>         verbose "check_effective_target_vect_perm_short: using cached result" 2
>     } else {
>         set et_vect_perm_short_saved 0
> -        if { [istarget powerpc*-*-*]
> +        if { [is-effective-target arm_neon_ok]
> +            || [istarget powerpc*-*-*]
>              || [istarget spu-*-*] } {
>             set et_vect_perm_short_saved 1
>         }
> --
> 1.7.7.3
>
Michael Meissner Dec. 9, 2011, 6:17 p.m. UTC | #2
On Fri, Dec 09, 2011 at 06:02:21PM +0000, Ramana Radhakrishnan wrote:
> On 8 December 2011 21:06, Richard Henderson <rth@redhat.com> wrote:
> > ---
> >  gcc/config/arm/arm-protos.h           |    3 +
> >  gcc/config/arm/arm.c                  |  527 ++++++++++++++++++++++++++++++++-
> >  gcc/config/arm/neon.md                |   59 ++++
> >  gcc/config/arm/vec-common.md          |   26 ++
> >  gcc/testsuite/lib/target-supports.exp |    9 +-
> >  5 files changed, 620 insertions(+), 4 deletions(-)
> 
> I haven't been following the vector permute work in great detail  and
> I must say I haven't read this patch series in great detail yet.
> 
> For Neon a further optimization to consider might be to use the vext
> instruction which could achieve permute masks that are monotonically
> increasing constants ? While I expect the latency for a vext or vtbl
> instruction to be about the same (your mileage might vary depending on
> the core), using vext gives us the freedom of not needing a register
> for the permute mask -
> 
> a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n +
> 4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n
> 
>     If the mask being provided is a reverse of the mask above, it's
> probably not worth it.
> 
> 
> Additionally , can we also detect rotate rights ? unless ofcourse
> there's a different interface -
> 
>    a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1
> 
> 
> Masks doing rotate lefts are probably not worth it in this

Richard and I were discussing this last night on IRC, and it is certainly
possible.  Somebody would just have to write a predicate to recognize the
case.  We do wonder how frequently it will occur, and whether people doing this
would just use the whole vector shift instead of shuffle.
Richard Henderson Dec. 9, 2011, 7:38 p.m. UTC | #3
On 12/09/2011 10:02 AM, Ramana Radhakrishnan wrote:
> For Neon a further optimization to consider might be to use the vext
> instruction which could achieve permute masks that are monotonically
> increasing constants ? While I expect the latency for a vext or vtbl
> instruction to be about the same (your mileage might vary depending on
> the core), using vext gives us the freedom of not needing a register
> for the permute mask -
> 
> a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n +
> 4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n

Good to know.  I missed that one in my reading of the manual.

> Additionally , can we also detect rotate rights ? unless ofcourse
> there's a different interface -
> 
>    a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1

Certainly we can.


r~
diff mbox

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..8c3e412 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -244,4 +244,7 @@  extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
 #endif /* RTX_CODE */
 
+extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
+
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 65b4e9d..0395a41 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -267,6 +267,9 @@  static unsigned int arm_autovectorize_vector_sizes (void);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 
+static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+					     const unsigned char *sel);
+
 
 /* Table of machine attributes.  */
 static const struct attribute_spec arm_attribute_table[] =
@@ -604,6 +607,10 @@  static const struct attribute_spec arm_attribute_table[] =
 #define TARGET_PREFERRED_RENAME_CLASS \
   arm_preferred_rename_class
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  arm_vectorize_vec_perm_const_ok
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 /* Obstack for minipool constant handling.  */
@@ -25064,6 +25071,524 @@  vfp3_const_double_for_fract_bits (rtx operand)
     }
   return 0;
 }
+
+#define MAX_VECT_LEN 16
 
-#include "gt-arm.h"
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Generate a variable permutation.  */
+
+static void
+arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_NEON);
+
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+	emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
+      else
+	emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
+    }
+  else
+    {
+      enum machine_mode mode1, mode2;
+      rtx pair, part;
+
+      if (vmode == V8QImode)
+	mode1 = DImode, mode2 = TImode;
+      else
+	mode1 = TImode, mode2 = OImode;
+
+      pair = gen_reg_rtx (mode2);
+      emit_insn (gen_rtx_CLOBBER (VOIDmode, pair));
+
+      part = simplify_gen_subreg (mode1, pair, mode2,
+				  subreg_lowpart_offset (mode1, mode2));
+      emit_move_insn (part, gen_lowpart (mode1, op0));
+
+      part = simplify_gen_subreg (mode1, pair, mode2,
+				  subreg_highpart_offset (mode1, mode2));
+      emit_move_insn (part, gen_lowpart (mode1, op1));
+
+      if (vmode == V8QImode)
+	emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
+      else
+	emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
+    }
+}
+
+void
+arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
+
+  /* The VTBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  arm_expand_vec_perm_1 (target, op0, op1, sel);
+}
+
+/* Generate or test for an insn that supports a constant permutation.  */
+
+/* Recognize patterns for the VUZP insns.  */
+
+static bool
+arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned elt = (i * 2 + odd) & mask;
+      if (d->perm[i] != elt)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vuzpv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VZIP insns.  */
+
+static bool
+arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
+{
+  unsigned int i, high, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  high = nelt / 2;
+  if (d->perm[0] == high)
+    ;
+  else if (d->perm[0] == 0)
+    high = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
+    {
+      unsigned elt = (i + high) & mask;
+      if (d->perm[i * 2] != elt)
+	return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vzipv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      high = !high;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (high)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VREV insns.  */
+
+static bool
+arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, diff, nelt = d->nelt;
+  rtx (*gen)(rtx, rtx, rtx);
+
+  if (!d->one_vector_p)
+    return false;
+
+  diff = d->perm[0];
+  switch (diff)
+    {
+    case 7:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_neon_vrev64v16qi; break;
+	case V8QImode:  gen = gen_neon_vrev64v8qi;  break;
+	default:
+	  return false;
+	}
+      break;
+    case 3:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_neon_vrev32v16qi; break;
+	case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
+	case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
+	case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
+	default:
+	  return false;
+	}
+      break;
+    case 1:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_neon_vrev16v16qi; break;
+	case V8QImode:  gen = gen_neon_vrev16v8qi;  break;
+	case V8HImode:  gen = gen_neon_vrev32v8hi;  break;
+	case V4HImode:  gen = gen_neon_vrev32v4hi;  break;
+	case V4SImode:  gen = gen_neon_vrev64v4si;  break;
+	case V2SImode:  gen = gen_neon_vrev64v2si;  break;
+	case V4SFmode:  gen = gen_neon_vrev64v4sf;  break;
+	case V2SFmode:  gen = gen_neon_vrev64v2sf;  break;
+	default:
+	  return false;
+	}
+      break;
+    default:
+      return false;
+    }
+
+  for (i = 0; i < nelt; i += diff)
+    for (j = 0; j <= diff; j += 1)
+      if (d->perm[i + j] != i + diff - j)
+	return false;
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  /* ??? The third operand is an artifact of the builtin infrastructure
+     and is ignored by the actual instruction.  */
+  emit_insn (gen (d->target, d->op0, const0_rtx));
+  return true;
+}
+
+/* Recognize patterns for the VTRN insns.  */
+
+static bool
+arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (d->one_vector_p)
+    return false;
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+
+  for (i = 0; i < nelt; i += 2)
+    {
+      if (d->perm[i] != i + odd)
+	return false;
+      if (d->perm[i + 1] != i + nelt + odd)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vtrnv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* The NEON VTBL instruction is a fully variable permuation that's even
+   stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
+   is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
+   can do slightly better by expanding this as a constant where we don't
+   have to apply a mask.  */
+
+static bool
+arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
+{
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
 
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
+}
+
+static bool
+arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
+
+      for (i = 0; i < nelt; ++i)
+	d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_NEON)
+    {
+      if (arm_evpc_neon_vuzp (d))
+	return true;
+      if (arm_evpc_neon_vzip (d))
+	return true;
+      if (arm_evpc_neon_vrev (d))
+	return true;
+      if (arm_evpc_neon_vtrn (d))
+	return true;
+      return arm_evpc_neon_vtbl (d);
+    }
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+	break;
+
+      /* The elements of PERM do not suggest that only the first operand
+	 is used, but both operands are identical.  Allow easier matching
+	 of the permutation by folding the permutation into the single
+	 input vector.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return arm_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+				 const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = arm_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+
+#include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 94e0a5f..bd68d39 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3876,6 +3876,65 @@ 
   [(set_attr "neon_type" "neon_bp_3cycle")]
 )
 
+;; These two are used by the vec_perm infrastructure for V16QImode.
+(define_insn_and_split "neon_vtbl1v16qi"
+  [(set (match_operand:V16QI 0 "s_register_operand" "=w")
+	(unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w")
+		       (match_operand:V16QI 2 "s_register_operand" "w")]
+		      UNSPEC_VTBL))]
+  "TARGET_NEON"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0, op1, op2, part0, part2;
+  unsigned ofs;
+
+  op0 = operands[0];
+  op1 = gen_lowpart (TImode, operands[1]);
+  op2 = operands[2];
+
+  ofs = subreg_lowpart_offset (V8QImode, V16QImode);
+  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
+  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
+  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
+
+  ofs = subreg_highpart_offset (V8QImode, V16QImode);
+  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
+  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
+  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
+  DONE;
+})
+
+(define_insn_and_split "neon_vtbl2v16qi"
+  [(set (match_operand:V16QI 0 "s_register_operand" "=w")
+	(unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w")
+		       (match_operand:V16QI 2 "s_register_operand" "w")]
+		      UNSPEC_VTBL))]
+  "TARGET_NEON"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0, op1, op2, part0, part2;
+  unsigned ofs;
+
+  op0 = operands[0];
+  op1 = operands[1];
+  op2 = operands[2];
+
+  ofs = subreg_lowpart_offset (V8QImode, V16QImode);
+  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
+  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
+  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
+
+  ofs = subreg_highpart_offset (V8QImode, V16QImode);
+  part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
+  part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
+  emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
+  DONE;
+})
+
 (define_insn "neon_vtbx1v8qi"
   [(set (match_operand:V8QI 0 "s_register_operand" "=w")
 	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index c27c414..eb29900 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -108,3 +108,29 @@ 
    || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
 {
 })
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VALL 0 "s_register_operand" "")
+   (match_operand:VALL 1 "s_register_operand" "")
+   (match_operand:VALL 2 "s_register_operand" "")
+   (match_operand:<V_cmp_result> 3 "" "")]
+  "TARGET_NEON
+   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
+{
+  if (arm_expand_vec_perm_const (operands[0], operands[1],
+				 operands[2], operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:VE 0 "s_register_operand" "")
+   (match_operand:VE 1 "s_register_operand" "")
+   (match_operand:VE 2 "s_register_operand" "")
+   (match_operand:VE 3 "s_register_operand" "")]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+{
+  arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]);
+  DONE;
+})
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 78223af..d99a0b3 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2725,7 +2725,8 @@  proc check_effective_target_vect_perm { } {
         verbose "check_effective_target_vect_perm: using cached result" 2
     } else {
         set et_vect_perm_saved 0
-        if { [istarget powerpc*-*-*]
+        if { [is-effective-target arm_neon_ok]
+	     || [istarget powerpc*-*-*]
              || [istarget spu-*-*]
 	     || [istarget i?86-*-*]
 	     || [istarget x86_64-*-*] } {
@@ -2748,7 +2749,8 @@  proc check_effective_target_vect_perm_byte { } {
         verbose "check_effective_target_vect_perm_byte: using cached result" 2
     } else {
         set et_vect_perm_byte_saved 0
-        if { [istarget powerpc*-*-*]
+        if { [is-effective-target arm_neon_ok]
+	     || [istarget powerpc*-*-*]
              || [istarget spu-*-*] } {
             set et_vect_perm_byte_saved 1
         }
@@ -2769,7 +2771,8 @@  proc check_effective_target_vect_perm_short { } {
         verbose "check_effective_target_vect_perm_short: using cached result" 2
     } else {
         set et_vect_perm_short_saved 0
-        if { [istarget powerpc*-*-*]
+        if { [is-effective-target arm_neon_ok]
+	     || [istarget powerpc*-*-*]
              || [istarget spu-*-*] } {
             set et_vect_perm_short_saved 1
         }