diff mbox

[2/2,x86] Add palignr support for AVX2.

Message ID CAOvf_xzXNYBAAMdZr8d-6PLnQnvJyZaDaZ7LSXnoBDy7opmuPw@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko April 29, 2014, 5:13 p.m. UTC
Thanks. The path is fixed according to your comments:


On Tue, Apr 29, 2014 at 6:50 PM, Richard Henderson <rth@redhat.com> wrote:
> On 04/29/2014 06:50 AM, Evgeny Stupachenko wrote:
>> +  if (d->one_operand_p != true)
>> +    return false;
>
> This looks odd.  Better as !d->one_operand_p.
>
>> +
>> +  /* For an in order permutation with one operand like: {5 6 7 0 1 2 3 4}
>> +     PALIGNR is better than PSHUFB.  Check for an order in permutation.  */
>
> FWIW, "in order permutation" sounds like a contradiction in terms.
> Better to describe this as a rotate.
>
>> +  in_order_length = 0;
>> +  in_order_length_max = 0;
>> +  if (d->one_operand_p == true)
>
> You've just tested one_operand above.
>
>> +    for (i = 0; i < 2 * nelt; ++i)
>
> Which means that 2*nelt is doing twice as much work as needed.
>
>> +      {
>> +       if ((d->perm[(i + 1) & (nelt - 1)] -
>> +            d->perm[i & (nelt - 1)]) != 1)
>
> Surely we can avoid re-reading the comparison value...
>
>   next = (d->perm[0] + 1) & (nelt - 1);
>   for (i = 1; i < nelt; ++i)
>     {
>       if (d->perm[i] != next)
>         return false;
>       next = (next + 1) & (nelt - 1);
>     }
>
>> +         {
>> +           if (in_order_length > in_order_length_max)
>> +               in_order_length_max = in_order_length;
>> +             in_order_length = 0;
>> +         }
>> +       else
>> +         in_order_length++;
>> +      }
>> +
>> +  /* If not an ordered permutation then try something else.  */
>> +  if (in_order_length_max != nelt - 1)
>> +    return false;
>
> I don't understand what this length and max stuff is trying to accomplish.
>
>> +
>> +  min = d->perm[0];
>> +
>> +  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>> +  shift1 = GEN_INT ((min - nelt / 2) *
>> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>> +
>> +  if (GET_MODE_SIZE (d->vmode) != 32)
>
> Positive tests are almost always clearer: == 16.
>
>
> r~

Comments

Richard Henderson April 29, 2014, 5:39 p.m. UTC | #1
On 04/29/2014 10:13 AM, Evgeny Stupachenko wrote:
> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
> +     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  */
> +  for (i = 0; i < nelt; ++i)
> +    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
> +      return false;
> +
> +  min = d->perm[0];

Why are you running this loop NELT times instead of NELT-1 like I suggested?
Why is that test expression so complicated?

Obviously d->perm[0] is the anchor and everything else can be computed relative
to that.  I'd expect no more than

  min = d->perm[0];
  for (i = 1; i < nelt; ++i)
    if (d->perm[i] != ((min + i) & (nelt - 1)))
      return false;

Now that I think of it,

> +  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
> +     Requires SSSE3.  */
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      if (!TARGET_SSSE3)
> +       return false;
> +    }
> +  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
> +     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
> +  else if (GET_MODE_SIZE (d->vmode) == 32)
> +    {
> +      if (!TARGET_AVX2)
> +       return false;
> +    }
> +  else
> +    return false;
> +
> +  if (!d->one_operand_p)
> +    return false;

The comments are wrong.  Move the operand_p test to the top,
where it should be, and they're more obviously wrong:

  "must have one operand"
  "palignr of two operands..."

I think your comment

  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
     we want to use PALIGNR.  */

belongs up there instead of those two incorrect comments.



r~
Evgeny Stupachenko April 30, 2014, 2:25 p.m. UTC | #2
On Tue, Apr 29, 2014 at 9:39 PM, Richard Henderson <rth@redhat.com> wrote:
> On 04/29/2014 10:13 AM, Evgeny Stupachenko wrote:
>> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>> +     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  */
>> +  for (i = 0; i < nelt; ++i)
>> +    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
>> +      return false;
>> +
>> +  min = d->perm[0];
>
> Why are you running this loop NELT times instead of NELT-1 like I suggested?
> Why is that test expression so complicated?
>
> Obviously d->perm[0] is the anchor and everything else can be computed relative
> to that.  I'd expect no more than
>
>   min = d->perm[0];
>   for (i = 1; i < nelt; ++i)
>     if (d->perm[i] != ((min + i) & (nelt - 1)))
>       return false;

Agree on this. The loop is less complicated.

>
> Now that I think of it,
>
>> +  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>> +     Requires SSSE3.  */
>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>> +    {
>> +      if (!TARGET_SSSE3)
>> +       return false;
>> +    }
>> +  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>> +     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>> +    {
>> +      if (!TARGET_AVX2)
>> +       return false;
>> +    }
>> +  else
>> +    return false;
>> +
>> +  if (!d->one_operand_p)
>> +    return false;
>
> The comments are wrong.  Move the operand_p test to the top,
> where it should be, and they're more obviously wrong:
>
>   "must have one operand"
>   "palignr of two operands..."
>
> I think your comment
>
>   /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>      we want to use PALIGNR.  */
>
> belongs up there instead of those two incorrect comments.
>

What I mean in the comments
  16 bytes case:
    For 1 operand permutation
      Rotation will cost "palignr" which is better than "pshufb" as
has smaller opcode (6 vs 9) and always costs 1 tick (pshufb takes 3-5
ticks on some x86 archs).
    For 2 operands permutation
      If "palignr" is applicable it reduces instructions from: "2
pshufb and or" to "palignr and pshufb". Profitable for the same
reasons as above.
  32 bytes case:
    For 1 operand permutation
      Rotation will cost only 2 instruction "palignr and perm" which
is better than "2 pshufb and perm".
    For 2 operands permutation
      If palignr is applicable it reduces instructions from: "4 pshufb
2 perm and or" to "palignr, 2 pshufb, perm and or" and profitable for
the same reasons as above.

So the final reason is the same for 1 and 2 operands case. However I
agree to extend the comments as they are not clear.
Maybe we should unite one and two operand case into 1 function? I can
submit such patch when patch 1/2 is committed.

Thanks,
Evgeny

>
>
> r~
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 002d295..aa6372a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,79 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   return true;
 }

+/* A subroutine of ix86_expand_vec_perm_1.  Try to use just palignr
+   instruction for one operand permutation.  This is better than pshufb
+   as does not require to pass big constant and faster on some x86
+   architectures.  */
+
+static bool
+expand_vec_perm_palignr_one_operand (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned min;
+  rtx shift, shift1, target, tmp;
+
+  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
+     Requires SSSE3.  */
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSSE3)
+       return false;
+    }
+  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
+     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX2)
+       return false;
+    }
+  else
+    return false;
+
+  if (!d->one_operand_p)
+    return false;
+
+  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  */
+  for (i = 0; i < nelt; ++i)
+    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
+      return false;
+
+  min = d->perm[0];
+  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+  shift1 = GEN_INT ((min - nelt / 2) *
+          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+                                     gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      tmp = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv2ti (tmp,
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (33)));
+      if (min < nelt / 2)
+        emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, tmp),
+                                        gen_lowpart (V2TImode, d->op0),
+                                        shift));
+      else
+       emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, d->op1),
+                                        gen_lowpart (V2TImode, tmp),
+                                        shift1));
+    }
+  emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+  return true;
+}
+
 static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
@@ -42943,6 +43016,10 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_vpermil (d))
     return true;

+  /* Try palignr on one operand.  */
+  if (expand_vec_perm_palignr_one_operand (d))
+    return true;
+
   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   if (expand_vec_perm_pshufb (d))