diff mbox

[2/2,x86] Add palignr support for AVX2.

Message ID CAOvf_xwRM17xGzaLoqxHXJ9U=iWJMq26ZyC4f1sp3_TUctnTVA@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko May 5, 2014, 4:54 p.m. UTC
Assuming first part of the patch is committed. Is the following patch
ok? It passes bootstrap and make check.


On Wed, Apr 30, 2014 at 6:25 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> On Tue, Apr 29, 2014 at 9:39 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 04/29/2014 10:13 AM, Evgeny Stupachenko wrote:
>>> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>> +     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  */
>>> +  for (i = 0; i < nelt; ++i)
>>> +    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
>>> +      return false;
>>> +
>>> +  min = d->perm[0];
>>
>> Why are you running this loop NELT times instead of NELT-1 like I suggested?
>> Why is that test expression so complicated?
>>
>> Obviously d->perm[0] is the anchor and everything else can be computed relative
>> to that.  I'd expect no more than
>>
>>   min = d->perm[0];
>>   for (i = 1; i < nelt; ++i)
>>     if (d->perm[i] != ((min + i) & (nelt - 1)))
>>       return false;
>
> Agree on this. The loop is less complicated.
>
>>
>> Now that I think of it,
>>
>>> +  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>> +     Requires SSSE3.  */
>>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>>> +    {
>>> +      if (!TARGET_SSSE3)
>>> +       return false;
>>> +    }
>>> +  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>> +     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>>> +    {
>>> +      if (!TARGET_AVX2)
>>> +       return false;
>>> +    }
>>> +  else
>>> +    return false;
>>> +
>>> +  if (!d->one_operand_p)
>>> +    return false;
>>
>> The comments are wrong.  Move the operand_p test to the top,
>> where it should be, and they're more obviously wrong:
>>
>>   "must have one operand"
>>   "palignr of two operands..."
>>
>> I think your comment
>>
>>   /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>      we want to use PALIGNR.  */
>>
>> belongs up there instead of those two incorrect comments.
>>
>
> What I mean in the comments
>   16 bytes case:
>     For 1 operand permutation
>       Rotation will cost "palignr" which is better than "pshufb" as
> has smaller opcode (6 vs 9) and always costs 1 tick (pshufb takes 3-5
> ticks on some x86 archs).
>     For 2 operands permutation
>       If "palignr" is applicable it reduces instructions from: "2
> pshufb and or" to "palignr and pshufb". Profitable for the same
> reasons as above.
>   32 bytes case:
>     For 1 operand permutation
>       Rotation will cost only 2 instruction "palignr and perm" which
> is better than "2 pshufb and perm".
>     For 2 operands permutation
>       If palignr is applicable it reduces instructions from: "4 pshufb
> 2 perm and or" to "palignr, 2 pshufb, perm and or" and profitable for
> the same reasons as above.
>
> So the final reason is the same for 1 and 2 operands case. However I
> agree to extend the comments as they are not clear.
> Maybe we should unite one and two operand case into 1 function? I can
> submit such patch when patch 1/2 is committed.
>
> Thanks,
> Evgeny
>
>>
>>
>> r~

Comments

Evgeny Stupachenko May 19, 2014, 10:43 a.m. UTC | #1
Ping.

On Mon, May 5, 2014 at 8:54 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Assuming first part of the patch is committed. Is the following patch
> ok? It passes bootstrap and make check.
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 91f6f21..475448e 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -42808,6 +42808,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
>  }
>
>  static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
> +static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d);
>
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
>     in a single instruction.  */
> @@ -42943,6 +42944,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_vpermil (d))
>      return true;
>
> +  /* Try palignr on one operand.  */
> +  if (d->one_operand_p && expand_vec_perm_palignr (d))
> +    return true;
> +
>    /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
>       vpshufb, vpermd, vpermps or vpermq variable permutation.  */
>    if (expand_vec_perm_pshufb (d))
> @@ -43040,22 +43045,36 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>    else
>      return false;
>
> -  min = 2 * nelt, max = 0;
> -  for (i = 0; i < nelt; ++i)
> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
> +     PALIGNR is better than any other permutaion expand.
> +     Check for a rotation in permutation.  */
> +  if (d->one_operand_p)
>      {
> -      unsigned e = d->perm[i];
> -      if (e < min)
> -       min = e;
> -      if (e > max)
> -       max = e;
> +      min = d->perm[0];
> +      for (i = 1; i < nelt; ++i)
> +       if (d->perm[i] != ((min + i) & (nelt - 1)))
> +         return false;
>      }
> -  if (min == 0 || max - min >= nelt)
> -    return false;
> +  /* For a 2 operand permutaion we check if elements fit within one vector.  */
> +  else
> +    {
> +      min = 2 * nelt, max = 0;
> +      for (i = 0; i < nelt; ++i)
> +       {
> +         unsigned e = d->perm[i];
> +         if (e < min)
> +           min = e;
> +         if (e > max)
> +           max = e;
> +       }
> +      if (min == 0 || max - min >= nelt)
> +       return false;
>
> -  /* Given that we have SSSE3, we know we'll be able to implement the
> -     single operand permutation after the palignr with pshufb.  */
> -  if (d->testing_p)
> -    return true;
> +      /* Given that we have SSSE3, we know we'll be able to implement the
> +        single operand permutation after the palignr with pshufb.  */
> +      if (d->testing_p)
> +       return true;
> +    }
>
>    dcopy = *d;
>    shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> @@ -43089,6 +43108,14 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>      }
>
>
> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
> +     the rest single operand permutation is just move.  */
> +  if (d->one_operand_p)
> +    {
> +      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
> +      return true;
> +    }
> +
>    dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
>    dcopy.one_operand_p = true;
>
>
> On Wed, Apr 30, 2014 at 6:25 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> On Tue, Apr 29, 2014 at 9:39 PM, Richard Henderson <rth@redhat.com> wrote:
>>> On 04/29/2014 10:13 AM, Evgeny Stupachenko wrote:
>>>> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>>> +     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  */
>>>> +  for (i = 0; i < nelt; ++i)
>>>> +    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
>>>> +      return false;
>>>> +
>>>> +  min = d->perm[0];
>>>
>>> Why are you running this loop NELT times instead of NELT-1 like I suggested?
>>> Why is that test expression so complicated?
>>>
>>> Obviously d->perm[0] is the anchor and everything else can be computed relative
>>> to that.  I'd expect no more than
>>>
>>>   min = d->perm[0];
>>>   for (i = 1; i < nelt; ++i)
>>>     if (d->perm[i] != ((min + i) & (nelt - 1)))
>>>       return false;
>>
>> Agree on this. The loop is less complicated.
>>
>>>
>>> Now that I think of it,
>>>
>>>> +  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>>> +     Requires SSSE3.  */
>>>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>>>> +    {
>>>> +      if (!TARGET_SSSE3)
>>>> +       return false;
>>>> +    }
>>>> +  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>>> +     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>>>> +    {
>>>> +      if (!TARGET_AVX2)
>>>> +       return false;
>>>> +    }
>>>> +  else
>>>> +    return false;
>>>> +
>>>> +  if (!d->one_operand_p)
>>>> +    return false;
>>>
>>> The comments are wrong.  Move the operand_p test to the top,
>>> where it should be, and they're more obviously wrong:
>>>
>>>   "must have one operand"
>>>   "palignr of two operands..."
>>>
>>> I think your comment
>>>
>>>   /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>>      we want to use PALIGNR.  */
>>>
>>> belongs up there instead of those two incorrect comments.
>>>
>>
>> What I mean in the comments
>>   16 bytes case:
>>     For 1 operand permutation
>>       Rotation will cost "palignr" which is better than "pshufb" as
>> has smaller opcode (6 vs 9) and always costs 1 tick (pshufb takes 3-5
>> ticks on some x86 archs).
>>     For 2 operands permutation
>>       If "palignr" is applicable it reduces instructions from: "2
>> pshufb and or" to "palignr and pshufb". Profitable for the same
>> reasons as above.
>>   32 bytes case:
>>     For 1 operand permutation
>>       Rotation will cost only 2 instruction "palignr and perm" which
>> is better than "2 pshufb and perm".
>>     For 2 operands permutation
>>       If palignr is applicable it reduces instructions from: "4 pshufb
>> 2 perm and or" to "palignr, 2 pshufb, perm and or" and profitable for
>> the same reasons as above.
>>
>> So the final reason is the same for 1 and 2 operands case. However I
>> agree to extend the comments as they are not clear.
>> Maybe we should unite one and two operand case into 1 function? I can
>> submit such patch when patch 1/2 is committed.
>>
>> Thanks,
>> Evgeny
>>
>>>
>>>
>>> r~
Richard Henderson May 19, 2014, 4 p.m. UTC | #2
On 05/05/2014 09:54 AM, Evgeny Stupachenko wrote:
> @@ -42943,6 +42944,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_vpermil (d))
>      return true;
> 
> +  /* Try palignr on one operand.  */
> +  if (d->one_operand_p && expand_vec_perm_palignr (d))
> +    return true;

No, because unless in_order and SSSE3, expand_vec_perm_palignr generates at
least 2 insns, and by contract expand_vec_perm_1 must generate only one.

I think what might help you out is to have the rotate permutation matched
directly, rather than have to have it converted to a shift.

Thus I think you'd do well to start this series with a patch that adds a
pattern of the form

(define_insn "*ssse3_palignr<mode>_perm"
  [(set (match_operand:V_128 0 "register_operand" "=x,x")
        (vec_select:V_128
           (match_operand:V_128 1 "register_operand" "0,x")
           (match_operand:V_128 2 "nonimmediate_operand" "xm,xm")
           (match_parallel 3 "palign_operand"
             [(match_operand 4 "const_int_operand" "")]
  "TARGET_SSSE3"
{
  enum machine_mode imode = GET_INNER_MODE (GET_MODE (operands[0]));
  operands[3] = GEN_INT (INTVAL (operands[4]) * GET_MODE_SIZE (imode));

  switch (which_alternative)
    {
    case 0:
      return "palignr\t{%3, %2, %0|%0, %2, %3}";
    case 1:
      return "vpalignr\t{%3, %2, %1, %0|%0, %1, %2, %3}";
    default:
      gcc_unreachable ();
    }
}
  [(set_attr "isa" "noavx,avx")
   (set_attr "type" "sseishft")
   (set_attr "atom_unit" "sishuf")
   (set_attr "prefix_data16" "1,*")
   (set_attr "prefix_extra" "1")
   (set_attr "length_immediate" "1")
   (set_attr "prefix" "orig,vex")])

where the palign_operand function verifies that the constants are all in order.
 This is very similar to the way we define the broadcast type patterns.

You'll need a similar pattern with a different predicate for the avx2 palignr,
since it's not a simple increment, but also verifying the cross-lane constraint.

With that as patch 1/1, I believe that will significantly tidy up what else
you're attempting to change with this series.



r~
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 91f6f21..475448e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42808,6 +42808,7 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 }

 static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d);

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
    in a single instruction.  */
@@ -42943,6 +42944,10 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_vpermil (d))
     return true;

+  /* Try palignr on one operand.  */
+  if (d->one_operand_p && expand_vec_perm_palignr (d))
+    return true;
+
   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   if (expand_vec_perm_pshufb (d))
@@ -43040,22 +43045,36 @@  expand_vec_perm_palignr (struct expand_vec_perm_d *d)
   else
     return false;

-  min = 2 * nelt, max = 0;
-  for (i = 0; i < nelt; ++i)
+  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+     PALIGNR is better than any other permutaion expand.
+     Check for a rotation in permutation.  */
+  if (d->one_operand_p)
     {
-      unsigned e = d->perm[i];
-      if (e < min)
-       min = e;
-      if (e > max)
-       max = e;
+      min = d->perm[0];
+      for (i = 1; i < nelt; ++i)
+       if (d->perm[i] != ((min + i) & (nelt - 1)))
+         return false;
     }
-  if (min == 0 || max - min >= nelt)
-    return false;
+  /* For a 2 operand permutaion we check if elements fit within one vector.  */
+  else
+    {
+      min = 2 * nelt, max = 0;
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned e = d->perm[i];
+         if (e < min)
+           min = e;
+         if (e > max)
+           max = e;
+       }
+      if (min == 0 || max - min >= nelt)
+       return false;

-  /* Given that we have SSSE3, we know we'll be able to implement the
-     single operand permutation after the palignr with pshufb.  */
-  if (d->testing_p)
-    return true;
+      /* Given that we have SSSE3, we know we'll be able to implement the
+        single operand permutation after the palignr with pshufb.  */
+      if (d->testing_p)
+       return true;
+    }

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
@@ -43089,6 +43108,14 @@  expand_vec_perm_palignr (struct expand_vec_perm_d *d)
     }


+  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+     the rest single operand permutation is just move.  */
+  if (d->one_operand_p)
+    {
+      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+      return true;
+    }
+
   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;