diff mbox

[1/2,x86] Add palignr support for AVX2.

Message ID CAOvf_xxq6PNN2KLpA8yoB2jjgmy-UAOvij_ghgsKPkZnsu2Rkg@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko May 5, 2014, 4:49 p.m. UTC
Is the following patch ok? It passes bootstrap and make check.

    chance to succeed.  */
@@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift, target;
+  rtx shift, shift1, target, tmp;
   struct expand_vec_perm_d dcopy;

-  /* Even with AVX, palignr only operates on 128-bit vectors.  */
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSSE3)
+       return false;
+    }
+  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX2)
+       return false;
+    }
+  /* Other sizes are not supported.  */
+  else
     return false;

-  min = nelt, max = 0;
+  min = 2 * nelt, max = 0;
   for (i = 0; i < nelt; ++i)
     {
       unsigned e = d->perm[i];
@@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  target = gen_reg_rtx (TImode);
-  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-                                 gen_lowpart (TImode, d->op0), shift));
+  shift1 = GEN_INT ((min - nelt / 2) *
+          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+  if (GET_MODE_SIZE (d->vmode) != 32)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+                                     gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      tmp = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv2ti (tmp,
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (33)));
+      if (min < nelt / 2)
+        emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, tmp),
+                                        gen_lowpart (V2TImode, d->op0),
+                                        shift));
+      else
+       emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, d->op1),
+                                        gen_lowpart (V2TImode, tmp),
+                                        shift1));
+    }
+

   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;

On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <rth@redhat.com> wrote:
> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote:
>> Agree on checks:
>>
>>   /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>      Requires SSSE3.  */
>>   if (GET_MODE_SIZE (d->vmode) == 16)
>>     {
>>       if(!TARGET_SSSE3)
>>         return false;
>>     }
>>   /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>      PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>   else if (GET_MODE_SIZE (d->vmode) == 32)
>>     {
>>       if(!TARGET_AVX2)
>>         return false;
>>     }
>>   else
>>     return false;
>
> Thanks, much better.
>
>
> r~

Comments

Evgeny Stupachenko May 19, 2014, 10:42 a.m. UTC | #1
Ping.

On Mon, May 5, 2014 at 8:49 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Is the following patch ok? It passes bootstrap and make check.
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 88142a8..91f6f21 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
>    return true;
>  }
>
> +static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
> +
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
>     in a single instruction.  */
>
> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_pshufb (d))
>      return true;
>
> +  /* Try the AVX2 vpshufb.  */
> +  if (expand_vec_perm_vpshufb2_vpermq (d))
> +    return true;
> +
>    /* Try the AVX512F vpermi2 instructions.  */
>    rtx vec[64];
>    enum machine_mode mode = d->vmode;
> @@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
> expand_vec_perm_d *d)
>  }
>
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> -   the permutation using the SSSE3 palignr instruction.  This succeeds
> +   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
>     when all of the elements in PERM fit within one vector and we merely
>     need to shift them down so that a single vector permutation has a
>     chance to succeed.  */
> @@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>    unsigned i, nelt = d->nelt;
>    unsigned min, max;
>    bool in_order, ok;
> -  rtx shift, target;
> +  rtx shift, shift1, target, tmp;
>    struct expand_vec_perm_d dcopy;
>
> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      if (!TARGET_SSSE3)
> +       return false;
> +    }
> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
> +  else if (GET_MODE_SIZE (d->vmode) == 32)
> +    {
> +      if (!TARGET_AVX2)
> +       return false;
> +    }
> +  /* Other sizes are not supported.  */
> +  else
>      return false;
>
> -  min = nelt, max = 0;
> +  min = 2 * nelt, max = 0;
>    for (i = 0; i < nelt; ++i)
>      {
>        unsigned e = d->perm[i];
> @@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>
>    dcopy = *d;
>    shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> -  target = gen_reg_rtx (TImode);
> -  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> -                                 gen_lowpart (TImode, d->op0), shift));
> +  shift1 = GEN_INT ((min - nelt / 2) *
> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> +
> +  if (GET_MODE_SIZE (d->vmode) != 32)
> +    {
> +      target = gen_reg_rtx (TImode);
> +      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> +                                     gen_lowpart (TImode, d->op0), shift));
> +    }
> +  else
> +    {
> +      target = gen_reg_rtx (V2TImode);
> +      tmp = gen_reg_rtx (V4DImode);
> +      emit_insn (gen_avx2_permv2ti (tmp,
> +                                   gen_lowpart (V4DImode, d->op0),
> +                                   gen_lowpart (V4DImode, d->op1),
> +                                   GEN_INT (33)));
> +      if (min < nelt / 2)
> +        emit_insn (gen_avx2_palignrv2ti (target,
> +                                        gen_lowpart (V2TImode, tmp),
> +                                        gen_lowpart (V2TImode, d->op0),
> +                                        shift));
> +      else
> +       emit_insn (gen_avx2_palignrv2ti (target,
> +                                        gen_lowpart (V2TImode, d->op1),
> +                                        gen_lowpart (V2TImode, tmp),
> +                                        shift1));
> +    }
> +
>
>    dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
>    dcopy.one_operand_p = true;
>
> On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote:
>>> Agree on checks:
>>>
>>>   /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>>      Requires SSSE3.  */
>>>   if (GET_MODE_SIZE (d->vmode) == 16)
>>>     {
>>>       if(!TARGET_SSSE3)
>>>         return false;
>>>     }
>>>   /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>>      PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>>   else if (GET_MODE_SIZE (d->vmode) == 32)
>>>     {
>>>       if(!TARGET_AVX2)
>>>         return false;
>>>     }
>>>   else
>>>     return false;
>>
>> Thanks, much better.
>>
>>
>> r~
Richard Henderson May 19, 2014, 3:32 p.m. UTC | #2
On 05/05/2014 09:49 AM, Evgeny Stupachenko wrote:
> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_pshufb (d))
>      return true;
> 
> +  /* Try the AVX2 vpshufb.  */
> +  if (expand_vec_perm_vpshufb2_vpermq (d))
> +    return true;

Why is this here?  It doesn't expand to 1 insn, which is
what expand_vec_perm_1 is intended to check.

It's already called from ix86_expand_vec_perm_const_1.
If things need to be shuffled around in that function,
then that's the right place to do so.

It's also clearly unrelated to "palignr", so has no
business at all within this patch.

> -  min = nelt, max = 0;
> +  min = 2 * nelt, max = 0;

This change to min is wrong for this patch.  It probably
belonged in your 2/2 patch.

> +  shift1 = GEN_INT ((min - nelt / 2) *
> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));

Coding convention sez

  shift1 = GEN_INT ((min - nelt / 2)
                    * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));


r~
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 88142a8..91f6f21 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,8 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   return true;
 }

+static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
    in a single instruction.  */

@@ -42946,6 +42948,10 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pshufb (d))
     return true;

+  /* Try the AVX2 vpshufb.  */
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
   /* Try the AVX512F vpermi2 instructions.  */
   rtx vec[64];
   enum machine_mode mode = d->vmode;
@@ -43004,7 +43010,7 @@  expand_vec_perm_pshuflw_pshufhw (struct
expand_vec_perm_d *d)
 }

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   the permutation using the SSSE3 palignr instruction.  This succeeds
+   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
    need to shift them down so that a single vector permutation has a