diff mbox

[1/2,x86] Add palignr support for AVX2.

Message ID CAOvf_xwYJpQ0H9=MrLzUd40JDksZ8Tfsks2=8ZW4MfR_wCjU1A@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko July 3, 2014, 9:53 a.m. UTC
The "expand_vec_perm_palignr" is similar for SSSE3 and AVX2 cases,
  but AVX2 requires more instructions to complete the scheme.

The patch below adds AVX2 support for six instructions, leaving SSSE3 for two.
Is it ok?

   min = nelt, max = 0;
@@ -43168,9 +43183,34 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  target = gen_reg_rtx (TImode);
-  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-                                 gen_lowpart (TImode, d->op0), shift));
+  shift1 = GEN_INT ((min - nelt / 2)
+          * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+  if (GET_MODE_SIZE (d->vmode) != 32)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+                                     gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      tmp = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv2ti (tmp,
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (33)));
+      if (min < nelt / 2)
+        emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, tmp),
+                                        gen_lowpart (V2TImode, d->op0),
+                                        shift));
+      else
+       emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, d->op1),
+                                        gen_lowpart (V2TImode, tmp),
+                                        shift1));
+    }

   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;
@@ -43192,9 +43232,22 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
       return true;
     }

-  ok = expand_vec_perm_1 (&dcopy);
-  gcc_assert (ok);
-
+  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
+     operand permutaoin.  */
+  if (insn_num == 2)
+    {
+      ok = expand_vec_perm_1 (&dcopy);
+      gcc_assert (ok);
+    }
+  /* For AVX2 we need 2 instructions for the shift: vpalignr and
+     vperm plus 4 instructions for one operand permutation.  */
+  else if (insn_num == 6)
+    {
+      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
+      gcc_assert (ok);
+    }
+  else
+    ok = false;
   return ok;
 }

@@ -44627,7 +44680,7 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_pshuflw_pshufhw (d))
     return true;

-  if (expand_vec_perm_palignr (d))
+  if (expand_vec_perm_palignr (d, 2))
     return true;

   if (expand_vec_perm_interleave2 (d))
@@ -44680,6 +44733,10 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_even_odd (d))
     return true;

+  /* Try sequences of six instructions.  */
+  if (expand_vec_perm_palignr (d, 6))
+    return true;
+
   /* Even longer sequences.  */
   if (expand_vec_perm_vpshufb4_vpermq2 (d))
     return true;

On Mon, May 19, 2014 at 7:32 PM, Richard Henderson <rth@redhat.com> wrote:
> On 05/05/2014 09:49 AM, Evgeny Stupachenko wrote:
>> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>>    if (expand_vec_perm_pshufb (d))
>>      return true;
>>
>> +  /* Try the AVX2 vpshufb.  */
>> +  if (expand_vec_perm_vpshufb2_vpermq (d))
>> +    return true;
>
> Why is this here?  It doesn't expand to 1 insn, which is
> what expand_vec_perm_1 is intended to check.
>
> It's already called from ix86_expand_vec_perm_const_1.
> If things need to be shuffled around in that function,
> then that's the right place to do so.
>
> It's also clearly unrelated to "palignr", so has no
> business at all within this patch.
>
>> -  min = nelt, max = 0;
>> +  min = 2 * nelt, max = 0;
>
> This change to min is wrong for this patch.  It probably
> belonged in your 2/2 patch.
>
>> +  shift1 = GEN_INT ((min - nelt / 2) *
>> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>
> Coding convention sez
>
>   shift1 = GEN_INT ((min - nelt / 2)
>                     * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>
>
> r~
>

Comments

Uros Bizjak July 4, 2014, 11:22 a.m. UTC | #1
On Thu, Jul 3, 2014 at 11:53 AM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> The "expand_vec_perm_palignr" is similar for SSSE3 and AVX2 cases,
>   but AVX2 requires more instructions to complete the scheme.
>
> The patch below adds AVX2 support for six instructions, leaving SSSE3 for two.
> Is it ok?

ChangeLog entry is missing.

Uros.
Evgeny Stupachenko July 4, 2014, 12:47 p.m. UTC | #2
2014-07-04  Evgeny Stupachenko  <evstupac@gmail.com>

        * config/i386/i386.c (expand_vec_perm_palignr): Extend to use AVX2
        PALINGR instruction.
        * config/i386/i386.c (ix86_expand_vec_perm_const_1): Add palignr try
        for AVX2.

On Fri, Jul 4, 2014 at 3:22 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Thu, Jul 3, 2014 at 11:53 AM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> The "expand_vec_perm_palignr" is similar for SSSE3 and AVX2 cases,
>>   but AVX2 requires more instructions to complete the scheme.
>>
>> The patch below adds AVX2 support for six instructions, leaving SSSE3 for two.
>> Is it ok?
>
> ChangeLog entry is missing.
>
> Uros.
Richard Henderson July 7, 2014, 2:40 p.m. UTC | #3
On 07/03/2014 02:53 AM, Evgeny Stupachenko wrote:
> -expand_vec_perm_palignr (struct expand_vec_perm_d *d)
> +expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)

insn_num might as well be "bool avx2", since it's only ever set to two values.

> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      if (!TARGET_SSSE3)
> +       return false;
> +    }
> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
> +  else if (GET_MODE_SIZE (d->vmode) == 32)
> +    {
> +      if (!TARGET_AVX2)
> +       return false;
> +    }
> +  /* Other sizes are not supported.  */
> +  else
>      return false;

And you'd better check it up here because...

> +  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
> +     operand permutaoin.  */
> +  if (insn_num == 2)
> +    {
> +      ok = expand_vec_perm_1 (&dcopy);
> +      gcc_assert (ok);
> +    }
> +  /* For AVX2 we need 2 instructions for the shift: vpalignr and
> +     vperm plus 4 instructions for one operand permutation.  */
> +  else if (insn_num == 6)
> +    {
> +      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
> +      gcc_assert (ok);
> +    }
> +  else
> +    ok = false;
>    return ok;

... down here you'll simply ICE from the gcc_assert.


r~
Evgeny Stupachenko July 10, 2014, 3:29 p.m. UTC | #4
On Mon, Jul 7, 2014 at 6:40 PM, Richard Henderson <rth@redhat.com> wrote:
> On 07/03/2014 02:53 AM, Evgeny Stupachenko wrote:
>> -expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>> +expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)
>
> insn_num might as well be "bool avx2", since it's only ever set to two values.

Agree. However:
 after the alignment, one operand permutation could be just move and
take 2 instructions for AVX2 as well
 for AVX2 there could be other cases when the scheme takes 4 or 5 instructions
 we can leave it for potential avx512 extension

>
>> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
>> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
>> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>> +    {
>> +      if (!TARGET_SSSE3)
>> +       return false;
>> +    }
>> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>> +    {
>> +      if (!TARGET_AVX2)
>> +       return false;
>> +    }
>> +  /* Other sizes are not supported.  */
>> +  else
>>      return false;
>
> And you'd better check it up here because...
>

Correct. The following should resolve the issue:
  /* For AVX2 we need more than 2 instructions when the alignment
     by itself does not produce the desired permutation.  */
  if (TARGET_AVX2 && insn_num <= 2)
    return false;

>> +  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
>> +     operand permutaoin.  */
>> +  if (insn_num == 2)
>> +    {
>> +      ok = expand_vec_perm_1 (&dcopy);
>> +      gcc_assert (ok);
>> +    }
>> +  /* For AVX2 we need 2 instructions for the shift: vpalignr and
>> +     vperm plus 4 instructions for one operand permutation.  */
>> +  else if (insn_num == 6)
>> +    {
>> +      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
>> +      gcc_assert (ok);
>> +    }
>> +  else
>> +    ok = false;
>>    return ok;
>
> ... down here you'll simply ICE from the gcc_assert.




>
>
> r~
Evgeny Stupachenko Aug. 14, 2014, 8:08 a.m. UTC | #5
Ping.

On Thu, Jul 10, 2014 at 7:29 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> On Mon, Jul 7, 2014 at 6:40 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 07/03/2014 02:53 AM, Evgeny Stupachenko wrote:
>>> -expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>>> +expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)
>>
>> insn_num might as well be "bool avx2", since it's only ever set to two values.
>
> Agree. However:
>  after the alignment, one operand permutation could be just move and
> take 2 instructions for AVX2 as well
>  for AVX2 there could be other cases when the scheme takes 4 or 5 instructions
>  we can leave it for potential avx512 extension
>
>>
>>> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
>>> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
>>> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
>>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>>> +    {
>>> +      if (!TARGET_SSSE3)
>>> +       return false;
>>> +    }
>>> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
>>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>>> +    {
>>> +      if (!TARGET_AVX2)
>>> +       return false;
>>> +    }
>>> +  /* Other sizes are not supported.  */
>>> +  else
>>>      return false;
>>
>> And you'd better check it up here because...
>>
>
> Correct. The following should resolve the issue:
>   /* For AVX2 we need more than 2 instructions when the alignment
>      by itself does not produce the desired permutation.  */
>   if (TARGET_AVX2 && insn_num <= 2)
>     return false;
>
>>> +  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
>>> +     operand permutaoin.  */
>>> +  if (insn_num == 2)
>>> +    {
>>> +      ok = expand_vec_perm_1 (&dcopy);
>>> +      gcc_assert (ok);
>>> +    }
>>> +  /* For AVX2 we need 2 instructions for the shift: vpalignr and
>>> +     vperm plus 4 instructions for one operand permutation.  */
>>> +  else if (insn_num == 6)
>>> +    {
>>> +      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
>>> +      gcc_assert (ok);
>>> +    }
>>> +  else
>>> +    ok = false;
>>>    return ok;
>>
>> ... down here you'll simply ICE from the gcc_assert.
>
>
>
>
>>
>>
>> r~
H.J. Lu Aug. 14, 2014, 2:55 p.m. UTC | #6
On Thu, Aug 14, 2014 at 1:08 AM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Ping.
>
> On Thu, Jul 10, 2014 at 7:29 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> On Mon, Jul 7, 2014 at 6:40 PM, Richard Henderson <rth@redhat.com> wrote:
>>> On 07/03/2014 02:53 AM, Evgeny Stupachenko wrote:
>>>> -expand_vec_perm_palignr (struct expand_vec_perm_d *d)
>>>> +expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)
>>>
>>> insn_num might as well be "bool avx2", since it's only ever set to two values.
>>
>> Agree. However:
>>  after the alignment, one operand permutation could be just move and
>> take 2 instructions for AVX2 as well
>>  for AVX2 there could be other cases when the scheme takes 4 or 5 instructions
>>  we can leave it for potential avx512 extension
>>
>>>
>>>> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
>>>> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
>>>> +  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
>>>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>>>> +    {
>>>> +      if (!TARGET_SSSE3)
>>>> +       return false;
>>>> +    }
>>>> +  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
>>>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>>>> +    {
>>>> +      if (!TARGET_AVX2)
>>>> +       return false;
>>>> +    }
>>>> +  /* Other sizes are not supported.  */
>>>> +  else
>>>>      return false;
>>>
>>> And you'd better check it up here because...
>>>
>>
>> Correct. The following should resolve the issue:
>>   /* For AVX2 we need more than 2 instructions when the alignment
>>      by itself does not produce the desired permutation.  */
>>   if (TARGET_AVX2 && insn_num <= 2)
>>     return false;
>>
>>>> +  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
>>>> +     operand permutaoin.  */
>>>> +  if (insn_num == 2)
>>>> +    {
>>>> +      ok = expand_vec_perm_1 (&dcopy);
>>>> +      gcc_assert (ok);
>>>> +    }
>>>> +  /* For AVX2 we need 2 instructions for the shift: vpalignr and
>>>> +     vperm plus 4 instructions for one operand permutation.  */
>>>> +  else if (insn_num == 6)
>>>> +    {
>>>> +      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
>>>> +      gcc_assert (ok);
>>>> +    }
>>>> +  else
>>>> +    ok = false;
>>>>    return ok;
>>>
>>> ... down here you'll simply ICE from the gcc_assert.
>>

Can you modify your patch to fix

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62128

with a testcase?
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2cffcef..70fc832 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43130,23 +43130,38 @@  expand_vec_perm_pshuflw_pshufhw (struct
expand_vec_perm_d *d)
   return true;
 }

+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   the permutation using the SSSE3 palignr instruction.  This succeeds
+   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
    need to shift them down so that a single vector permutation has a
    chance to succeed.  */

 static bool
-expand_vec_perm_palignr (struct expand_vec_perm_d *d)
+expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)
 {
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift, target;
+  rtx shift, shift1, target, tmp;
   struct expand_vec_perm_d dcopy;

-  /* Even with AVX, palignr only operates on 128-bit vectors.  */
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSSE3)
+       return false;
+    }
+  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX2)
+       return false;
+    }
+  /* Other sizes are not supported.  */
+  else
     return false;