diff mbox

Extend shift permutations on power of 2 cases

Message ID CAOvf_xxeRzHv9nHmXxWhjsOw9sto43a1mtd3aV_KYy_1OLGpHQ@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko Nov. 12, 2014, 11:15 a.m. UTC
Committed r217359.
However, it appeared that AVX2 uses vperm2i128 for the shift here
(instead of palignr for SSSE3/AVX). To handle AVX2 case we need to
modify test case:


On Tue, Nov 11, 2014 at 5:28 PM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Tue, Nov 11, 2014 at 3:21 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> Hi,
>>
>> The patch extends shift permutations technique on power of 2 cases
>> (previously even/odd transformations was used unconditionally).
>> Basically the patch just add loop for load group of length 2, like it
>> is done in "vect_permute_load_chain" function.
>>
>> For Silvermont it reduces insn sequence for load group of length 4
>> from 31 to 20 insns.
>> Performance for the test in the patch improved by ~20%.
>>
>> Bootstrap passed.
>> Make check in progress.
>>
>> Is it ok?
>
> Ok.
>
> Thanks,
> Richard.
>
>> 2014-11-11  Evgeny Stupachenko  <evstupac@gmail.com>
>>
>> gcc/testsuite
>>         * gcc.target/i386/pr52252-atom-1.c: New.
>>
>> gcc/
>>         * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend shift
>>         permutations on power of 2 cases.
>>
>> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>> new file mode 100644
>> index 0000000..1fbd258
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>> @@ -0,0 +1,22 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target ssse3 } */
>> +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
>> +#define byte unsigned char
>> +
>> +void
>> +pair_mul_sum(byte *in, byte *out, int size)
>> +{
>> +  int j;
>> +  for(j = 0; j < size; j++)
>> +    {
>> +      byte a = in[0];
>> +      byte b = in[1];
>> +      byte c = in[2];
>> +      byte d = in[3];
>> +      out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * a);
>> +      in += 4;
>> +      out += 1;
>> +    }
>> +}
>> +
>> +/* { dg-final { scan-assembler "palignr" } } */
>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>> index 0bc0356..d2e0e93 100644
>> --- a/gcc/tree-vect-data-refs.c
>> +++ b/gcc/tree-vect-data-refs.c
>> @@ -5379,8 +5379,9 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>    memcpy (result_chain->address (), dr_chain.address (),
>>           length * sizeof (tree));
>>
>> -  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>> +  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>>      {
>> +      unsigned int j, log_length = exact_log2 (length);
>>        for (i = 0; i < nelt / 2; ++i)
>>         sel[i] = i * 2;
>>        for (i = 0; i < nelt / 2; ++i)
>> @@ -5441,37 +5442,44 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>        select_mask = vect_gen_perm_mask (vectype, sel);
>>        gcc_assert (select_mask != NULL);
>>
>> -      first_vect = dr_chain[0];
>> -      second_vect = dr_chain[1];
>> -
>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> -                                               first_vect, first_vect,
>> -                                               perm2_mask1);
>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -      vect[0] = data_ref;
>> +      for (i = 0; i < log_length; i++)
>> +       {
>> +         for (j = 0; j < length; j += 2)
>> +           {
>> +             first_vect = dr_chain[j];
>> +             second_vect = dr_chain[j + 1];
>>
>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> -                                               second_vect, second_vect,
>> -                                               perm2_mask2);
>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -      vect[1] = data_ref;
>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                       first_vect, first_vect,
>> +                                                       perm2_mask1);
>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +             vect[0] = data_ref;
>>
>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> -                                               vect[0], vect[1],
>> -                                               shift1_mask);
>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -      (*result_chain)[1] = data_ref;
>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                       second_vect,
>> second_vect,
>> +                                                       perm2_mask2);
>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +             vect[1] = data_ref;
>>
>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> -                                               vect[0], vect[1],
>> -                                               select_mask);
>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -      (*result_chain)[0] = data_ref;
>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                       vect[0], vect[1],
>> +                                                       shift1_mask);
>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +             (*result_chain)[j/2 + length/2] = data_ref;
>>
>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                       vect[0], vect[1],
>> +                                                       select_mask);
>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +             (*result_chain)[j/2] = data_ref;
>> +           }
>> +         memcpy (dr_chain.address (), result_chain->address (),
>> +                 length * sizeof (tree));
>> +       }
>>        return true;
>>      }
>>    if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)

Comments

Evgeny Stupachenko Nov. 12, 2014, 1:15 p.m. UTC | #1
To avoid misunderstanding.
I haven't yet committed this obvious fix.
Is it ok?

On Wed, Nov 12, 2014 at 2:15 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Committed r217359.
> However, it appeared that AVX2 uses vperm2i128 for the shift here
> (instead of palignr for SSSE3/AVX). To handle AVX2 case we need to
> modify test case:
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> index 1fbd258..020e983 100644
> --- a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> @@ -19,4 +19,4 @@ pair_mul_sum(byte *in, byte *out, int size)
>      }
>  }
>
> -/* { dg-final { scan-assembler "palignr" } } */
> +/* { dg-final { scan-assembler "perm2i128|palignr" } } */
>
> On Tue, Nov 11, 2014 at 5:28 PM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Tue, Nov 11, 2014 at 3:21 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>>> Hi,
>>>
>>> The patch extends shift permutations technique on power of 2 cases
>>> (previously even/odd transformations was used unconditionally).
>>> Basically the patch just add loop for load group of length 2, like it
>>> is done in "vect_permute_load_chain" function.
>>>
>>> For Silvermont it reduces insn sequence for load group of length 4
>>> from 31 to 20 insns.
>>> Performance for the test in the patch improved by ~20%.
>>>
>>> Bootstrap passed.
>>> Make check in progress.
>>>
>>> Is it ok?
>>
>> Ok.
>>
>> Thanks,
>> Richard.
>>
>>> 2014-11-11  Evgeny Stupachenko  <evstupac@gmail.com>
>>>
>>> gcc/testsuite
>>>         * gcc.target/i386/pr52252-atom-1.c: New.
>>>
>>> gcc/
>>>         * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend shift
>>>         permutations on power of 2 cases.
>>>
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> new file mode 100644
>>> index 0000000..1fbd258
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> @@ -0,0 +1,22 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target ssse3 } */
>>> +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
>>> +#define byte unsigned char
>>> +
>>> +void
>>> +pair_mul_sum(byte *in, byte *out, int size)
>>> +{
>>> +  int j;
>>> +  for(j = 0; j < size; j++)
>>> +    {
>>> +      byte a = in[0];
>>> +      byte b = in[1];
>>> +      byte c = in[2];
>>> +      byte d = in[3];
>>> +      out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * a);
>>> +      in += 4;
>>> +      out += 1;
>>> +    }
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "palignr" } } */
>>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>>> index 0bc0356..d2e0e93 100644
>>> --- a/gcc/tree-vect-data-refs.c
>>> +++ b/gcc/tree-vect-data-refs.c
>>> @@ -5379,8 +5379,9 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>>    memcpy (result_chain->address (), dr_chain.address (),
>>>           length * sizeof (tree));
>>>
>>> -  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>>> +  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>>>      {
>>> +      unsigned int j, log_length = exact_log2 (length);
>>>        for (i = 0; i < nelt / 2; ++i)
>>>         sel[i] = i * 2;
>>>        for (i = 0; i < nelt / 2; ++i)
>>> @@ -5441,37 +5442,44 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>>        select_mask = vect_gen_perm_mask (vectype, sel);
>>>        gcc_assert (select_mask != NULL);
>>>
>>> -      first_vect = dr_chain[0];
>>> -      second_vect = dr_chain[1];
>>> -
>>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> -                                               first_vect, first_vect,
>>> -                                               perm2_mask1);
>>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -      vect[0] = data_ref;
>>> +      for (i = 0; i < log_length; i++)
>>> +       {
>>> +         for (j = 0; j < length; j += 2)
>>> +           {
>>> +             first_vect = dr_chain[j];
>>> +             second_vect = dr_chain[j + 1];
>>>
>>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> -                                               second_vect, second_vect,
>>> -                                               perm2_mask2);
>>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -      vect[1] = data_ref;
>>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                       first_vect, first_vect,
>>> +                                                       perm2_mask1);
>>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +             vect[0] = data_ref;
>>>
>>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> -                                               vect[0], vect[1],
>>> -                                               shift1_mask);
>>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -      (*result_chain)[1] = data_ref;
>>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                       second_vect,
>>> second_vect,
>>> +                                                       perm2_mask2);
>>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +             vect[1] = data_ref;
>>>
>>> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>>> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> -                                               vect[0], vect[1],
>>> -                                               select_mask);
>>> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -      (*result_chain)[0] = data_ref;
>>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                       vect[0], vect[1],
>>> +                                                       shift1_mask);
>>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +             (*result_chain)[j/2 + length/2] = data_ref;
>>>
>>> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>>> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                       vect[0], vect[1],
>>> +                                                       select_mask);
>>> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +             (*result_chain)[j/2] = data_ref;
>>> +           }
>>> +         memcpy (dr_chain.address (), result_chain->address (),
>>> +                 length * sizeof (tree));
>>> +       }
>>>        return true;
>>>      }
>>>    if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
Uros Bizjak Nov. 12, 2014, 1:16 p.m. UTC | #2
On Wed, Nov 12, 2014 at 2:15 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> To avoid misunderstanding.
> I haven't yet committed this obvious fix.
> Is it ok?

If it is obvious, then it doesn't need an approval.

So, OK.

Thanks,
Uros.
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
index 1fbd258..020e983 100644
--- a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
@@ -19,4 +19,4 @@  pair_mul_sum(byte *in, byte *out, int size)
     }
 }

-/* { dg-final { scan-assembler "palignr" } } */
+/* { dg-final { scan-assembler "perm2i128|palignr" } } */