diff mbox

Extend shift permutations on power of 2 cases

Message ID CAOvf_xwSEVJfhvvKETqSOkkt3oHCZN9Ek15q=0OUOmqyv0JooA@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko Nov. 11, 2014, 2:21 p.m. UTC
Hi,

The patch extends shift permutations technique on power of 2 cases
(previously even/odd transformations was used unconditionally).
Basically the patch just add loop for load group of length 2, like it
is done in "vect_permute_load_chain" function.

For Silvermont it reduces insn sequence for load group of length 4
from 31 to 20 insns.
Performance for the test in the patch improved by ~20%.

Bootstrap passed.
Make check in progress.

Is it ok?

2014-11-11  Evgeny Stupachenko  <evstupac@gmail.com>

gcc/testsuite
        * gcc.target/i386/pr52252-atom-1.c: New.

gcc/
        * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend shift
        permutations on power of 2 cases.

   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)

Comments

Richard Biener Nov. 11, 2014, 2:28 p.m. UTC | #1
On Tue, Nov 11, 2014 at 3:21 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Hi,
>
> The patch extends shift permutations technique on power of 2 cases
> (previously even/odd transformations was used unconditionally).
> Basically the patch just add loop for load group of length 2, like it
> is done in "vect_permute_load_chain" function.
>
> For Silvermont it reduces insn sequence for load group of length 4
> from 31 to 20 insns.
> Performance for the test in the patch improved by ~20%.
>
> Bootstrap passed.
> Make check in progress.
>
> Is it ok?

Ok.

Thanks,
Richard.

> 2014-11-11  Evgeny Stupachenko  <evstupac@gmail.com>
>
> gcc/testsuite
>         * gcc.target/i386/pr52252-atom-1.c: New.
>
> gcc/
>         * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend shift
>         permutations on power of 2 cases.
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> new file mode 100644
> index 0000000..1fbd258
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target ssse3 } */
> +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
> +#define byte unsigned char
> +
> +void
> +pair_mul_sum(byte *in, byte *out, int size)
> +{
> +  int j;
> +  for(j = 0; j < size; j++)
> +    {
> +      byte a = in[0];
> +      byte b = in[1];
> +      byte c = in[2];
> +      byte d = in[3];
> +      out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * a);
> +      in += 4;
> +      out += 1;
> +    }
> +}
> +
> +/* { dg-final { scan-assembler "palignr" } } */
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 0bc0356..d2e0e93 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -5379,8 +5379,9 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>    memcpy (result_chain->address (), dr_chain.address (),
>           length * sizeof (tree));
>
> -  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
> +  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>      {
> +      unsigned int j, log_length = exact_log2 (length);
>        for (i = 0; i < nelt / 2; ++i)
>         sel[i] = i * 2;
>        for (i = 0; i < nelt / 2; ++i)
> @@ -5441,37 +5442,44 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>        select_mask = vect_gen_perm_mask (vectype, sel);
>        gcc_assert (select_mask != NULL);
>
> -      first_vect = dr_chain[0];
> -      second_vect = dr_chain[1];
> -
> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> -                                               first_vect, first_vect,
> -                                               perm2_mask1);
> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> -      vect[0] = data_ref;
> +      for (i = 0; i < log_length; i++)
> +       {
> +         for (j = 0; j < length; j += 2)
> +           {
> +             first_vect = dr_chain[j];
> +             second_vect = dr_chain[j + 1];
>
> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> -                                               second_vect, second_vect,
> -                                               perm2_mask2);
> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> -      vect[1] = data_ref;
> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> +                                                       first_vect, first_vect,
> +                                                       perm2_mask1);
> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> +             vect[0] = data_ref;
>
> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> -                                               vect[0], vect[1],
> -                                               shift1_mask);
> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> -      (*result_chain)[1] = data_ref;
> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> +                                                       second_vect,
> second_vect,
> +                                                       perm2_mask2);
> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> +             vect[1] = data_ref;
>
> -      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
> -      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> -                                               vect[0], vect[1],
> -                                               select_mask);
> -      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> -      (*result_chain)[0] = data_ref;
> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> +                                                       vect[0], vect[1],
> +                                                       shift1_mask);
> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> +             (*result_chain)[j/2 + length/2] = data_ref;
>
> +             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
> +             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> +                                                       vect[0], vect[1],
> +                                                       select_mask);
> +             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> +             (*result_chain)[j/2] = data_ref;
> +           }
> +         memcpy (dr_chain.address (), result_chain->address (),
> +                 length * sizeof (tree));
> +       }
>        return true;
>      }
>    if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
new file mode 100644
index 0000000..1fbd258
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
+#define byte unsigned char
+
+void
+pair_mul_sum(byte *in, byte *out, int size)
+{
+  int j;
+  for(j = 0; j < size; j++)
+    {
+      byte a = in[0];
+      byte b = in[1];
+      byte c = in[2];
+      byte d = in[3];
+      out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * a);
+      in += 4;
+      out += 1;
+    }
+}
+
+/* { dg-final { scan-assembler "palignr" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 0bc0356..d2e0e93 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -5379,8 +5379,9 @@  vect_shift_permute_load_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
          length * sizeof (tree));

-  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
     {
+      unsigned int j, log_length = exact_log2 (length);
       for (i = 0; i < nelt / 2; ++i)
        sel[i] = i * 2;
       for (i = 0; i < nelt / 2; ++i)
@@ -5441,37 +5442,44 @@  vect_shift_permute_load_chain (vec<tree> dr_chain,
       select_mask = vect_gen_perm_mask (vectype, sel);
       gcc_assert (select_mask != NULL);

-      first_vect = dr_chain[0];
-      second_vect = dr_chain[1];
-
-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
-      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                               first_vect, first_vect,
-                                               perm2_mask1);
-      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-      vect[0] = data_ref;
+      for (i = 0; i < log_length; i++)
+       {
+         for (j = 0; j < length; j += 2)
+           {
+             first_vect = dr_chain[j];
+             second_vect = dr_chain[j + 1];

-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
-      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                               second_vect, second_vect,
-                                               perm2_mask2);
-      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-      vect[1] = data_ref;
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       first_vect, first_vect,
+                                                       perm2_mask1);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             vect[0] = data_ref;

-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
-      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                               vect[0], vect[1],
-                                               shift1_mask);
-      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-      (*result_chain)[1] = data_ref;
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       second_vect,
second_vect,
+                                                       perm2_mask2);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             vect[1] = data_ref;

-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
-      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                               vect[0], vect[1],
-                                               select_mask);
-      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-      (*result_chain)[0] = data_ref;
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       vect[0], vect[1],
+                                                       shift1_mask);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2 + length/2] = data_ref;

+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       vect[0], vect[1],
+                                                       select_mask);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2] = data_ref;
+           }
+         memcpy (dr_chain.address (), result_chain->address (),
+                 length * sizeof (tree));
+       }
       return true;
     }