@@ -71,6 +71,8 @@ using namespace gcc;
The variable current_pass is also used for statistics and plugins. */
opt_pass *current_pass;
+unsigned int pending_TODOs = 0;
+
/* Most passes are single-instance (within their context) and thus don't
need to implement cloning, but passes that support multiple instances
*must* provide their own implementation of the clone method.
@@ -731,7 +733,54 @@ make_pass_late_compilation (gcc::context *ctxt)
return new pass_late_compilation (ctxt);
}
+/* Pre-SLP scalar cleanup, it has several cleanup passes like FRE, DSE. */
+
+namespace {
+
+const pass_data pass_data_pre_slp_scalar_cleanup =
+{
+ GIMPLE_PASS, /* type */
+ "*pre_slp_scalar_cleanup", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_SCALAR_CLEANUP, /* tv_id */
+ ( PROP_cfg | PROP_ssa ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_pre_slp_scalar_cleanup : public gimple_opt_pass
+{
+public:
+ pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_pre_slp_scalar_cleanup, ctxt)
+ {
+ }
+
+ virtual bool
+ gate (function *)
+ {
+ return flag_tree_slp_vectorize
+ && (pending_TODOs & TODO_force_next_scalar_cleanup);
+ }
+
+ virtual unsigned int
+ execute (function *)
+ {
+ pending_TODOs &= ~TODO_force_next_scalar_cleanup;
+ return 0;
+ }
+}; // class pass_pre_slp_scalar_cleanup
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
+{
+ return new pass_pre_slp_scalar_cleanup (ctxt);
+}
/* Set the static pass number of pass PASS to ID and record that
in the mapping from static pass number to pass. */
@@ -2538,6 +2587,12 @@ execute_one_pass (opt_pass *pass)
return true;
}
+ if (todo_after & TODO_force_next_scalar_cleanup)
+ {
+ todo_after &= ~TODO_force_next_scalar_cleanup;
+ pending_TODOs |= TODO_force_next_scalar_cleanup;
+ }
+
do_per_function (clear_last_verified, NULL);
do_per_function (update_properties_after_pass, pass);
@@ -288,11 +288,16 @@ along with GCC; see the file COPYING3. If not see
/* pass_vectorize must immediately follow pass_if_conversion.
Please do not add any other passes in between. */
NEXT_PASS (pass_vectorize);
- PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
+ PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
NEXT_PASS (pass_dce);
- POP_INSERT_PASSES ()
- NEXT_PASS (pass_predcom);
+ POP_INSERT_PASSES ()
+ NEXT_PASS (pass_predcom);
NEXT_PASS (pass_complete_unroll);
+ NEXT_PASS (pass_pre_slp_scalar_cleanup);
+ PUSH_INSERT_PASSES_WITHIN (pass_pre_slp_scalar_cleanup)
+ NEXT_PASS (pass_fre, false /* may_iterate */);
+ NEXT_PASS (pass_dse);
+ POP_INSERT_PASSES ()
NEXT_PASS (pass_slp_vectorize);
NEXT_PASS (pass_loop_prefetch);
/* Run IVOPTs after the last pass that uses data-reference analysis
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funroll-loops -ftree-vectorize -fdump-tree-dse-details" } */
+
+/* Test if scalar cleanup pass takes effects, mainly check
+ its secondary pass DSE can remove dead stores on array
+ tmp. */
+
+#include "stdint.h"
+
+static inline void
+foo (int16_t *diff, int i_size, uint8_t *val1, int i_val1, uint8_t *val2,
+ int i_val2)
+{
+ for (int y = 0; y < i_size; y++)
+ {
+ for (int x = 0; x < i_size; x++)
+ diff[x + y * i_size] = val1[x] - val2[x];
+ val1 += i_val1;
+ val2 += i_val2;
+ }
+}
+
+void
+bar (int16_t res[16], uint8_t *val1, uint8_t *val2)
+{
+ int16_t d[16];
+ int16_t tmp[16];
+
+ foo (d, 4, val1, 16, val2, 32);
+
+ for (int i = 0; i < 4; i++)
+ {
+ int s03 = d[i * 4 + 0] + d[i * 4 + 3];
+ int s12 = d[i * 4 + 1] + d[i * 4 + 2];
+ int d03 = d[i * 4 + 0] - d[i * 4 + 3];
+ int d12 = d[i * 4 + 1] - d[i * 4 + 2];
+
+ tmp[0 * 4 + i] = s03 + s12;
+ tmp[1 * 4 + i] = 2 * d03 + d12;
+ tmp[2 * 4 + i] = s03 - s12;
+ tmp[3 * 4 + i] = d03 - 2 * d12;
+ }
+
+ for (int i = 0; i < 4; i++)
+ {
+ int s03 = tmp[i * 4 + 0] + tmp[i * 4 + 3];
+ int s12 = tmp[i * 4 + 1] + tmp[i * 4 + 2];
+ int d03 = tmp[i * 4 + 0] - tmp[i * 4 + 3];
+ int d12 = tmp[i * 4 + 1] - tmp[i * 4 + 2];
+
+ res[i * 4 + 0] = s03 + s12;
+ res[i * 4 + 1] = 2 * d03 + d12;
+ res[i * 4 + 2] = s03 - s12;
+ res[i * 4 + 3] = d03 - 2 * d12;
+ }
+}
+
+/* { dg-final { scan-tree-dump {Deleted dead store:.*tmp} "dse3" } } */
@@ -17,5 +17,5 @@ int foo (int *p, int b)
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse1"} } */
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
-/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
+/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
@@ -22,5 +22,5 @@ foo(int cond, struct z *s)
/* { dg-final { scan-tree-dump-times "Deleted dead store" 3 "dse1"} } */
/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
-/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
+/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
@@ -10,7 +10,10 @@ foo (int *a, int *b)
a[i] = b[0] + b[1] + b[i+1] + b[i+2];
}
-void bar (int *a, int *b)
+/* Disable pre-slp FRE to avoid unexpected SLP on the epilogue
+ of the 1st loop. */
+void __attribute__((optimize("-fno-tree-fre")))
+bar (int *a, int *b)
{
int i;
for (i = 0; i < (ARR_SIZE - 2); ++i)
@@ -193,6 +193,7 @@ DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
+DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup")
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
@@ -301,6 +301,11 @@ protected:
/* Release function body and stop pass manager. */
#define TODO_discard_function (1 << 23)
+/* Used as one pending action, it expects the following scalar
+ cleanup pass will clear it and do the cleanup work when it's
+ met. */
+#define TODO_force_next_scalar_cleanup (1 << 24)
+
/* Internally used in execute_function_todo(). */
#define TODO_update_ssa_any \
(TODO_update_ssa \
@@ -380,6 +385,7 @@ extern gimple_opt_pass *make_pass_simduid_cleanup (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
@@ -627,6 +633,12 @@ extern gimple_opt_pass *make_pass_convert_switch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_vaarg (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_gimple_isel (gcc::context *ctxt);
+/* Different from normal TODO_flags which are handled right at the begin
+ or the end of one pass execution, the pending TODO_flags are allowed
+ to be passed down in the pipeline until one of its consumers can take
+ over it. */
+extern unsigned int pending_TODOs;
+
/* Current optimization pass. */
extern opt_pass *current_pass;
@@ -1404,13 +1404,14 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
computations; otherwise, the size might blow up before the
iteration is complete and the IR eventually cleaned up. */
if (loop_outer (loop_father))
- {
- /* Once we process our father we will have processed
- the fathers of our children as well, so avoid doing
- redundant work and clear fathers we've gathered sofar. */
- bitmap_clear (father_bbs);
- bitmap_set_bit (father_bbs, loop_father->header->index);
- }
+ /* Once we process our father we will have processed
+ the fathers of our children as well, so avoid doing
+ redundant work and clear fathers we've gathered sofar.
+ But don't clear it for one outermost loop since its
+ propagation doesn't run necessarily all the time. */
+ bitmap_clear (father_bbs);
+
+ bitmap_set_bit (father_bbs, loop_father->header->index);
return true;
}
@@ -1429,6 +1430,8 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
bool changed;
int iteration = 0;
bool irred_invalidated = false;
+ int retval = 0;
+ bool outermost_unrolled = false;
estimate_numbers_of_iterations (cfun);
@@ -1472,6 +1475,8 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
if (loop_outer (unrolled_loop_bb->loop_father))
bitmap_set_bit (fathers,
unrolled_loop_bb->loop_father->num);
+ else if (!outermost_unrolled)
+ outermost_unrolled = true;
}
bitmap_clear (father_bbs);
/* Propagate the constants within the new basic blocks. */
@@ -1509,11 +1514,15 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
BITMAP_FREE (father_bbs);
+ /* Trigger scalar cleanup once any outermost loop gets unrolled. */
+ if (outermost_unrolled && unroll_outer)
+ retval |= TODO_force_next_scalar_cleanup;
+
if (irred_invalidated
&& loops_state_satisfies_p (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS))
mark_irreducible_loops ();
- return 0;
+ return retval;
}
/* Canonical induction variable creation pass. */