@@ -1,3 +1,13 @@
+2013-11-17 Martin Liska <marxin.liska@gmail.com>
+ Jan Hubicka <jh@suse.cz>
+
+ * cgraphunit.c (node_cmp): New function.
+ (expand_all_functions): Function ordering added.
+ * common.opt: New profile based function reordering flag introduced.
+ * lto-partition.c: Support for time profile added.
+ * lto.c: Likewise.
+ * predict.c (handle_missing_profiles): Time profile handled in
+ missing profiles.
2013-11-16 Joern Rennecke <joern.rennecke@embecosm.com>
* config/arc/arc.c (arc_predicate_delay_insns): New function.
@@ -1824,6 +1824,19 @@ expand_function (struct cgraph_node *node)
ipa_remove_all_references (&node->ref_list);
}
+/* Node comparer that is responsible for the order that corresponds
+ to time when a function was launched for the first time. */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+ const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+ const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+ return a->tp_first_run != b->tp_first_run
+ ? b->tp_first_run - a->tp_first_run
+ : b->order - a->order;
+}
/* Expand all functions that must be output.
@@ -1835,11 +1848,14 @@ expand_function (struct cgraph_node *node)
to use subsections to make the output functions appear in top-down
order). */
+
static void
expand_all_functions (void)
{
struct cgraph_node *node;
struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+ unsigned int expanded_func_count = 0, profiled_func_count = 0;
int order_pos, new_order_pos = 0;
int i;
@@ -1852,19 +1868,35 @@ expand_all_functions (void)
if (order[i]->process)
order[new_order_pos++] = order[i];
+ if (flag_profile_reorder_functions)
+ qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
for (i = new_order_pos - 1; i >= 0; i--)
{
node = order[i];
+
if (node->process)
{
+ expanded_func_count++;
+ if(node->tp_first_run)
+ profiled_func_count++;
+
node->process = 0;
expand_function (node);
}
}
+
+ if (in_lto_p && dump_file)
+ fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+ main_input_filename, profiled_func_count, expanded_func_count);
+
+ if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+ fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+ profiled_func_count, expanded_func_count);
+
cgraph_process_new_functions ();
free (order);
-
}
/* This is used to sort the node types by the cgraph order number. */
@@ -2189,6 +2221,7 @@ compile (void)
#endif
cgraph_state = CGRAPH_STATE_EXPANSION;
+
if (!flag_toplevel_reorder)
output_in_order ();
else
@@ -1708,6 +1708,10 @@ fprofile-report
Common Report Var(profile_report)
Report on consistency of profile
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
frandom-seed
Common Var(common_deferred_options) Defer
@@ -393,7 +393,7 @@ Objective-C and Objective-C++ Dialects}.
-fprefetch-loop-arrays -fprofile-report @gol
-fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
-fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
-freciprocal-math -free -frename-registers -freorder-blocks @gol
-freorder-blocks-and-partition -freorder-functions @gol
-frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -8645,7 +8645,7 @@ profile useful for later recompilation with profile feedback based
optimization. You must use @option{-fprofile-generate} both when
compiling and when linking your program.
-The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fvpt}.
+The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fprofile-reorder-functions}, @code{-fvpt}.
If @var{path} is specified, GCC looks at the @var{path} to find
the profile feedback data files. See @option{-fprofile-dir}.
@@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
+
@item -fvpt
@opindex fvpt
If combined with @option{-fprofile-arcs}, this option instructs the compiler
@@ -280,9 +280,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
Be lax about comdats; they may or may not be duplicated and we may
end up in need to duplicate keyed comdat because it has unkeyed alias. */
+
gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
|| DECL_COMDAT (node->decl)
|| !symbol_partitioned_p (node));
+
add_symbol_to_partition_1 (part, node);
}
@@ -395,6 +397,25 @@ node_cmp (const void *pa, const void *pb)
{
const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+ /* Profile reorder flag enables function reordering based on first execution
+ of a function. All functions with profile are placed in ascending
+ order at the beginning. */
+
+ if (flag_profile_reorder_functions)
+ {
+ /* Functions with time profile are sorted in ascending order. */
+ if (a->tp_first_run && b->tp_first_run)
+ return a->tp_first_run != b->tp_first_run
+ ? a->tp_first_run - b->tp_first_run
+ : a->order - b->order;
+
+ /* Functions with time profile are sorted before the functions
+ that do not have the profile. */
+ if (a->tp_first_run || b->tp_first_run)
+ return b->tp_first_run - a->tp_first_run;
+ }
+
return b->order - a->order;
}
@@ -449,7 +470,7 @@ void
lto_balanced_map (void)
{
int n_nodes = 0;
- int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+ int n_varpool_nodes = 0, varpool_pos = 0;
struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
struct varpool_node **varpool_order = NULL;
int i;
@@ -481,10 +502,13 @@ lto_balanced_map (void)
get better about minimizing the function bounday, but until that
things works smoother if we order in source order. */
qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+ if (cgraph_dump_file)
+ for(i = 0; i < n_nodes; i++)
+ fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
+
if (!flag_toplevel_reorder)
{
- qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
FOR_EACH_VARIABLE (vnode)
if (get_symbol_class (vnode) == SYMBOL_PARTITION)
n_varpool_nodes++;
@@ -683,7 +707,6 @@ lto_balanced_map (void)
best_i = i;
best_n_nodes = lto_symtab_encoder_size (partition->encoder);
best_total_size = total_size;
- best_varpool_pos = varpool_pos;
}
if (cgraph_dump_file)
fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -701,7 +724,6 @@ lto_balanced_map (void)
fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
i - best_i, best_i);
undo_partition (partition, best_n_nodes);
- varpool_pos = best_varpool_pos;
}
i = best_i;
/* When we are finished, avoid creating empty partition. */
@@ -849,7 +871,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
of the same name in partition ENCODER (or in whole compilation unit if
ENCODER is NULL) and if so, mangle the statics. Always mangle all
conflicting statics, so we reduce changes of silently miscompiling
- asm statemnets referring to them by symbol name. */
+ asm statements referring to them by symbol name. */
static void
rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
@@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
/* Sort partitions by size so small ones are compiled last.
FIXME: Even when not reordering we may want to output one list for parallel make
and other for final link command. */
- ltrans_partitions.qsort (flag_toplevel_reorder
+
+ if (!flag_profile_reorder_functions || !flag_profile_use)
+ ltrans_partitions.qsort (flag_toplevel_reorder
? cmp_partitions_size
: cmp_partitions_order);
+
for (i = 0; i < n_sets; i++)
{
size_t len;
@@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts,
opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
if (!opts_set->x_flag_tree_loop_distribute_patterns)
opts->x_flag_tree_loop_distribute_patterns = value;
+ if (!opts_set->x_flag_profile_reorder_functions)
+ opts->x_flag_profile_reorder_functions = value;
/* Indirect call profiling should do all useful transformations
speculative devirutalization does. */
if (!opts_set->x_flag_devirtualize_speculatively
@@ -1708,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
opts->x_flag_profile_values = value;
if (!opts_set->x_flag_inline_functions)
opts->x_flag_inline_functions = value;
+ if (!opts_set->x_flag_profile_reorder_functions)
+ opts->x_flag_profile_reorder_functions = value;
/* FIXME: Instrumentation we insert makes ipa-reference bitmaps
quadratic. Disable the pass until better memory representation
is done. */
@@ -2840,12 +2840,24 @@ handle_missing_profiles (void)
{
struct cgraph_edge *e;
gcov_type call_count = 0;
+ gcov_type max_tp_first_run = 0;
struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
if (node->count)
continue;
for (e = node->callers; e; e = e->next_caller)
+ {
call_count += e->count;
+
+ if (e->caller->tp_first_run > max_tp_first_run)
+ max_tp_first_run = e->caller->tp_first_run;
+ }
+
+ /* If time profile is missing, let assign the maximum that comes from
+ caller functions. */
+ if (!node->tp_first_run)
+ node->tp_first_run = max_tp_first_run;
+
if (call_count
&& fn && fn->cfg
&& (call_count * unlikely_count_fraction >= profile_info->runs))