Message ID | 20150217210307.GB13234@kam.mff.cuni.cz |
---|---|
State | New |
Headers | show |
On 02/17/2015 10:03 PM, Jan Hubicka wrote: > Hi, > this patch should chase away the expensive thunks and aliases walks from most > of analysis code. I think only real use left is local_p predicate that needs to > stay because i386 expect local flag to match between caller and callee when > expanding assembler thunk. I at least optimized it by first moving the walk to > be conditional for nonlocal functions only and then reorganizing > call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and > only then work on thunks. Most likely this will find the non-local thunk/alias > faster. Other cases was leftovers from the conversion of thunks from aliases > to functions. > > I also noticed a bug in ipa-profile that does not disable all the > transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it. > > Bootstrapped/regtested x86_64-linux, comitted. I would be interested to > know if the call_for_symbol_thunks_and_aliases is now off your oprofiles > (sorry, easier to type than perf-profiles) > > Honza > > * ipa-visibility.c (function_and_variable_visibility): Only > check locality if node is not already local. > * ipa-inline.c (want_inline_function_to_all_callers_p): Use > call_for_symbol_and_aliases instead of > call_for_symbol_thunks_and_aliases. > (ipa_inline): Likewise. > * cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases): > first walk aliases. > * ipa.c (symbol_table::remove_unreachable_nodes): Use > call_for_symbol_and_aliases. > * ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol. > (ipa_propagate_frequency_1): Use it; use opt_for_fn > (ipa_propagate_frequency): Update. > (ipa_profile): Add opt_for_fn gueards. > Index: ipa-visibility.c > =================================================================== > --- ipa-visibility.c (revision 220741) > +++ ipa-visibility.c (working copy) > @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w > } > FOR_EACH_DEFINED_FUNCTION (node) > { > - node->local.local |= node->local_p (); > + if (!node->local.local) > + node->local.local |= node->local_p (); > > /* If we know that function can not be overwritten by a different semantics > and moreover its section can not be discarded, replace all direct calls > Index: ipa-inline.c > =================================================================== > --- ipa-inline.c (revision 220741) > +++ ipa-inline.c (working copy) > @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s > if (node->global.inlined_to) > return false; > /* Does it have callers? */ > - if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true)) > + if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true)) > return false; > /* Inlining into all callers would increase size? */ > if (estimate_growth (node) > 0) > return false; > /* All inlines must be possible. */ > - if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call, > - true)) > + if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call, > + true)) > return false; > if (!cold && !has_hot_call) > return false; > @@ -2359,9 +2359,9 @@ ipa_inline (void) > if (want_inline_function_to_all_callers_p (node, cold)) > { > int num_calls = 0; > - node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls, > - true); > - while (node->call_for_symbol_thunks_and_aliases > + node->call_for_symbol_and_aliases (sum_callers, &num_calls, > + true); > + while (node->call_for_symbol_and_aliases > (inline_to_all_callers, &num_calls, true)) > ; > remove_functions = true; > Index: cgraph.c > =================================================================== > --- cgraph.c (revision 220741) > +++ cgraph.c (working copy) > @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_ > > if (callback (this, data)) > return true; > + FOR_EACH_ALIAS (this, ref) > + { > + cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); > + if (include_overwritable > + || alias->get_availability () > AVAIL_INTERPOSABLE) > + if (alias->call_for_symbol_thunks_and_aliases (callback, data, > + include_overwritable, > + exclude_virtual_thunks)) > + return true; > + } > for (e = callers; e; e = e->next_caller) > if (e->caller->thunk.thunk_p > && (include_overwritable > @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_ > exclude_virtual_thunks)) > return true; > > - FOR_EACH_ALIAS (this, ref) > - { > - cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); > - if (include_overwritable > - || alias->get_availability () > AVAIL_INTERPOSABLE) > - if (alias->call_for_symbol_thunks_and_aliases (callback, data, > - include_overwritable, > - exclude_virtual_thunks)) > - return true; > - } > return false; > } > > Index: ipa.c > =================================================================== > --- ipa.c (revision 220741) > +++ ipa.c (working copy) > @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes ( > if (node->address_taken > && !node->used_from_other_partition) > { > - if (!node->call_for_symbol_thunks_and_aliases > + if (!node->call_for_symbol_and_aliases > (has_addr_references_p, NULL, true) > && (!node->instrumentation_clone > || !node->instrumented_version > Index: ipa-profile.c > =================================================================== > --- ipa-profile.c (revision 220741) > +++ ipa-profile.c (working copy) > @@ -322,6 +322,7 @@ ipa_profile_read_summary (void) > > struct ipa_propagate_frequency_data > { > + cgraph_node *function_symbol; > bool maybe_unlikely_executed; > bool maybe_executed_once; > bool only_called_at_startup; > @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph > || d->only_called_at_startup || d->only_called_at_exit); > edge = edge->next_caller) > { > - if (edge->caller != node) > + if (edge->caller != d->function_symbol) > { > d->only_called_at_startup &= edge->caller->only_called_at_startup; > /* It makes sense to put main() together with the static constructors. > @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph > errors can make us to push function into unlikely section even when > it is executed by the train run. Transfer the function only if all > callers are unlikely executed. */ > - if (profile_info && flag_branch_probabilities > + if (profile_info > + && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities) > + /* Thunks are not profiled. This is more or less implementation > + bug. */ > + && !d->function_symbol->thunk.thunk_p > && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED > || (edge->caller->global.inlined_to > && edge->caller->global.inlined_to->frequency > @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node > bool > ipa_propagate_frequency (struct cgraph_node *node) > { > - struct ipa_propagate_frequency_data d = {true, true, true, true}; > + struct ipa_propagate_frequency_data d = {node, true, true, true, true}; > bool changed = false; > > /* We can not propagate anything useful about externally visible functions > @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "Processing frequency %s\n", node->name ()); > > - node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d, > - true); > + node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d, > + true); > > if ((d.only_called_at_startup && !d.only_called_at_exit) > && !node->only_called_at_startup) > @@ -597,6 +602,9 @@ ipa_profile (void) > { > bool update = false; > > + if (!opt_for_fn (n->decl, flag_ipa_profile)) > + continue; > + > for (e = n->indirect_calls; e; e = e->next_callee) > { > if (n->count) > @@ -697,7 +705,9 @@ ipa_profile (void) > order_pos = ipa_reverse_postorder (order); > for (i = order_pos - 1; i >= 0; i--) > { > - if (order[i]->local.local && ipa_propagate_frequency (order[i])) > + if (order[i]->local.local > + && opt_for_fn (order[i]->decl, flag_ipa_profile) > + && ipa_propagate_frequency (order[i])) > { > for (e = order[i]->callees; e; e = e->next_callee) > if (e->callee->local.local && !e->callee->aux) > @@ -714,7 +724,9 @@ ipa_profile (void) > something_changed = false; > for (i = order_pos - 1; i >= 0; i--) > { > - if (order[i]->aux && ipa_propagate_frequency (order[i])) > + if (order[i]->aux > + && opt_for_fn (order[i]->decl, flag_ipa_profile) > + && ipa_propagate_frequency (order[i])) > { > for (e = order[i]->callees; e; e = e->next_callee) > if (e->callee->local.local && !e->callee->aux) > Hi. There's perf report and -ftime report of WPA phase. Martin Execution times (seconds) phase setup : 0.00 ( 0%) usr 0.00 ( 0%) sys 0.00 ( 0%) wall 1977 kB ( 0%) ggc phase opt and generate : 171.18 (65%) usr 2.29 (47%) sys 173.40 (65%) wall 2682609 kB (13%) ggc phase stream in : 92.09 (35%) usr 2.55 (53%) sys 94.61 (35%) wall18738048 kB (87%) ggc callgraph optimization : 0.72 ( 0%) usr 0.00 ( 0%) sys 0.73 ( 0%) wall 16 kB ( 0%) ggc ipa dead code removal : 5.12 ( 2%) usr 0.05 ( 1%) sys 5.07 ( 2%) wall 0 kB ( 0%) ggc ipa virtual call target : 2.93 ( 1%) usr 0.03 ( 1%) sys 3.02 ( 1%) wall 0 kB ( 0%) ggc ipa devirtualization : 0.26 ( 0%) usr 0.01 ( 0%) sys 0.34 ( 0%) wall 32646 kB ( 0%) ggc ipa cp : 4.29 ( 2%) usr 0.48 (10%) sys 4.86 ( 2%) wall 851380 kB ( 4%) ggc ipa inlining heuristics : 122.37 (46%) usr 0.42 ( 9%) sys 122.72 (46%) wall 807997 kB ( 4%) ggc ipa comdats : 0.53 ( 0%) usr 0.00 ( 0%) sys 0.53 ( 0%) wall 0 kB ( 0%) ggc ipa lto gimple in : 5.16 ( 2%) usr 1.09 (23%) sys 6.64 ( 2%) wall 1370302 kB ( 6%) ggc ipa lto decl in : 79.11 (30%) usr 1.58 (33%) sys 80.64 (30%) wall16957092 kB (79%) ggc ipa lto constructors in : 0.37 ( 0%) usr 0.06 ( 1%) sys 0.37 ( 0%) wall 22897 kB ( 0%) ggc ipa lto cgraph I/O : 1.44 ( 1%) usr 0.24 ( 5%) sys 1.69 ( 1%) wall 901960 kB ( 4%) ggc ipa lto decl merge : 3.27 ( 1%) usr 0.01 ( 0%) sys 3.26 ( 1%) wall 16383 kB ( 0%) ggc ipa lto cgraph merge : 4.63 ( 2%) usr 0.04 ( 1%) sys 4.68 ( 2%) wall 20432 kB ( 0%) ggc whopr wpa : 1.70 ( 1%) usr 0.00 ( 0%) sys 1.71 ( 1%) wall 2 kB ( 0%) ggc whopr partitioning : 4.72 ( 2%) usr 0.02 ( 0%) sys 4.73 ( 2%) wall 7796 kB ( 0%) ggc ipa reference : 2.70 ( 1%) usr 0.10 ( 2%) sys 2.80 ( 1%) wall 0 kB ( 0%) ggc ipa profile : 0.53 ( 0%) usr 0.03 ( 1%) sys 0.58 ( 0%) wall 0 kB ( 0%) ggc ipa pure const : 3.13 ( 1%) usr 0.09 ( 2%) sys 3.21 ( 1%) wall 0 kB ( 0%) ggc ipa icf : 16.96 ( 6%) usr 0.17 ( 4%) sys 17.06 ( 6%) wall 3087 kB ( 0%) ggc inline parameters : 0.01 ( 0%) usr 0.00 ( 0%) sys 0.00 ( 0%) wall 0 kB ( 0%) ggc tree SSA rewrite : 0.39 ( 0%) usr 0.05 ( 1%) sys 0.27 ( 0%) wall 51205 kB ( 0%) ggc tree SSA other : 0.00 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%) wall 0 kB ( 0%) ggc tree SSA incremental : 0.50 ( 0%) usr 0.08 ( 2%) sys 0.50 ( 0%) wall 33556 kB ( 0%) ggc tree operand scan : 0.45 ( 0%) usr 0.11 ( 2%) sys 0.47 ( 0%) wall 343892 kB ( 2%) ggc dominance frontiers : 0.05 ( 0%) usr 0.00 ( 0%) sys 0.04 ( 0%) wall 0 kB ( 0%) ggc dominance computation : 0.51 ( 0%) usr 0.08 ( 2%) sys 0.58 ( 0%) wall 0 kB ( 0%) ggc varconst : 0.02 ( 0%) usr 0.06 ( 1%) sys 0.05 ( 0%) wall 0 kB ( 0%) ggc loop fini : 0.12 ( 0%) usr 0.00 ( 0%) sys 0.13 ( 0%) wall 0 kB ( 0%) ggc unaccounted todo : 1.19 ( 0%) usr 0.00 ( 0%) sys 1.15 ( 0%) wall 0 kB ( 0%) ggc TOTAL : 263.27 4.84 268.01 21422636 kB [ perf record: Woken up 254 times to write data ] [ perf record: Captured and wrote 63.481 MB perf.data (~2773530 samples) ] marxin@marxinbox:~/Programming/chromium/src/out/Release> perf report --stdio | sed 's/\ *$//' | head -n50# To display the perf.data header info, please use --header/--header-only options. # # Samples: 1M of event 'cycles' # Event count (approx.): 945739511218 # # Overhead Command Shared Object # ........ ........ ................. .................................................................................................................................................................................................................................................................................................. # 19.88% lto1-wpa lto1 [.] nonremovable_p(cgraph_node*, void*) 9.17% lto1-wpa lto1 [.] cgraph_node::used_from_object_file_p_worker(cgraph_node*, void*) 7.93% lto1-wpa lto1 [.] cgraph_node::call_for_symbol_and_aliases_1(bool (*)(cgraph_node*, void*), void*, bool) 6.37% lto1-wpa lto1 [.] inflate_fast 2.23% lto1-wpa lto1 [.] compare_tree_sccs_1(tree_node*, tree_node*, tree_node***) 2.14% lto1-wpa lto1 [.] streamer_read_uhwi(lto_input_block*) 1.96% lto1-wpa lto1 [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option) 1.83% lto1-wpa lto1 [.] unify_scc(streamer_tree_cache_d*, unsigned int, unsigned int, unsigned int, unsigned int) 1.61% lto1-wpa lto1 [.] streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*) 1.23% lto1-wpa lto1 [.] lto_cgraph_replace_node(cgraph_node*, cgraph_node*) 1.21% lto1-wpa libc-2.19.so [.] msort_with_tmp.part.0 1.19% lto1-wpa lto1 [.] streamer_get_pickled_tree(lto_input_block*, data_in*) 1.14% lto1-wpa lto1 [.] symbol_table::remove_unreachable_nodes(_IO_FILE*) 1.08% lto1-wpa libc-2.19.so [.] _int_malloc 1.02% lto1-wpa lto1 [.] ipa_icf::sem_variable::equals(tree_node*, tree_node*) 0.96% lto1-wpa lto1 [.] lto_input_tree_1(lto_input_block*, data_in*, LTO_tags, unsigned int) 0.84% lto1-wpa lto1 [.] inflate 0.74% lto1-wpa lto1 [.] adler32 0.71% lto1-wpa lto1 [.] lto_input_tree(lto_input_block*, data_in*) 0.68% lto1-wpa lto1 [.] cgraph_node::call_for_symbol_thunks_and_aliases(bool (*)(cgraph_node*, void*), void*, bool, bool) 0.66% lto1-wpa lto1 [.] streamer_read_tree_body(lto_input_block*, data_in*, tree_node*) 0.64% lto1-wpa lto1 [.] estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone .isra.129] 0.63% lto1-wpa lto1 [.] lto_input_location(bitpack_d*, data_in*)
On 02/18/2015 02:58 PM, Martin Liška wrote: > On 02/17/2015 10:03 PM, Jan Hubicka wrote: >> Hi, >> this patch should chase away the expensive thunks and aliases walks from most >> of analysis code. I think only real use left is local_p predicate that needs to >> stay because i386 expect local flag to match between caller and callee when >> expanding assembler thunk. I at least optimized it by first moving the walk to >> be conditional for nonlocal functions only and then reorganizing >> call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and >> only then work on thunks. Most likely this will find the non-local thunk/alias >> faster. Other cases was leftovers from the conversion of thunks from aliases >> to functions. >> >> I also noticed a bug in ipa-profile that does not disable all the >> transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it. >> >> Bootstrapped/regtested x86_64-linux, comitted. I would be interested to >> know if the call_for_symbol_thunks_and_aliases is now off your oprofiles >> (sorry, easier to type than perf-profiles) >> >> Honza >> >> * ipa-visibility.c (function_and_variable_visibility): Only >> check locality if node is not already local. >> * ipa-inline.c (want_inline_function_to_all_callers_p): Use >> call_for_symbol_and_aliases instead of >> call_for_symbol_thunks_and_aliases. >> (ipa_inline): Likewise. >> * cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases): >> first walk aliases. >> * ipa.c (symbol_table::remove_unreachable_nodes): Use >> call_for_symbol_and_aliases. >> * ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol. >> (ipa_propagate_frequency_1): Use it; use opt_for_fn >> (ipa_propagate_frequency): Update. >> (ipa_profile): Add opt_for_fn gueards. >> Index: ipa-visibility.c >> =================================================================== >> --- ipa-visibility.c (revision 220741) >> +++ ipa-visibility.c (working copy) >> @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w >> } >> FOR_EACH_DEFINED_FUNCTION (node) >> { >> - node->local.local |= node->local_p (); >> + if (!node->local.local) >> + node->local.local |= node->local_p (); >> >> /* If we know that function can not be overwritten by a different semantics >> and moreover its section can not be discarded, replace all direct calls >> Index: ipa-inline.c >> =================================================================== >> --- ipa-inline.c (revision 220741) >> +++ ipa-inline.c (working copy) >> @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s >> if (node->global.inlined_to) >> return false; >> /* Does it have callers? */ >> - if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true)) >> + if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true)) >> return false; >> /* Inlining into all callers would increase size? */ >> if (estimate_growth (node) > 0) >> return false; >> /* All inlines must be possible. */ >> - if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call, >> - true)) >> + if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call, >> + true)) >> return false; >> if (!cold && !has_hot_call) >> return false; >> @@ -2359,9 +2359,9 @@ ipa_inline (void) >> if (want_inline_function_to_all_callers_p (node, cold)) >> { >> int num_calls = 0; >> - node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls, >> - true); >> - while (node->call_for_symbol_thunks_and_aliases >> + node->call_for_symbol_and_aliases (sum_callers, &num_calls, >> + true); >> + while (node->call_for_symbol_and_aliases >> (inline_to_all_callers, &num_calls, true)) >> ; >> remove_functions = true; >> Index: cgraph.c >> =================================================================== >> --- cgraph.c (revision 220741) >> +++ cgraph.c (working copy) >> @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_ >> >> if (callback (this, data)) >> return true; >> + FOR_EACH_ALIAS (this, ref) >> + { >> + cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); >> + if (include_overwritable >> + || alias->get_availability () > AVAIL_INTERPOSABLE) >> + if (alias->call_for_symbol_thunks_and_aliases (callback, data, >> + include_overwritable, >> + exclude_virtual_thunks)) >> + return true; >> + } >> for (e = callers; e; e = e->next_caller) >> if (e->caller->thunk.thunk_p >> && (include_overwritable >> @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_ >> exclude_virtual_thunks)) >> return true; >> >> - FOR_EACH_ALIAS (this, ref) >> - { >> - cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); >> - if (include_overwritable >> - || alias->get_availability () > AVAIL_INTERPOSABLE) >> - if (alias->call_for_symbol_thunks_and_aliases (callback, data, >> - include_overwritable, >> - exclude_virtual_thunks)) >> - return true; >> - } >> return false; >> } >> >> Index: ipa.c >> =================================================================== >> --- ipa.c (revision 220741) >> +++ ipa.c (working copy) >> @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes ( >> if (node->address_taken >> && !node->used_from_other_partition) >> { >> - if (!node->call_for_symbol_thunks_and_aliases >> + if (!node->call_for_symbol_and_aliases >> (has_addr_references_p, NULL, true) >> && (!node->instrumentation_clone >> || !node->instrumented_version >> Index: ipa-profile.c >> =================================================================== >> --- ipa-profile.c (revision 220741) >> +++ ipa-profile.c (working copy) >> @@ -322,6 +322,7 @@ ipa_profile_read_summary (void) >> >> struct ipa_propagate_frequency_data >> { >> + cgraph_node *function_symbol; >> bool maybe_unlikely_executed; >> bool maybe_executed_once; >> bool only_called_at_startup; >> @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph >> || d->only_called_at_startup || d->only_called_at_exit); >> edge = edge->next_caller) >> { >> - if (edge->caller != node) >> + if (edge->caller != d->function_symbol) >> { >> d->only_called_at_startup &= edge->caller->only_called_at_startup; >> /* It makes sense to put main() together with the static constructors. >> @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph >> errors can make us to push function into unlikely section even when >> it is executed by the train run. Transfer the function only if all >> callers are unlikely executed. */ >> - if (profile_info && flag_branch_probabilities >> + if (profile_info >> + && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities) >> + /* Thunks are not profiled. This is more or less implementation >> + bug. */ >> + && !d->function_symbol->thunk.thunk_p >> && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED >> || (edge->caller->global.inlined_to >> && edge->caller->global.inlined_to->frequency >> @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node >> bool >> ipa_propagate_frequency (struct cgraph_node *node) >> { >> - struct ipa_propagate_frequency_data d = {true, true, true, true}; >> + struct ipa_propagate_frequency_data d = {node, true, true, true, true}; >> bool changed = false; >> >> /* We can not propagate anything useful about externally visible functions >> @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n >> if (dump_file && (dump_flags & TDF_DETAILS)) >> fprintf (dump_file, "Processing frequency %s\n", node->name ()); >> >> - node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d, >> - true); >> + node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d, >> + true); >> >> if ((d.only_called_at_startup && !d.only_called_at_exit) >> && !node->only_called_at_startup) >> @@ -597,6 +602,9 @@ ipa_profile (void) >> { >> bool update = false; >> >> + if (!opt_for_fn (n->decl, flag_ipa_profile)) >> + continue; >> + >> for (e = n->indirect_calls; e; e = e->next_callee) >> { >> if (n->count) >> @@ -697,7 +705,9 @@ ipa_profile (void) >> order_pos = ipa_reverse_postorder (order); >> for (i = order_pos - 1; i >= 0; i--) >> { >> - if (order[i]->local.local && ipa_propagate_frequency (order[i])) >> + if (order[i]->local.local >> + && opt_for_fn (order[i]->decl, flag_ipa_profile) >> + && ipa_propagate_frequency (order[i])) >> { >> for (e = order[i]->callees; e; e = e->next_callee) >> if (e->callee->local.local && !e->callee->aux) >> @@ -714,7 +724,9 @@ ipa_profile (void) >> something_changed = false; >> for (i = order_pos - 1; i >= 0; i--) >> { >> - if (order[i]->aux && ipa_propagate_frequency (order[i])) >> + if (order[i]->aux >> + && opt_for_fn (order[i]->decl, flag_ipa_profile) >> + && ipa_propagate_frequency (order[i])) >> { >> for (e = order[i]->callees; e; e = e->next_callee) >> if (e->callee->local.local && !e->callee->aux) >> > > Hi. > > There's perf report and -ftime report of WPA phase. > > Martin Hm, using the same compiler, Firefox LTO time statistics and perf report and very different. I'm wondering how can be that possible? Martin Execution times (seconds) phase setup : 0.00 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%) wall 1988 kB ( 0%) ggc phase opt and generate : 42.32 (70%) usr 0.85 (56%) sys 43.16 (69%) wall 1387464 kB (28%) ggc phase stream in : 18.50 (30%) usr 0.68 (44%) sys 19.17 (31%) wall 3528077 kB (72%) ggc garbage collection : 2.24 ( 4%) usr 0.00 ( 0%) sys 2.24 ( 4%) wall 0 kB ( 0%) ggc callgraph optimization : 0.37 ( 1%) usr 0.00 ( 0%) sys 0.37 ( 1%) wall 38 kB ( 0%) ggc ipa dead code removal : 3.06 ( 5%) usr 0.01 ( 1%) sys 2.88 ( 5%) wall 0 kB ( 0%) ggc ipa virtual call target : 5.72 ( 9%) usr 0.06 ( 4%) sys 5.87 ( 9%) wall 0 kB ( 0%) ggc ipa devirtualization : 0.18 ( 0%) usr 0.00 ( 0%) sys 0.23 ( 0%) wall 22382 kB ( 0%) ggc ipa cp : 2.88 ( 5%) usr 0.09 ( 6%) sys 2.97 ( 5%) wall 515623 kB (10%) ggc ipa inlining heuristics : 13.96 (23%) usr 0.13 ( 8%) sys 14.12 (23%) wall 471848 kB (10%) ggc ipa comdats : 0.12 ( 0%) usr 0.00 ( 0%) sys 0.12 ( 0%) wall 0 kB ( 0%) ggc ipa lto gimple in : 2.54 ( 4%) usr 0.48 (31%) sys 3.23 ( 5%) wall 645652 kB (13%) ggc ipa lto decl in : 12.64 (21%) usr 0.37 (24%) sys 13.01 (21%) wall 2592737 kB (53%) ggc ipa lto constructors in : 0.17 ( 0%) usr 0.01 ( 1%) sys 0.20 ( 0%) wall 16493 kB ( 0%) ggc ipa lto cgraph I/O : 0.58 ( 1%) usr 0.09 ( 6%) sys 0.67 ( 1%) wall 437504 kB ( 9%) ggc ipa lto decl merge : 1.90 ( 3%) usr 0.00 ( 0%) sys 1.90 ( 3%) wall 8191 kB ( 0%) ggc ipa lto cgraph merge : 1.30 ( 2%) usr 0.00 ( 0%) sys 1.29 ( 2%) wall 14989 kB ( 0%) ggc whopr wpa : 0.91 ( 1%) usr 0.00 ( 0%) sys 0.88 ( 1%) wall 2 kB ( 0%) ggc whopr partitioning : 2.66 ( 4%) usr 0.00 ( 0%) sys 2.67 ( 4%) wall 6081 kB ( 0%) ggc ipa reference : 1.38 ( 2%) usr 0.01 ( 1%) sys 1.40 ( 2%) wall 0 kB ( 0%) ggc ipa profile : 0.21 ( 0%) usr 0.01 ( 1%) sys 0.21 ( 0%) wall 0 kB ( 0%) ggc ipa pure const : 1.61 ( 3%) usr 0.01 ( 1%) sys 1.61 ( 3%) wall 0 kB ( 0%) ggc ipa icf : 4.99 ( 8%) usr 0.06 ( 4%) sys 5.00 ( 8%) wall 1120 kB ( 0%) ggc tree SSA rewrite : 0.12 ( 0%) usr 0.02 ( 1%) sys 0.12 ( 0%) wall 23170 kB ( 0%) ggc tree SSA incremental : 0.23 ( 0%) usr 0.05 ( 3%) sys 0.21 ( 0%) wall 14434 kB ( 0%) ggc tree operand scan : 0.14 ( 0%) usr 0.03 ( 2%) sys 0.22 ( 0%) wall 145252 kB ( 3%) ggc dominance frontiers : 0.04 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%) wall 0 kB ( 0%) ggc dominance computation : 0.14 ( 0%) usr 0.05 ( 3%) sys 0.11 ( 0%) wall 0 kB ( 0%) ggc varconst : 0.01 ( 0%) usr 0.02 ( 1%) sys 0.03 ( 0%) wall 0 kB ( 0%) ggc loop fini : 0.07 ( 0%) usr 0.00 ( 0%) sys 0.03 ( 0%) wall 0 kB ( 0%) ggc unaccounted todo : 0.62 ( 1%) usr 0.00 ( 0%) sys 0.65 ( 1%) wall 0 kB ( 0%) ggc TOTAL : 60.82 1.53 62.34 4917531 kB [ perf record: Woken up 59 times to write data ] [ perf record: Captured and wrote 14.722 MB perf.data (~643202 samples) ] marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/home/marxin/Programming/bin/gcc2/lib/gcc/x86_64-unknown-linux-gnu/5.0.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: ../configure --enable-languages=c,c++ --disable-libsanitizer --prefix=/home/marxin/Programming/bin/gcc2 --disable-bootstrap --enable-checking=release Thread model: posix gcc version 5.0.0 20150218 (experimental) (GCC) marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report --stdio | sed 's/\ *$//' | head -n50 # To display the perf.data header info, please use --header/--header-only options. # # Samples: 245K of event 'cycles' # Event count (approx.): 216467422123 # # Overhead Command Shared Object # ........ ........ ................. .................................................................................................................................................................................................................................................................................................. # 4.97% lto1-wpa lto1 [.] inflate_fast 2.78% lto1-wpa lto1 [.] symbol_table::remove_unreachable_nodes(_IO_FILE*) 2.37% lto1-wpa libc-2.19.so [.] _int_malloc 1.77% lto1-wpa lto1 [.] record_target_from_binfo(vec<cgraph_node*, va_heap, vl_ptr>&, vec<tree_node*, va_heap, vl_ptr>*, tree_node*, tree_node*, vec<tree_node*, va_heap, vl_ptr>&, long, tree_node*, long, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, bool, bool*) 1.57% lto1-wpa lto1 [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option) 1.56% lto1-wpa lto1 [.] streamer_read_uhwi(lto_input_block*) 1.48% lto1-wpa lto1 [.] estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone .isra.129] 1.48% lto1-wpa lto1 [.] unify_scc(streamer_tree_cache_d*, unsigned int, unsigned int, unsigned int, unsigned int) 1.40% lto1-wpa lto1 [.] lto_cgraph_replace_node(cgraph_node*, cgraph_node*) 1.38% lto1-wpa lto1 [.] ggc_set_mark(void const*) 1.30% lto1-wpa libc-2.19.so [.] malloc_consolidate 1.28% lto1-wpa lto1 [.] htab_hash_string 1.25% lto1-wpa lto1 [.] compare_tree_sccs_1(tree_node*, tree_node*, tree_node***) 1.23% lto1-wpa lto1 [.] fibonacci_heap<sreal, cgraph_edge>::consolidate() 1.19% lto1-wpa lto1 [.] splay_tree_splay 1.15% lto1-wpa lto1 [.] can_inline_edge_p(cgraph_edge*, bool, bool) 1.14% lto1-wpa lto1 [.] cgraph_node::get_availability() 1.14% lto1-wpa lto1 [.] evaluate_properties_for_edge(cgraph_edge*, bool, unsigned int*, vec<tree_node*, va_heap, vl_ptr>*, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>*, vec<ipa_agg_jump_function*, va_heap, vl_ptr>*) [clone .constprop.131] 1.13% lto1-wpa lto1 [.] gimple_get_virt_method_for_vtable(long, tree_node*, unsigned long, bool*) 1.10% lto1-wpa lto1 [.] types_same_for_odr(tree_node const*, tree_node const*) 1.08% lto1-wpa lto1 [.] gt_ggc_mx_lang_tree_node(void*) 1.05% lto1-wpa lto1 [.] streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*) 0.99% lto1-wpa lto1 [.] type_in_anonymous_namespace_p(tree_node const*) 0.99% lto1-wpa lto1 [.] gimple_has_body_p(tree_node*) 0.95% lto1-wpa lto1 [.] decl_assembler_name(tree_node*) 0.93% lto1-wpa lto1 [.] do_per_function(void (*)(function*, void*), void*) 0.82% lto1-wpa libc-2.19.so [.] _int_free 0.81% lto1-wpa lto1 [.] possible_polymorphic_call_targets_1(vec<cgraph_node*, va_heap, vl_ptr>&, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, tree_node*, odr_type_d*, long, tree_node*, long, bool*, vec<tree_node*, va_heap, vl_ptr>&, bool) 0.81% lto1-wpa lto1 [.] searchc(searchc_env*, cgraph_node*, bool (*)(cgraph_edge*)) 0.80% lto1-wpa lto1 [.] streamer_get_pickled_tree(lto_input_block*, data_in*) 0.78% lto1-wpa lto1 [.] edge_badness(cgraph_edge*, bool) 0.77% lto1-wpa lto1 [.] hash_table<asmname_hasher, xcallocator, true>::find_slot_with_hash(tree_node const* const&, unsigned int, insert_option) 0.77% lto1-wpa lto1 [.] update_callee_keys(fibonacci_heap<sreal, cgraph_edge>*, cgraph_node*, bitmap_head*) 0.76% lto1-wpa lto1 [.] ggc_internal_alloc(unsigned long, void (*)(void*), unsigned long, unsigned long) 0.75% lto1-wpa lto1 [.] fibonacci_heap<sreal, cgraph_edge>::extract_minimum_node() 0.75% lto1-wpa lto1 [.] execute_one_pass(opt_pass*) 0.74% lto1-wpa lto1 [.] inflate 0.71% lto1-wpa lto1 [.] contains_polymorphic_type_p(tree_node const*) 0.67% lto1-wpa lto1 [.] get_binfo_at_offset(tree_node*, long, tree_node*) 0.64% lto1-wpa lto1 [.] symbol_table::decl_assembler_name_equal(tree_node*, tree_node const*) 0.61% lto1-wpa lto1 [.] lto_balanced_map(int) 0.61% lto1-wpa lto1 [.] ipa_icf::sem_item_optimizer::do_congruence_step_for_index(ipa_icf::congruence_class*, unsigned int)
Index: ipa-visibility.c =================================================================== --- ipa-visibility.c (revision 220741) +++ ipa-visibility.c (working copy) @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w } FOR_EACH_DEFINED_FUNCTION (node) { - node->local.local |= node->local_p (); + if (!node->local.local) + node->local.local |= node->local_p (); /* If we know that function can not be overwritten by a different semantics and moreover its section can not be discarded, replace all direct calls Index: ipa-inline.c =================================================================== --- ipa-inline.c (revision 220741) +++ ipa-inline.c (working copy) @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s if (node->global.inlined_to) return false; /* Does it have callers? */ - if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true)) + if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true)) return false; /* Inlining into all callers would increase size? */ if (estimate_growth (node) > 0) return false; /* All inlines must be possible. */ - if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call, - true)) + if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call, + true)) return false; if (!cold && !has_hot_call) return false; @@ -2359,9 +2359,9 @@ ipa_inline (void) if (want_inline_function_to_all_callers_p (node, cold)) { int num_calls = 0; - node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls, - true); - while (node->call_for_symbol_thunks_and_aliases + node->call_for_symbol_and_aliases (sum_callers, &num_calls, + true); + while (node->call_for_symbol_and_aliases (inline_to_all_callers, &num_calls, true)) ; remove_functions = true; Index: cgraph.c =================================================================== --- cgraph.c (revision 220741) +++ cgraph.c (working copy) @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_ if (callback (this, data)) return true; + FOR_EACH_ALIAS (this, ref) + { + cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); + if (include_overwritable + || alias->get_availability () > AVAIL_INTERPOSABLE) + if (alias->call_for_symbol_thunks_and_aliases (callback, data, + include_overwritable, + exclude_virtual_thunks)) + return true; + } for (e = callers; e; e = e->next_caller) if (e->caller->thunk.thunk_p && (include_overwritable @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_ exclude_virtual_thunks)) return true; - FOR_EACH_ALIAS (this, ref) - { - cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring); - if (include_overwritable - || alias->get_availability () > AVAIL_INTERPOSABLE) - if (alias->call_for_symbol_thunks_and_aliases (callback, data, - include_overwritable, - exclude_virtual_thunks)) - return true; - } return false; } Index: ipa.c =================================================================== --- ipa.c (revision 220741) +++ ipa.c (working copy) @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes ( if (node->address_taken && !node->used_from_other_partition) { - if (!node->call_for_symbol_thunks_and_aliases + if (!node->call_for_symbol_and_aliases (has_addr_references_p, NULL, true) && (!node->instrumentation_clone || !node->instrumented_version Index: ipa-profile.c =================================================================== --- ipa-profile.c (revision 220741) +++ ipa-profile.c (working copy) @@ -322,6 +322,7 @@ ipa_profile_read_summary (void) struct ipa_propagate_frequency_data { + cgraph_node *function_symbol; bool maybe_unlikely_executed; bool maybe_executed_once; bool only_called_at_startup; @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph || d->only_called_at_startup || d->only_called_at_exit); edge = edge->next_caller) { - if (edge->caller != node) + if (edge->caller != d->function_symbol) { d->only_called_at_startup &= edge->caller->only_called_at_startup; /* It makes sense to put main() together with the static constructors. @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph errors can make us to push function into unlikely section even when it is executed by the train run. Transfer the function only if all callers are unlikely executed. */ - if (profile_info && flag_branch_probabilities + if (profile_info + && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities) + /* Thunks are not profiled. This is more or less implementation + bug. */ + && !d->function_symbol->thunk.thunk_p && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED || (edge->caller->global.inlined_to && edge->caller->global.inlined_to->frequency @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node bool ipa_propagate_frequency (struct cgraph_node *node) { - struct ipa_propagate_frequency_data d = {true, true, true, true}; + struct ipa_propagate_frequency_data d = {node, true, true, true, true}; bool changed = false; /* We can not propagate anything useful about externally visible functions @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Processing frequency %s\n", node->name ()); - node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d, - true); + node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d, + true); if ((d.only_called_at_startup && !d.only_called_at_exit) && !node->only_called_at_startup) @@ -597,6 +602,9 @@ ipa_profile (void) { bool update = false; + if (!opt_for_fn (n->decl, flag_ipa_profile)) + continue; + for (e = n->indirect_calls; e; e = e->next_callee) { if (n->count) @@ -697,7 +705,9 @@ ipa_profile (void) order_pos = ipa_reverse_postorder (order); for (i = order_pos - 1; i >= 0; i--) { - if (order[i]->local.local && ipa_propagate_frequency (order[i])) + if (order[i]->local.local + && opt_for_fn (order[i]->decl, flag_ipa_profile) + && ipa_propagate_frequency (order[i])) { for (e = order[i]->callees; e; e = e->next_callee) if (e->callee->local.local && !e->callee->aux) @@ -714,7 +724,9 @@ ipa_profile (void) something_changed = false; for (i = order_pos - 1; i >= 0; i--) { - if (order[i]->aux && ipa_propagate_frequency (order[i])) + if (order[i]->aux + && opt_for_fn (order[i]->decl, flag_ipa_profile) + && ipa_propagate_frequency (order[i])) { for (e = order[i]->callees; e; e = e->next_callee) if (e->callee->local.local && !e->callee->aux)