diff mbox

[RFC] LTO: IPA inline speed up for large apps (Chrome)

Message ID 20150217210307.GB13234@kam.mff.cuni.cz
State New
Headers show

Commit Message

Jan Hubicka Feb. 17, 2015, 9:03 p.m. UTC
Hi,
this patch should chase away the expensive thunks and aliases walks from most
of analysis code. I think only real use left is local_p predicate that needs to
stay because i386 expect local flag to match between caller and callee when
expanding assembler thunk. I at least optimized it by first moving the walk to
be conditional for nonlocal functions only and then reorganizing
call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and
only then work on thunks.  Most likely this will find the non-local thunk/alias
faster.  Other cases was leftovers from the conversion of thunks from aliases
to functions.

I also noticed a bug in ipa-profile that does not disable all the
transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it.

Bootstrapped/regtested x86_64-linux, comitted.  I would be interested to
know if the call_for_symbol_thunks_and_aliases is now off your oprofiles
(sorry, easier to type than perf-profiles)

Honza

	* ipa-visibility.c (function_and_variable_visibility): Only
	check locality if node is not already local.
	* ipa-inline.c (want_inline_function_to_all_callers_p): Use
	call_for_symbol_and_aliases instead of
	call_for_symbol_thunks_and_aliases.
	(ipa_inline): Likewise.
	* cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases):
	first walk aliases.
	* ipa.c (symbol_table::remove_unreachable_nodes): Use
	call_for_symbol_and_aliases.
	* ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol.
	(ipa_propagate_frequency_1): Use it; use opt_for_fn
	(ipa_propagate_frequency): Update.
	(ipa_profile): Add opt_for_fn gueards.

Comments

Martin Liška Feb. 18, 2015, 1:58 p.m. UTC | #1
On 02/17/2015 10:03 PM, Jan Hubicka wrote:
> Hi,
> this patch should chase away the expensive thunks and aliases walks from most
> of analysis code. I think only real use left is local_p predicate that needs to
> stay because i386 expect local flag to match between caller and callee when
> expanding assembler thunk. I at least optimized it by first moving the walk to
> be conditional for nonlocal functions only and then reorganizing
> call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and
> only then work on thunks.  Most likely this will find the non-local thunk/alias
> faster.  Other cases was leftovers from the conversion of thunks from aliases
> to functions.
>
> I also noticed a bug in ipa-profile that does not disable all the
> transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it.
>
> Bootstrapped/regtested x86_64-linux, comitted.  I would be interested to
> know if the call_for_symbol_thunks_and_aliases is now off your oprofiles
> (sorry, easier to type than perf-profiles)
>
> Honza
>
> 	* ipa-visibility.c (function_and_variable_visibility): Only
> 	check locality if node is not already local.
> 	* ipa-inline.c (want_inline_function_to_all_callers_p): Use
> 	call_for_symbol_and_aliases instead of
> 	call_for_symbol_thunks_and_aliases.
> 	(ipa_inline): Likewise.
> 	* cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases):
> 	first walk aliases.
> 	* ipa.c (symbol_table::remove_unreachable_nodes): Use
> 	call_for_symbol_and_aliases.
> 	* ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol.
> 	(ipa_propagate_frequency_1): Use it; use opt_for_fn
> 	(ipa_propagate_frequency): Update.
> 	(ipa_profile): Add opt_for_fn gueards.
> Index: ipa-visibility.c
> ===================================================================
> --- ipa-visibility.c	(revision 220741)
> +++ ipa-visibility.c	(working copy)
> @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w
>       }
>     FOR_EACH_DEFINED_FUNCTION (node)
>       {
> -      node->local.local |= node->local_p ();
> +      if (!node->local.local)
> +        node->local.local |= node->local_p ();
>
>         /* If we know that function can not be overwritten by a different semantics
>   	 and moreover its section can not be discarded, replace all direct calls
> Index: ipa-inline.c
> ===================================================================
> --- ipa-inline.c	(revision 220741)
> +++ ipa-inline.c	(working copy)
> @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s
>     if (node->global.inlined_to)
>       return false;
>     /* Does it have callers?  */
> -  if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true))
> +  if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true))
>       return false;
>     /* Inlining into all callers would increase size?  */
>     if (estimate_growth (node) > 0)
>       return false;
>     /* All inlines must be possible.  */
> -  if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call,
> -						true))
> +  if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call,
> +					 true))
>       return false;
>     if (!cold && !has_hot_call)
>       return false;
> @@ -2359,9 +2359,9 @@ ipa_inline (void)
>   	  if (want_inline_function_to_all_callers_p (node, cold))
>   	    {
>   	      int num_calls = 0;
> -	      node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls,
> -						      true);
> -	      while (node->call_for_symbol_thunks_and_aliases
> +	      node->call_for_symbol_and_aliases (sum_callers, &num_calls,
> +						 true);
> +	      while (node->call_for_symbol_and_aliases
>   		       (inline_to_all_callers, &num_calls, true))
>   		;
>   	      remove_functions = true;
> Index: cgraph.c
> ===================================================================
> --- cgraph.c	(revision 220741)
> +++ cgraph.c	(working copy)
> @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_
>
>     if (callback (this, data))
>       return true;
> +  FOR_EACH_ALIAS (this, ref)
> +    {
> +      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
> +      if (include_overwritable
> +	  || alias->get_availability () > AVAIL_INTERPOSABLE)
> +	if (alias->call_for_symbol_thunks_and_aliases (callback, data,
> +						     include_overwritable,
> +						     exclude_virtual_thunks))
> +	  return true;
> +    }
>     for (e = callers; e; e = e->next_caller)
>       if (e->caller->thunk.thunk_p
>   	&& (include_overwritable
> @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_
>   						       exclude_virtual_thunks))
>   	return true;
>
> -  FOR_EACH_ALIAS (this, ref)
> -    {
> -      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
> -      if (include_overwritable
> -	  || alias->get_availability () > AVAIL_INTERPOSABLE)
> -	if (alias->call_for_symbol_thunks_and_aliases (callback, data,
> -						     include_overwritable,
> -						     exclude_virtual_thunks))
> -	  return true;
> -    }
>     return false;
>   }
>
> Index: ipa.c
> ===================================================================
> --- ipa.c	(revision 220741)
> +++ ipa.c	(working copy)
> @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes (
>       if (node->address_taken
>   	&& !node->used_from_other_partition)
>         {
> -	if (!node->call_for_symbol_thunks_and_aliases
> +	if (!node->call_for_symbol_and_aliases
>   	    (has_addr_references_p, NULL, true)
>   	    && (!node->instrumentation_clone
>   		|| !node->instrumented_version
> Index: ipa-profile.c
> ===================================================================
> --- ipa-profile.c	(revision 220741)
> +++ ipa-profile.c	(working copy)
> @@ -322,6 +322,7 @@ ipa_profile_read_summary (void)
>
>   struct ipa_propagate_frequency_data
>   {
> +  cgraph_node *function_symbol;
>     bool maybe_unlikely_executed;
>     bool maybe_executed_once;
>     bool only_called_at_startup;
> @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph
>   	        || d->only_called_at_startup || d->only_called_at_exit);
>          edge = edge->next_caller)
>       {
> -      if (edge->caller != node)
> +      if (edge->caller != d->function_symbol)
>   	{
>             d->only_called_at_startup &= edge->caller->only_called_at_startup;
>   	  /* It makes sense to put main() together with the static constructors.
> @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph
>   	 errors can make us to push function into unlikely section even when
>   	 it is executed by the train run.  Transfer the function only if all
>   	 callers are unlikely executed.  */
> -      if (profile_info && flag_branch_probabilities
> +      if (profile_info
> +	  && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities)
> +	  /* Thunks are not profiled.  This is more or less implementation
> +	     bug.  */
> +	  && !d->function_symbol->thunk.thunk_p
>   	  && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED
>   	      || (edge->caller->global.inlined_to
>   		  && edge->caller->global.inlined_to->frequency
> @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node
>   bool
>   ipa_propagate_frequency (struct cgraph_node *node)
>   {
> -  struct ipa_propagate_frequency_data d = {true, true, true, true};
> +  struct ipa_propagate_frequency_data d = {node, true, true, true, true};
>     bool changed = false;
>
>     /* We can not propagate anything useful about externally visible functions
> @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n
>     if (dump_file && (dump_flags & TDF_DETAILS))
>       fprintf (dump_file, "Processing frequency %s\n", node->name ());
>
> -  node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d,
> -					    true);
> +  node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d,
> +				     true);
>
>     if ((d.only_called_at_startup && !d.only_called_at_exit)
>         && !node->only_called_at_startup)
> @@ -597,6 +602,9 @@ ipa_profile (void)
>       {
>         bool update = false;
>
> +      if (!opt_for_fn (n->decl, flag_ipa_profile))
> +	continue;
> +
>         for (e = n->indirect_calls; e; e = e->next_callee)
>   	{
>   	  if (n->count)
> @@ -697,7 +705,9 @@ ipa_profile (void)
>     order_pos = ipa_reverse_postorder (order);
>     for (i = order_pos - 1; i >= 0; i--)
>       {
> -      if (order[i]->local.local && ipa_propagate_frequency (order[i]))
> +      if (order[i]->local.local
> +	  && opt_for_fn (order[i]->decl, flag_ipa_profile)
> +	  && ipa_propagate_frequency (order[i]))
>   	{
>   	  for (e = order[i]->callees; e; e = e->next_callee)
>   	    if (e->callee->local.local && !e->callee->aux)
> @@ -714,7 +724,9 @@ ipa_profile (void)
>         something_changed = false;
>         for (i = order_pos - 1; i >= 0; i--)
>   	{
> -	  if (order[i]->aux && ipa_propagate_frequency (order[i]))
> +	  if (order[i]->aux
> +	      && opt_for_fn (order[i]->decl, flag_ipa_profile)
> +	      && ipa_propagate_frequency (order[i]))
>   	    {
>   	      for (e = order[i]->callees; e; e = e->next_callee)
>   		if (e->callee->local.local && !e->callee->aux)
>

Hi.

There's perf report and -ftime report of WPA phase.

Martin
Execution times (seconds)
 phase setup             :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.00 ( 0%) wall    1977 kB ( 0%) ggc
 phase opt and generate  : 171.18 (65%) usr   2.29 (47%) sys 173.40 (65%) wall 2682609 kB (13%) ggc
 phase stream in         :  92.09 (35%) usr   2.55 (53%) sys  94.61 (35%) wall18738048 kB (87%) ggc
 callgraph optimization  :   0.72 ( 0%) usr   0.00 ( 0%) sys   0.73 ( 0%) wall      16 kB ( 0%) ggc
 ipa dead code removal   :   5.12 ( 2%) usr   0.05 ( 1%) sys   5.07 ( 2%) wall       0 kB ( 0%) ggc
 ipa virtual call target :   2.93 ( 1%) usr   0.03 ( 1%) sys   3.02 ( 1%) wall       0 kB ( 0%) ggc
 ipa devirtualization    :   0.26 ( 0%) usr   0.01 ( 0%) sys   0.34 ( 0%) wall   32646 kB ( 0%) ggc
 ipa cp                  :   4.29 ( 2%) usr   0.48 (10%) sys   4.86 ( 2%) wall  851380 kB ( 4%) ggc
 ipa inlining heuristics : 122.37 (46%) usr   0.42 ( 9%) sys 122.72 (46%) wall  807997 kB ( 4%) ggc
 ipa comdats             :   0.53 ( 0%) usr   0.00 ( 0%) sys   0.53 ( 0%) wall       0 kB ( 0%) ggc
 ipa lto gimple in       :   5.16 ( 2%) usr   1.09 (23%) sys   6.64 ( 2%) wall 1370302 kB ( 6%) ggc
 ipa lto decl in         :  79.11 (30%) usr   1.58 (33%) sys  80.64 (30%) wall16957092 kB (79%) ggc
 ipa lto constructors in :   0.37 ( 0%) usr   0.06 ( 1%) sys   0.37 ( 0%) wall   22897 kB ( 0%) ggc
 ipa lto cgraph I/O      :   1.44 ( 1%) usr   0.24 ( 5%) sys   1.69 ( 1%) wall  901960 kB ( 4%) ggc
 ipa lto decl merge      :   3.27 ( 1%) usr   0.01 ( 0%) sys   3.26 ( 1%) wall   16383 kB ( 0%) ggc
 ipa lto cgraph merge    :   4.63 ( 2%) usr   0.04 ( 1%) sys   4.68 ( 2%) wall   20432 kB ( 0%) ggc
 whopr wpa               :   1.70 ( 1%) usr   0.00 ( 0%) sys   1.71 ( 1%) wall       2 kB ( 0%) ggc
 whopr partitioning      :   4.72 ( 2%) usr   0.02 ( 0%) sys   4.73 ( 2%) wall    7796 kB ( 0%) ggc
 ipa reference           :   2.70 ( 1%) usr   0.10 ( 2%) sys   2.80 ( 1%) wall       0 kB ( 0%) ggc
 ipa profile             :   0.53 ( 0%) usr   0.03 ( 1%) sys   0.58 ( 0%) wall       0 kB ( 0%) ggc
 ipa pure const          :   3.13 ( 1%) usr   0.09 ( 2%) sys   3.21 ( 1%) wall       0 kB ( 0%) ggc
 ipa icf                 :  16.96 ( 6%) usr   0.17 ( 4%) sys  17.06 ( 6%) wall    3087 kB ( 0%) ggc
 inline parameters       :   0.01 ( 0%) usr   0.00 ( 0%) sys   0.00 ( 0%) wall       0 kB ( 0%) ggc
 tree SSA rewrite        :   0.39 ( 0%) usr   0.05 ( 1%) sys   0.27 ( 0%) wall   51205 kB ( 0%) ggc
 tree SSA other          :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall       0 kB ( 0%) ggc
 tree SSA incremental    :   0.50 ( 0%) usr   0.08 ( 2%) sys   0.50 ( 0%) wall   33556 kB ( 0%) ggc
 tree operand scan       :   0.45 ( 0%) usr   0.11 ( 2%) sys   0.47 ( 0%) wall  343892 kB ( 2%) ggc
 dominance frontiers     :   0.05 ( 0%) usr   0.00 ( 0%) sys   0.04 ( 0%) wall       0 kB ( 0%) ggc
 dominance computation   :   0.51 ( 0%) usr   0.08 ( 2%) sys   0.58 ( 0%) wall       0 kB ( 0%) ggc
 varconst                :   0.02 ( 0%) usr   0.06 ( 1%) sys   0.05 ( 0%) wall       0 kB ( 0%) ggc
 loop fini               :   0.12 ( 0%) usr   0.00 ( 0%) sys   0.13 ( 0%) wall       0 kB ( 0%) ggc
 unaccounted todo        :   1.19 ( 0%) usr   0.00 ( 0%) sys   1.15 ( 0%) wall       0 kB ( 0%) ggc
 TOTAL                 : 263.27             4.84           268.01           21422636 kB
[ perf record: Woken up 254 times to write data ]
[ perf record: Captured and wrote 63.481 MB perf.data (~2773530 samples) ]
marxin@marxinbox:~/Programming/chromium/src/out/Release> perf report --stdio | sed 's/\ *$//' | head -n50# To display the perf.data header info, please use --header/--header-only options.
#
# Samples: 1M of event 'cycles'
# Event count (approx.): 945739511218
#
# Overhead   Command      Shared Object
# ........  ........  .................  ..................................................................................................................................................................................................................................................................................................
#
    19.88%  lto1-wpa  lto1               [.] nonremovable_p(cgraph_node*, void*)
     9.17%  lto1-wpa  lto1               [.] cgraph_node::used_from_object_file_p_worker(cgraph_node*, void*)
     7.93%  lto1-wpa  lto1               [.] cgraph_node::call_for_symbol_and_aliases_1(bool (*)(cgraph_node*, void*), void*, bool)
     6.37%  lto1-wpa  lto1               [.] inflate_fast
     2.23%  lto1-wpa  lto1               [.] compare_tree_sccs_1(tree_node*, tree_node*, tree_node***)
     2.14%  lto1-wpa  lto1               [.] streamer_read_uhwi(lto_input_block*)
     1.96%  lto1-wpa  lto1               [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option)
     1.83%  lto1-wpa  lto1               [.] unify_scc(streamer_tree_cache_d*, unsigned int, unsigned int, unsigned int, unsigned int)
     1.61%  lto1-wpa  lto1               [.] streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*)
     1.23%  lto1-wpa  lto1               [.] lto_cgraph_replace_node(cgraph_node*, cgraph_node*)
     1.21%  lto1-wpa  libc-2.19.so       [.] msort_with_tmp.part.0
     1.19%  lto1-wpa  lto1               [.] streamer_get_pickled_tree(lto_input_block*, data_in*)
     1.14%  lto1-wpa  lto1               [.] symbol_table::remove_unreachable_nodes(_IO_FILE*)
     1.08%  lto1-wpa  libc-2.19.so       [.] _int_malloc
     1.02%  lto1-wpa  lto1               [.] ipa_icf::sem_variable::equals(tree_node*, tree_node*)
     0.96%  lto1-wpa  lto1               [.] lto_input_tree_1(lto_input_block*, data_in*, LTO_tags, unsigned int)
     0.84%  lto1-wpa  lto1               [.] inflate
     0.74%  lto1-wpa  lto1               [.] adler32
     0.71%  lto1-wpa  lto1               [.] lto_input_tree(lto_input_block*, data_in*)
     0.68%  lto1-wpa  lto1               [.] cgraph_node::call_for_symbol_thunks_and_aliases(bool (*)(cgraph_node*, void*), void*, bool, bool)
     0.66%  lto1-wpa  lto1               [.] streamer_read_tree_body(lto_input_block*, data_in*, tree_node*)
     0.64%  lto1-wpa  lto1               [.] estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone .isra.129]
     0.63%  lto1-wpa  lto1               [.] lto_input_location(bitpack_d*, data_in*)
Martin Liška Feb. 18, 2015, 2:13 p.m. UTC | #2
On 02/18/2015 02:58 PM, Martin Liška wrote:
> On 02/17/2015 10:03 PM, Jan Hubicka wrote:
>> Hi,
>> this patch should chase away the expensive thunks and aliases walks from most
>> of analysis code. I think only real use left is local_p predicate that needs to
>> stay because i386 expect local flag to match between caller and callee when
>> expanding assembler thunk. I at least optimized it by first moving the walk to
>> be conditional for nonlocal functions only and then reorganizing
>> call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and
>> only then work on thunks.  Most likely this will find the non-local thunk/alias
>> faster.  Other cases was leftovers from the conversion of thunks from aliases
>> to functions.
>>
>> I also noticed a bug in ipa-profile that does not disable all the
>> transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it.
>>
>> Bootstrapped/regtested x86_64-linux, comitted.  I would be interested to
>> know if the call_for_symbol_thunks_and_aliases is now off your oprofiles
>> (sorry, easier to type than perf-profiles)
>>
>> Honza
>>
>>     * ipa-visibility.c (function_and_variable_visibility): Only
>>     check locality if node is not already local.
>>     * ipa-inline.c (want_inline_function_to_all_callers_p): Use
>>     call_for_symbol_and_aliases instead of
>>     call_for_symbol_thunks_and_aliases.
>>     (ipa_inline): Likewise.
>>     * cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases):
>>     first walk aliases.
>>     * ipa.c (symbol_table::remove_unreachable_nodes): Use
>>     call_for_symbol_and_aliases.
>>     * ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol.
>>     (ipa_propagate_frequency_1): Use it; use opt_for_fn
>>     (ipa_propagate_frequency): Update.
>>     (ipa_profile): Add opt_for_fn gueards.
>> Index: ipa-visibility.c
>> ===================================================================
>> --- ipa-visibility.c    (revision 220741)
>> +++ ipa-visibility.c    (working copy)
>> @@ -595,7 +595,8 @@ function_and_variable_visibility (bool w
>>       }
>>     FOR_EACH_DEFINED_FUNCTION (node)
>>       {
>> -      node->local.local |= node->local_p ();
>> +      if (!node->local.local)
>> +        node->local.local |= node->local_p ();
>>
>>         /* If we know that function can not be overwritten by a different semantics
>>        and moreover its section can not be discarded, replace all direct calls
>> Index: ipa-inline.c
>> ===================================================================
>> --- ipa-inline.c    (revision 220741)
>> +++ ipa-inline.c    (working copy)
>> @@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s
>>     if (node->global.inlined_to)
>>       return false;
>>     /* Does it have callers?  */
>> -  if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true))
>> +  if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true))
>>       return false;
>>     /* Inlining into all callers would increase size?  */
>>     if (estimate_growth (node) > 0)
>>       return false;
>>     /* All inlines must be possible.  */
>> -  if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call,
>> -                        true))
>> +  if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call,
>> +                     true))
>>       return false;
>>     if (!cold && !has_hot_call)
>>       return false;
>> @@ -2359,9 +2359,9 @@ ipa_inline (void)
>>         if (want_inline_function_to_all_callers_p (node, cold))
>>           {
>>             int num_calls = 0;
>> -          node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls,
>> -                              true);
>> -          while (node->call_for_symbol_thunks_and_aliases
>> +          node->call_for_symbol_and_aliases (sum_callers, &num_calls,
>> +                         true);
>> +          while (node->call_for_symbol_and_aliases
>>                  (inline_to_all_callers, &num_calls, true))
>>           ;
>>             remove_functions = true;
>> Index: cgraph.c
>> ===================================================================
>> --- cgraph.c    (revision 220741)
>> +++ cgraph.c    (working copy)
>> @@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_
>>
>>     if (callback (this, data))
>>       return true;
>> +  FOR_EACH_ALIAS (this, ref)
>> +    {
>> +      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
>> +      if (include_overwritable
>> +      || alias->get_availability () > AVAIL_INTERPOSABLE)
>> +    if (alias->call_for_symbol_thunks_and_aliases (callback, data,
>> +                             include_overwritable,
>> +                             exclude_virtual_thunks))
>> +      return true;
>> +    }
>>     for (e = callers; e; e = e->next_caller)
>>       if (e->caller->thunk.thunk_p
>>       && (include_overwritable
>> @@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_
>>                                  exclude_virtual_thunks))
>>       return true;
>>
>> -  FOR_EACH_ALIAS (this, ref)
>> -    {
>> -      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
>> -      if (include_overwritable
>> -      || alias->get_availability () > AVAIL_INTERPOSABLE)
>> -    if (alias->call_for_symbol_thunks_and_aliases (callback, data,
>> -                             include_overwritable,
>> -                             exclude_virtual_thunks))
>> -      return true;
>> -    }
>>     return false;
>>   }
>>
>> Index: ipa.c
>> ===================================================================
>> --- ipa.c    (revision 220741)
>> +++ ipa.c    (working copy)
>> @@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes (
>>       if (node->address_taken
>>       && !node->used_from_other_partition)
>>         {
>> -    if (!node->call_for_symbol_thunks_and_aliases
>> +    if (!node->call_for_symbol_and_aliases
>>           (has_addr_references_p, NULL, true)
>>           && (!node->instrumentation_clone
>>           || !node->instrumented_version
>> Index: ipa-profile.c
>> ===================================================================
>> --- ipa-profile.c    (revision 220741)
>> +++ ipa-profile.c    (working copy)
>> @@ -322,6 +322,7 @@ ipa_profile_read_summary (void)
>>
>>   struct ipa_propagate_frequency_data
>>   {
>> +  cgraph_node *function_symbol;
>>     bool maybe_unlikely_executed;
>>     bool maybe_executed_once;
>>     bool only_called_at_startup;
>> @@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph
>>               || d->only_called_at_startup || d->only_called_at_exit);
>>          edge = edge->next_caller)
>>       {
>> -      if (edge->caller != node)
>> +      if (edge->caller != d->function_symbol)
>>       {
>>             d->only_called_at_startup &= edge->caller->only_called_at_startup;
>>         /* It makes sense to put main() together with the static constructors.
>> @@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph
>>        errors can make us to push function into unlikely section even when
>>        it is executed by the train run.  Transfer the function only if all
>>        callers are unlikely executed.  */
>> -      if (profile_info && flag_branch_probabilities
>> +      if (profile_info
>> +      && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities)
>> +      /* Thunks are not profiled.  This is more or less implementation
>> +         bug.  */
>> +      && !d->function_symbol->thunk.thunk_p
>>         && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED
>>             || (edge->caller->global.inlined_to
>>             && edge->caller->global.inlined_to->frequency
>> @@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node
>>   bool
>>   ipa_propagate_frequency (struct cgraph_node *node)
>>   {
>> -  struct ipa_propagate_frequency_data d = {true, true, true, true};
>> +  struct ipa_propagate_frequency_data d = {node, true, true, true, true};
>>     bool changed = false;
>>
>>     /* We can not propagate anything useful about externally visible functions
>> @@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n
>>     if (dump_file && (dump_flags & TDF_DETAILS))
>>       fprintf (dump_file, "Processing frequency %s\n", node->name ());
>>
>> -  node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d,
>> -                        true);
>> +  node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d,
>> +                     true);
>>
>>     if ((d.only_called_at_startup && !d.only_called_at_exit)
>>         && !node->only_called_at_startup)
>> @@ -597,6 +602,9 @@ ipa_profile (void)
>>       {
>>         bool update = false;
>>
>> +      if (!opt_for_fn (n->decl, flag_ipa_profile))
>> +    continue;
>> +
>>         for (e = n->indirect_calls; e; e = e->next_callee)
>>       {
>>         if (n->count)
>> @@ -697,7 +705,9 @@ ipa_profile (void)
>>     order_pos = ipa_reverse_postorder (order);
>>     for (i = order_pos - 1; i >= 0; i--)
>>       {
>> -      if (order[i]->local.local && ipa_propagate_frequency (order[i]))
>> +      if (order[i]->local.local
>> +      && opt_for_fn (order[i]->decl, flag_ipa_profile)
>> +      && ipa_propagate_frequency (order[i]))
>>       {
>>         for (e = order[i]->callees; e; e = e->next_callee)
>>           if (e->callee->local.local && !e->callee->aux)
>> @@ -714,7 +724,9 @@ ipa_profile (void)
>>         something_changed = false;
>>         for (i = order_pos - 1; i >= 0; i--)
>>       {
>> -      if (order[i]->aux && ipa_propagate_frequency (order[i]))
>> +      if (order[i]->aux
>> +          && opt_for_fn (order[i]->decl, flag_ipa_profile)
>> +          && ipa_propagate_frequency (order[i]))
>>           {
>>             for (e = order[i]->callees; e; e = e->next_callee)
>>           if (e->callee->local.local && !e->callee->aux)
>>
>
> Hi.
>
> There's perf report and -ftime report of WPA phase.
>
> Martin

Hm, using the same compiler, Firefox LTO time statistics and perf report and very different.
I'm wondering how can be that possible?

Martin
Execution times (seconds)
 phase setup             :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall    1988 kB ( 0%) ggc
 phase opt and generate  :  42.32 (70%) usr   0.85 (56%) sys  43.16 (69%) wall 1387464 kB (28%) ggc
 phase stream in         :  18.50 (30%) usr   0.68 (44%) sys  19.17 (31%) wall 3528077 kB (72%) ggc
 garbage collection      :   2.24 ( 4%) usr   0.00 ( 0%) sys   2.24 ( 4%) wall       0 kB ( 0%) ggc
 callgraph optimization  :   0.37 ( 1%) usr   0.00 ( 0%) sys   0.37 ( 1%) wall      38 kB ( 0%) ggc
 ipa dead code removal   :   3.06 ( 5%) usr   0.01 ( 1%) sys   2.88 ( 5%) wall       0 kB ( 0%) ggc
 ipa virtual call target :   5.72 ( 9%) usr   0.06 ( 4%) sys   5.87 ( 9%) wall       0 kB ( 0%) ggc
 ipa devirtualization    :   0.18 ( 0%) usr   0.00 ( 0%) sys   0.23 ( 0%) wall   22382 kB ( 0%) ggc
 ipa cp                  :   2.88 ( 5%) usr   0.09 ( 6%) sys   2.97 ( 5%) wall  515623 kB (10%) ggc
 ipa inlining heuristics :  13.96 (23%) usr   0.13 ( 8%) sys  14.12 (23%) wall  471848 kB (10%) ggc
 ipa comdats             :   0.12 ( 0%) usr   0.00 ( 0%) sys   0.12 ( 0%) wall       0 kB ( 0%) ggc
 ipa lto gimple in       :   2.54 ( 4%) usr   0.48 (31%) sys   3.23 ( 5%) wall  645652 kB (13%) ggc
 ipa lto decl in         :  12.64 (21%) usr   0.37 (24%) sys  13.01 (21%) wall 2592737 kB (53%) ggc
 ipa lto constructors in :   0.17 ( 0%) usr   0.01 ( 1%) sys   0.20 ( 0%) wall   16493 kB ( 0%) ggc
 ipa lto cgraph I/O      :   0.58 ( 1%) usr   0.09 ( 6%) sys   0.67 ( 1%) wall  437504 kB ( 9%) ggc
 ipa lto decl merge      :   1.90 ( 3%) usr   0.00 ( 0%) sys   1.90 ( 3%) wall    8191 kB ( 0%) ggc
 ipa lto cgraph merge    :   1.30 ( 2%) usr   0.00 ( 0%) sys   1.29 ( 2%) wall   14989 kB ( 0%) ggc
 whopr wpa               :   0.91 ( 1%) usr   0.00 ( 0%) sys   0.88 ( 1%) wall       2 kB ( 0%) ggc
 whopr partitioning      :   2.66 ( 4%) usr   0.00 ( 0%) sys   2.67 ( 4%) wall    6081 kB ( 0%) ggc
 ipa reference           :   1.38 ( 2%) usr   0.01 ( 1%) sys   1.40 ( 2%) wall       0 kB ( 0%) ggc
 ipa profile             :   0.21 ( 0%) usr   0.01 ( 1%) sys   0.21 ( 0%) wall       0 kB ( 0%) ggc
 ipa pure const          :   1.61 ( 3%) usr   0.01 ( 1%) sys   1.61 ( 3%) wall       0 kB ( 0%) ggc
 ipa icf                 :   4.99 ( 8%) usr   0.06 ( 4%) sys   5.00 ( 8%) wall    1120 kB ( 0%) ggc
 tree SSA rewrite        :   0.12 ( 0%) usr   0.02 ( 1%) sys   0.12 ( 0%) wall   23170 kB ( 0%) ggc
 tree SSA incremental    :   0.23 ( 0%) usr   0.05 ( 3%) sys   0.21 ( 0%) wall   14434 kB ( 0%) ggc
 tree operand scan       :   0.14 ( 0%) usr   0.03 ( 2%) sys   0.22 ( 0%) wall  145252 kB ( 3%) ggc
 dominance frontiers     :   0.04 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall       0 kB ( 0%) ggc
 dominance computation   :   0.14 ( 0%) usr   0.05 ( 3%) sys   0.11 ( 0%) wall       0 kB ( 0%) ggc
 varconst                :   0.01 ( 0%) usr   0.02 ( 1%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 loop fini               :   0.07 ( 0%) usr   0.00 ( 0%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 unaccounted todo        :   0.62 ( 1%) usr   0.00 ( 0%) sys   0.65 ( 1%) wall       0 kB ( 0%) ggc
 TOTAL                 :  60.82             1.53            62.34            4917531 kB
[ perf record: Woken up 59 times to write data ]
[ perf record: Captured and wrote 14.722 MB perf.data (~643202 samples) ]
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/home/marxin/Programming/bin/gcc2/lib/gcc/x86_64-unknown-linux-gnu/5.0.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-libsanitizer --prefix=/home/marxin/Programming/bin/gcc2 --disable-bootstrap --enable-checking=release
Thread model: posix
gcc version 5.0.0 20150218 (experimental) (GCC) 
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library> perf report --stdio | sed 's/\ *$//' | head -n50 
# To display the perf.data header info, please use --header/--header-only options.
#
# Samples: 245K of event 'cycles'
# Event count (approx.): 216467422123
#
# Overhead   Command      Shared Object
# ........  ........  .................  ..................................................................................................................................................................................................................................................................................................
#
     4.97%  lto1-wpa  lto1               [.] inflate_fast
     2.78%  lto1-wpa  lto1               [.] symbol_table::remove_unreachable_nodes(_IO_FILE*)
     2.37%  lto1-wpa  libc-2.19.so       [.] _int_malloc
     1.77%  lto1-wpa  lto1               [.] record_target_from_binfo(vec<cgraph_node*, va_heap, vl_ptr>&, vec<tree_node*, va_heap, vl_ptr>*, tree_node*, tree_node*, vec<tree_node*, va_heap, vl_ptr>&, long, tree_node*, long, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, bool, bool*)
     1.57%  lto1-wpa  lto1               [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option)
     1.56%  lto1-wpa  lto1               [.] streamer_read_uhwi(lto_input_block*)
     1.48%  lto1-wpa  lto1               [.] estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone .isra.129]
     1.48%  lto1-wpa  lto1               [.] unify_scc(streamer_tree_cache_d*, unsigned int, unsigned int, unsigned int, unsigned int)
     1.40%  lto1-wpa  lto1               [.] lto_cgraph_replace_node(cgraph_node*, cgraph_node*)
     1.38%  lto1-wpa  lto1               [.] ggc_set_mark(void const*)
     1.30%  lto1-wpa  libc-2.19.so       [.] malloc_consolidate
     1.28%  lto1-wpa  lto1               [.] htab_hash_string
     1.25%  lto1-wpa  lto1               [.] compare_tree_sccs_1(tree_node*, tree_node*, tree_node***)
     1.23%  lto1-wpa  lto1               [.] fibonacci_heap<sreal, cgraph_edge>::consolidate()
     1.19%  lto1-wpa  lto1               [.] splay_tree_splay
     1.15%  lto1-wpa  lto1               [.] can_inline_edge_p(cgraph_edge*, bool, bool)
     1.14%  lto1-wpa  lto1               [.] cgraph_node::get_availability()
     1.14%  lto1-wpa  lto1               [.] evaluate_properties_for_edge(cgraph_edge*, bool, unsigned int*, vec<tree_node*, va_heap, vl_ptr>*, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>*, vec<ipa_agg_jump_function*, va_heap, vl_ptr>*) [clone .constprop.131]
     1.13%  lto1-wpa  lto1               [.] gimple_get_virt_method_for_vtable(long, tree_node*, unsigned long, bool*)
     1.10%  lto1-wpa  lto1               [.] types_same_for_odr(tree_node const*, tree_node const*)
     1.08%  lto1-wpa  lto1               [.] gt_ggc_mx_lang_tree_node(void*)
     1.05%  lto1-wpa  lto1               [.] streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*)
     0.99%  lto1-wpa  lto1               [.] type_in_anonymous_namespace_p(tree_node const*)
     0.99%  lto1-wpa  lto1               [.] gimple_has_body_p(tree_node*)
     0.95%  lto1-wpa  lto1               [.] decl_assembler_name(tree_node*)
     0.93%  lto1-wpa  lto1               [.] do_per_function(void (*)(function*, void*), void*)
     0.82%  lto1-wpa  libc-2.19.so       [.] _int_free
     0.81%  lto1-wpa  lto1               [.] possible_polymorphic_call_targets_1(vec<cgraph_node*, va_heap, vl_ptr>&, hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*, default_hashset_traits>*, tree_node*, odr_type_d*, long, tree_node*, long, bool*, vec<tree_node*, va_heap, vl_ptr>&, bool)
     0.81%  lto1-wpa  lto1               [.] searchc(searchc_env*, cgraph_node*, bool (*)(cgraph_edge*))
     0.80%  lto1-wpa  lto1               [.] streamer_get_pickled_tree(lto_input_block*, data_in*)
     0.78%  lto1-wpa  lto1               [.] edge_badness(cgraph_edge*, bool)
     0.77%  lto1-wpa  lto1               [.] hash_table<asmname_hasher, xcallocator, true>::find_slot_with_hash(tree_node const* const&, unsigned int, insert_option)
     0.77%  lto1-wpa  lto1               [.] update_callee_keys(fibonacci_heap<sreal, cgraph_edge>*, cgraph_node*, bitmap_head*)
     0.76%  lto1-wpa  lto1               [.] ggc_internal_alloc(unsigned long, void (*)(void*), unsigned long, unsigned long)
     0.75%  lto1-wpa  lto1               [.] fibonacci_heap<sreal, cgraph_edge>::extract_minimum_node()
     0.75%  lto1-wpa  lto1               [.] execute_one_pass(opt_pass*)
     0.74%  lto1-wpa  lto1               [.] inflate
     0.71%  lto1-wpa  lto1               [.] contains_polymorphic_type_p(tree_node const*)
     0.67%  lto1-wpa  lto1               [.] get_binfo_at_offset(tree_node*, long, tree_node*)
     0.64%  lto1-wpa  lto1               [.] symbol_table::decl_assembler_name_equal(tree_node*, tree_node const*)
     0.61%  lto1-wpa  lto1               [.] lto_balanced_map(int)
     0.61%  lto1-wpa  lto1               [.] ipa_icf::sem_item_optimizer::do_congruence_step_for_index(ipa_icf::congruence_class*, unsigned int)
diff mbox

Patch

Index: ipa-visibility.c
===================================================================
--- ipa-visibility.c	(revision 220741)
+++ ipa-visibility.c	(working copy)
@@ -595,7 +595,8 @@  function_and_variable_visibility (bool w
     }
   FOR_EACH_DEFINED_FUNCTION (node)
     {
-      node->local.local |= node->local_p ();
+      if (!node->local.local)
+        node->local.local |= node->local_p ();
 
       /* If we know that function can not be overwritten by a different semantics
 	 and moreover its section can not be discarded, replace all direct calls
Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 220741)
+++ ipa-inline.c	(working copy)
@@ -975,14 +975,14 @@  want_inline_function_to_all_callers_p (s
   if (node->global.inlined_to)
     return false;
   /* Does it have callers?  */
-  if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true))
+  if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true))
     return false;
   /* Inlining into all callers would increase size?  */
   if (estimate_growth (node) > 0)
     return false;
   /* All inlines must be possible.  */
-  if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call,
-						true))
+  if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call,
+					 true))
     return false;
   if (!cold && !has_hot_call)
     return false;
@@ -2359,9 +2359,9 @@  ipa_inline (void)
 	  if (want_inline_function_to_all_callers_p (node, cold))
 	    {
 	      int num_calls = 0;
-	      node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls,
-						      true);
-	      while (node->call_for_symbol_thunks_and_aliases
+	      node->call_for_symbol_and_aliases (sum_callers, &num_calls,
+						 true);
+	      while (node->call_for_symbol_and_aliases
 		       (inline_to_all_callers, &num_calls, true))
 		;
 	      remove_functions = true;
Index: cgraph.c
===================================================================
--- cgraph.c	(revision 220741)
+++ cgraph.c	(working copy)
@@ -2191,6 +2191,16 @@  cgraph_node::call_for_symbol_thunks_and_
 
   if (callback (this, data))
     return true;
+  FOR_EACH_ALIAS (this, ref)
+    {
+      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
+      if (include_overwritable
+	  || alias->get_availability () > AVAIL_INTERPOSABLE)
+	if (alias->call_for_symbol_thunks_and_aliases (callback, data,
+						     include_overwritable,
+						     exclude_virtual_thunks))
+	  return true;
+    }
   for (e = callers; e; e = e->next_caller)
     if (e->caller->thunk.thunk_p
 	&& (include_overwritable
@@ -2202,16 +2212,6 @@  cgraph_node::call_for_symbol_thunks_and_
 						       exclude_virtual_thunks))
 	return true;
 
-  FOR_EACH_ALIAS (this, ref)
-    {
-      cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
-      if (include_overwritable
-	  || alias->get_availability () > AVAIL_INTERPOSABLE)
-	if (alias->call_for_symbol_thunks_and_aliases (callback, data,
-						     include_overwritable,
-						     exclude_virtual_thunks))
-	  return true;
-    }
   return false;
 }
 
Index: ipa.c
===================================================================
--- ipa.c	(revision 220741)
+++ ipa.c	(working copy)
@@ -661,7 +661,7 @@  symbol_table::remove_unreachable_nodes (
     if (node->address_taken
 	&& !node->used_from_other_partition)
       {
-	if (!node->call_for_symbol_thunks_and_aliases
+	if (!node->call_for_symbol_and_aliases
 	    (has_addr_references_p, NULL, true)
 	    && (!node->instrumentation_clone
 		|| !node->instrumented_version
Index: ipa-profile.c
===================================================================
--- ipa-profile.c	(revision 220741)
+++ ipa-profile.c	(working copy)
@@ -322,6 +322,7 @@  ipa_profile_read_summary (void)
 
 struct ipa_propagate_frequency_data
 {
+  cgraph_node *function_symbol;
   bool maybe_unlikely_executed;
   bool maybe_executed_once;
   bool only_called_at_startup;
@@ -342,7 +343,7 @@  ipa_propagate_frequency_1 (struct cgraph
 	        || d->only_called_at_startup || d->only_called_at_exit);
        edge = edge->next_caller)
     {
-      if (edge->caller != node)
+      if (edge->caller != d->function_symbol)
 	{
           d->only_called_at_startup &= edge->caller->only_called_at_startup;
 	  /* It makes sense to put main() together with the static constructors.
@@ -358,7 +359,11 @@  ipa_propagate_frequency_1 (struct cgraph
 	 errors can make us to push function into unlikely section even when
 	 it is executed by the train run.  Transfer the function only if all
 	 callers are unlikely executed.  */
-      if (profile_info && flag_branch_probabilities
+      if (profile_info
+	  && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities)
+	  /* Thunks are not profiled.  This is more or less implementation
+	     bug.  */
+	  && !d->function_symbol->thunk.thunk_p
 	  && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED
 	      || (edge->caller->global.inlined_to
 		  && edge->caller->global.inlined_to->frequency
@@ -418,7 +423,7 @@  contains_hot_call_p (struct cgraph_node
 bool
 ipa_propagate_frequency (struct cgraph_node *node)
 {
-  struct ipa_propagate_frequency_data d = {true, true, true, true};
+  struct ipa_propagate_frequency_data d = {node, true, true, true, true};
   bool changed = false;
 
   /* We can not propagate anything useful about externally visible functions
@@ -432,8 +437,8 @@  ipa_propagate_frequency (struct cgraph_n
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Processing frequency %s\n", node->name ());
 
-  node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d,
-					    true);
+  node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d,
+				     true);
 
   if ((d.only_called_at_startup && !d.only_called_at_exit)
       && !node->only_called_at_startup)
@@ -597,6 +602,9 @@  ipa_profile (void)
     {
       bool update = false;
 
+      if (!opt_for_fn (n->decl, flag_ipa_profile))
+	continue;
+
       for (e = n->indirect_calls; e; e = e->next_callee)
 	{
 	  if (n->count)
@@ -697,7 +705,9 @@  ipa_profile (void)
   order_pos = ipa_reverse_postorder (order);
   for (i = order_pos - 1; i >= 0; i--)
     {
-      if (order[i]->local.local && ipa_propagate_frequency (order[i]))
+      if (order[i]->local.local
+	  && opt_for_fn (order[i]->decl, flag_ipa_profile)
+	  && ipa_propagate_frequency (order[i]))
 	{
 	  for (e = order[i]->callees; e; e = e->next_callee)
 	    if (e->callee->local.local && !e->callee->aux)
@@ -714,7 +724,9 @@  ipa_profile (void)
       something_changed = false;
       for (i = order_pos - 1; i >= 0; i--)
 	{
-	  if (order[i]->aux && ipa_propagate_frequency (order[i]))
+	  if (order[i]->aux
+	      && opt_for_fn (order[i]->decl, flag_ipa_profile)
+	      && ipa_propagate_frequency (order[i]))
 	    {
 	      for (e = order[i]->callees; e; e = e->next_callee)
 		if (e->callee->local.local && !e->callee->aux)