diff mbox series

Remove overall growth from badness metrics

Message ID 20190106235208.tqx4jqylljlbk3gk@kam.mff.cuni.cz
State New
Headers show
Series Remove overall growth from badness metrics | expand

Commit Message

Jan Hubicka Jan. 6, 2019, 11:52 p.m. UTC
Hi,
this patch removes overall growth from badness metrics.  This code was
added at a time inliner was purely function size based to give a hint
that inlining more different functions into all callers is better than
inlining one called very many times.

With profile we now have more fine grained information and in all
tests I did this heuristics seems to be counter-productive now and
harmful especially on large units where growth estimate gets out of
date.

I plan to commit the patch tomorrow after re-testing everything after
the bugfixes from today and yesterday.  In addition to this have found
that current inline-unit-growth is too small for LTO of large programs
(especially Firefox:) and there are important improvements when
increased from 20 to 30 or 40.  I am re-running C++ benchmarks and other
tests to decide about precise setting.  Finally I plan to increase
the new parameters for bit more inlining at -O2 and -Os.

Bootstrapped/regtested x86_64-linux, will commit it tomorrow.

	* ipa-inline.c (edge_badness): Do not account overall_growth into
	badness metrics.

Comments

Qing Zhao Jan. 7, 2019, 5:37 p.m. UTC | #1
> On Jan 6, 2019, at 5:52 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
> 
> Hi,
> this patch removes overall growth from badness metrics.  This code was
> added at a time inliner was purely function size based to give a hint
> that inlining more different functions into all callers is better than
> inlining one called very many times.
> 
> With profile we now have more fine grained information and in all
> tests I did this heuristics seems to be counter-productive now and
> harmful especially on large units where growth estimate gets out of
> date.
> 
> I plan to commit the patch tomorrow after re-testing everything after
> the bugfixes from today and yesterday.  In addition to this have found
> that current inline-unit-growth is too small for LTO of large programs
> (especially Firefox:) and there are important improvements when
> increased from 20 to 30 or 40.  I am re-running C++ benchmarks and other
> tests to decide about precise setting.  Finally I plan to increase
> the new parameters for bit more inlining at -O2 and -Os.

Usually increasing these parameters might increase the compilation time and the 
final code size, do you have any data for compilation time and code size impact from
these parameter change?

thanks.

Qing
> 
> Bootstrapped/regtested x86_64-linux, will commit it tomorrow.
> 
> 	* ipa-inline.c (edge_badness): Do not account overall_growth into
> 	badness metrics.
> Index: ipa-inline.c
> ===================================================================
> --- ipa-inline.c	(revision 267612)
> +++ ipa-inline.c	(working copy)
> @@ -1082,8 +1082,8 @@ edge_badness (struct cgraph_edge *edge,
>   /* When profile is available. Compute badness as:
> 
>                  time_saved * caller_count
> -     goodness =  -------------------------------------------------
> -	         growth_of_caller * overall_growth * combined_size
> +     goodness =  --------------------------------
> +	         growth_of_caller * combined_size
> 
>      badness = - goodness
> 
> @@ -1094,7 +1094,6 @@ edge_badness (struct cgraph_edge *edge,
> 	   || caller->count.ipa ().nonzero_p ())
>     {
>       sreal numerator, denominator;
> -      int overall_growth;
>       sreal inlined_time = compute_inlined_call_time (edge, edge_time);
> 
>       numerator = (compute_uninlined_call_time (edge, unspec_edge_time)
> @@ -1106,73 +1105,6 @@ edge_badness (struct cgraph_edge *edge,
>       else if (caller->count.ipa ().initialized_p ())
> 	numerator = numerator >> 11;
>       denominator = growth;
> -
> -      overall_growth = callee_info->growth;
> -
> -      /* Look for inliner wrappers of the form:
> -
> -	 inline_caller ()
> -	   {
> -	     do_fast_job...
> -	     if (need_more_work)
> -	       noninline_callee ();
> -	   }
> -	 Withhout panilizing this case, we usually inline noninline_callee
> -	 into the inline_caller because overall_growth is small preventing
> -	 further inlining of inline_caller.
> -
> -	 Penalize only callgraph edges to functions with small overall
> -	 growth ...
> -	*/
> -      if (growth > overall_growth
> -	  /* ... and having only one caller which is not inlined ... */
> -	  && callee_info->single_caller
> -	  && !edge->caller->global.inlined_to
> -	  /* ... and edges executed only conditionally ... */
> -	  && edge->sreal_frequency () < 1
> -	  /* ... consider case where callee is not inline but caller is ... */
> -	  && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
> -	       && DECL_DECLARED_INLINE_P (caller->decl))
> -	      /* ... or when early optimizers decided to split and edge
> -		 frequency still indicates splitting is a win ... */
> -	      || (callee->split_part && !caller->split_part
> -		  && edge->sreal_frequency () * 100
> -		     < PARAM_VALUE
> -			  (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY)
> -		  /* ... and do not overwrite user specified hints.   */
> -		  && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
> -		      || DECL_DECLARED_INLINE_P (caller->decl)))))
> -	{
> -	  ipa_fn_summary *caller_info = ipa_fn_summaries->get (caller);
> -	  int caller_growth = caller_info->growth;
> -
> -	  /* Only apply the penalty when caller looks like inline candidate,
> -	     and it is not called once and.  */
> -	  if (!caller_info->single_caller && overall_growth < caller_growth
> -	      && caller_info->inlinable
> -	      && caller_info->size
> -		 < (DECL_DECLARED_INLINE_P (caller->decl)
> -		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
> -	    {
> -	      if (dump)
> -		fprintf (dump_file,
> -			 "     Wrapper penalty. Increasing growth %i to %i\n",
> -			 overall_growth, caller_growth);
> -	      overall_growth = caller_growth;
> -	    }
> -	}
> -      if (overall_growth > 0)
> -        {
> -	  /* Strongly preffer functions with few callers that can be inlined
> -	     fully.  The square root here leads to smaller binaries at average.
> -	     Watch however for extreme cases and return to linear function
> -	     when growth is large.  */
> -	  if (overall_growth < 256)
> -	    overall_growth *= overall_growth;
> -	  else
> -	    overall_growth += 256 * 256 - 256;
> -	  denominator *= overall_growth;
> -        }
>       denominator *= ipa_fn_summaries->get (caller)->self_size + growth;
> 
>       badness = - numerator / denominator;
> @@ -1182,18 +1114,14 @@ edge_badness (struct cgraph_edge *edge,
> 	  fprintf (dump_file,
> 		   "      %f: guessed profile. frequency %f, count %" PRId64
> 		   " caller count %" PRId64
> -		   " time w/o inlining %f, time with inlining %f"
> -		   " overall growth %i (current) %i (original)"
> -		   " %i (compensated)\n",
> +		   " time w/o inlining %f, time with inlining %f\n",
> 		   badness.to_double (),
> 		   edge->sreal_frequency ().to_double (),
> 		   edge->count.ipa ().initialized_p () ? edge->count.ipa ().to_gcov_type () : -1,
> 		   caller->count.ipa ().initialized_p () ? caller->count.ipa ().to_gcov_type () : -1,
> 		   compute_uninlined_call_time (edge,
> 						unspec_edge_time).to_double (),
> -		   inlined_time.to_double (),
> -		   estimate_growth (callee),
> -		   callee_info->growth, overall_growth);
> +		   inlined_time.to_double ());
> 	}
>     }
>   /* When function local profile is not available or it does not give
Jan Hubicka Jan. 8, 2019, 11:46 a.m. UTC | #2
> > I plan to commit the patch tomorrow after re-testing everything after
> > the bugfixes from today and yesterday.  In addition to this have found
> > that current inline-unit-growth is too small for LTO of large programs
> > (especially Firefox:) and there are important improvements when
> > increased from 20 to 30 or 40.  I am re-running C++ benchmarks and other
> > tests to decide about precise setting.  Finally I plan to increase
> > the new parameters for bit more inlining at -O2 and -Os.
> 
> Usually increasing these parameters might increase the compilation time and the 
> final code size, do you have any data for compilation time and code size impact from
> these parameter change?

Yes, currently LNT is down because some machines apparently ran out of
disk space after christmas, so I can not show you data on that, but I
can show Firefox.  Will make summary of LNT too once it restarts.

In general this parameter affects primarily -O3 builds, becuase -O2
hardly hits the limit. From -O3 only programs with very large units are
affected (-O2 units hits the limit only if you do have a lot of inline
hints in the code).

In my test bed this included Firefox with or without LTO becuase they do
"poor man's" LTO by #including multiple .cpp files into single unified
source which makes average units large.  Also tramp3d, DLV from our C++
benhcmark is affected. 

I have some data on Firefox and I will build remainin ones:

 growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
 20 (default)   83752215   94390023  93085455  103437191  94351191
 40             85299111   97220935  101600151 108910311  115311719
 clang          111520431            114863807 108437807

Build times are within noise of my setup, but they are less pronounced
than the code size difference. I think at most 1 minute out of 100.
Note that Firefox consists of 6% Rust code that is not built by GCC and
and building that consumes over half of the build time.

Problem I am trying to solve here are is to get consistent LTO
performance improvements compared to non-LTO. Currently there are
some regressions:
 https://treeherder.mozilla.org/perf.html#/compare?originalProject=try&originalRevision=b6ba1ebfe913d152989495d8cb450bce02f27d44&newProject=try&newRevision=c7bd18804e328ed490eab707072b3cf59da91042&framework=1&showOnlyComparable=1&showOnlyImportant=1
All those regressions goes away with limit increase.

I tracked them down to the fact that we do not inline some very small
functions already (such as IsHTMLWhitespace .  In GCC 5 timeframe I
tuned this parameter to 20% based on Firefox LTO benchmarks but I was
not that serious about performance since my setup was not giving very
reproducible results for sub 5% differences on tp5o. Since we plan to
enable LTO by default for Tumbleweed I need to find something that does
not cause too many regression while keeping code size advantage of
non-LTO.

Honza
> 
> thanks.
> 
> Qing
> > 
> > Bootstrapped/regtested x86_64-linux, will commit it tomorrow.
> > 
> > 	* ipa-inline.c (edge_badness): Do not account overall_growth into
> > 	badness metrics.
> > Index: ipa-inline.c
> > ===================================================================
> > --- ipa-inline.c	(revision 267612)
> > +++ ipa-inline.c	(working copy)
> > @@ -1082,8 +1082,8 @@ edge_badness (struct cgraph_edge *edge,
> >   /* When profile is available. Compute badness as:
> > 
> >                  time_saved * caller_count
> > -     goodness =  -------------------------------------------------
> > -	         growth_of_caller * overall_growth * combined_size
> > +     goodness =  --------------------------------
> > +	         growth_of_caller * combined_size
> > 
> >      badness = - goodness
> > 
> > @@ -1094,7 +1094,6 @@ edge_badness (struct cgraph_edge *edge,
> > 	   || caller->count.ipa ().nonzero_p ())
> >     {
> >       sreal numerator, denominator;
> > -      int overall_growth;
> >       sreal inlined_time = compute_inlined_call_time (edge, edge_time);
> > 
> >       numerator = (compute_uninlined_call_time (edge, unspec_edge_time)
> > @@ -1106,73 +1105,6 @@ edge_badness (struct cgraph_edge *edge,
> >       else if (caller->count.ipa ().initialized_p ())
> > 	numerator = numerator >> 11;
> >       denominator = growth;
> > -
> > -      overall_growth = callee_info->growth;
> > -
> > -      /* Look for inliner wrappers of the form:
> > -
> > -	 inline_caller ()
> > -	   {
> > -	     do_fast_job...
> > -	     if (need_more_work)
> > -	       noninline_callee ();
> > -	   }
> > -	 Withhout panilizing this case, we usually inline noninline_callee
> > -	 into the inline_caller because overall_growth is small preventing
> > -	 further inlining of inline_caller.
> > -
> > -	 Penalize only callgraph edges to functions with small overall
> > -	 growth ...
> > -	*/
> > -      if (growth > overall_growth
> > -	  /* ... and having only one caller which is not inlined ... */
> > -	  && callee_info->single_caller
> > -	  && !edge->caller->global.inlined_to
> > -	  /* ... and edges executed only conditionally ... */
> > -	  && edge->sreal_frequency () < 1
> > -	  /* ... consider case where callee is not inline but caller is ... */
> > -	  && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
> > -	       && DECL_DECLARED_INLINE_P (caller->decl))
> > -	      /* ... or when early optimizers decided to split and edge
> > -		 frequency still indicates splitting is a win ... */
> > -	      || (callee->split_part && !caller->split_part
> > -		  && edge->sreal_frequency () * 100
> > -		     < PARAM_VALUE
> > -			  (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY)
> > -		  /* ... and do not overwrite user specified hints.   */
> > -		  && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
> > -		      || DECL_DECLARED_INLINE_P (caller->decl)))))
> > -	{
> > -	  ipa_fn_summary *caller_info = ipa_fn_summaries->get (caller);
> > -	  int caller_growth = caller_info->growth;
> > -
> > -	  /* Only apply the penalty when caller looks like inline candidate,
> > -	     and it is not called once and.  */
> > -	  if (!caller_info->single_caller && overall_growth < caller_growth
> > -	      && caller_info->inlinable
> > -	      && caller_info->size
> > -		 < (DECL_DECLARED_INLINE_P (caller->decl)
> > -		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
> > -	    {
> > -	      if (dump)
> > -		fprintf (dump_file,
> > -			 "     Wrapper penalty. Increasing growth %i to %i\n",
> > -			 overall_growth, caller_growth);
> > -	      overall_growth = caller_growth;
> > -	    }
> > -	}
> > -      if (overall_growth > 0)
> > -        {
> > -	  /* Strongly preffer functions with few callers that can be inlined
> > -	     fully.  The square root here leads to smaller binaries at average.
> > -	     Watch however for extreme cases and return to linear function
> > -	     when growth is large.  */
> > -	  if (overall_growth < 256)
> > -	    overall_growth *= overall_growth;
> > -	  else
> > -	    overall_growth += 256 * 256 - 256;
> > -	  denominator *= overall_growth;
> > -        }
> >       denominator *= ipa_fn_summaries->get (caller)->self_size + growth;
> > 
> >       badness = - numerator / denominator;
> > @@ -1182,18 +1114,14 @@ edge_badness (struct cgraph_edge *edge,
> > 	  fprintf (dump_file,
> > 		   "      %f: guessed profile. frequency %f, count %" PRId64
> > 		   " caller count %" PRId64
> > -		   " time w/o inlining %f, time with inlining %f"
> > -		   " overall growth %i (current) %i (original)"
> > -		   " %i (compensated)\n",
> > +		   " time w/o inlining %f, time with inlining %f\n",
> > 		   badness.to_double (),
> > 		   edge->sreal_frequency ().to_double (),
> > 		   edge->count.ipa ().initialized_p () ? edge->count.ipa ().to_gcov_type () : -1,
> > 		   caller->count.ipa ().initialized_p () ? caller->count.ipa ().to_gcov_type () : -1,
> > 		   compute_uninlined_call_time (edge,
> > 						unspec_edge_time).to_double (),
> > -		   inlined_time.to_double (),
> > -		   estimate_growth (callee),
> > -		   callee_info->growth, overall_growth);
> > +		   inlined_time.to_double ());
> > 	}
> >     }
> >   /* When function local profile is not available or it does not give
>
Qing Zhao Jan. 8, 2019, 5:26 p.m. UTC | #3
> On Jan 8, 2019, at 5:46 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> 
>>> I plan to commit the patch tomorrow after re-testing everything after
>>> the bugfixes from today and yesterday.  In addition to this have found
>>> that current inline-unit-growth is too small for LTO of large programs
>>> (especially Firefox:) and there are important improvements when
>>> increased from 20 to 30 or 40.  I am re-running C++ benchmarks and other
>>> tests to decide about precise setting.  Finally I plan to increase
>>> the new parameters for bit more inlining at -O2 and -Os.
>> 
>> Usually increasing these parameters might increase the compilation time and the 
>> final code size, do you have any data for compilation time and code size impact from
>> these parameter change?
> 
> Yes, currently LNT is down because some machines apparently ran out of
> disk space after christmas, so I can not show you data on that, but I
> can show Firefox.  Will make summary of LNT too once it restarts.
Okay, thanks.
> 
> In general this parameter affects primarily -O3 builds, becuase -O2
> hardly hits the limit. From -O3 only programs with very large units are
> affected (-O2 units hits the limit only if you do have a lot of inline
> hints in the code).
don’t quite understand here, what’s the major difference for inlining between -O3 and -O2? 
(I see -finline-functions is enabled for both O3 and O2).

> 
> In my test bed this included Firefox with or without LTO becuase they do
> "poor man's" LTO by #including multiple .cpp files into single unified
> source which makes average units large.  Also tramp3d, DLV from our C++
> benhcmark is affected. 
> 
> I have some data on Firefox and I will build remainin ones:
in the following, are the data for code size? are the optimization level O3?
what’s PGO mean?  
> 
> growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
> 20 (default)   83752215   94390023  93085455  103437191  94351191
> 40             85299111   97220935  101600151 108910311  115311719
> clang          111520431            114863807 108437807
> 
> Build times are within noise of my setup, but they are less pronounced
> than the code size difference. I think at most 1 minute out of 100.
> Note that Firefox consists of 6% Rust code that is not built by GCC and
> and building that consumes over half of the build time.
> 
> Problem I am trying to solve here are is to get consistent LTO
> performance improvements compared to non-LTO. Currently there are
> some regressions:
> https://treeherder.mozilla.org/perf.html#/compare?originalProject=try&originalRevision=b6ba1ebfe913d152989495d8cb450bce02f27d44&newProject=try&newRevision=c7bd18804e328ed490eab707072b3cf59da91042&framework=1&showOnlyComparable=1&showOnlyImportant=1
> All those regressions goes away with limit increase.


> I tracked them down to the fact that we do not inline some very small
> functions already (such as IsHTMLWhitespace .  In GCC 5 timeframe I
> tuned this parameter to 20% based on Firefox LTO benchmarks but I was
> not that serious about performance since my setup was not giving very
> reproducible results for sub 5% differences on tp5o. Since we plan to
> enable LTO by default for Tumbleweed I need to find something that does
> not cause too many regression while keeping code size advantage of
> non-LTO.

from my understanding, the performance regression from LTO to non-LTO is caused 
by some small and important functions cannot be inlined anymore with LTO due to more functions are
eligible to be inlined for LTO, therefore the original value for inline-unit-growth becomes relatively smaller.

When increasing the value of inline-unit-growth for LTO is one approach to resolve this issue, adjusting
the sorting heuristic to sort those important and smaller routines as higher priority to be inlined might be
another and better approach? 

Qing

> 
> Honza
>> 
>> thanks.
>> 
>> Qing
>>> 
>>> Bootstrapped/regtested x86_64-linux, will commit it tomorrow.
>>> 
>>> 	* ipa-inline.c (edge_badness): Do not account overall_growth into
>>> 	badness metrics.
>>> Index: ipa-inline.c
>>> ===================================================================
>>> --- ipa-inline.c	(revision 267612)
>>> +++ ipa-inline.c	(working copy)
>>> @@ -1082,8 +1082,8 @@ edge_badness (struct cgraph_edge *edge,
>>>  /* When profile is available. Compute badness as:
>>> 
>>>                 time_saved * caller_count
>>> -     goodness =  -------------------------------------------------
>>> -	         growth_of_caller * overall_growth * combined_size
>>> +     goodness =  --------------------------------
>>> +	         growth_of_caller * combined_size
>>> 
>>>     badness = - goodness
>>> 
>>> @@ -1094,7 +1094,6 @@ edge_badness (struct cgraph_edge *edge,
>>> 	   || caller->count.ipa ().nonzero_p ())
>>>    {
>>>      sreal numerator, denominator;
>>> -      int overall_growth;
>>>      sreal inlined_time = compute_inlined_call_time (edge, edge_time);
>>> 
>>>      numerator = (compute_uninlined_call_time (edge, unspec_edge_time)
>>> @@ -1106,73 +1105,6 @@ edge_badness (struct cgraph_edge *edge,
>>>      else if (caller->count.ipa ().initialized_p ())
>>> 	numerator = numerator >> 11;
>>>      denominator = growth;
>>> -
>>> -      overall_growth = callee_info->growth;
>>> -
>>> -      /* Look for inliner wrappers of the form:
>>> -
>>> -	 inline_caller ()
>>> -	   {
>>> -	     do_fast_job...
>>> -	     if (need_more_work)
>>> -	       noninline_callee ();
>>> -	   }
>>> -	 Withhout panilizing this case, we usually inline noninline_callee
>>> -	 into the inline_caller because overall_growth is small preventing
>>> -	 further inlining of inline_caller.
>>> -
>>> -	 Penalize only callgraph edges to functions with small overall
>>> -	 growth ...
>>> -	*/
>>> -      if (growth > overall_growth
>>> -	  /* ... and having only one caller which is not inlined ... */
>>> -	  && callee_info->single_caller
>>> -	  && !edge->caller->global.inlined_to
>>> -	  /* ... and edges executed only conditionally ... */
>>> -	  && edge->sreal_frequency () < 1
>>> -	  /* ... consider case where callee is not inline but caller is ... */
>>> -	  && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
>>> -	       && DECL_DECLARED_INLINE_P (caller->decl))
>>> -	      /* ... or when early optimizers decided to split and edge
>>> -		 frequency still indicates splitting is a win ... */
>>> -	      || (callee->split_part && !caller->split_part
>>> -		  && edge->sreal_frequency () * 100
>>> -		     < PARAM_VALUE
>>> -			  (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY)
>>> -		  /* ... and do not overwrite user specified hints.   */
>>> -		  && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
>>> -		      || DECL_DECLARED_INLINE_P (caller->decl)))))
>>> -	{
>>> -	  ipa_fn_summary *caller_info = ipa_fn_summaries->get (caller);
>>> -	  int caller_growth = caller_info->growth;
>>> -
>>> -	  /* Only apply the penalty when caller looks like inline candidate,
>>> -	     and it is not called once and.  */
>>> -	  if (!caller_info->single_caller && overall_growth < caller_growth
>>> -	      && caller_info->inlinable
>>> -	      && caller_info->size
>>> -		 < (DECL_DECLARED_INLINE_P (caller->decl)
>>> -		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
>>> -	    {
>>> -	      if (dump)
>>> -		fprintf (dump_file,
>>> -			 "     Wrapper penalty. Increasing growth %i to %i\n",
>>> -			 overall_growth, caller_growth);
>>> -	      overall_growth = caller_growth;
>>> -	    }
>>> -	}
>>> -      if (overall_growth > 0)
>>> -        {
>>> -	  /* Strongly preffer functions with few callers that can be inlined
>>> -	     fully.  The square root here leads to smaller binaries at average.
>>> -	     Watch however for extreme cases and return to linear function
>>> -	     when growth is large.  */
>>> -	  if (overall_growth < 256)
>>> -	    overall_growth *= overall_growth;
>>> -	  else
>>> -	    overall_growth += 256 * 256 - 256;
>>> -	  denominator *= overall_growth;
>>> -        }
>>>      denominator *= ipa_fn_summaries->get (caller)->self_size + growth;
>>> 
>>>      badness = - numerator / denominator;
>>> @@ -1182,18 +1114,14 @@ edge_badness (struct cgraph_edge *edge,
>>> 	  fprintf (dump_file,
>>> 		   "      %f: guessed profile. frequency %f, count %" PRId64
>>> 		   " caller count %" PRId64
>>> -		   " time w/o inlining %f, time with inlining %f"
>>> -		   " overall growth %i (current) %i (original)"
>>> -		   " %i (compensated)\n",
>>> +		   " time w/o inlining %f, time with inlining %f\n",
>>> 		   badness.to_double (),
>>> 		   edge->sreal_frequency ().to_double (),
>>> 		   edge->count.ipa ().initialized_p () ? edge->count.ipa ().to_gcov_type () : -1,
>>> 		   caller->count.ipa ().initialized_p () ? caller->count.ipa ().to_gcov_type () : -1,
>>> 		   compute_uninlined_call_time (edge,
>>> 						unspec_edge_time).to_double (),
>>> -		   inlined_time.to_double (),
>>> -		   estimate_growth (callee),
>>> -		   callee_info->growth, overall_growth);
>>> +		   inlined_time.to_double ());
>>> 	}
>>>    }
>>>  /* When function local profile is not available or it does not give
>>
Jan Hubicka Jan. 8, 2019, 5:53 p.m. UTC | #4
> > 
> > In general this parameter affects primarily -O3 builds, becuase -O2
> > hardly hits the limit. From -O3 only programs with very large units are
> > affected (-O2 units hits the limit only if you do have a lot of inline
> > hints in the code).
> don’t quite understand here, what’s the major difference for inlining between -O3 and -O2? 
> (I see -finline-functions is enabled for both O3 and O2).

-O2 has -finline-small-functions where we inline only when function is
declared inline or code size is expected to shrink after the inlining.
-O3 has -finline-functions where we auto-inline a lot more.
> 
> > 
> > In my test bed this included Firefox with or without LTO becuase they do
> > "poor man's" LTO by #including multiple .cpp files into single unified
> > source which makes average units large.  Also tramp3d, DLV from our C++
> > benhcmark is affected. 
> > 
> > I have some data on Firefox and I will build remainin ones:
> in the following, are the data for code size? are the optimization level O3?
> what’s PGO mean?  

Those are sizes of libxul, which is the largest library of Firefox.
PGO is profile guided optimization.
> > 
> > growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
> > 20 (default)   83752215   94390023  93085455  103437191  94351191
> > 40             85299111   97220935  101600151 108910311  115311719
> > clang          111520431            114863807 108437807
> > 
> > Build times are within noise of my setup, but they are less pronounced
> > than the code size difference. I think at most 1 minute out of 100.
> > Note that Firefox consists of 6% Rust code that is not built by GCC and
> > and building that consumes over half of the build time.
> > 
> > Problem I am trying to solve here are is to get consistent LTO
> > performance improvements compared to non-LTO. Currently there are
> > some regressions:
> > https://treeherder.mozilla.org/perf.html#/compare?originalProject=try&originalRevision=b6ba1ebfe913d152989495d8cb450bce02f27d44&newProject=try&newRevision=c7bd18804e328ed490eab707072b3cf59da91042&framework=1&showOnlyComparable=1&showOnlyImportant=1
> > All those regressions goes away with limit increase.
> 
> 
> > I tracked them down to the fact that we do not inline some very small
> > functions already (such as IsHTMLWhitespace .  In GCC 5 timeframe I
> > tuned this parameter to 20% based on Firefox LTO benchmarks but I was
> > not that serious about performance since my setup was not giving very
> > reproducible results for sub 5% differences on tp5o. Since we plan to
> > enable LTO by default for Tumbleweed I need to find something that does
> > not cause too many regression while keeping code size advantage of
> > non-LTO.
> 
> from my understanding, the performance regression from LTO to non-LTO is caused 
> by some small and important functions cannot be inlined anymore with LTO due to more functions are
> eligible to be inlined for LTO, therefore the original value for inline-unit-growth becomes relatively smaller.

Yes, with whole program optimization most functions calls are inlinable,
while when in normal non-LTO build most function calls are external.
Since there are a lot of small functions called cross-module in modern
C++ programs it simply makes inliner to run out of the limits before
getting some of useful inline decisions.

I was poking about this for a while, but did not really have very good
testcases available making it difficult to judge code size/performance
tradeoffs here.  With Firefox I can measure things better now and
it is clear that 20% growth is just too small. It is small even with
profile feedback where compiler knows quite well what calls to inline
and more so without.
> 
> When increasing the value of inline-unit-growth for LTO is one approach to resolve this issue, adjusting
> the sorting heuristic to sort those important and smaller routines as higher priority to be inlined might be
> another and better approach? 

Yes, i have also reworked the inline metrics somehwat and spent quite
some time looking into dumps to see that it behaves reasonably.  There
was two ages old bugs I fixed in last two weeks and also added some
extra tricks like penalizing cross-module inlines some time ago. Given
the fact that even with profile feedback I am not able to sort the
priority queue well and neither can Clang do the job, I think it is good
motivation to adjust the parameter which I have set somewhat arbitrarily
at a time I was not able to test it well.

Honza
Qing Zhao Jan. 8, 2019, 11:55 p.m. UTC | #5
> On Jan 8, 2019, at 11:53 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> 
>>> 
>>> In general this parameter affects primarily -O3 builds, becuase -O2
>>> hardly hits the limit. From -O3 only programs with very large units are
>>> affected (-O2 units hits the limit only if you do have a lot of inline
>>> hints in the code).
>> don’t quite understand here, what’s the major difference for inlining between -O3 and -O2? 
>> (I see -finline-functions is enabled for both O3 and O2).
> 
> -O2 has -finline-small-functions where we inline only when function is
> declared inline or code size is expected to shrink after the inlining.
> -O3 has -finline-functions where we auto-inline a lot more.

Looks like that our current documentation has a bug in the below:

https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html <https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html>

-finline-functions
Consider all functions for inlining, even if they are not declared inline. The compiler heuristically decides which functions are worth integrating in this way.
If all calls to a given function are integrated, and the function is declared static, then the function is normally not output as assembler code in its own right.
Enabled at levels -O2, -O3, -Os. Also enabled by -fprofile-use and -fauto-profile.

It clearly mentioned that -finline-functions is enabled at -O2, O3, -Os. 

And I checked the gcc9 source code, opts.c:

    /* -O3 and -Os optimizations.  */
    /* Inlining of functions reducing size is a good idea with -Os
       regardless of them being declared inline.  */
    { OPT_LEVELS_3_PLUS_AND_SIZE, OPT_finline_functions, NULL, 1 },

looks like that -finline-functions is ONLY enabled at -O3 and -Os, not for O2.
(However, I am confused with why -finline-functions should be enabled for -Os?)

>> 
>>> 
>>> In my test bed this included Firefox with or without LTO becuase they do
>>> "poor man's" LTO by #including multiple .cpp files into single unified
>>> source which makes average units large.  Also tramp3d, DLV from our C++
>>> benhcmark is affected. 
>>> 
>>> I have some data on Firefox and I will build remainin ones:
>> in the following, are the data for code size? are the optimization level O3?
>> what’s PGO mean?  
> 
> Those are sizes of libxul, which is the largest library of Firefox.
> PGO is profile guided optimization.

Okay.  I see. 

looks like for LTO,  the code size increase with profiling is much smaller than
that without profiling when growth is increased from 20% to 40%.  

for Non-LTO, the code size increase is minimal when growth is increased fro 20% to 40%.

However, not quite understand the last column, could you please explain a little bit
on last column (-finline-functions)?

>>> 
>>> growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
>>> 20 (default)   83752215   94390023  93085455  103437191  94351191
>>> 40             85299111   97220935  101600151 108910311  115311719
>>> clang          111520431            114863807 108437807
>>> 
>>> Build times are within noise of my setup, but they are less pronounced
>>> than the code size difference. I think at most 1 minute out of 100.
>>> Note that Firefox consists of 6% Rust code that is not built by GCC and
>>> and building that consumes over half of the build time.
>>> 
>>> Problem I am trying to solve here are is to get consistent LTO
>>> performance improvements compared to non-LTO. Currently there are
>>> some regressions:
>>> https://treeherder.mozilla.org/perf.html#/compare?originalProject=try&originalRevision=b6ba1ebfe913d152989495d8cb450bce02f27d44&newProject=try&newRevision=c7bd18804e328ed490eab707072b3cf59da91042&framework=1&showOnlyComparable=1&showOnlyImportant=1
>>> All those regressions goes away with limit increase.
>> 
>> 
>>> I tracked them down to the fact that we do not inline some very small
>>> functions already (such as IsHTMLWhitespace .  In GCC 5 timeframe I
>>> tuned this parameter to 20% based on Firefox LTO benchmarks but I was
>>> not that serious about performance since my setup was not giving very
>>> reproducible results for sub 5% differences on tp5o. Since we plan to
>>> enable LTO by default for Tumbleweed I need to find something that does
>>> not cause too many regression while keeping code size advantage of
>>> non-LTO.
>> 
>> from my understanding, the performance regression from LTO to non-LTO is caused 
>> by some small and important functions cannot be inlined anymore with LTO due to more functions are
>> eligible to be inlined for LTO, therefore the original value for inline-unit-growth becomes relatively smaller.
> 
> Yes, with whole program optimization most functions calls are inlinable,
> while when in normal non-LTO build most function calls are external.
> Since there are a lot of small functions called cross-module in modern
> C++ programs it simply makes inliner to run out of the limits before
> getting some of useful inline decisions.
> 
> I was poking about this for a while, but did not really have very good
> testcases available making it difficult to judge code size/performance
> tradeoffs here.  With Firefox I can measure things better now and
> it is clear that 20% growth is just too small. It is small even with
> profile feedback where compiler knows quite well what calls to inline
> and more so without.

Yes, for C++, 20% might be too small, especially for cross-file inlining.
and C++ applications usually benefit more from inlining. 

>> 
>> When increasing the value of inline-unit-growth for LTO is one approach to resolve this issue, adjusting
>> the sorting heuristic to sort those important and smaller routines as higher priority to be inlined might be
>> another and better approach? 
> 
> Yes, i have also reworked the inline metrics somehwat and spent quite
> some time looking into dumps to see that it behaves reasonably.  There
> was two ages old bugs I fixed in last two weeks and also added some
> extra tricks like penalizing cross-module inlines some time ago. Given
> the fact that even with profile feedback I am not able to sort the
> priority queue well and neither can Clang do the job, I think it is good
> motivation to adjust the parameter which I have set somewhat arbitrarily
> at a time I was not able to test it well.

where is the code for your current heuristic to sorting the inlinable candidates?

Thanks.

Qing
> 
> Honza
Jan Hubicka Jan. 9, 2019, 2:31 p.m. UTC | #6
> Looks like that our current documentation has a bug in the below:
> 
> https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html <https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html>
> 
> -finline-functions
> Consider all functions for inlining, even if they are not declared inline. The compiler heuristically decides which functions are worth integrating in this way.
> If all calls to a given function are integrated, and the function is declared static, then the function is normally not output as assembler code in its own right.
> Enabled at levels -O2, -O3, -Os. Also enabled by -fprofile-use and -fauto-profile.
> 
> It clearly mentioned that -finline-functions is enabled at -O2, O3, -Os. 
> 
> And I checked the gcc9 source code, opts.c:
> 
>     /* -O3 and -Os optimizations.  */
>     /* Inlining of functions reducing size is a good idea with -Os
>        regardless of them being declared inline.  */
>     { OPT_LEVELS_3_PLUS_AND_SIZE, OPT_finline_functions, NULL, 1 },
> 
> looks like that -finline-functions is ONLY enabled at -O3 and -Os, not for O2.
> (However, I am confused with why -finline-functions should be enabled for -Os?)

yes, it is documentation bug. Seems like Eric beat me on fixing it.
> 
> >> 
> >>> 
> >>> In my test bed this included Firefox with or without LTO becuase they do
> >>> "poor man's" LTO by #including multiple .cpp files into single unified
> >>> source which makes average units large.  Also tramp3d, DLV from our C++
> >>> benhcmark is affected. 
> >>> 
> >>> I have some data on Firefox and I will build remainin ones:
> >> in the following, are the data for code size? are the optimization level O3?
> >> what’s PGO mean?  
> > 
> > Those are sizes of libxul, which is the largest library of Firefox.
> > PGO is profile guided optimization.
> 
> Okay.  I see. 
> 
> looks like for LTO,  the code size increase with profiling is much smaller than
> that without profiling when growth is increased from 20% to 40%.  

With LTo the growth is about 9%, while for non-LTO is about about 4% and
with PGO it is about 3%.  This is expected.

For non-LTO most of translation units do not hit the limit becuase most
of calls are external. Firefox is bit special here by using the #include
based unified build that gets it closer to LTO, but not quite.

With LTO there is only one translation unit that hits the 20% code size
growth that after optimization translates to that 9%

With profilef feedback code is partitioned into cold and hot sections
where only hot section growths by the given percentage. For firefox
about 15% of the binary is trained and rest is cold.
> 
> for Non-LTO, the code size increase is minimal when growth is increased fro 20% to 40%.
> 
> However, not quite understand the last column, could you please explain a little bit
> on last column (-finline-functions)?

It is non-lto build but with additional -finline-functions.

GCC build machinery uses -O2 by default and -O3 for some files. Adding
-finline-functions enables agressive inlining everywhere.  But double
checking the numbers, I must have cut&pasted wrong data here.  For
growth 20 -finline-functions non-LTO non-PGO I get 107272791 (so table
is wrong) and increasing growth to 40 gets me 115311719 (which is
correct in the table)
> 
> >>> 
> >>> growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
> >>> 20 (default)   83752215   94390023  93085455  103437191  94351191
> >>> 40             85299111   97220935  101600151 108910311  115311719
> >>> clang          111520431            114863807 108437807

It should be:
growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
20 (default)   83752215   94390023  93085455  103437191  107272791
40             85299111   97220935  101600151 108910311  115311719
clang          111520431            114863807 108437807

So 7.5% growth.
> > I was poking about this for a while, but did not really have very good
> > testcases available making it difficult to judge code size/performance
> > tradeoffs here.  With Firefox I can measure things better now and
> > it is clear that 20% growth is just too small. It is small even with
> > profile feedback where compiler knows quite well what calls to inline
> > and more so without.
> 
> Yes, for C++, 20% might be too small, especially for cross-file inlining.
> and C++ applications usually benefit more from inlining. 

Yep, that is my conclussion too.
> 
> >> 
> >> When increasing the value of inline-unit-growth for LTO is one approach to resolve this issue, adjusting
> >> the sorting heuristic to sort those important and smaller routines as higher priority to be inlined might be
> >> another and better approach? 
> > 
> > Yes, i have also reworked the inline metrics somehwat and spent quite
> > some time looking into dumps to see that it behaves reasonably.  There
> > was two ages old bugs I fixed in last two weeks and also added some
> > extra tricks like penalizing cross-module inlines some time ago. Given
> > the fact that even with profile feedback I am not able to sort the
> > priority queue well and neither can Clang do the job, I think it is good
> > motivation to adjust the parameter which I have set somewhat arbitrarily
> > at a time I was not able to test it well.
> 
> where is the code for your current heuristic to sorting the inlinable candidates?

It is in ipa-inline.c:edge-badness
If you use -fdump-ipa-inline-details you can search for "Considering" in
the dump file to find record about every inline decision. It dumps the
badness value and also the individual values used to compute it.

Honza
> 
> Thanks.
> 
> Qing
> > 
> > Honza
>
Qing Zhao Jan. 9, 2019, 6:04 p.m. UTC | #7
>>> 
>>> Those are sizes of libxul, which is the largest library of Firefox.
>>> PGO is profile guided optimization.
>> 
>> Okay.  I see. 
>> 
>> looks like for LTO,  the code size increase with profiling is much smaller than
>> that without profiling when growth is increased from 20% to 40%.  
> 
> With LTo the growth is about 9%, while for non-LTO is about about 4% and
> with PGO it is about 3%.  This is expected.
> 
> For non-LTO most of translation units do not hit the limit becuase most
> of calls are external. Firefox is bit special here by using the #include
> based unified build that gets it closer to LTO, but not quite.
> 
> With LTO there is only one translation unit that hits the 20% code size
> growth that after optimization translates to that 9%
> 
> With profilef feedback code is partitioned into cold and hot sections
> where only hot section growths by the given percentage. For firefox
> about 15% of the binary is trained and rest is cold.
>> 
>> for Non-LTO, the code size increase is minimal when growth is increased fro 20% to 40%.
>> 
>> However, not quite understand the last column, could you please explain a little bit
>> on last column (-finline-functions)?
> 
> It is non-lto build but with additional -finline-functions.
> 
> GCC build machinery uses -O2 by default and -O3 for some files. Adding
> -finline-functions enables agressive inlining everywhere.  But double
> checking the numbers, I must have cut&pasted wrong data here.  For
> growth 20 -finline-functions non-LTO non-PGO I get 107272791 (so table
> is wrong) and increasing growth to 40 gets me 115311719 (which is
> correct in the table)
>> 
>>>>> 
>>>>> growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
>>>>> 20 (default)   83752215   94390023  93085455  103437191  94351191
>>>>> 40             85299111   97220935  101600151 108910311  115311719
>>>>> clang          111520431            114863807 108437807
> 
> It should be:
> growth		LTO+PGO	   PGO	     LTO        none      -finline-functions
> 20 (default)   83752215   94390023  93085455  103437191  107272791
> 40             85299111   97220935  101600151 108910311  115311719
> clang          111520431            114863807 108437807
> 
> So 7.5% growth.
Okay, I see.

>>> 
>>> Yes, i have also reworked the inline metrics somehwat and spent quite
>>> some time looking into dumps to see that it behaves reasonably.  There
>>> was two ages old bugs I fixed in last two weeks and also added some
>>> extra tricks like penalizing cross-module inlines some time ago. Given
>>> the fact that even with profile feedback I am not able to sort the
>>> priority queue well and neither can Clang do the job, I think it is good
>>> motivation to adjust the parameter which I have set somewhat arbitrarily
>>> at a time I was not able to test it well.
>> 
>> where is the code for your current heuristic to sorting the inlinable candidates?
> 
> It is in ipa-inline.c:edge-badness
> If you use -fdump-ipa-inline-details you can search for "Considering" in
> the dump file to find record about every inline decision. It dumps the
> badness value and also the individual values used to compute it.

thanks, will take a look on it.

Qing
>
diff mbox series

Patch

Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 267612)
+++ ipa-inline.c	(working copy)
@@ -1082,8 +1082,8 @@  edge_badness (struct cgraph_edge *edge,
   /* When profile is available. Compute badness as:
      
                  time_saved * caller_count
-     goodness =  -------------------------------------------------
-	         growth_of_caller * overall_growth * combined_size
+     goodness =  --------------------------------
+	         growth_of_caller * combined_size
 
      badness = - goodness
 
@@ -1094,7 +1094,6 @@  edge_badness (struct cgraph_edge *edge,
 	   || caller->count.ipa ().nonzero_p ())
     {
       sreal numerator, denominator;
-      int overall_growth;
       sreal inlined_time = compute_inlined_call_time (edge, edge_time);
 
       numerator = (compute_uninlined_call_time (edge, unspec_edge_time)
@@ -1106,73 +1105,6 @@  edge_badness (struct cgraph_edge *edge,
       else if (caller->count.ipa ().initialized_p ())
 	numerator = numerator >> 11;
       denominator = growth;
-
-      overall_growth = callee_info->growth;
-
-      /* Look for inliner wrappers of the form:
-
-	 inline_caller ()
-	   {
-	     do_fast_job...
-	     if (need_more_work)
-	       noninline_callee ();
-	   }
-	 Withhout panilizing this case, we usually inline noninline_callee
-	 into the inline_caller because overall_growth is small preventing
-	 further inlining of inline_caller.
-
-	 Penalize only callgraph edges to functions with small overall
-	 growth ...
-	*/
-      if (growth > overall_growth
-	  /* ... and having only one caller which is not inlined ... */
-	  && callee_info->single_caller
-	  && !edge->caller->global.inlined_to
-	  /* ... and edges executed only conditionally ... */
-	  && edge->sreal_frequency () < 1
-	  /* ... consider case where callee is not inline but caller is ... */
-	  && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
-	       && DECL_DECLARED_INLINE_P (caller->decl))
-	      /* ... or when early optimizers decided to split and edge
-		 frequency still indicates splitting is a win ... */
-	      || (callee->split_part && !caller->split_part
-		  && edge->sreal_frequency () * 100
-		     < PARAM_VALUE
-			  (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY)
-		  /* ... and do not overwrite user specified hints.   */
-		  && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
-		      || DECL_DECLARED_INLINE_P (caller->decl)))))
-	{
-	  ipa_fn_summary *caller_info = ipa_fn_summaries->get (caller);
-	  int caller_growth = caller_info->growth;
-
-	  /* Only apply the penalty when caller looks like inline candidate,
-	     and it is not called once and.  */
-	  if (!caller_info->single_caller && overall_growth < caller_growth
-	      && caller_info->inlinable
-	      && caller_info->size
-		 < (DECL_DECLARED_INLINE_P (caller->decl)
-		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
-	    {
-	      if (dump)
-		fprintf (dump_file,
-			 "     Wrapper penalty. Increasing growth %i to %i\n",
-			 overall_growth, caller_growth);
-	      overall_growth = caller_growth;
-	    }
-	}
-      if (overall_growth > 0)
-        {
-	  /* Strongly preffer functions with few callers that can be inlined
-	     fully.  The square root here leads to smaller binaries at average.
-	     Watch however for extreme cases and return to linear function
-	     when growth is large.  */
-	  if (overall_growth < 256)
-	    overall_growth *= overall_growth;
-	  else
-	    overall_growth += 256 * 256 - 256;
-	  denominator *= overall_growth;
-        }
       denominator *= ipa_fn_summaries->get (caller)->self_size + growth;
 
       badness = - numerator / denominator;
@@ -1182,18 +1114,14 @@  edge_badness (struct cgraph_edge *edge,
 	  fprintf (dump_file,
 		   "      %f: guessed profile. frequency %f, count %" PRId64
 		   " caller count %" PRId64
-		   " time w/o inlining %f, time with inlining %f"
-		   " overall growth %i (current) %i (original)"
-		   " %i (compensated)\n",
+		   " time w/o inlining %f, time with inlining %f\n",
 		   badness.to_double (),
 		   edge->sreal_frequency ().to_double (),
 		   edge->count.ipa ().initialized_p () ? edge->count.ipa ().to_gcov_type () : -1,
 		   caller->count.ipa ().initialized_p () ? caller->count.ipa ().to_gcov_type () : -1,
 		   compute_uninlined_call_time (edge,
 						unspec_edge_time).to_double (),
-		   inlined_time.to_double (),
-		   estimate_growth (callee),
-		   callee_info->growth, overall_growth);
+		   inlined_time.to_double ());
 	}
     }
   /* When function local profile is not available or it does not give