diff mbox

Another inline heuristic tweak

Message ID 20150331073144.GA62830@kam.mff.cuni.cz
State New
Headers show

Commit Message

Jan Hubicka March 31, 2015, 7:31 a.m. UTC
Hi,
I was investigating regressions WRT to GCC 4.9 on some of Firefox talos
benchmarks.  The reason turned out to be caused by wrong handling functions
that have fast inline path plus an offline slow implementation.
I.e.:
 inline_caller ()
   {
     do_fast_job...
     if (need_more_work)
       noninline_callee ();
   }

Our inline heuristics gives quite a  lot of priority to inlinable functions
called once because it knows the code size will shrink.  It thus almost
consistently first inline noninline_callee before even considering
inline_caller that is way too large at that point. This seem to have got worse
with the new inline metric. Also LLVm seems to get lucky here, because often
the offline function is placed in other compilation unit.  LLVM inlines quite
agressively at compile time even with LTO and thus it usually inline the caller
before even seeing callee's body.

This patch adjust badness metric to recognize this specific pattern and use
caller's overall_growth instead of callee's.  The long conditional is there to
make this to fireonly in quite controlled situation (either because user
declared wrapper inline or the function was produced by ipa-split)
I manually inspected quite representative sample of the cases where this
triggers on firefox and GCC and they all seems quite sane. Next stage1 I
may turn this into a simple propagation that will probably lead to better
results.  The patch as it is fixes the regression and improves significantly
over 4.9 (by up to 32% on individual parts of dromaeo)

The patch also adds explanation to yesterday change suggested by Richard
and I added a capping to the power to 64 - it seems to matter only for
small functions and I am concerned by overflows and effect of extremely
large values. The periodic testers did not show any noticeable regressions
caused by the patch and I have verified that all the speedup remains with the
capping.

I am running additional benchmarks overnight and intend to commit it tomorrow
if testing suceeds.

Bootstrapped/regtested x86_64-linux.

	* lto-cgraph.c (lto_output_node, input_overwrite_node): Stream
	split_part.
	* ipa-inline.c (edge_badness): Add wrapper penalty.
	(sum_callers): Move up.
	(inline_small_functions): Set single_caller.
	* ipa-inline.h (inline_summary): Add single_caller.
	* ipa-split.c (split_function): Set split_part.
	(cgraph_node::create_clone): Do not shadow decl; copy split_part.
	* cgraph.h (cgraph_node): Add split_part.

	* gcc.dg/ipa/inlinehint-4.c: New testcase.
diff mbox

Patch

Index: lto-cgraph.c
===================================================================
--- lto-cgraph.c	(revision 221777)
+++ lto-cgraph.c	(working copy)
@@ -578,6 +578,7 @@  lto_output_node (struct lto_simple_outpu
   bp_pack_enum (&bp, ld_plugin_symbol_resolution,
 	        LDPR_NUM_KNOWN, node->resolution);
   bp_pack_value (&bp, node->instrumentation_clone, 1);
+  bp_pack_value (&bp, node->split_part, 1);
   streamer_write_bitpack (&bp);
   streamer_write_data_stream (ob->main_stream, section, strlen (section) + 1);
 
@@ -1214,6 +1216,7 @@  input_overwrite_node (struct lto_file_de
   node->resolution = bp_unpack_enum (bp, ld_plugin_symbol_resolution,
 				     LDPR_NUM_KNOWN);
   node->instrumentation_clone = bp_unpack_value (bp, 1);
+  node->split_part = bp_unpack_value (bp, 1);
   gcc_assert (flag_ltrans
 	      || (!node->in_other_partition
 		  && !node->used_from_other_partition));
Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 221777)
+++ ipa-inline.c	(working copy)
@@ -1088,6 +1088,7 @@  edge_badness (struct cgraph_edge *edge,
   else if (opt_for_fn (caller->decl, flag_guess_branch_prob) || caller->count)
     {
       sreal numerator, denominator;
+      int overall_growth;
 
       numerator = (compute_uninlined_call_time (callee_info, edge)
 		   - compute_inlined_call_time (edge, edge_time));
@@ -1098,8 +1099,74 @@  edge_badness (struct cgraph_edge *edge,
       else if (opt_for_fn (caller->decl, flag_branch_probabilities))
 	numerator = numerator >> 11;
       denominator = growth;
-      if (callee_info->growth > 0)
-	denominator *= callee_info->growth * callee_info->growth;
+
+      overall_growth = callee_info->growth;
+
+      /* Look for inliner wrappers of the form:
+
+	 inline_caller ()
+	   {
+	     do_fast_job...
+	     if (need_more_work)
+	       noninline_callee ();
+	   }
+	 Withhout panilizing this case, we usually inline noninline_callee
+	 into the inline_caller because overall_growth is small preventing
+	 further inlining of inline_caller.
+
+	 Penalize only callgraph edges to functions with small overall
+	 growth ...
+	*/
+      if (growth > overall_growth
+	  /* ... and having only one caller which is not inlined ... */
+	  && callee_info->single_caller
+	  && !edge->caller->global.inlined_to
+	  /* ... and edges executed only conditionally ... */
+	  && edge->frequency < CGRAPH_FREQ_BASE
+	  /* ... consider case where callee is not inline but caller is ... */
+	  && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
+	       && DECL_DECLARED_INLINE_P (caller->decl))
+	      /* ... or when early optimizers decided to split and edge
+		 frequency still indicates splitting is a win ... */
+	      || (callee->split_part && !caller->split_part
+		  && edge->frequency
+		     < CGRAPH_FREQ_BASE
+		       * PARAM_VALUE
+			  (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY) / 100
+		  /* ... and do not overwrite user specified hints.   */
+		  && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
+		      || DECL_DECLARED_INLINE_P (caller->decl)))))
+	{
+	  struct inline_summary *caller_info = inline_summaries->get (caller);
+	  int caller_growth = caller_info->growth;
+
+	  /* Only apply the penalty when caller looks like inline candidate,
+	     and it is not called once and.  */
+	  if (!caller_info->single_caller && overall_growth < caller_growth
+	      && caller_info->inlinable
+	      && caller_info->size
+		 < (DECL_DECLARED_INLINE_P (caller->decl)
+		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
+	    {
+	      if (dump)
+		fprintf (dump_file,
+			 "     Wrapper penalty. Increasing growth %i to %i\n",
+			 overall_growth, caller_growth);
+	      overall_growth = caller_growth;
+	    }
+	}
+      if (overall_growth > 0)
+        {
+	  /* Strongly preffer functions with few callers that can be inlined
+	     fully.  The square root here leads to smaller binaries at average.
+	     Watch however for extreme cases and return to linear function
+	     when growth is large.  */
+	  if (overall_growth < 64)
+	    overall_growth *= overall_growth;
+	  else
+	    overall_growth += 64 * 64 - 64;
+	  denominator *= overall_growth;
+        }
 
       badness = - numerator / denominator;
 
@@ -1109,13 +1176,15 @@  edge_badness (struct cgraph_edge *edge,
 		   "      %f: guessed profile. frequency %f, count %"PRId64
 		   " caller count %"PRId64
 		   " time w/o inlining %f, time w inlining %f"
-		   " overall growth %i (current) %i (original)\n",
-		   badness.to_double (), (double)edge->frequency / CGRAPH_FREQ_BASE,
+		   " overall growth %i (current) %i (original)"
+		   " %i (compensated)\n",
+		   badness.to_double (),
+		  (double)edge->frequency / CGRAPH_FREQ_BASE,
 		   edge->count, caller->count,
 		   compute_uninlined_call_time (callee_info, edge).to_double (),
 		   compute_inlined_call_time (edge, edge_time).to_double (),
 		   estimate_growth (callee),
-		   callee_info->growth);
+		   callee_info->growth, overall_growth);
 	}
     }
   /* When function local profile is not available or it does not give
@@ -1133,8 +1202,8 @@  edge_badness (struct cgraph_edge *edge,
       else
 	badness = badness << nest;
       if (dump)
-	fprintf (dump_file, "      %f: no profile. nest %i\n", badness.to_double (),
-		 nest);
+	fprintf (dump_file, "      %f: no profile. nest %i\n",
+		 badness.to_double (), nest);
     }
   gcc_checking_assert (badness != 0);
 
@@ -1649,6 +1718,20 @@  inline_account_function_p (struct cgraph
 	   && node->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED);
 }
 
+/* Count number of callers of NODE and store it into DATA (that
+   points to int.  Worker for cgraph_for_node_and_aliases.  */
+
+static bool
+sum_callers (struct cgraph_node *node, void *data)
+{
+  struct cgraph_edge *e;
+  int *num_calls = (int *)data;
+
+  for (e = node->callers; e; e = e->next_caller)
+    (*num_calls)++;
+  return false;
+}
+
 /* We use greedy algorithm for inlining of small functions:
    All inline candidates are put into prioritized heap ordered in
    increasing badness.
@@ -1693,6 +1776,12 @@  inline_small_functions (void)
 	    if (inline_account_function_p (node))
 	      initial_size += info->size;
 	    info->growth = estimate_growth (node);
+
+	    int num_calls = 0;
+	    node->call_for_symbol_and_aliases (sum_callers, &num_calls,
+					       true);
+	    if (num_calls == 1)
+	      info->single_caller = true;
 	    if (dfs && dfs->next_cycle)
 	      {
 		struct cgraph_node *n2;
@@ -2085,20 +2174,6 @@  flatten_function (struct cgraph_node *no
     inline_update_overall_summary (node);
 }
 
-/* Count number of callers of NODE and store it into DATA (that
-   points to int.  Worker for cgraph_for_node_and_aliases.  */
-
-static bool
-sum_callers (struct cgraph_node *node, void *data)
-{
-  struct cgraph_edge *e;
-  int *num_calls = (int *)data;
-
-  for (e = node->callers; e; e = e->next_caller)
-    (*num_calls)++;
-  return false;
-}
-
 /* Inline NODE to all callers.  Worker for cgraph_for_node_and_aliases.
    DATA points to number of calls originally found so we avoid infinite
    recursion.  */
Index: ipa-inline.h
===================================================================
--- ipa-inline.h	(revision 221777)
+++ ipa-inline.h	(working copy)
@@ -129,6 +129,9 @@  struct GTY(()) inline_summary
   /* True when function contains cilk spawn (and thus we can not inline
      into it).  */
   unsigned contains_cilk_spawn : 1;
+  /* True wen there is only one caller of the function before small function
+     inlining.  */
+  unsigned int single_caller : 1;
 
   /* Information about function that will result after applying all the
      inline decisions present in the callgraph.  Generally kept up to
Index: ipa-split.c
===================================================================
--- ipa-split.c	(revision 221777)
+++ ipa-split.c	(working copy)
@@ -1402,6 +1402,8 @@  split_function (basic_block return_bb, s
     (vNULL, NULL, args_to_skip, !split_part_return_p, split_point->split_bbs,
      split_point->entry_bb, "part");
 
+  node->split_part = true;
+
   /* Let's take a time profile for splitted function.  */
   node->tp_first_run = cur_node->tp_first_run + 1;
 
Index: cgraphclones.c
===================================================================
--- cgraphclones.c	(revision 221777)
+++ cgraphclones.c	(working copy)
@@ -437,7 +437,7 @@  cgraph_node::expand_all_artificial_thunk
    node is not inlined.  */
 
 cgraph_node *
-cgraph_node::create_clone (tree decl, gcov_type gcov_count, int freq,
+cgraph_node::create_clone (tree new_decl, gcov_type gcov_count, int freq,
 			   bool update_original,
 			   vec<cgraph_edge *> redirect_callers,
 			   bool call_duplication_hook,
@@ -449,7 +449,7 @@  cgraph_node::create_clone (tree decl, gc
   gcov_type count_scale;
   unsigned i;
 
-  new_node->decl = decl;
+  new_node->decl = new_decl;
   new_node->register_symbol ();
   new_node->origin = origin;
   new_node->lto_file_data = lto_file_data;
@@ -476,6 +476,7 @@  cgraph_node::create_clone (tree decl, gc
 
   new_node->clone.tree_map = NULL;
   new_node->clone.args_to_skip = args_to_skip;
+  new_node->split_part = split_part;
   if (!args_to_skip)
     new_node->clone.combined_args_to_skip = clone.combined_args_to_skip;
   else if (clone.combined_args_to_skip)
Index: testsuite/gcc.dg/ipa/inlinehint-4.c
===================================================================
--- testsuite/gcc.dg/ipa/inlinehint-4.c	(revision 0)
+++ testsuite/gcc.dg/ipa/inlinehint-4.c	(revision 0)
@@ -0,0 +1,40 @@ 
+/* { dg-options "-O3 -fdump-ipa-inline-details -fno-early-inlining --param large-unit-insns=1"  } */
+/* { dg-add-options bind_pic_locally } */
+int *hashval;
+int *hash;
+int hash_size;
+
+static int
+lookup_slow (int val)
+{
+  int i = val % hash_size;
+  while (hashval[i] && hashval[i] != val)
+    i++;
+  return hash[i];
+}
+
+static inline int
+lookup (int val)
+{
+  static int cache, cache_val;
+  if (val == cache_val)
+    return cache;
+  else
+    {
+      cache_val = val;
+      cache = lookup_slow (val);
+      return cache;
+    }
+}
+
+int
+test (int i)
+{
+  return lookup (i) + lookup (2 * i) + lookup (3 * i) + lookup (4 * i) +
+    lookup (5 * i) + lookup (6 * i) + lookup (7 * i) + lookup (8 * i) +
+    lookup (9 * i);
+}
+/* { dg-final { scan-ipa-dump "Wrapper penalty"  "inline"  } } */
+/* { dg-final { scan-ipa-dump-not "Inlining lookup_slow to lookup"  "inline"  } } */
+/* { dg-final { scan-ipa-dump "Inlining lookup to test"  "inline"  } } */
+/* { dg-final { cleanup-ipa-dump "inline" } } */
Index: cgraph.h
===================================================================
--- cgraph.h	(revision 221777)
+++ cgraph.h	(working copy)
@@ -1319,6 +1319,8 @@  public:
   unsigned merged : 1;
   /* True if function was created to be executed in parallel.  */
   unsigned parallelized_function : 1;
+  /* True if function is part split out by ipa-split.  */
+  unsigned split_part : 1;
 
 private:
   /* Worker for call_for_symbol_and_aliases.  */