Implement ipa_inline_call_context cache
diff mbox series

Message ID 20191103163549.vknanxugsfzj7cal@kam.mff.cuni.cz
State New
Headers show
Series
  • Implement ipa_inline_call_context cache
Related show

Commit Message

Jan Hubicka Nov. 3, 2019, 4:35 p.m. UTC
Hi,
this patch implements simple single entry LRU cache for computing size/time of
callgraph edges.   My original patch contained a hashtable for all contexts but
since I became more careful about memory allocation I decided to first go with
this simple patch.  For Firefox I get:

node context cache: 1266834 hits, 871097 misses, 345654 initializations

So we save about 51% of calculations which are the usual bottlenecks of the
greedy inliner. Calculating the estimate involves walking all edges out of
the given function and since we compute it for each caller, this may get
quadratic.

In the followup patches I will increase the effectivity of cache by dropping
unnecesary information and then I will evaluate what size of cahce works best.

Current perf profile of WPA on firefox looks as follows:
   2.98%  lto1-wpa-stream  libc-2.24.so       [.] _int_malloc
   2.75%  lto1-wpa-stream  lto1               [.] streamer_tree_cache_lookup
   2.16%  lto1-wpa         libc-2.24.so       [.] _int_malloc
   2.13%  lto1-wpa         lto1               [.] inflate_fast
   1.96%  lto1-wpa         lto1               [.] record_target_from_binfo
   1.90%  lto1-wpa         lto1               [.] types_same_for_odr
   1.37%  lto1-wpa         lto1               [.] estimate_calls_size_and_time
   1.37%  lto1-wpa         lto1               [.] symbol_table::remove_unreachable_nodes
   1.23%  lto1-wpa-stream  lto1               [.] streamer_write_uhwi_stream
   1.07%  lto1-wpa         lto1               [.] type_possibly_instantiated_p
   1.07%  lto1-wpa         libc-2.24.so       [.] malloc_consolidate
   1.02%  lto1-wpa-stream  lto1               [.] DFS::DFS
   1.02%  lto1-wpa         lto1               [.] unify_scc
   1.02%  lto1-wpa         lto1               [.] sreal::operator/
   1.01%  lto1-wpa-stream  [kernel.kallsyms]  [k] copy_page
   0.97%  lto1-wpa         lto1               [.] fibonacci_heap<sreal, cgraph_edge>::consolidate
   0.89%  lto1-wpa         lto1               [.] evaluate_properties_for_edge
   0.81%  lto1-wpa         [kernel.kallsyms]  [k] copy_page_range
   0.81%  lto1-wpa         lto1               [.] splay_tree_splay.part.0
   0.80%  lto1-wpa-stream  lto1               [.] lto_output_tree
   0.79%  lto1-wpa         lto1               [.] streamer_read_uhwi
   0.75%  lto1-wpa         lto1               [.] gimple_get_virt_method_for_vtable
   0.74%  lto1-wpa         lto1               [.] edge_badness
   0.74%  lto1-wpa         libc-2.24.so       [.] int_mallinfo
   0.74%  lto1-wpa         lto1               [.] rename_statics
   0.74%  lto1-wpa         lto1               [.] possible_polymorphic_call_targets_1
   0.69%  lto1-wpa         lto1               [.] compare_tree_sccs_1
   0.67%  lto1-wpa         libc-2.24.so       [.] _int_free
   0.65%  lto1-wpa         lto1               [.] fibonacci_heap<sreal, cgraph_edge>::extract_minimum_node
   0.61%  lto1-wpa         lto1               [.] htab_hash_string
   0.58%  lto1-wpa         lto1               [.] ht_lookup_with_hash
   0.58%  lto1-wpa-stream  lto1               [.] linemap_ordinary_map_lookup
   0.57%  lto1-wpa         lto1               [.] streamer_read_tree_bitfields
   0.56%  lto1-wpa         lto1               [.] cgraph_node::get_availability
   0.56%  lto1-wpa         lto1               [.] hash_table<hash_map<int_hash<int, 0, -1>, edge_growth_cache_entry*, simple_hashmap_traits<default_hash_traits<int_hash<int, 0, -1> >, edge_gr

Cutting down record_target_from_binfo and types_same_for_odr is relatively easy
(we do not need to walk may edges in unreachable function removal each time).
estimate_calls_size_and_time was reduced by this patch (it used to be 1st of
lto1-wpa).


Bootstrapped/regtested x86_64-linux, will commit it shortly.

Honza
	* ipa-fnsummary.c (ipa_call_context::duplicate_from): New
	member function.
	(ipa_call_context::release): Add ALL parameter.
	(ipa_call_context::equal_to): New member function.
	* ipa-fnsummary.h (ipa_call_context): Add empty constructor;
	duplicate_form, release, equal_to and exists_p member functoins.
	* ipa-inline-analysis.c (node_context_cache_entry): New
	class.
	(node_context_summary): Likewise.
	(node_context_cache, node_context_cache_hit, node_context_cache_miss,
	node_context_clear): New static vars.
	(initialize_growth_caches): New function.
	(free_growth_caches): Also delete node_context_cache; output stats.
	(do_estimate_edge_time): Cache contexts.
	(reset_node_cache): New function.
	* ipa-inline.c (reset_edge_caches): Reset also node cache.
	(inline_small_functions): Initialize growth caches.
	* ipa-inline.h (reset_node_cache, initialize_growth_caches):
	Declare.
	* ipa-predicate.h (inline_param_summary::equal_to): New.
	* ipa-prop.c (ipa_agg_jf_item::equal_to): New.
	* ipa-prop.h (ipa_agg_jf_item): Declare equal_to member function.
	(ipa_agg_jump_function): Implement equal_to member function.

Patch
diff mbox series

Index: ipa-fnsummary.c
===================================================================
--- ipa-fnsummary.c	(revision 277755)
+++ ipa-fnsummary.c	(working copy)
@@ -2964,14 +2964,103 @@  ipa_call_context::ipa_call_context (cgra
 {
 }
 
-/* Release memory used by known_vals/contexts/aggs vectors.  */
+void
+ipa_call_context::duplicate_from (const ipa_call_context &ctx)
+{
+  m_node = ctx.m_node;
+  m_possible_truths = ctx.m_possible_truths;
+  m_nonspec_possible_truths = ctx.m_nonspec_possible_truths;
+
+  if (ctx.m_inline_param_summary.exists ())
+    m_inline_param_summary = ctx.m_inline_param_summary.copy ();
+  else
+    m_inline_param_summary = vNULL;
+  if (ctx.m_known_vals.exists ())
+    m_known_vals = ctx.m_known_vals.copy ();
+  else
+    m_known_vals = vNULL;
+  if (ctx.m_known_contexts.exists ())
+    m_known_contexts = ctx.m_known_contexts.copy ();
+  else
+    m_known_contexts = vNULL;
+  if (ctx.m_known_aggs.exists ())
+    m_known_aggs = ctx.m_known_aggs.copy ();
+  else
+    m_known_aggs = vNULL;
+}
+
+/* Release memory used by known_vals/contexts/aggs vectors.
+   If ALL is true release also inline_param_summary.
+   This happens when context was previously duplciated to be stored
+   into cache.  */
 
 void
-ipa_call_context::release ()
+ipa_call_context::release (bool all)
 {
+  /* See if context is initialized at first place.  */
+  if (!m_node)
+    return;
   m_known_vals.release ();
   m_known_contexts.release ();
   m_known_aggs.release ();
+  if (all)
+    m_inline_param_summary.release ();
+}
+
+/* Return true if CTX describes the same call context as THIS.  */
+
+bool
+ipa_call_context::equal_to (const ipa_call_context &ctx)
+{
+  if (m_node != ctx.m_node
+      || m_possible_truths != ctx.m_possible_truths
+      || m_nonspec_possible_truths != ctx.m_nonspec_possible_truths)
+    return false;
+  if (m_inline_param_summary.exists () != ctx.m_inline_param_summary.exists ()
+      || m_known_vals.exists () != ctx.m_known_vals.exists()
+      || m_known_contexts.exists () != ctx.m_known_contexts.exists ()
+      || m_known_aggs.exists () != ctx.m_known_aggs.exists ())
+    return false;
+  if (m_inline_param_summary.exists ())
+    {
+      if (m_inline_param_summary.length () != ctx.m_inline_param_summary.length ())
+	return false;
+      for (unsigned int i = 0; i < m_inline_param_summary.length (); i++)
+	if (!m_inline_param_summary[i].equal_to (ctx.m_inline_param_summary[i]))
+	  return false;
+    }
+  if (m_known_vals.exists ())
+    {
+      if (m_known_vals.length () != ctx.m_known_vals.length ())
+	return false;
+      for (unsigned int i = 0; i < m_known_vals.length (); i++)
+	{
+	  tree t1 = m_known_vals[i];
+	  tree t2 = ctx.m_known_vals[i];
+
+	  if (t1 != t2
+	      && (!t1 || !t2 || !operand_equal_p (m_known_vals[i],
+						  ctx.m_known_vals[i], 0)))
+	    return false;
+	}
+    }
+  if (m_known_contexts.exists ())
+    {
+      if (m_known_contexts.length () != ctx.m_known_contexts.length ())
+	return false;
+      for (unsigned int i = 0; i < m_known_contexts.length (); i++)
+	if (!m_known_contexts[i].equal_to (ctx.m_known_contexts[i]))
+	  return false;
+    }
+  if (m_known_aggs.exists ())
+    {
+      if (m_known_aggs.length () != ctx.m_known_aggs.length ())
+	return false;
+      for (unsigned int i = 0; i < m_known_aggs.length (); i++)
+	if (!m_known_aggs[i]->equal_to (*ctx.m_known_aggs[i]))
+	  return false;
+    }
+  return true;
 }
 
 /* Estimate size and time needed to execute call in the given context.
Index: ipa-fnsummary.h
===================================================================
--- ipa-fnsummary.h	(revision 277755)
+++ ipa-fnsummary.h	(working copy)
@@ -295,11 +295,21 @@  public:
 		    vec<ipa_polymorphic_call_context> known_contexts,
 		    vec<ipa_agg_jump_function_p> known_aggs,
 		    vec<inline_param_summary> m_inline_param_summary);
+  ipa_call_context ()
+  : m_node(NULL)
+  {
+  }
   void estimate_size_and_time (int *ret_size, int *ret_min_size,
 			       sreal *ret_time,
 			       sreal *ret_nonspecialized_time,
 			       ipa_hints *ret_hints);
-  void release ();
+  void duplicate_from (const ipa_call_context &ctx);
+  void release (bool all = false);
+  bool equal_to (const ipa_call_context &);
+  bool exists_p ()
+  {
+    return m_node != NULL;
+  }
 private:
   /* Called function.  */
   cgraph_node *m_node;
Index: ipa-inline-analysis.c
===================================================================
--- ipa-inline-analysis.c	(revision 277755)
+++ ipa-inline-analysis.c	(working copy)
@@ -53,6 +53,48 @@  along with GCC; see the file COPYING3.
 /* Cached node/edge growths.  */
 call_summary<edge_growth_cache_entry *> *edge_growth_cache = NULL;
 
+/* The context cache remembers estimated time/size and hints for given
+   ipa_call_context of a call.  */
+class node_context_cache_entry
+{
+public:
+  ipa_call_context ctx;
+  sreal time, nonspec_time;
+  int size;
+  ipa_hints hints;
+
+  node_context_cache_entry ()
+  : ctx ()
+  {
+  }
+  ~node_context_cache_entry ()
+  {
+    ctx.release ();
+  }
+};
+
+/* At the moment we implement primitive single entry LRU cache.  */
+class node_context_summary
+{
+public:
+  node_context_cache_entry entry;
+
+  node_context_summary ()
+  : entry ()
+  {
+  }
+  ~node_context_summary ()
+  {
+  }
+};
+
+/* Summary holding the context cache.  */
+static fast_function_summary <node_context_summary *, va_heap>
+	*node_context_cache = NULL;
+/* Statistics about the context cache effectivity.  */
+static long node_context_cache_hit, node_context_cache_miss,
+	    node_context_cache_clear;
+
 /* Give initial reasons why inlining would fail on EDGE.  This gets either
    nullified or usually overwritten by more precise reasons later.  */
 
@@ -77,6 +119,16 @@  initialize_inline_failed (struct cgraph_
 			    == CIF_FINAL_ERROR);
 }
 
+/* Allocate edge growth caches.  */
+
+void
+initialize_growth_caches ()
+{
+  edge_growth_cache
+    = new call_summary<edge_growth_cache_entry *> (symtab, false);
+  node_context_cache
+    = new fast_function_summary<node_context_summary *, va_heap> (symtab);
+}
 
 /* Free growth caches.  */
 
@@ -84,7 +136,17 @@  void
 free_growth_caches (void)
 {
   delete edge_growth_cache;
+  delete node_context_cache;
   edge_growth_cache = NULL;
+  node_context_cache = NULL;
+  if (dump_file)
+    fprintf (dump_file, "node context cache: %li hits, %li misses,"
+		   	" %li initializations\n",
+	     node_context_cache_hit, node_context_cache_miss,
+	     node_context_cache_clear);
+  node_context_cache_hit = 0;
+  node_context_cache_miss = 0;
+  node_context_cache_clear = 0;
 }
 
 /* Return hints derrived from EDGE.   */
@@ -129,7 +191,7 @@  do_estimate_edge_time (struct cgraph_edg
   vec<ipa_polymorphic_call_context> known_contexts;
   vec<ipa_agg_jump_function_p> known_aggs;
   class ipa_call_summary *es = ipa_call_summaries->get (edge);
-  int min_size;
+  int min_size = -1;
 
   callee = edge->callee->ultimate_alias_target ();
 
@@ -139,8 +201,37 @@  do_estimate_edge_time (struct cgraph_edg
 				&known_contexts, &known_aggs);
   ipa_call_context ctx (callee, clause, nonspec_clause, known_vals,
 		  	known_contexts, known_aggs, es->param);
-  ctx.estimate_size_and_time (&size, &min_size,
-			      &time, &nonspec_time, &hints);
+  if (node_context_cache != NULL)
+    {
+      node_context_summary *e = node_context_cache->get_create (callee);
+      if (e->entry.ctx.equal_to (ctx))
+	{
+	  node_context_cache_hit++;
+	  size = e->entry.size;
+	  time = e->entry.time;
+	  nonspec_time = e->entry.nonspec_time;
+	  hints = e->entry.hints;
+	}
+      else
+	{
+	  if (e->entry.ctx.exists_p ())
+	    node_context_cache_miss++;
+	  else
+	    node_context_cache_clear++;
+	  e->entry.ctx.release (true);
+	  e->entry.ctx = ctx;
+	  ctx.estimate_size_and_time (&size, &min_size,
+				      &time, &nonspec_time, &hints);
+	  e->entry.size = size;
+	  e->entry.time = time;
+	  e->entry.nonspec_time = nonspec_time;
+	  e->entry.hints = hints;
+	  e->entry.ctx.duplicate_from (ctx);
+	}
+    }
+  else
+    ctx.estimate_size_and_time (&size, &min_size,
+				&time, &nonspec_time, &hints);
 
   /* When we have profile feedback, we can quite safely identify hot
      edges and for those we disable size limits.  Don't do that when
@@ -160,8 +251,9 @@  do_estimate_edge_time (struct cgraph_edg
   /* When caching, update the cache entry.  */
   if (edge_growth_cache != NULL)
     {
-      ipa_fn_summaries->get (edge->callee->function_symbol ())->min_size
-	 = min_size;
+      if (min_size >= 0)
+        ipa_fn_summaries->get (edge->callee->function_symbol ())->min_size
+	   = min_size;
       edge_growth_cache_entry *entry
 	= edge_growth_cache->get_create (edge);
       entry->time = time;
@@ -174,6 +266,14 @@  do_estimate_edge_time (struct cgraph_edg
   return time;
 }
 
+/* Reset cache for NODE.
+   This must be done each time NODE body is modified.  */
+void
+reset_node_cache (struct cgraph_node *node)
+{
+  if (node_context_cache)
+    node_context_cache->remove (node);
+}
 
 /* Return estimated callee growth after inlining EDGE.
    Only to be called via estimate_edge_size.  */
Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 277753)
+++ ipa-inline.c	(working copy)
@@ -1368,6 +1368,8 @@  reset_edge_caches (struct cgraph_node *n
   if (where->inlined_to)
     where = where->inlined_to;
 
+  reset_node_cache (where);
+
   if (edge_growth_cache != NULL)
     for (edge = where->callers; edge; edge = edge->next_caller)
       if (edge->inline_failed)
@@ -1900,8 +1902,7 @@  inline_small_functions (void)
 	  max_count = max_count.max (edge->count.ipa ());
       }
   ipa_free_postorder_info ();
-  edge_growth_cache
-    = new call_summary<edge_growth_cache_entry *> (symtab, false);
+  initialize_growth_caches ();
 
   if (dump_file)
     fprintf (dump_file,
Index: ipa-inline.h
===================================================================
--- ipa-inline.h	(revision 277753)
+++ ipa-inline.h	(working copy)
@@ -48,6 +48,8 @@  bool growth_likely_positive (struct cgra
 int do_estimate_edge_size (struct cgraph_edge *edge);
 sreal do_estimate_edge_time (struct cgraph_edge *edge);
 ipa_hints do_estimate_edge_hints (struct cgraph_edge *edge);
+void reset_node_cache (struct cgraph_node *node);
+void initialize_growth_caches ();
 void free_growth_caches (void);
 
 /* In ipa-inline.c  */
Index: ipa-predicate.h
===================================================================
--- ipa-predicate.h	(revision 277753)
+++ ipa-predicate.h	(working copy)
@@ -77,6 +77,10 @@  struct inline_param_summary
 
      Value 0 is reserved for compile time invariants. */
   int change_prob;
+  bool equal_to (const inline_param_summary &other)
+  {
+    return change_prob == other.change_prob;
+  }
 };
 
 typedef vec<condition, va_gc> *conditions;
Index: ipa-prop.c
===================================================================
--- ipa-prop.c	(revision 277753)
+++ ipa-prop.c	(working copy)
@@ -5293,4 +5293,12 @@  ipcp_transform_function (struct cgraph_n
   return TODO_update_ssa_only_virtuals;
 }
 
+
+/* Return true if OTHER describes same agg item.  */
+bool
+ipa_agg_jf_item::equal_to (const ipa_agg_jf_item &other)
+{
+  return offset == other.offset
+	 && operand_equal_p (value, other.value, 0);
+}
 #include "gt-ipa-prop.h"
Index: ipa-prop.h
===================================================================
--- ipa-prop.h	(revision 277753)
+++ ipa-prop.h	(working copy)
@@ -127,6 +127,9 @@  struct GTY(()) ipa_agg_jf_item
 
   /* The known constant or type if this is a clobber.  */
   tree value;
+
+  /* Return true if OTHER describes same agg item.  */
+  bool equal_to (const ipa_agg_jf_item &other);
 };
 
 
@@ -139,6 +142,23 @@  struct GTY(()) ipa_agg_jump_function
   vec<ipa_agg_jf_item, va_gc> *items;
   /* True if the data was passed by reference (as opposed to by value). */
   bool by_ref;
+
+  /* Return true if OTHER describes same agg items.  */
+  bool equal_to (const ipa_agg_jump_function &other)
+  {
+    if (by_ref != other.by_ref)
+      return false;
+    if (items != NULL && other.items == NULL)
+      return false;
+    if (!items)
+      return other.items == NULL;
+    if (items->length () != other.items->length ())
+      return false;
+    for (unsigned int i = 0; i < items->length (); i++)
+      if (!(*items)[i].equal_to ((*other.items)[i]))
+	return false;
+    return true;
+  }
 };
 
 typedef struct ipa_agg_jump_function *ipa_agg_jump_function_p;