Patchwork Inliner speedup

login
register
mail settings
Submitter Jan Hubicka
Date July 3, 2010, 12:38 p.m.
Message ID <20100703123807.GF6378@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/57805/
State New
Headers show

Comments

Jan Hubicka - July 3, 2010, 12:38 p.m.
Hi,
this patch reduces inlining time at WHOPR bootstrap from 24%:
Execution times (seconds)
 garbage collection    :   0.29 ( 1%) usr   0.00 ( 0%) sys   0.29 ( 1%) wall       0 kB ( 0%) ggc
 callgraph optimization:   0.02 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall       0 kB ( 0%) ggc
 varpool construction  :   0.15 ( 0%) usr   0.00 ( 0%) sys   0.15 ( 0%) wall    1872 kB ( 0%) ggc
 ipa cp                :   0.25 ( 1%) usr   0.00 ( 0%) sys   0.26 ( 1%) wall   18162 kB ( 4%) ggc
 ipa lto gimple I/O    :   3.82 (13%) usr   0.17 (13%) sys   3.96 (12%) wall       0 kB ( 0%) ggc
 ipa lto decl I/O      :  13.00 (43%) usr   0.18 (14%) sys  13.31 (42%) wall  248867 kB (59%) ggc
 ipa lto decl init I/O :   0.38 ( 1%) usr   0.00 ( 0%) sys   0.39 ( 1%) wall   55385 kB (13%) ggc
 ipa lto cgraph I/O    :   0.15 ( 0%) usr   0.03 ( 2%) sys   0.18 ( 1%) wall   50794 kB (12%) ggc
 ipa lto decl merge    :   1.69 ( 6%) usr   0.05 ( 4%) sys   1.74 ( 5%) wall      29 kB ( 0%) ggc
 ipa lto cgraph merge  :   0.10 ( 0%) usr   0.01 ( 1%) sys   0.10 ( 0%) wall    6833 kB ( 2%) ggc
 whopr wpa             :   1.54 ( 5%) usr   0.03 ( 2%) sys   1.56 ( 5%) wall    6940 kB ( 2%) ggc
 whopr wpa I/O         :   0.68 ( 2%) usr   0.75 (60%) sys   1.32 ( 4%) wall       0 kB ( 0%) ggc
 ipa reference         :   0.64 ( 2%) usr   0.02 ( 2%) sys   0.64 ( 2%) wall       0 kB ( 0%) ggc
 ipa profile           :   0.02 ( 0%) usr   0.00 ( 0%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 ipa pure const        :   0.39 ( 1%) usr   0.00 ( 0%) sys   0.39 ( 1%) wall       0 kB ( 0%) ggc
 inline heuristics     :   7.23 (24%) usr   0.00 ( 0%) sys   7.24 (23%) wall   29591 kB ( 7%) ggc
 callgraph verifier    :   0.05 ( 0%) usr   0.00 ( 0%) sys   0.06 ( 0%) wall       0 kB ( 0%) ggc
 varconst              :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.02 ( 0%) wall       0 kB ( 0%) ggc
 TOTAL                 :  30.48             1.26            31.75             419615 kB

to

 garbage collection    :   0.32 ( 1%) usr   0.00 ( 0%) sys   0.32 ( 1%) wall       0 kB ( 0%) ggc
 callgraph optimization:   0.02 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) wall       0 kB ( 0%) ggc
 varpool construction  :   0.18 ( 1%) usr   0.00 ( 0%) sys   0.17 ( 1%) wall    1872 kB ( 0%) ggc
 ipa cp                :   0.28 ( 1%) usr   0.00 ( 0%) sys   0.28 ( 1%) wall   18176 kB ( 4%) ggc
 ipa lto gimple I/O    :   3.95 (15%) usr   0.16 (13%) sys   4.17 (15%) wall       0 kB ( 0%) ggc
 ipa lto decl I/O      :  14.10 (52%) usr   0.23 (18%) sys  14.37 (51%) wall  249135 kB (59%) ggc
 ipa lto decl init I/O :   0.40 ( 1%) usr   0.00 ( 0%) sys   0.40 ( 1%) wall   55385 kB (13%) ggc
 ipa lto cgraph I/O    :   0.16 ( 1%) usr   0.03 ( 2%) sys   0.20 ( 1%) wall   50861 kB (12%) ggc
 ipa lto decl merge    :   1.96 ( 7%) usr   0.09 ( 7%) sys   2.04 ( 7%) wall      29 kB ( 0%) ggc
 ipa lto cgraph merge  :   0.11 ( 0%) usr   0.01 ( 1%) sys   0.12 ( 0%) wall    6834 kB ( 2%) ggc
 whopr wpa             :   1.67 ( 6%) usr   0.01 ( 1%) sys   1.69 ( 6%) wall    7054 kB ( 2%) ggc
 whopr wpa I/O         :   0.75 ( 3%) usr   0.69 (55%) sys   1.42 ( 5%) wall       0 kB ( 0%) ggc
 ipa reference         :   0.77 ( 3%) usr   0.02 ( 2%) sys   0.76 ( 3%) wall       0 kB ( 0%) ggc
 ipa profile           :   0.02 ( 0%) usr   0.00 ( 0%) sys   0.03 ( 0%) wall       0 kB ( 0%) ggc
 ipa pure const        :   0.48 ( 2%) usr   0.00 ( 0%) sys   0.47 ( 2%) wall       0 kB ( 0%) ggc
 inline heuristics     :   1.66 ( 6%) usr   0.00 ( 0%) sys   1.66 ( 6%) wall   29870 kB ( 7%) ggc
 callgraph verifier    :   0.09 ( 0%) usr   0.00 ( 0%) sys   0.12 ( 0%) wall       0 kB ( 0%) ggc
 varconst              :   0.01 ( 0%) usr   0.02 ( 2%) sys   0.02 ( 0%) wall       0 kB ( 0%) ggc
 TOTAL                 :  27.02             1.26            28.34             420357 kB

The trick is to avoid recomputing keys when offline copy of the function was not eliminated.
Bootstrapped/regtesed x86_64-linux, will commit it shortly.

Honza

	* ipa-inline.c (update_edge_key): Break out from ...
	update_callers_keys): ... here;
	(update_callee_keys): Update only the edges from caller to callee.
	(update_all_calle_keys): Do what update_calle_keys did.
	(decide_inlining_of_small_functions): Avoid recomputing of all
	callees when badness increase.

Patch

Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 161672)
+++ ipa-inline.c	(working copy)
@@ -661,6 +661,30 @@ 
     return badness;
 }
 
+/* Recompute badness of EDGE and update its key in HEAP if needed.  */
+static void
+update_edge_key (fibheap_t heap, struct cgraph_edge *edge)
+{
+  int badness = cgraph_edge_badness (edge, false);
+  if (edge->aux)
+    {
+      fibnode_t n = (fibnode_t) edge->aux;
+      gcc_checking_assert (n->data == edge);
+
+      /* fibheap_replace_key only decrease the keys.
+	 When we increase the key we do not update heap
+	 and instead re-insert the element once it becomes
+	 a minium of heap.  */
+      if (badness < n->key)
+	{
+	  fibheap_replace_key (heap, n, badness);
+	  gcc_checking_assert (n->key == badness);
+	}
+    }
+  else
+    edge->aux = fibheap_insert (heap, badness, edge);
+}
+
 /* Recompute heap nodes for each of caller edge.  */
 
 static void
@@ -678,8 +702,6 @@ 
   bitmap_set_bit (updated_nodes, node->uid);
   node->global.estimated_growth = INT_MIN;
 
-  if (!node->local.inlinable)
-    return;
   /* See if there is something to do.  */
   for (edge = node->callers; edge; edge = edge->next_caller)
     if (edge->inline_failed)
@@ -702,28 +724,53 @@ 
 
   for (; edge; edge = edge->next_caller)
     if (edge->inline_failed)
+      update_edge_key (heap, edge);
+}
+
+/* Recompute heap nodes for each uninlined call.
+   This is used when we know that edge badnesses are going only to increase
+   (we introduced new call site) and thus all we need is to insert newly
+   created edges into heap.  */
+
+static void
+update_callee_keys (fibheap_t heap, struct cgraph_node *node,
+		    bitmap updated_nodes)
+{
+  struct cgraph_edge *e = node->callees;
+  node->global.estimated_growth = INT_MIN;
+
+  if (!e)
+    return;
+  while (true)
+    if (!e->inline_failed && e->callee->callees)
+      e = e->callee->callees;
+    else
       {
-	int badness = cgraph_edge_badness (edge, false);
-	if (edge->aux)
+	if (e->inline_failed
+	    && e->callee->local.inlinable
+	    && !bitmap_bit_p (updated_nodes, e->callee->uid))
 	  {
-	    fibnode_t n = (fibnode_t) edge->aux;
-	    gcc_assert (n->data == edge);
-	    if (n->key == badness)
-	      continue;
-
-	    /* fibheap_replace_key only decrease the keys.
-	       When we increase the key we do not update heap
-	       and instead re-insert the element once it becomes
-	       a minium of heap.  */
-	    if (badness < n->key)
+	    node->global.estimated_growth = INT_MIN;
+	    /* If function becomes uninlinable, we need to remove it from the heap.  */
+	    if (!cgraph_default_inline_p (e->callee, &e->inline_failed))
+	      update_caller_keys (heap, e->callee, updated_nodes);
+	    else
+	    /* Otherwise update just edge E.  */
+	      update_edge_key (heap, e);
+	  }
+	if (e->next_callee)
+	  e = e->next_callee;
+	else
+	  {
+	    do
 	      {
-		fibheap_replace_key (heap, n, badness);
-		gcc_assert (n->key == badness);
-	        continue;
+		if (e->caller == node)
+		  return;
+		e = e->caller->callers;
 	      }
+	    while (!e->next_callee);
+	    e = e->next_callee;
 	  }
-	else
-	  edge->aux = fibheap_insert (heap, badness, edge);
       }
 }
 
@@ -731,8 +778,8 @@ 
    Walk recursively into all inline clones.  */
 
 static void
-update_callee_keys (fibheap_t heap, struct cgraph_node *node,
-		    bitmap updated_nodes)
+update_all_callee_keys (fibheap_t heap, struct cgraph_node *node,
+			bitmap updated_nodes)
 {
   struct cgraph_edge *e = node->callees;
   node->global.estimated_growth = INT_MIN;
@@ -1166,7 +1213,7 @@ 
 	    continue;
 	  if (flag_indirect_inlining)
 	    add_new_edges_to_heap (heap, new_indirect_edges);
-          update_callee_keys (heap, where, updated_nodes);
+          update_all_callee_keys (heap, where, updated_nodes);
 	}
       else
 	{
@@ -1182,11 +1229,18 @@ 
 	      continue;
 	    }
 	  callee = edge->callee;
+	  gcc_checking_assert (!callee->global.inlined_to);
 	  cgraph_mark_inline_edge (edge, true, &new_indirect_edges);
 	  if (flag_indirect_inlining)
 	    add_new_edges_to_heap (heap, new_indirect_edges);
 
-	  update_callee_keys (heap, callee, updated_nodes);
+	  /* We inlined last offline copy to the body.  This might lead
+	     to callees of function having fewer call sites and thus they
+	     may need updating.  */
+	  if (callee->global.inlined_to)
+	    update_all_callee_keys (heap, callee, updated_nodes);
+	  else
+	    update_callee_keys (heap, edge->callee, updated_nodes);
 	}
       where = edge->caller;
       if (where->global.inlined_to)