diff mbox series

More speedups

Message ID nycvar.YFH.7.76.1911211154530.5566@zhemvz.fhfr.qr
State New
Headers show
Series More speedups | expand

Commit Message

Richard Biener Nov. 21, 2019, 11:58 a.m. UTC
The following avoids heap allocations for setting up the loop iterator
in most cases.  It also avoids re-allocating the PHI vector all the
time in mark_phi_for_rewrite.  Plus it fixes some of the slowness
we observe in compute_idf which uses a) bad iteration order and
b) a worklist implementation not avoiding duplicates.  I see up
to 30% duplicate blocks on the work vector plus we seed the
work vector in a way that later blocks are visited first
(but only ever later blocks are queued so first first is more
efficient).

The vec.h hunk is to aovid a bootstrap fail due to a
-Wfree-nonheap-object warning from the loop iterator change.
IIRC we saw such thing in the past and Honza also mentioned this
recently.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2019-11-21  Richard Biener  <rguenther@suse.de>

	* cfgloop.h (loop_iterator::~loop_iterator): Remove.
	(loop_iterator::to_visit): Use an auto_vec with internal storage.
	(loop_iterator::loop_iterator): Adjust.
	* cfganal.c (compute_dominance_frontiers_1): Fold into...
	(compute_dominance_frontiers): ... this.  Hoist invariant
	get_immediate_dominator call.
	(compute_idf): Use a work-set instead of a work-list for more
	optimal iteration order and duplicate avoidance.
	* tree-into-ssa.c (mark_phi_for_rewrite): Avoid re-allocating
	the vector all the time, instead pre-allocate the vector only
	once.
	(delete_update_ssa): Simplify.
	* vec.h (va_heap::release): Disable -Wfree-nonheap-object around it.
diff mbox series

Patch

Index: gcc/cfgloop.h
===================================================================
--- gcc/cfgloop.h	(revision 278496)
+++ gcc/cfgloop.h	(working copy)
@@ -661,7 +661,6 @@  class loop_iterator
 {
 public:
   loop_iterator (function *fn, loop_p *loop, unsigned flags);
-  ~loop_iterator ();
 
   inline loop_p next ();
 
@@ -669,7 +668,7 @@  public:
   function *fn;
 
   /* The list of loops to visit.  */
-  vec<int> to_visit;
+  auto_vec<int, 16> to_visit;
 
   /* The index of the actual loop.  */
   unsigned idx;
@@ -702,12 +701,11 @@  loop_iterator::loop_iterator (function *
   this->fn = fn;
   if (!loops_for_fn (fn))
     {
-      this->to_visit.create (0);
       *loop = NULL;
       return;
     }
 
-  this->to_visit.create (number_of_loops (fn));
+  this->to_visit.reserve_exact (number_of_loops (fn));
   mn = (flags & LI_INCLUDE_ROOT) ? 0 : 1;
 
   if (flags & LI_ONLY_INNERMOST)
@@ -769,12 +767,6 @@  loop_iterator::loop_iterator (function *
   *loop = this->next ();
 }
 
-inline
-loop_iterator::~loop_iterator ()
-{
-  this->to_visit.release ();
-}
-
 #define FOR_EACH_LOOP(LOOP, FLAGS) \
   for (loop_iterator li(cfun, &(LOOP), FLAGS); \
        (LOOP); \
Index: gcc/cfganal.c
===================================================================
--- gcc/cfganal.c	(revision 278496)
+++ gcc/cfganal.c	(working copy)
@@ -1326,10 +1327,11 @@  dfs_enumerate_from (basic_block bb, int
    of the dominance frontiers, no more, no less.
 */
 
-
-static void
-compute_dominance_frontiers_1 (bitmap_head *frontiers)
+void
+compute_dominance_frontiers (bitmap_head *frontiers)
 {
+  timevar_push (TV_DOM_FRONTIERS);
+
   edge p;
   edge_iterator ei;
   basic_block b;
@@ -1337,34 +1339,22 @@  compute_dominance_frontiers_1 (bitmap_he
     {
       if (EDGE_COUNT (b->preds) >= 2)
 	{
+	  basic_block domsb = get_immediate_dominator (CDI_DOMINATORS, b);
 	  FOR_EACH_EDGE (p, ei, b->preds)
 	    {
 	      basic_block runner = p->src;
-	      basic_block domsb;
 	      if (runner == ENTRY_BLOCK_PTR_FOR_FN (cfun))
 		continue;
 
-	      domsb = get_immediate_dominator (CDI_DOMINATORS, b);
 	      while (runner != domsb)
 		{
-		  if (!bitmap_set_bit (&frontiers[runner->index],
-				       b->index))
+		  if (!bitmap_set_bit (&frontiers[runner->index], b->index))
 		    break;
-		  runner = get_immediate_dominator (CDI_DOMINATORS,
-						    runner);
+		  runner = get_immediate_dominator (CDI_DOMINATORS, runner);
 		}
 	    }
 	}
     }
-}
-
-
-void
-compute_dominance_frontiers (bitmap_head *frontiers)
-{
-  timevar_push (TV_DOM_FRONTIERS);
-
-  compute_dominance_frontiers_1 (frontiers);
 
   timevar_pop (TV_DOM_FRONTIERS);
 }
@@ -1385,25 +1375,26 @@  compute_idf (bitmap def_blocks, bitmap_h
   unsigned bb_index, i;
   bitmap phi_insertion_points;
 
-  /* Each block can appear at most twice on the work-stack.  */
-  auto_vec<int> work_stack (2 * n_basic_blocks_for_fn (cfun));
   phi_insertion_points = BITMAP_ALLOC (NULL);
 
-  /* Seed the work list with all the blocks in DEF_BLOCKS.  We use
-     vec::quick_push here for speed.  This is safe because we know that
-     the number of definition blocks is no greater than the number of
-     basic blocks, which is the initial capacity of WORK_STACK.  */
-  EXECUTE_IF_SET_IN_BITMAP (def_blocks, 0, bb_index, bi)
-    work_stack.quick_push (bb_index);
+  /* Seed the work set with all the blocks in DEF_BLOCKS.  */
+  auto_bitmap work_set;
+  bitmap_copy (work_set, def_blocks);
+  bitmap_tree_view (work_set);
 
-  /* Pop a block off the worklist, add every block that appears in
+  /* Pop a block off the workset, add every block that appears in
      the original block's DF that we have not already processed to
-     the worklist.  Iterate until the worklist is empty.   Blocks
-     which are added to the worklist are potential sites for
+     the workset.  Iterate until the workset is empty.   Blocks
+     which are added to the workset are potential sites for
      PHI nodes.  */
-  while (work_stack.length () > 0)
+  while (!bitmap_empty_p (work_set))
     {
-      bb_index = work_stack.pop ();
+      /* The dominance frontier of a block is blocks after it so iterating
+         on earlier blocks first is better.
+	 ???  Basic blocks are by no means guaranteed to be ordered in
+	 optimal order for this iteration.  */
+      bb_index = bitmap_first_set_bit (work_set);
+      bitmap_clear_bit (work_set, bb_index);
 
       /* Since the registration of NEW -> OLD name mappings is done
 	 separately from the call to update_ssa, when updating the SSA
@@ -1416,7 +1407,7 @@  compute_idf (bitmap def_blocks, bitmap_h
       EXECUTE_IF_AND_COMPL_IN_BITMAP (&dfs[bb_index], phi_insertion_points,
 	                              0, i, bi)
 	{
-	  work_stack.quick_push (i);
+	  bitmap_set_bit (work_set, i);
 	  bitmap_set_bit (phi_insertion_points, i);
 	}
     }
Index: gcc/tree-into-ssa.c
===================================================================
--- gcc/tree-into-ssa.c	(revision 278496)
+++ gcc/tree-into-ssa.c	(working copy)
@@ -940,14 +940,18 @@  mark_phi_for_rewrite (basic_block bb, gp
   if (!blocks_with_phis_to_rewrite)
     return;
 
-  bitmap_set_bit (blocks_with_phis_to_rewrite, idx);
-
-  n = (unsigned) last_basic_block_for_fn (cfun) + 1;
-  if (phis_to_rewrite.length () < n)
-    phis_to_rewrite.safe_grow_cleared (n);
-
-  phis = phis_to_rewrite[idx];
-  phis.reserve (10);
+  if (bitmap_set_bit (blocks_with_phis_to_rewrite, idx))
+    {
+      n = (unsigned) last_basic_block_for_fn (cfun) + 1;
+      if (phis_to_rewrite.length () < n)
+	phis_to_rewrite.safe_grow_cleared (n);
+
+      phis = phis_to_rewrite[idx];
+      gcc_assert (!phis.exists ());
+      phis.create (10);
+    }
+  else
+    phis = phis_to_rewrite[idx];
 
   phis.safe_push (phi);
   phis_to_rewrite[idx] = phis;
@@ -2937,11 +2941,7 @@  delete_update_ssa (void)
 
   if (blocks_with_phis_to_rewrite)
     EXECUTE_IF_SET_IN_BITMAP (blocks_with_phis_to_rewrite, 0, i, bi)
-      {
-	vec<gphi *> phis = phis_to_rewrite[i];
-	phis.release ();
-	phis_to_rewrite[i].create (0);
-      }
+      phis_to_rewrite[i].release ();
 
   BITMAP_FREE (blocks_with_phis_to_rewrite);
   BITMAP_FREE (blocks_to_update);
Index: gcc/vec.h
===================================================================
--- gcc/vec.h	(revision 278548)
+++ gcc/vec.h	(working copy)
@@ -295,6 +295,11 @@  va_heap::reserve (vec<T, va_heap, vl_emb
 }
 
 
+#if GCC_VERSION >= 4007
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+#endif
+
 /* Free the heap space allocated for vector V.  */
 
 template<typename T>
@@ -312,6 +317,9 @@  va_heap::release (vec<T, va_heap, vl_emb
   v = NULL;
 }
 
+#if GCC_VERSION >= 4007
+#pragma GCC diagnostic pop
+#endif
 
 /* Allocator type for GC vectors.  Notice that we need the structure
    declaration even if GC is not enabled.  */