diff mbox

PR56572 flatten unnecessary nested transactions after inlining

Message ID 52CEEB06.9060503@redhat.com
State New
Headers show

Commit Message

Aldy Hernandez Jan. 9, 2014, 6:31 p.m. UTC
On 01/06/14 13:40, Richard Henderson wrote:
> On 12/19/2013 11:06 AM, Richard Biener wrote:
>> Aldy Hernandez <aldyh@redhat.com> wrote:
>>> I'd still like to catch the common cases, like I do with this patch.
>>>
>>> Perhaps we move this code to the .tmmark pass and handle the
>>> uninstrumented case.  rth?
>
> tmmark is way way later than you'd want.  I believe that ipa_tm is the right
> place.  That's where we generate clones.  The clones know a-priori that they're
> called within a transaction and thus all internal transations may be
> eliminated.  And thus any inlining that would happen after ipa_tm would
> properly inline the clone, and thus no more need be done.

I have a patch (attached for reference) removing the nested transactions 
while we are creating the clones (as suggested), but the uninstrumented 
code path complicates things.  I'm afraid I don't have any good news.

Consider this:

inline void f() {
   __transaction_atomic {
       a = 12345;
   }
}

void g() {
   __transaction_atomic {
     f();
   }
}

The problem is that when we add the uninstrumented code path later in 
tmipa, we end up with the following for g():

g ()
{
   <bb 2>:
   __transaction_atomic  // SUBCODE=[ GTMA_HAVE_LOAD GTMA_HAVE_STORE ]
   goto <bb 3>;

   <bb 5>:
   f ();            /* <---- uninstrumented path */
   __builtin__ITM_commitTransaction ();
   goto <bb 4>;

   <bb 3>:
   f (); [tm-clone]    /* <---- instrumented path */
   __builtin__ITM_commitTransaction ();

   <bb 4>:
   return;

}

Since we only removed the transaction in the clone of f(), plain regular 
f() will still have the additional transaction, so inlining will still 
yield a g() with a nested transaction in the uninstrumented path.

So we're back to square one, needing a separate pass to remove the 
nested transactions, and this pass will unfortunately have to deal with 
the uninstrumented/instrumented paths.

This has taken longer to fix than I expected, so I'm going to put this 
aside for now and concentrate on some P1-P2's.  For the record, since 
you don't like this pass in the .tmmark pass which is WAY late, can we 
have a tree pass right after the IPA passes (thus after inlining)?

I'll add some notes to the PR so we can pick this up later.

Aldy
diff mbox

Patch

diff --git a/gcc/trans-mem.c b/gcc/trans-mem.c
index fe6dc28..59b589c 100644
--- a/gcc/trans-mem.c
+++ b/gcc/trans-mem.c
@@ -1,5 +1,7 @@ 
 /* Passes for transactional memory support.
    Copyright (C) 2008-2014 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com> and
+   		  Aldy Hernandez <aldyh@redhat.com>.
 
    This file is part of GCC.
 
@@ -4106,8 +4108,8 @@  maybe_push_queue (struct cgraph_node *node,
    code path.  QUEUE are the basic blocks inside the transaction
    represented in REGION.
 
-   Later in split_code_paths() we will add the conditional to choose
-   between the two alternatives.  */
+   Later in the tmmark pass (expand_transaction) we will add the
+   conditional to choose between the two alternatives.  */
 
 static void
 ipa_uninstrument_transaction (struct tm_region *region,
@@ -4192,29 +4194,11 @@  ipa_tm_scan_calls_transaction (struct tm_ipa_cg_data *d,
       bbs = get_tm_region_blocks (r->entry_block, r->exit_blocks, NULL,
 				  d->transaction_blocks_normal, false);
 
-      // Generate the uninstrumented code path for this transaction.
-      ipa_uninstrument_transaction (r, bbs);
-
       FOR_EACH_VEC_ELT (bbs, i, bb)
 	ipa_tm_scan_calls_block (callees_p, bb, false);
 
       bbs.release ();
     }
-
-  // ??? copy_bbs should maintain cgraph edges for the blocks as it is
-  // copying them, rather than forcing us to do this externally.
-  rebuild_cgraph_edges ();
-
-  // ??? In ipa_uninstrument_transaction we don't try to update dominators
-  // because copy_bbs doesn't return a VEC like iterate_fix_dominators expects.
-  // Instead, just release dominators here so update_ssa recomputes them.
-  free_dominance_info (CDI_DOMINATORS);
-
-  // When building the uninstrumented code path, copy_bbs will have invoked
-  // create_new_def_for starting an "ssa update context".  There is only one
-  // instance of this context, so resolve ssa updates before moving on to
-  // the next function.
-  update_ssa (TODO_update_ssa);
 }
 
 /* Scan all calls in NODE as if this is the transactional clone,
@@ -4890,10 +4874,11 @@  ipa_tm_create_version_alias (struct cgraph_node *node, void *data)
   return false;
 }
 
-/* Create a copy of the function (possibly declaration only) of OLD_NODE,
-   appropriate for the transactional clone.  */
+/* Create a copy of the function (possibly declaration only) of
+   OLD_NODE, appropriate for the transactional clone.  Returns the
+   cgraph node for the newly created clone.  */
 
-static void
+static struct cgraph_node *
 ipa_tm_create_version (struct cgraph_node *old_node)
 {
   tree new_decl, old_decl, tm_name;
@@ -4947,13 +4932,12 @@  ipa_tm_create_version (struct cgraph_node *old_node)
     ipa_tm_mark_forced_by_abi_node (new_node);
 
   /* Do the same thing, but for any aliases of the original node.  */
-  {
-    struct create_version_alias_info data;
-    data.old_node = old_node;
-    data.new_decl = new_decl;
-    cgraph_for_node_and_aliases (old_node, ipa_tm_create_version_alias,
-				 &data, true);
-  }
+  struct create_version_alias_info data;
+  data.old_node = old_node;
+  data.new_decl = new_decl;
+  cgraph_for_node_and_aliases (old_node, ipa_tm_create_version_alias,
+			       &data, true);
+  return new_node;
 }
 
 /* Construct a call to TM_IRREVOCABLE and insert it at the beginning of BB.  */
@@ -5275,16 +5259,14 @@  ipa_tm_transform_transaction (struct cgraph_node *node)
   pop_cfun ();
 }
 
-/* Transform the calls within the transactional clone of NODE.  */
+/* Transform the calls within the transactional clone of NODE.  D is
+   the IPA auxiliary data for the NODE.  */
 
 static void
-ipa_tm_transform_clone (struct cgraph_node *node)
+ipa_tm_transform_clone (struct cgraph_node *node, struct tm_ipa_cg_data *d)
 {
-  struct tm_ipa_cg_data *d;
   bool need_ssa_rename;
 
-  d = get_cg_data (&node, true);
-
   /* If this function makes no calls and has no irrevocable blocks,
      then there's nothing to do.  */
   /* ??? Remove non-aborting top-level transactions.  */
@@ -5305,6 +5287,100 @@  ipa_tm_transform_clone (struct cgraph_node *node)
   pop_cfun ();
 }
 
+/* Callback for expand_regions.
+
+   Given a transaction in REGION, remove the GIMPLE_TRANSACTION and
+   its corresponding BUILT_IN_TM_COMMIT*.  The tm_region itself is
+   removed by the caller.  */
+
+static void *
+remove_transaction (struct tm_region *region, void *data ATTRIBUTE_UNUSED)
+{
+  // Remove the transaction.
+  if (dump_file)
+    fprintf (dump_file, "Folding unnecessary inner transaction in BB%d\n",
+	     gimple_bb (region->transaction_stmt)->index);
+  gimple_stmt_iterator gsi = gsi_for_stmt (region->transaction_stmt);
+  gsi_remove (&gsi, true);
+
+  // Find the corresponding BUILT_IN_TM_COMMIT* and remove it as well.
+  if (region->exit_blocks)
+    {
+      bitmap_iterator bi;
+      unsigned i;
+
+      EXECUTE_IF_SET_IN_BITMAP (region->exit_blocks, 0, i, bi)
+	{
+	  gimple_stmt_iterator gsi
+	    = gsi_last_bb (BASIC_BLOCK_FOR_FN (cfun, i));
+	  gimple stmt = gsi_stmt (gsi);
+	  if (gimple_code (stmt) != GIMPLE_CALL)
+	    continue;
+	  tree fndecl = gimple_call_fndecl (stmt);
+	  if (fndecl
+	      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+	      && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_TM_COMMIT
+		  || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_TM_COMMIT_EH))
+	    {
+	      /* We technically don't need to do this because our
+		 caller (ipa_tm_execute) is brain dead and calls
+		 rebuild_cgraph_edges, but it's proper form.
+		 Eventually we'd like not to call
+		 rebuild_cgraph_edges() and have everyone clean up
+		 their own mess, like we do here.  */
+	      struct cgraph_node *cfun_node
+		= cgraph_get_node (current_function_decl);
+	      cgraph_remove_edge (cgraph_edge (cfun_node, stmt));
+
+	      tree vdef = gimple_vdef (stmt);
+	      tree vuse = gimple_vuse (stmt);
+	      if (vdef && vuse)
+		replace_uses_by (vdef, vuse);
+	      gsi_remove (&gsi, true);
+	    }
+	}
+    }
+  return NULL;
+}
+
+/* A helper function for flatten_nested_transactions.
+
+   Starting at TOP, go through all the transactional regions and
+   remove any nested transactions that are redundant, removing both
+   the start/end of a transaction.  */
+
+static void
+flatten_nested_transactions_1 (struct tm_region *top)
+{
+  for (struct tm_region *region = top; region; region = region->next)
+    {
+      if (region->inner)
+	flatten_nested_transactions_1 (region->inner);
+
+      /* Transactions which have no explicit aborts can be removed.  */
+      unsigned int sub = gimple_transaction_subcode (region->transaction_stmt);
+      if (!(sub & GTMA_HAVE_ABORT))
+	expand_regions (region, remove_transaction, NULL, false);
+    }
+}
+
+/* Remove any transactions in the given CLONE that are redundant.
+   This is done by removing the GIMPLE_TRANSACTION +
+   BUILT_IN_TM_COMMIT* pairs.  */
+
+static void
+flatten_nested_transactions (struct cgraph_node *clone)
+{
+  /* Initialize all_tm_regions and friends for this clone.  */
+  push_cfun (DECL_STRUCT_FUNCTION (clone->decl));
+  calculate_dominance_info (CDI_DOMINATORS);
+  tm_region_init (NULL);
+
+  flatten_nested_transactions_1 (all_tm_regions);
+
+  pop_cfun ();
+}
+
 /* Main entry point for the transactional memory IPA pass.  */
 
 static unsigned int
@@ -5352,14 +5428,12 @@  ipa_tm_execute (void)
 
 	push_cfun (DECL_STRUCT_FUNCTION (node->decl));
 	calculate_dominance_info (CDI_DOMINATORS);
-
 	tm_region_init (NULL);
 	if (all_tm_regions)
 	  {
 	    d = get_cg_data (&node, true);
 
-	    /* Scan for calls that are in each transaction, and
-	       generate the uninstrumented code path.  */
+	    /* Scan for calls that are in each transaction.  */
 	    ipa_tm_scan_calls_transaction (d, &tm_callees);
 
 	    /* Put it in the worklist so we can scan the function
@@ -5537,9 +5611,49 @@  ipa_tm_execute (void)
 	doit = true;
 
       if (doit)
-	ipa_tm_create_version (node);
+	{
+	  struct cgraph_node *new_node = ipa_tm_create_version (node);
+	  flatten_nested_transactions (new_node);
+	}
     }
 
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (node->lowered
+	&& cgraph_function_body_availability (node) >= AVAIL_OVERWRITABLE)
+      {
+	push_cfun (DECL_STRUCT_FUNCTION (node->decl));
+	calculate_dominance_info (CDI_DOMINATORS);
+	tm_region_init (NULL);
+
+	for (struct tm_region *r = all_tm_regions; r; r = r->next)
+	  {
+	    struct tm_ipa_cg_data *d = get_cg_data (&node, true);
+	    vec<basic_block> bbs
+	      = get_tm_region_blocks (r->entry_block, r->exit_blocks, NULL,
+				      d->transaction_blocks_normal, false);
+	    ipa_uninstrument_transaction (r, bbs);
+	  }
+
+	// ??? copy_bbs should maintain cgraph edges for the blocks as
+	// it is copying them, rather than forcing us to do this
+	// externally.
+	rebuild_cgraph_edges ();
+
+	// ??? In ipa_uninstrument_transaction we don't try to update
+	// dominators because copy_bbs doesn't return a VEC like
+	// iterate_fix_dominators expects.  Instead, just release
+	// dominators here so update_ssa recomputes them.
+	free_dominance_info (CDI_DOMINATORS);
+
+	// When building the uninstrumented code path, copy_bbs will
+	// have invoked create_new_def_for starting an "ssa update
+	// context".  There is only one instance of this context, so
+	// resolve ssa updates before moving on to the next function.
+	update_ssa (TODO_update_ssa);
+
+	pop_cfun ();
+      }
+
   /* Redirect calls to the new clones, and insert irrevocable marks.  */
   for (i = 0; i < tm_callees.length (); ++i)
     {
@@ -5548,7 +5662,7 @@  ipa_tm_execute (void)
 	{
 	  d = get_cg_data (&node, true);
 	  if (d->clone)
-	    ipa_tm_transform_clone (node);
+	    ipa_tm_transform_clone (node, d);
 	}
     }
   FOR_EACH_DEFINED_FUNCTION (node)