diff mbox

[gomp4] optimize routine calling

Message ID 55DCA17E.2020301@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Aug. 25, 2015, 5:10 p.m. UTC
When forking to call a partitioned routine there is no need to propagate local 
state from the active thread to the forked threads.

I've committed this patch to implement that optimization.

nathan
diff mbox

Patch

2015-08-25  Nathan Sidwell  <nathan@codesourcery.com>

	* config/nvptx/nvptx.c (nvptx_emit_forking): Add is_call argument,
	propagate it into mask.
	(nvptx_emit_joining): Likewise.
	(nvptx_expand_call): Move emit_forking call to later.  Add is_call
	argument.
	(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): Asjust calls.
	(nvptx_discover_pars): Don't look for predecessor insn in call
	forks and joins.
	(nvptx_process_pars): Don't emit propagation code for a call.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 227159)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -1047,16 +1047,16 @@  nvptx_expand_compare (rtx compare)
 /* Emit forking instructions for MASK.  */
 
 static void
-nvptx_emit_forking (unsigned mask)
+nvptx_emit_forking (unsigned mask, bool is_call)
 {
   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
   if (mask)
     {
-      rtx op = GEN_INT (mask);
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
       
       /* Emit fork for worker level.  */
-      if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+      if (!is_call && mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
@@ -1065,16 +1065,19 @@  nvptx_emit_forking (unsigned mask)
 /* Emit joining instructions for MASK.  */
 
 static void
-nvptx_emit_joining (unsigned mask)
+nvptx_emit_joining (unsigned mask, bool is_call)
 {
   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
   if (mask)
     {
-      rtx op = GEN_INT (mask);
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
 
-      /* Emit joining for all pars.  */
-      emit_insn (gen_nvptx_joining (op));
+      /* Emit joining for all non-call pars to ensure there's a single
+	 predecessor for the block the join insn ends up in.  This is
+	 needed for skipping entire loops.  */
+      if (!is_call)
+	emit_insn (gen_nvptx_joining (op));
       emit_insn (gen_nvptx_join (op));
     }
 }
@@ -1135,8 +1138,6 @@  nvptx_expand_call (rtx retval, rtx addre
 	}
     }
 
-  nvptx_emit_forking (parallel);
-
   if (cfun->machine->funtype
       /* It's possible to construct testcases where we call a variable.
 	 See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
@@ -1195,11 +1196,12 @@  nvptx_expand_call (rtx retval, rtx addre
 	  write_func_decl_from_insn (func_decls, retval, pat, callee);
 	}
     }
+  nvptx_emit_forking (parallel, true);
   emit_call_insn (pat);
   if (tmp_retval != retval)
     emit_move_insn (retval, tmp_retval);
 
-  nvptx_emit_joining (parallel);
+  nvptx_emit_joining (parallel, true);
 }
 
 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
@@ -1207,13 +1209,13 @@  nvptx_expand_call (rtx retval, rtx addre
 void
 nvptx_expand_oacc_fork (unsigned mode)
 {
-  nvptx_emit_forking (GOMP_DIM_MASK (mode));
+  nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
 }
 
 void
 nvptx_expand_oacc_join (unsigned mode)
 {
-  nvptx_emit_joining (GOMP_DIM_MASK (mode));
+  nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
 }
 
 /* Expander for reduction locking and unlocking.  We expect SRC to be
@@ -2611,7 +2613,8 @@  nvptx_discover_pars (bb_insn_map_t *map)
 		l = new parallel (l, mask);
 		l->forked_block = block;
 		l->forked_insn = end;
-		if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 		  l->fork_insn
 		    = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	      }
@@ -2626,7 +2629,8 @@  nvptx_discover_pars (bb_insn_map_t *map)
 		gcc_assert (l->mask == mask);
 		l->join_block = block;
 		l->join_insn = end;
-		if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 		  l->joining_insn
 		    = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 		l = l->parent;
@@ -3013,7 +3017,9 @@  nvptx_process_pars (parallel *par)
       inner_mask |= par->inner_mask;
     }
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+    { /* No propagation needed for a call.  */ }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
       nvptx_wpropagate (false, par->forked_block, par->forked_insn);
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);