diff mbox

[gomp4] More openacc loop indirection

Message ID 561CFCF2.2070705@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Oct. 13, 2015, 12:45 p.m. UTC
I've committed this next patch  in my series to move loop partitioning decisions 
to the target compiler.

It introduces 2 more IFN_UNIQUE cases, marking the head and tail sequences of an 
openACC loop.  These are added around the reduction and fork/join regions.  In 
the oacc_device_lower pass we use these markers to reconstruct the openacc 
partitioning regions (their  unique property permits this,  in the same way the 
ptx backend uses the fork/join markers themselves).  Then we scan over the head 
and tail sequences setting the partitioning level.

This patch still doesn't actually defer the partitioning decision -- its putting 
in place machinery to allow such deferral.  I expect the next patch to complete 
the transition.

nathan
diff mbox

Patch

Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def	(revision 228713)
+++ gcc/internal-fn.def	(working copy)
@@ -78,6 +78,10 @@  DEF_INTERNAL_FN (UNIQUE, ECF_NOTHROW | E
    indicating the axis of forking or joining and return nothing.  */
 #define IFN_UNIQUE_OACC_FORK 1
 #define IFN_UNIQUE_OACC_JOIN 2
+/* HEAD_MARK and TAIL_MARK are used to demark the sequence entering or
+   leaving partitioned execution.  */
+#define IFN_UNIQUE_OACC_HEAD_MARK 3
+#define IFN_UNIQUE_OACC_TAIL_MARK 4
 
 /* DIM_SIZE and DIM_POS return the size of a particular compute
    dimension and the executing thread's position within that
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 228713)
+++ gcc/omp-low.c	(working copy)
@@ -236,6 +236,26 @@  struct omp_for_data
   struct omp_for_data_loop *loops;
 };
 
+/* Describe the OpenACC looping structure of a function.  The entire
+   function is held in a 'NULL' loop.  */
+
+struct oacc_loop
+{
+  oacc_loop *parent; /* Containing loop.  */
+
+  oacc_loop *child; /* First inner loop.  */
+
+  oacc_loop *sibling; /* Next loop within same parent.  */
+
+  location_t loc; /* Location of the loop start.  */
+
+  /* Start of head and tail.  */
+  gcall *head;  /* Head marker function. */
+  gcall *tail;  /* Tail marker function.  */
+
+  /* Partitioning level.  */
+  unsigned level;
+};
 
 static splay_tree all_contexts;
 static int taskreg_nesting_level;
@@ -4737,11 +4757,12 @@  expand_oacc_get_thread_num (gimple_seq *
   return res;
 }
 
-/* Lower the OpenACC reductions of CLAUSES for compute axis DIM.  INNER
-   is true if this is an inner axis of a multi-axis loop.  FORK and
-   JOIN are (optional) fork and join markers.  Generate the
-   before-loop forking sequence in FORK_SEQ and the after-loop joining
-   sequence to JOIN_SEQ.  The general form of these sequences is
+/* Lower the OpenACC reductions of CLAUSES for compute axis LEVEL
+   (which might be a placeholder).  INNER is true if this is an inner
+   axis of a multi-axis loop.  FORK and JOIN are (optional) fork and
+   join markers.  Generate the before-loop forking sequence in
+   FORK_SEQ and the after-loop joining sequence to JOIN_SEQ.  The
+   general form of these sequences is
 
      GOACC_REDUCTION_SETUP
      GOACC_FORK
@@ -4752,7 +4773,7 @@  expand_oacc_get_thread_num (gimple_seq *
      GOACC_REDUCTION_TEARDOWN.  */
 
 static void
-lower_oacc_reductions (location_t loc, tree clauses, unsigned dim, bool inner,
+lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 		       gcall *fork, gcall *join, gimple_seq *fork_seq,
 		       gimple_seq *join_seq, omp_context *ctx)
 {
@@ -4764,7 +4785,6 @@  lower_oacc_reductions (location_t loc, t
   gimple_seq after_join = NULL;
   unsigned count = 0;
   tree lid = build_int_cst (unsigned_type_node, oacc_lid++);
-  tree level = build_int_cst (unsigned_type_node, dim);
 
   for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
     if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
@@ -4866,6 +4886,22 @@  lower_oacc_reductions (location_t loc, t
   gimple_seq_add_seq (join_seq, after_join);
 }
 
+/* Emit an OpenACC lopp head or tail marker to SEQ.  LEVEL is the
+   partitioning level of the enclosed region.  */ 
+
+static void
+lower_oacc_loop_marker (location_t loc, bool head, tree level,
+			gimple_seq *seq)
+{
+  tree marker = build_int_cst
+    (integer_type_node, (head ? IFN_UNIQUE_OACC_HEAD_MARK
+			 : IFN_UNIQUE_OACC_TAIL_MARK));
+  gcall *call = gimple_build_call_internal
+    (IFN_UNIQUE, 1 + (level != NULL_TREE), marker, level);
+  gimple_set_location (call, loc);
+  gimple_seq_add_stmt (seq, call);
+}
+
 /* Generate the before and after OpenACC loop sequences.  CLAUSES are
    the loop clauses, from which we extract reductions.  Initialize
    HEAD and TAIL.  */
@@ -4884,19 +4920,25 @@  lower_oacc_head_tail (location_t loc, tr
   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
     if (mask & GOMP_DIM_MASK (ix))
       {
-	tree level = build_int_cst (unsigned_type_node, ix);
+	tree place = build_int_cst (integer_type_node, -1);
+	tree level = build_int_cst (integer_type_node, ix);
 	gcall *fork = gimple_build_call_internal
 	  (IFN_UNIQUE, 2,
-	   build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK), level);
+	   build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK), place);
 	gcall *join = gimple_build_call_internal
 	  (IFN_UNIQUE, 2,
-	   build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN), level);
+	   build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN), place);
 	gimple_seq fork_seq = NULL;
 	gimple_seq join_seq = NULL;
 
 	gimple_set_location (fork, loc);
 	gimple_set_location (join, loc);
-	lower_oacc_reductions (loc, clauses, ix, inner,
+
+	/* Mark the beginning of this level sequence.  */
+	lower_oacc_loop_marker (loc, true, level, &fork_seq);
+	lower_oacc_loop_marker (loc, false, level, &join_seq);
+
+	lower_oacc_reductions (loc, clauses, place, inner,
 			       fork, join, &fork_seq, &join_seq,  ctx);
 
 	/* Append this level to head. */
@@ -4907,6 +4949,10 @@  lower_oacc_head_tail (location_t loc, tr
 
 	inner = true;
       }
+
+  /* Mark the end of the sequence.  */
+  lower_oacc_loop_marker (loc, true, NULL_TREE, head);
+  lower_oacc_loop_marker (loc, false, NULL_TREE, tail);
 }
 
 /* Generate code to implement the REDUCTION clauses.  OpenACC reductions
@@ -12613,9 +12659,9 @@  lower_omp_target (gimple_stmt_iterator *
     /* If there are reductions on the offloaded region itself, treat
        them as a dummy GANG loop.  */
     lower_oacc_reductions (gimple_location (ctx->stmt), clauses,
-			   GOMP_DIM_GANG, false, NULL, NULL,
-			   &irlist, &orlist, ctx);
-  
+			   build_int_cst (unsigned_type_node, GOMP_DIM_GANG),
+			   false, NULL, NULL, &irlist, &orlist, ctx);
+
   if (offloaded)
     {
       /* Declare all the variables created by mapping and the variables
@@ -15522,6 +15568,286 @@  oacc_validate_dims (tree fn, tree attrs,
   return fn_level;
 }
 
+/* Create an empty OpenACC loop structure at LOC.  */
+
+static oacc_loop *
+new_oacc_loop_raw (oacc_loop *parent, location_t loc)
+{
+  oacc_loop *loop = XCNEW (oacc_loop);
+
+  loop->parent = parent;
+  loop->child = loop->sibling = NULL;
+
+  if (parent)
+    {
+      loop->sibling = parent->child;
+      parent->child = loop;
+    }
+
+  loop->head = loop->tail = NULL;
+  
+  loop->loc = loc;
+  
+  loop->level = 0;
+
+  return loop;
+}
+
+/* Create an outermost, dummy OpenACC loop for offloaded function
+   DECL.  */
+
+static oacc_loop *
+new_oacc_loop_outer (tree decl)
+{
+  return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
+}
+
+/* Start a new OpenACC loop  structure beginning at head marker HEAD.
+   Link into PARENT loop.  Return the new loop.  */
+
+static oacc_loop *
+new_oacc_loop (oacc_loop *parent, gcall *head)
+{
+  oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (head));
+
+  loop->head = head;
+
+  loop->level = TREE_INT_CST_LOW (gimple_call_arg (head, 1));
+
+  return loop;
+}
+
+/* Finish off the current OpenACC loop ending at tail marker TAIL.
+   Return the parent loop.  */
+
+static oacc_loop *
+finish_oacc_loop (oacc_loop *loop, gcall *tail)
+{
+  loop->tail = tail;
+
+  gcc_assert (TREE_INT_CST_LOW (gimple_call_arg (tail, 1)) == loop->level);
+
+  return loop->parent;
+}
+
+/* Free all OpenACC loop structures within LOOP (inclusive).  */
+
+static void
+free_oacc_loop (oacc_loop *loop)
+{
+  if (loop->sibling)
+    free_oacc_loop (loop->sibling);
+  if (loop->child)
+    free_oacc_loop (loop->child);
+
+  free (loop);
+}
+
+/* Dump out the OpenACC loop head or tail beginning at FROM.  */
+
+static void
+dump_oacc_loop_part (FILE *file, gcall *from, int depth,  const char *title)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (from);
+  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (from, 0));
+
+  fprintf (file, "%*s%s:\n", depth * 2, "", title);
+  for (gimple *stmt = from; ;)
+    {
+      print_gimple_stmt (file, stmt, depth * 2 + 2, 0);
+      gsi_next (&gsi);
+      stmt = gsi_stmt (gsi);
+
+      if (!is_gimple_call (stmt))
+	continue;
+
+      gcall *call = as_a <gcall *> (stmt);
+      
+      if (gimple_call_internal_p (call)
+	  && gimple_call_internal_fn (call) == IFN_UNIQUE
+	  && code == TREE_INT_CST_LOW (gimple_call_arg (call, 0)))
+	break;
+    }
+}
+
+/* Dump OpenACC loops LOOP, its siblings and its children.  */
+
+static void
+dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
+{
+  fprintf (file, "%*sLoop %d %s:%u\n", depth * 2, "",
+	   loop->level,
+	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
+
+  if (loop->head)
+    dump_oacc_loop_part (file, loop->head, depth, "Head");
+  if (loop->tail)
+    dump_oacc_loop_part (file, loop->tail, depth, "Tail");
+  
+  if (loop->child)
+    dump_oacc_loop (file, loop->child, depth + 1);
+  if (loop->sibling)
+    dump_oacc_loop (file, loop->sibling, depth);
+}
+
+void debug_oacc_loop (oacc_loop *);
+
+/* Dump loops to stderr.  */
+
+DEBUG_FUNCTION void
+debug_oacc_loop (oacc_loop *loop)
+{
+  dump_oacc_loop (stderr, loop, 0);
+}
+
+/* DFS walk of basic blocks BB onwards, creating OpenACC loop
+   structures as we go.  By construction these loops are properly
+   nested.  */
+
+static void
+oacc_loop_walk (oacc_loop *loop, basic_block bb)
+{
+  if (bb->flags & BB_VISITED)
+    return;
+  bb->flags |= BB_VISITED;
+
+  /* Scan for loop markers.  */
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+
+      if (!is_gimple_call (stmt))
+	continue;
+
+      gcall *call = as_a <gcall *> (stmt);
+      
+      if (!gimple_call_internal_p (call))
+	continue;
+
+      if (gimple_call_internal_fn (call) != IFN_UNIQUE)
+	continue;
+
+      if (gimple_call_num_args (call) == 1)
+	continue;
+
+      unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+      switch (code)
+	{
+	case IFN_UNIQUE_OACC_HEAD_MARK:
+	  loop = new_oacc_loop (loop, call);
+	  break;
+	case IFN_UNIQUE_OACC_TAIL_MARK:
+	  loop = finish_oacc_loop (loop, call);
+	default: break;
+	}
+    }
+
+  /* Walk successor blocks.  */
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, bb->succs)
+    oacc_loop_walk (loop, e->dest);
+}
+
+/* Discover the OpenACC loops marked up by HEAD and TAIL markers for
+   the current function.  */
+
+static oacc_loop *
+oacc_loop_discovery ()
+{
+  basic_block bb;
+  
+  oacc_loop *top = new_oacc_loop_outer (current_function_decl);
+  oacc_loop_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
+
+  /* Reset the visited flags.  */
+  FOR_ALL_BB_FN (bb, cfun)
+    bb->flags &= ~BB_VISITED;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "OpenACC loops\n");
+      dump_oacc_loop (dump_file, top, 0);
+      fprintf (dump_file, "\n");
+    }
+
+  return top;
+}
+
+/* Transform the abstract internal function markers starting at FROM
+   to be for partitioning level LEVEL.  Stop when we meet another HEAD
+   or TAIL  marker.  */
+
+static void
+oacc_loop_transform (gcall *from, int level)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (from);
+  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (from, 0));
+  tree replacement  = build_int_cst (unsigned_type_node, level);
+
+  for (gimple *stmt = from; ;)
+    {
+      gsi_next (&gsi);
+      stmt = gsi_stmt (gsi);
+
+      if (!is_gimple_call (stmt))
+	continue;
+
+      gcall *call = as_a <gcall *> (stmt);
+      
+      if (!gimple_call_internal_p (call))
+	continue;
+
+      switch (gimple_call_internal_fn (call))
+	{
+	case IFN_UNIQUE:
+	  {
+	    unsigned c = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+
+	    if (c == code)
+	      goto break2;
+
+	    if (c == IFN_UNIQUE_OACC_FORK || c == IFN_UNIQUE_OACC_JOIN)
+	      *gimple_call_arg_ptr (call, 1) = replacement;
+	  }
+	  break;
+
+	case IFN_GOACC_REDUCTION_SETUP:
+	case IFN_GOACC_REDUCTION_INIT:
+	case IFN_GOACC_REDUCTION_FINI:
+	case IFN_GOACC_REDUCTION_TEARDOWN:
+	  *gimple_call_arg_ptr (call, 2) = replacement;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+ break2:;
+}
+
+/* Process the discovered OpenACC loops, setting the correct
+   partitioning level etc.  */
+
+static void
+oacc_loop_process (oacc_loop *loop)
+{
+  if (loop->child)
+    oacc_loop_process (loop->child);
+
+  if (loop->head)
+    {
+      oacc_loop_transform (loop->head, loop->level);
+      oacc_loop_transform (loop->tail, loop->level);
+    }
+
+  if (loop->sibling)
+    oacc_loop_process (loop->sibling);
+}
+
 /* Main entry point for oacc transformations which run on the device
    compiler after LTO, so we know what the target device is at this
    point (including the host fallback).  */
@@ -15537,11 +15863,17 @@  execute_oacc_device_lower ()
     return 0;
 
   oacc_validate_dims (current_function_decl, attrs, dims);
-  
+
+  /* Discover and process the loops.  */
+  oacc_loop *loops = oacc_loop_discovery ();
+  oacc_loop_process (loops);
+
   /* Offloaded targets may introduce new basic blocks, which require
      dominance information to update SSA.  */
   calculate_dominance_info (CDI_DOMINATORS);
 
+  /* Now lower internal loop functions to target-specific code
+     sequences.  */
   basic_block bb;
   FOR_ALL_BB_FN (bb, cfun)
     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
@@ -15599,6 +15931,9 @@  execute_oacc_device_lower ()
 		  && (targetm.goacc.fork_join
 		      (call, dims, code == IFN_UNIQUE_OACC_FORK)))
 		rescan = -1;
+	      else if (code == IFN_UNIQUE_OACC_HEAD_MARK
+		       || code == IFN_UNIQUE_OACC_TAIL_MARK)
+		rescan = -1;
 	      break;
 	    }
 	  }
@@ -15621,6 +15956,8 @@  execute_oacc_device_lower ()
 	  }
       }
 
+  free_oacc_loop (loops);
+  
   return 0;
 }