diff mbox

openacc vector variable broadcasting

Message ID 556FB18B.9060108@codesourcery.com
State New
Headers show

Commit Message

Cesar Philippidis June 4, 2015, 2:01 a.m. UTC
This patch adds support for broadcasting private variables to vector
threads inside acc loops. The algorithm I'm using is extremely
conservative. Basically, it walks each basic block inside an acc loop
and builds a USE and GEN set of decls. The GEN set contains all of the
variables declared inside of the lexical scope containing the basic
block. The USE set are all of the variables used inside that basic block
excluding those inside the GEN set.

This patch still needs a little more polishing. E.g., I'm using an STL
set for USE and GEN, when I probably should be using an existing gcc
container. Also, I'm broadcasting variables during expand-omp,
specifically during expand_omp_for_*. Looking at cfgloops, I wonder if I
can add an acc_loop member to struct loop so that I could defer variable
broadcasting to a later stage. That's something to consider later.

There are still some limitation with variable broadcasting. As of now,
it ignores arrays and structures. It also only works on integral types,
floats and doubles.

Worker variable broadcasting isn't working yet. The infrastructure is in
place, but it's not predicating the 'spill-and-load' from shared memory
sequence. As a consequence, the value being spilled is clobbered by all
of the threads, not just the worker with tid=0.

I applied this patch to gomp-4_0-branch.

Cesar
diff mbox

Patch

2015-06-03  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/	
	* builtins.c (expand_builtin_oacc_thread_broadcast): Allow
	constant value operads.
	* omp-low.c (expand_omp_for_static_nochunk): Broadcast variables
	to openacc vector threads when necessary.
	(expand_omp_for_static_chunk): Likewise.
	(generate_vector_broadcast): Teach how to handle non-integral typed
	variables.
	(populate_loop_use): New function.
	(oacc_broadcast_1): New function.
	(oacc_broadcast): New function.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/vector-broadcast.c: New test.
	

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 83d98b5..bfa9832 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6057,12 +6057,15 @@  expand_builtin_oacc_thread_broadcast (tree exp, rtx target)
   rtx tmp = target;
   machine_mode mode0 = insn_data[icode].operand[0].mode;
   machine_mode mode1 = insn_data[icode].operand[1].mode;
-  if (!REG_P (tmp) || GET_MODE (tmp) != mode0)
+  if (!tmp || !REG_P (tmp) || GET_MODE (tmp) != mode0)
     tmp = gen_reg_rtx (mode0);
   rtx op1 = expand_expr (arg0, NULL_RTX, mode1, EXPAND_NORMAL);
   if (GET_MODE (op1) != mode1)
     op1 = convert_to_mode (mode1, op1, 0);
 
+  /* op1 might be an immediate, place it inside a register.  */
+  op1 = force_reg (mode1, op1);
+
   rtx insn = GEN_FCN (icode) (tmp, op1);
   if (insn != NULL_RTX)
     {
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 2ad7f00..b1aa603 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -109,6 +109,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "lto-section-names.h"
 #include "gomp-constants.h"
 #include "gimple-pretty-print.h"
+#include "set"
 
 
 /* Lowering of OMP parallel and workshare constructs proceeds in two
@@ -291,6 +292,8 @@  static vec<omp_context *> taskreg_contexts;
 
 static void scan_omp (gimple_seq *, omp_context *);
 static tree scan_omp_1_op (tree *, int *, void *);
+static void oacc_broadcast (basic_block, basic_block, struct omp_region *,
+			    struct omp_for_data *);
 
 #define WALK_SUBSTMTS  \
     case GIMPLE_BIND: \
@@ -7131,6 +7134,9 @@  expand_omp_for_static_nochunk (struct omp_region *region,
     }
   exit_bb = region->exit;
 
+  /* Broadcast variables to OpenACC threads.  */
+  oacc_broadcast (entry_bb, fin_bb, region, fd);
+
   /* Iteration space partitioning goes in ENTRY_BB.  */
   gsi = gsi_last_bb (entry_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
@@ -7535,6 +7541,10 @@  expand_omp_for_static_chunk (struct omp_region *region,
   cont_bb = region->cont;
   gcc_assert (EDGE_COUNT (iter_part_bb->succs) == 2);
   fin_bb = BRANCH_EDGE (iter_part_bb)->dest;
+
+  /* Broadcast variables to OpenACC threads.  */
+  oacc_broadcast (entry_bb, fin_bb, region, fd);
+
   gcc_assert (broken_loop
 	      || fin_bb == FALLTHRU_EDGE (cont_bb)->dest);
   seq_start_bb = split_edge (FALLTHRU_EDGE (iter_part_bb));
@@ -10320,12 +10330,24 @@  generate_vector_broadcast (tree dest_var, tree var,
       fn = BUILT_IN_GOACC_THREAD_BROADCAST_LL;
       call_arg_type = long_long_unsigned_type_node;
     }
+
   bool need_conversion = !types_compatible_p (vartype, call_arg_type);
   tree casted_var = var;
+
   if (need_conversion)
     {
+      gassign *conv1 = NULL;
       casted_var = create_tmp_var (call_arg_type);
-      gassign *conv1 = gimple_build_assign (casted_var, NOP_EXPR, var);
+
+      /* Handle floats and doubles.  */
+      if (!INTEGRAL_TYPE_P (vartype))
+	{
+	  tree t = fold_build1 (VIEW_CONVERT_EXPR, call_arg_type, var);
+	  conv1 = gimple_build_assign (casted_var, t);
+	}
+      else
+	conv1 = gimple_build_assign (casted_var, NOP_EXPR, var);
+
       gsi_insert_after (&where, conv1, GSI_CONTINUE_LINKING);
     }
 
@@ -10333,14 +10355,23 @@  generate_vector_broadcast (tree dest_var, tree var,
   gimple call = gimple_build_call (decl, 1, casted_var);
   gsi_insert_after (&where, call, GSI_NEW_STMT);
   tree casted_dest = dest_var;
+
   if (need_conversion)
     {
+      gassign *conv2 = NULL;
       casted_dest = create_tmp_var (call_arg_type);
-      create_tmp_var (call_arg_type);
-      gassign *conv2 = gimple_build_assign (dest_var, NOP_EXPR,
-					    casted_dest);
+
+      if (!INTEGRAL_TYPE_P (vartype))
+	{
+	  tree t = fold_build1 (VIEW_CONVERT_EXPR, vartype, casted_dest);
+	  conv2 = gimple_build_assign (dest_var, t);
+	}
+      else
+	conv2 = gimple_build_assign (dest_var, NOP_EXPR, casted_dest);
+
       gsi_insert_after (&where, conv2, GSI_CONTINUE_LINKING);
     }
+
   gimple_call_set_lhs (call, casted_dest);
   return retval;
 }
@@ -10631,6 +10662,145 @@  predicate_omp_regions (basic_block head_bb)
     }
 }
 
+/* USE and GET sets for variable broadcasting.  */
+static std::set<tree> use, gen;
+
+static tree
+populate_loop_use (tree *tp, int *walk_subtrees, void *data_)
+{
+  struct walk_stmt_info *wi = (struct walk_stmt_info *) data_;
+  std::set<tree>::iterator it;
+
+  /* There isn't much to do for LHS ops. There shouldn't be any pointers
+     or references here.  */
+  if (wi && wi->is_lhs)
+    return NULL_TREE;
+
+  if (VAR_P (*tp))
+    {
+      tree type;
+
+      *walk_subtrees = 0;
+
+      /* Filter out incompatible decls.  */
+      if (INDIRECT_REF_P (*tp) || is_global_var (*tp))
+	return NULL_TREE;
+
+      type = TREE_TYPE (*tp);
+
+      /* Aggregate types aren't supported either.  */
+      if (AGGREGATE_TYPE_P (type))
+	return NULL_TREE;
+
+      /* Filter out decls inside GEN.  */
+      it = gen.find (*tp);
+      if (it == gen.end ())
+	use.insert (*tp);
+    }
+  else if (IS_TYPE_OR_DECL_P (*tp))
+    *walk_subtrees = 0;
+
+  return NULL_TREE;
+}
+
+/* INIT is true if this is the first time this function is called.  */
+
+static void
+oacc_broadcast_1 (basic_block entry_bb, basic_block exit_bb, bool init,
+		  int mask)
+{
+  basic_block son;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  tree block, var;
+
+  if (entry_bb == exit_bb)
+    return;
+
+  /* Populate the GEN set.  */
+
+  gsi = gsi_start_bb (entry_bb);
+  stmt = gsi_stmt (gsi);
+
+  /* There's nothing to do if stmt is empty or if this is the entry basic
+     block to the vector loop.  The entry basic block to pre-expanded loops
+     do not have an entry label.  As such, the scope containing the initial
+     entry_bb should not be added to the gen set.  */
+  if (stmt != NULL && !init && (block = gimple_block (stmt)) != NULL)
+    for (var = BLOCK_VARS (block); var; var = DECL_CHAIN (var))
+      gen.insert(var);
+
+  /* Populate the USE set.  */
+
+  for (gsi = gsi_start_bb (entry_bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      struct walk_stmt_info wi;
+
+      memset (&wi, 0, sizeof (wi));
+      stmt = gsi_stmt (gsi);
+
+      walk_gimple_op (stmt, populate_loop_use, &wi);
+    }
+
+  /* Continue processing the children of this basic block.  */
+  for (son = first_dom_son (CDI_DOMINATORS, entry_bb);
+       son;
+       son = next_dom_son (CDI_DOMINATORS, son))
+    oacc_broadcast_1 (son, exit_bb, false, mask);
+}
+
+/* Broadcast variables to OpenACC vector loops.  This function scans
+   all of the basic blocks withing an acc vector loop.  It maintains
+   two sets of decls, a GEN set and a USE set.  The GEN set contains
+   all of the decls in the the basic block's scope.  The USE set
+   consists of decls used in current basic block, but are not in the
+   GEN set, globally defined or were transferred into the the accelerator
+   via a data movement clause.
+
+   The vector loop begins at ENTRY_BB and end at EXIT_BB, where EXIT_BB
+   is a latch back to ENTRY_BB.  Once a set of used variables have been
+   determined, they will get broadcasted in a pre-header to ENTRY_BB.  */
+
+static void
+oacc_broadcast (basic_block entry_bb, basic_block exit_bb, omp_region *region,
+		struct omp_for_data *fd)
+{
+  gimple_stmt_iterator gsi;
+  std::set<tree>::iterator it;
+  int mask = 0;
+
+  if (fd->worker == integer_one_node)
+    mask = MASK_WORKER;
+  if (fd->vector == integer_one_node)
+    mask |= MASK_VECTOR;
+
+  /* Nothing to do if this isn't an acc worker or vector loop.  */
+  if (mask == 0)
+    return;
+
+  use.empty ();
+  gen.empty ();
+
+  /* Currently, subroutines aren't supported.  */
+  gcc_assert (!lookup_attribute ("oacc function",
+				 DECL_ATTRIBUTES (current_function_decl)));
+
+  /* Populate the set of used decls.  */
+  oacc_broadcast_1 (entry_bb, exit_bb, true, mask);
+
+  /* Broadcast all decls in USE right before the last instruction in
+     entry_bb.  */
+  gsi = gsi_last_bb (entry_bb);
+
+  gimple_seq seq = NULL;
+  gimple_stmt_iterator g2 = gsi_start (seq);
+
+  for (it = use.begin (); it != use.end (); it++)
+    generate_oacc_broadcast (region, *it, *it, g2, mask);
+
+  gsi_insert_seq_before (&gsi, seq, GSI_CONTINUE_LINKING);
+}
+
 /* Main entry point for expanding OMP-GIMPLE into runtime calls.  */
 
 static unsigned int
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-broadcast.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-broadcast.c
new file mode 100644
index 0000000..2e1893b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-broadcast.c
@@ -0,0 +1,38 @@ 
+/* Check if worker-single variables get broadcastd to vectors.  */
+
+/* { dg-do run } */
+
+#include <assert.h>
+#include <math.h>
+
+#define N 32
+
+#pragma acc routine
+float
+some_val ()
+{
+  return 2.71;
+}
+
+int
+main ()
+{
+  float threads[N], v1 = 3.14;
+
+  for (int i = 0; i < N; i++)
+    threads[i] = -1;
+
+#pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
+  {
+    float val = some_val ();
+
+#pragma acc loop vector
+    for (int i = 0; i < N; i++)
+      threads[i] = val + v1*i;
+  }
+
+  for (int i = 0; i < N; i++)
+    assert (fabs (threads[i] - (some_val () + v1*i)) < 0.0001);
+
+  return 0;
+}