diff mbox

[gomp4] reorganize reduction target lowering

Message ID 562EA371.2080903@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Oct. 26, 2015, 10:04 p.m. UTC
I've committed this to gomp4.  It changes the target reduction lowering to:

1) use a single internal function with initial argument discriminator, in the 
same manner to IFN_UNIQUE and IFN_GOACC_LOOP

2) Rather than identify reductions with loop-id and reduction-id, which will 
cause  a difficulty with inlining, we simply identify them with an offset 
determined when generating the lowering.   This offset is useful in our case for 
worker level reductions, but could be used for other purposes by different backends.

nathan
diff mbox

Patch

2015-10-26  Nathan Sidwell  <nathan@codesourcery.com>

	* doc/tm.texi: Rebuilt.
	* internal-fn.c (expand_GOACC_REDUCTION_SETUP,
	expand_GOACC_REDUCTION_INIT, expand_GOACC_REDUCTION_FINI,
	expand_GOACC_REDUCTION_TEADOWN): Replace with ...
	(expand_GOACC_REDUCTION): ... this.
	* internal-fn.def (GOACC_REDUCTION_SETUP,
	GOACC_REDUCTION_INIT, GOACC_REDUCTION_FINI,
	GOACC_REDUCTION_TEADOWN): Replace with ...
	(GOACC_REDUCTION): ... this.
	* internal-fn.h (enum ifn_goacc_reduction_kind): New.
	* omp-low.c (lower_rec_input_clauses): Adjust OpenACC comment.
	(lower_oacc_reductions): Remove RID & LID, calculate
	offset. Adjust for IFN_GOACC_REDUCTION change.
	(default_goacc_reduction): Don't return bool.  Adjust for argument
	shift.
	(execute_oacc_device_lower): Adjust for IFN_GOACC_REDUCTION
	change.
	* target.def (goacc_reduction): Adjust hook.
	* targhooks.h (default_goacc_reduction): Return void.
	* config/nvptx/nvptx.c (worker_red_hwm): Rename to ...
	(worker_red_size): ... here.
	(var_red_t, struct loop_red, loop_reds): Delete.
	(nvptx_reorg_reductions): Delete.
	(nvptx_reorg): Don't reorg reductoins.
	(nvptx_file_end): Adjust worker reduction size name.
	(nvptx_expand_worker_addr): Reimplement.
	(nvptx_init_builtins): Adjust WORKER_ADDR prototype.
	(nvptx_get_worker_red_addr): Reimplement.
	(nvptx_goacc_reduction_setup, nvptx_goacc_reduction_init,
	nvptx_goacc_reduction_fini, nvptx_goacc_reduction_teardown): Don't
	return bool.  Adjust for argument shift & worker offset
	processing.
	(nvptx_goacc_reduction): Adjust.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 229392)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -119,40 +119,13 @@  static unsigned worker_bcast_align;
 static GTY(()) rtx worker_bcast_sym;
 
 /* Size of buffer needed for worker reductions.  This has to be
-   disjoing from the worker broadcast array, as both may be live
+   distinct from the worker broadcast array, as both may be live
    concurrently.  */
-static unsigned worker_red_hwm;
+static unsigned worker_red_size;
 static unsigned worker_red_align;
 #define worker_red_name "__worker_red"
 static GTY(()) rtx worker_red_sym;
 
-/* To process worker-level reductions we need a buffer in CTA local
-   (.shared) memory.  As the number of loops per function and number
-   of reductions per loop are likely to be small numbers, we use
-   simple unsorted vectors to hold the mappings.  */
-
-/* Mapping from a reduction to an offset within the worker reduction
-   array.  */
-typedef std::pair<unsigned, unsigned> var_red_t;
-
-/* Mapping from loops within a function to lists of reductions on that
-   loop.  */
-struct loop_red
-{
-  unsigned id;  /* Loop ID.  */
-  unsigned hwm;  /* Allocated worker buffer for this loop.  */
-  auto_vec<var_red_t> vars;   /* Reduction variables of the loop.  */
-
-  loop_red (unsigned id_)
-  :id (id_), hwm (0) 
-  {
-  }
-};
-
-/* It would be nice to put this intp machine_function, but auto_vec
-   pulls in too much other stuff.   */
-static auto_vec<loop_red> loop_reds;
-
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -3785,21 +3758,7 @@  nvptx_neuter_pars (parallel *par, unsign
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
-static void
-nvptx_reorg_reductions (void)
-{
-  unsigned ix;
-
-  for (ix = loop_reds.length (); ix--;)
-    {
-      if (loop_reds[ix].hwm > worker_red_hwm)
-	worker_red_hwm = loop_reds[ix].hwm;
-      loop_reds.pop ();
-    }
-}
-
 /* PTX-specific reorganization
-   - Scan and release reduction buffers
    - Split blocks at fork and join instructions
    - Compute live registers
    - Mark now-unused registers, so function begin doesn't declare
@@ -3812,8 +3771,6 @@  nvptx_reorg_reductions (void)
 static void
 nvptx_reorg (void)
 {
-  nvptx_reorg_reductions ();
-  
   /* We are freeing block_for_insn in the toplev to keep compatibility
      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
   compute_bb_for_insn ();
@@ -4023,17 +3980,17 @@  nvptx_file_end (void)
 	       worker_bcast_name, worker_bcast_hwm);
     }
 
-  if (worker_red_hwm)
+  if (worker_red_size)
     {
       /* Define the reduction buffer.  */
 
-      worker_red_hwm = (worker_red_hwm + worker_red_align - 1)
+      worker_red_size = (worker_red_size + worker_red_align - 1)
 	& ~(worker_red_align - 1);
       
       fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_red_name);
       fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
 	       worker_red_align,
-	       worker_red_name, worker_red_hwm);
+	       worker_red_name, worker_red_size);
     }
 }
 
@@ -4074,44 +4031,21 @@  nvptx_expand_worker_addr (tree exp, rtx
   if (ignore)
     return target;
 
-  unsigned lid = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
-  unsigned rid = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 3));
-  unsigned ix;
-
-  for (ix = 0; ix != loop_reds.length (); ix++)
-    if (loop_reds[ix].id == lid)
-      goto found_lid;
-  /* Allocate a new loop.  */
-  loop_reds.safe_push (loop_red (lid));
- found_lid:
-  loop_red &loop = loop_reds[ix];
-  for (ix = 0; ix != loop.vars.length (); ix++)
-    if (loop.vars[ix].first == rid)
-      goto found_rid;
+  unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
+  if (align > worker_red_align)
+    worker_red_align = align;
+
+  unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
+  unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
+  if (size + offset > worker_red_size)
+    worker_red_size = size + offset;
 
-  /* Allocate a new var. */
-  {
-    unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
-    unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
-    unsigned off = loop.hwm;
-
-    if (align > worker_red_align)
-      worker_red_align = align;
-    off = (off + align - 1) & ~(align -1);
-    loop.hwm = off + size;
-    loop.vars.safe_push (var_red_t (rid, off));
-  }
- found_rid:
-
-  /* Return offset into worker reduction array.  */
-  unsigned offset = loop.vars[ix].second;
-  
   emit_insn (gen_rtx_SET (target, worker_red_sym));
 
   if (offset)
     emit_insn (gen_rtx_SET (target,
 			    gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
-	       
+
   emit_insn (gen_rtx_SET (target,
 			  gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
 					  UNSPEC_FROM_SHARED)));
@@ -4167,6 +4101,7 @@  enum nvptx_builtins
 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
 
 /* Return the NVPTX builtin for CODE.  */
+
 static tree
 nvptx_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
 {
@@ -4177,6 +4112,7 @@  nvptx_builtin_decl (unsigned code, bool
 }
 
 /* Set up all builtin functions for this target.  */
+
 static void
 nvptx_init_builtins (void)
 {
@@ -4185,6 +4121,7 @@  nvptx_init_builtins (void)
    add_builtin_function ("__builtin_nvptx_" NAME,			\
 			 build_function_type_list T,			\
 			 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
+#define ST sizetype
 #define UINT unsigned_type_node
 #define LLUINT long_long_unsigned_type_node
 #define PTRVOID ptr_type_node
@@ -4192,11 +4129,12 @@  nvptx_init_builtins (void)
   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
   DEF (WORKER_ADDR, "worker_addr",
-       (PTRVOID, UINT, UINT, UINT, UINT, NULL_TREE));
+       (PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
 
 #undef DEF
+#undef ST
 #undef UINT
 #undef LLUINT
 #undef PTRVOID
@@ -4209,10 +4147,8 @@  nvptx_init_builtins (void)
    IGNORE is nonzero if the value is to be ignored.  */
 
 static rtx
-nvptx_expand_builtin (tree exp, rtx target,
-		     rtx subtarget ATTRIBUTE_UNUSED,
-		     machine_mode mode,
-		     int ignore)
+nvptx_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
+		      machine_mode mode, int ignore)
 {
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   switch (DECL_FUNCTION_CODE (fndecl))
@@ -4232,7 +4168,7 @@  nvptx_expand_builtin (tree exp, rtx targ
     }
 }
 
-/* Define vector size for known hardware.  */
+/* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
 
@@ -4311,16 +4247,16 @@  nvptx_goacc_fork_join (gcall *call, cons
 }
 
 static tree
-nvptx_get_worker_red_addr (tree type, tree rid, tree lid)
+nvptx_get_worker_red_addr (tree type, tree offset)
 {
   machine_mode mode = TYPE_MODE (type);
   tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
   tree align = build_int_cst (unsigned_type_node,
 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
-  tree call = build_call_expr (fndecl, 4, size, align, lid, rid);
+  tree call = build_call_expr (fndecl, 3, offset, size, align);
 
-  return fold_build1 (NOP_EXPR, build_pointer_type (type), call);
+  return fold_convert (build_pointer_type (type), call);
 }
 
 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
@@ -4454,24 +4390,21 @@  nvptx_lockless_update (location_t loc, g
 
 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
 
-static bool
+static void
 nvptx_goacc_reduction_setup (gcall *call)
 {
   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   tree lhs = gimple_call_lhs (call);
-  tree var = gimple_call_arg (call, 1);
-  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
-  tree lid = gimple_call_arg (call, 4);
-  tree rid = gimple_call_arg (call, 5);
+  tree var = gimple_call_arg (call, 2);
+  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   gimple_seq seq = NULL;
-  tree r = NULL_TREE;
 
   push_gimplify_context (true);
 
   if (level != GOMP_DIM_GANG)
     {
       /* Copy the receiver object.  */
-      tree ref_to_res = gimple_call_arg (call, 0);
+      tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
 	var = build_simple_mem_ref (ref_to_res);
@@ -4480,40 +4413,36 @@  nvptx_goacc_reduction_setup (gcall *call
   if (level == GOMP_DIM_WORKER)
     {
       /* Store incoming value to worker reduction buffer.  */
-      tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), rid, lid);
+      tree offset = gimple_call_arg (call, 5);
+      tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);
       tree ref = build_simple_mem_ref (ptr);
       TREE_THIS_VOLATILE (ref) = 1;
       gimplify_assign (ref, var, &seq);
-      r = var;
     }
-  else
-    r = var;
 
   if (lhs)
-    gimplify_assign (lhs, r, &seq);
+    gimplify_assign (lhs, var, &seq);
 
   pop_gimplify_context (NULL);
   gsi_replace_with_seq (&gsi, seq, true);
-
-  return false;
 }
 
 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
 
-static bool
+static void
 nvptx_goacc_reduction_init (gcall *call)
 {
   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   tree lhs = gimple_call_lhs (call);
-  tree var = gimple_call_arg (call, 1);
-  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
-  tree init = omp_reduction_init_op
-    (gimple_location (call),
-     (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 3)),
-     TREE_TYPE (var));
+  tree var = gimple_call_arg (call, 2);
+  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+  enum tree_code rcode
+    = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
+  tree init = omp_reduction_init_op (gimple_location (call), rcode,
+				     TREE_TYPE (var));
   gimple_seq seq = NULL;
   
   push_gimplify_context (true);
@@ -4522,7 +4451,7 @@  nvptx_goacc_reduction_init (gcall *call)
     {
       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
       tree tid = make_ssa_name (integer_type_node);
-      tree dim_vector = gimple_call_arg (call, 2);
+      tree dim_vector = gimple_call_arg (call, 3);
       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
 						     dim_vector);
       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
@@ -4567,42 +4496,33 @@  nvptx_goacc_reduction_init (gcall *call)
       if (level == GOMP_DIM_GANG)
 	{
 	  /* If there's no receiver object, propagate the incoming VAR.  */
-	  tree ref_to_res = gimple_call_arg (call, 0);
+	  tree ref_to_res = gimple_call_arg (call, 1);
 	  if (integer_zerop (ref_to_res))
 	    init = var;
 	}
-      
+
       gimplify_assign (lhs, init, &seq);
     }
 
   pop_gimplify_context (NULL);
   gsi_replace_with_seq (&gsi, seq, true);
-
-  return false;
 }
 
 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
 
-static bool
+static void
 nvptx_goacc_reduction_fini (gcall *call)
 {
   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   tree lhs = gimple_call_lhs (call);
-  tree ref_to_res = gimple_call_arg (call, 0);
-  tree var = gimple_call_arg (call, 1);
-  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+  tree ref_to_res = gimple_call_arg (call, 1);
+  tree var = gimple_call_arg (call, 2);
+  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   enum tree_code op
-    = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 3));
-  tree lid = gimple_call_arg (call, 4);
-  tree rid = gimple_call_arg (call, 5);
+    = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
   gimple_seq seq = NULL;
   tree r = NULL_TREE;;
 
-  if (op == TRUTH_ANDIF_EXPR)
-    op = BIT_AND_EXPR;
-  else if (op == TRUTH_ORIF_EXPR)
-    op = BIT_IOR_EXPR;
-
   push_gimplify_context (true);
 
   if (level == GOMP_DIM_VECTOR)
@@ -4629,7 +4549,8 @@  nvptx_goacc_reduction_fini (gcall *call)
       if (level == GOMP_DIM_WORKER)
 	{
 	  /* Get reduction buffer address.  */
-	  tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), rid, lid);
+	  tree offset = gimple_call_arg (call, 5);
+	  tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
 	  tree ptr = make_ssa_name (TREE_TYPE (call));
 
 	  gimplify_assign (ptr, call, &seq);
@@ -4655,75 +4576,73 @@  nvptx_goacc_reduction_fini (gcall *call)
   pop_gimplify_context (NULL);
 
   gsi_replace_with_seq (&gsi, seq, true);
-
-  return false;
 }
 
 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
 
-static bool
+static void
 nvptx_goacc_reduction_teardown (gcall *call)
 {
   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   tree lhs = gimple_call_lhs (call);
-  tree var = gimple_call_arg (call, 1);
-  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
-  tree lid = gimple_call_arg (call, 4);
-  tree rid = gimple_call_arg (call, 5);
+  tree var = gimple_call_arg (call, 2);
+  int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
   gimple_seq seq = NULL;
-  tree r = NULL_TREE;
   
   push_gimplify_context (true);
   if (level == GOMP_DIM_WORKER)
     {
       /* Read the worker reduction buffer.  */
-      tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), rid, lid);
+      tree offset = gimple_call_arg (call, 5);
+      tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);
-      r = build_simple_mem_ref (ptr);
-      TREE_THIS_VOLATILE (r) = 1;
+      var = build_simple_mem_ref (ptr);
+      TREE_THIS_VOLATILE (var) = 1;
     }
-  else
-    r = var;
 
   if (level != GOMP_DIM_GANG)
     {
       /* Write to the receiver object.  */
-      tree ref_to_res = gimple_call_arg (call, 0);
+      tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
-	gimplify_assign (build_simple_mem_ref (ref_to_res), r, &seq);
+	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
     }
 
   if (lhs)
-    gimplify_assign (lhs, r, &seq);
+    gimplify_assign (lhs, var, &seq);
   
   pop_gimplify_context (NULL);
 
   gsi_replace_with_seq (&gsi, seq, true);
-
-  return false;
 }
 
-/* Default goacc.reduction early expander.  */
+/* NVPTX reduction expander.  */
 
-bool
+void
 nvptx_goacc_reduction (gcall *call)
 {
-  switch (gimple_call_internal_fn (call))
+  unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+
+  switch (code)
     {
     case IFN_GOACC_REDUCTION_SETUP:
-      return nvptx_goacc_reduction_setup (call);
+      nvptx_goacc_reduction_setup (call);
+      break;
 
     case IFN_GOACC_REDUCTION_INIT:
-      return nvptx_goacc_reduction_init (call);
+      nvptx_goacc_reduction_init (call);
+      break;
 
     case IFN_GOACC_REDUCTION_FINI:
-      return nvptx_goacc_reduction_fini (call);
+      nvptx_goacc_reduction_fini (call);
+      break;
 
     case IFN_GOACC_REDUCTION_TEARDOWN:
-      return nvptx_goacc_reduction_teardown (call);
+      nvptx_goacc_reduction_teardown (call);
+      break;
 
     default:
       gcc_unreachable ();
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi	(revision 229392)
+++ gcc/doc/tm.texi	(working copy)
@@ -5778,17 +5778,13 @@  pass.  It should return true, if the fun
 default hook returns true, if there are no RTL expanders for them.
 @end deftypefn
 
-@deftypefn {Target Hook} bool TARGET_GOACC_REDUCTION (gcall *@var{call})
+@deftypefn {Target Hook} void TARGET_GOACC_REDUCTION (gcall *@var{call})
 This hook is used by the oacc_transform pass to expand calls to the
-internal functions @var{GOACC_REDUCTION_SETUP},
-@var{GOACC_REDUCTION_INIT},
-@var{GOACC_REDUCTION_FINI} and
-@var{GOACC_REDUCTION_TEARDOWN} into a sequence of gimple instructions.
-@var{call} is gimple statement containing the call to the function.  This
-hook removes statement @var{call} after the expanded sequence has been
-inserted.  This hook is also responsible for allocating any storage for
-reductions when necessary.  It returns @var{true} if the expanded
-sequence introduces any calls to OpenACC-specific internal functions.
+@var{GOACC_REDUCTION} internal function, into a sequence of gimple
+instructions.  @var{call} is gimple statement containing the call to
+the function.  This hook removes statement @var{call} after the
+expanded sequence has been inserted.  This hook is also responsible
+for allocating any storage for reductions when necessary.
 @end deftypefn
 
 @node Anchored Addresses
Index: gcc/internal-fn.c
===================================================================
--- gcc/internal-fn.c	(revision 229392)
+++ gcc/internal-fn.c	(working copy)
@@ -2053,28 +2053,10 @@  expand_GOACC_DIM_POS (gcall *stmt)
     gcc_unreachable ();
 }
 
-/* All the GOACC_REDUCTION variants  get expanded in oacc_device_lower.  */
-
-static void
-expand_GOACC_REDUCTION_SETUP (gcall *stmt ATTRIBUTE_UNUSED)
-{
-  gcc_unreachable ();
-}
-
-static void
-expand_GOACC_REDUCTION_INIT (gcall *stmt ATTRIBUTE_UNUSED)
-{
-  gcc_unreachable ();
-}
-
-static void
-expand_GOACC_REDUCTION_FINI (gcall *stmt ATTRIBUTE_UNUSED)
-{
-  gcc_unreachable ();
-}
+/* This is expanded by oacc_device_lower pass.  */
 
 static void
-expand_GOACC_REDUCTION_TEARDOWN (gcall *stmt ATTRIBUTE_UNUSED)
+expand_GOACC_LOOP (gcall *stmt ATTRIBUTE_UNUSED)
 {
   gcc_unreachable ();
 }
@@ -2082,7 +2064,7 @@  expand_GOACC_REDUCTION_TEARDOWN (gcall *
 /* This is expanded by oacc_device_lower pass.  */
 
 static void
-expand_GOACC_LOOP (gcall *stmt ATTRIBUTE_UNUSED)
+expand_GOACC_REDUCTION (gcall *stmt ATTRIBUTE_UNUSED)
 {
   gcc_unreachable ();
 }
Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def	(revision 229392)
+++ gcc/internal-fn.def	(working copy)
@@ -82,23 +82,8 @@  DEF_INTERNAL_FN (UNIQUE, ECF_NOTHROW, NU
 DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
 DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
 
-/* REDUCTION_SETUP, REDUCTION_INIT, REDUCTION_FINI and REDUCTION_TEARDOWN
-   together define a generic interface to support gang, worker and vector
-   reductions. All of the functions take the following form
-
-     V = goacc_reduction_foo (REF_TO_RES, LOCAL_VAR, LEVEL, OP, LID, RID)
-
-   where REF_TO_RES is a reference to the original reduction variable for
-   that particular reduction, LOCAL_VAR is the intermediate reduction
-   variable. LEVEL corresponds to the GOMP_DIM of the reduction, OP is a
-   tree code of the reduction operation. LID is a unique identifier of the
-   loop within a TU and RID is a unique id for a reduction within a loop.
-   V is the resulting intermediate reduction variable returned by the
-   function.  In general, V should equal LOCAL_VAR.  */
-DEF_INTERNAL_FN (GOACC_REDUCTION_SETUP, ECF_NOTHROW, NULL)
-DEF_INTERNAL_FN (GOACC_REDUCTION_INIT, ECF_NOTHROW, NULL)
-DEF_INTERNAL_FN (GOACC_REDUCTION_FINI, ECF_NOTHROW, NULL)
-DEF_INTERNAL_FN (GOACC_REDUCTION_TEARDOWN, ECF_NOTHROW, NULL)
-
 /* OpenACC looping abstraction.  See internal-fn.h for usage.  */
 DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHROW, NULL)
+
+/* OpenACC reduction abstraction.  See internal-fn.h  for usage.  */
+DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL)
Index: gcc/internal-fn.h
===================================================================
--- gcc/internal-fn.h	(revision 229392)
+++ gcc/internal-fn.h	(working copy)
@@ -66,6 +66,28 @@  enum ifn_goacc_loop_kind {
   IFN_GOACC_LOOP_BOUND    /* Limit of iteration value.  */
 };
 
+/* The GOACC_REDUCTION function defines a generic interface to support
+   gang, worker and vector reductions.  All calls are of the following
+   form:
+
+     V = REDUCTION (CODE, REF_TO_RES, LOCAL_VAR, LEVEL, OP, OFFSET)
+
+   REF_TO_RES - is a reference to the original reduction varl, may be NULL
+   LOCAL_VAR is the intermediate reduction variable
+   LEVEL corresponds to the GOMP_DIM of the reduction
+   OP is the tree code of the reduction operation
+   OFFSET may be used as an offset into a reduction array for the
+          reductions occuring at this level.
+   In general the return value is LOCAL_VAR, which creates a data
+   dependency between calls operating on the same reduction.  */
+
+enum ifn_goacc_reduction_kind {
+  IFN_GOACC_REDUCTION_SETUP,
+  IFN_GOACC_REDUCTION_INIT,
+  IFN_GOACC_REDUCTION_FINI,
+  IFN_GOACC_REDUCTION_TEARDOWN
+};
+
 /* Initialize internal function tables.  */
 
 extern void init_internal_fns ();
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 229392)
+++ gcc/omp-low.c	(working copy)
@@ -4940,8 +4940,8 @@  lower_rec_input_clauses (tree clauses, g
 	      break;
 
 	    case OMP_CLAUSE_REDUCTION:
-	      /* OpenACC reductions are initialized using the internal
-		 functions GOACC_REDUCTION_SETUP and GOACC_REDUCTION_INIT.  */
+	      /* OpenACC reductions are initialized using the
+		 GOACC_REDUCTION internal function.  */
 	      if (is_gimple_omp_oacc (ctx->stmt))
 		break;
 	      if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
@@ -5401,14 +5401,13 @@  lower_oacc_reductions (location_t loc, t
 		       gcall *fork, gcall *join, gimple_seq *fork_seq,
 		       gimple_seq *join_seq, omp_context *ctx)
 {
-  static unsigned oacc_lid = 0;
-  
   gimple_seq before_fork = NULL;
   gimple_seq after_fork = NULL;
   gimple_seq before_join = NULL;
   gimple_seq after_join = NULL;
-  unsigned count = 0;
-  tree lid = build_int_cst (unsigned_type_node, oacc_lid++);
+  tree init_code = NULL_TREE, fini_code = NULL_TREE,
+    setup_code = NULL_TREE, teardown_code = NULL_TREE;
+  unsigned offset = 0;
 
   for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
     if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
@@ -5473,30 +5472,59 @@  lower_oacc_reductions (location_t loc, t
 	else if (is_reference (orig))
 	  ref_to_res = build_simple_mem_ref (ref_to_res);
 
-	unsigned rcode = OMP_CLAUSE_REDUCTION_CODE (c);
+	enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
 	if (rcode == MINUS_EXPR)
 	  rcode = PLUS_EXPR;
+	else if (rcode == TRUTH_ANDIF_EXPR)
+	  rcode = BIT_AND_EXPR;
+	else if (rcode == TRUTH_ORIF_EXPR)
+	  rcode = BIT_IOR_EXPR;
 	tree op = build_int_cst (unsigned_type_node, rcode);
-	tree rid = build_int_cst (unsigned_type_node, count);	
 
-	tree setup = build_call_expr_internal_loc
-	  (loc, IFN_GOACC_REDUCTION_SETUP, TREE_TYPE (var), 6,
-	   unshare_expr (ref_to_res), var, level, op, lid, rid);
-	tree init = build_call_expr_internal_loc
-	  (loc, IFN_GOACC_REDUCTION_INIT, TREE_TYPE (var), 6,
-	   unshare_expr (ref_to_res), var, level, op, lid, rid);
-	tree fini = build_call_expr_internal_loc
-	  (loc, IFN_GOACC_REDUCTION_FINI, TREE_TYPE (var), 6,
-	   unshare_expr (ref_to_res), var, level, op, lid, rid);
-	tree teardown = build_call_expr_internal_loc
-	  (loc, IFN_GOACC_REDUCTION_TEARDOWN, TREE_TYPE (var), 6,
-	   ref_to_res, var, level, op, lid, rid);
-
-	gimplify_assign (var, setup, &before_fork);
-	gimplify_assign (var, init, &after_fork);
-	gimplify_assign (var, fini, &before_join);
-	gimplify_assign (var, teardown, &after_join);
-	count++;
+	/* Determine position in reduction buffer, which may be used
+	   by target.  */
+	enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+	unsigned align = GET_MODE_ALIGNMENT (mode) /  BITS_PER_UNIT;
+	offset = (offset + align - 1) & ~(align - 1);
+	tree off = build_int_cst (sizetype, offset);
+	offset += GET_MODE_SIZE (mode);
+
+	if (!init_code)
+	  {
+	    init_code = build_int_cst (integer_type_node,
+				       IFN_GOACC_REDUCTION_INIT);
+	    fini_code = build_int_cst (integer_type_node,
+				       IFN_GOACC_REDUCTION_FINI);
+	    setup_code = build_int_cst (integer_type_node,
+					IFN_GOACC_REDUCTION_SETUP);
+	    teardown_code = build_int_cst (integer_type_node,
+					   IFN_GOACC_REDUCTION_TEARDOWN);
+	  }
+
+	tree setup_call
+	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
+					  TREE_TYPE (var), 6, setup_code,
+					  unshare_expr (ref_to_res),
+					  var, level, op, off);
+	tree init_call
+	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
+					  TREE_TYPE (var), 6, init_code,
+					  unshare_expr (ref_to_res),
+					  var, level, op, off);
+	tree fini_call
+	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
+					  TREE_TYPE (var), 6, fini_code,
+					  unshare_expr (ref_to_res),
+					  var, level, op, off);
+	tree teardown_call
+	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
+					  TREE_TYPE (var), 6, teardown_code,
+					  ref_to_res, var, level, op, off);
+
+	gimplify_assign (var, setup_call, &before_fork);
+	gimplify_assign (var, init_call, &after_fork);
+	gimplify_assign (var, fini_call, &before_join);
+	gimplify_assign (var, teardown_call, &after_join);
       }
 
   /* Now stitch things together.  */
@@ -19464,11 +19492,8 @@  oacc_loop_xform_head_tail (gcall *from,
 	  else if (c == code && stmt != from)
 	    break;
 	}
-      else if (gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION_SETUP
-	       || gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION_INIT
-	       || gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION_FINI
-	       || gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION_TEARDOWN)
-	*gimple_call_arg_ptr (stmt, 2) = replacement;
+      else if (gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION)
+	*gimple_call_arg_ptr (stmt, 3) = replacement;
 
       gsi_next (&gsi);
       while (gsi_end_p (gsi))
@@ -19788,13 +19813,13 @@  default_goacc_fork_join (gcall *ARG_UNUS
    If LHS is not NULL
        emit 'LHS = VAR'   */
 
-bool
+void
 default_goacc_reduction (gcall *call)
 {
+  unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
   gimple_stmt_iterator gsi = gsi_for_stmt (call);
   tree lhs = gimple_call_lhs (call);
-  tree var = gimple_call_arg (call, 1);
-  unsigned code = gimple_call_internal_fn (call);
+  tree var = gimple_call_arg (call, 2);
   gimple_seq seq = NULL;
 
   if (code == IFN_GOACC_REDUCTION_SETUP
@@ -19802,7 +19827,7 @@  default_goacc_reduction (gcall *call)
     {
       /* Setup and Teardown need to copy from/to the receiver object,
 	 if there is one.  */
-      tree ref_to_res = gimple_call_arg (call, 0);
+      tree ref_to_res = gimple_call_arg (call, 1);
       
       if (!integer_zerop (ref_to_res))
 	{
@@ -19824,8 +19849,6 @@  default_goacc_reduction (gcall *call)
     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
 
   gsi_replace_with_seq (&gsi, seq, true);
-
-  return false;
 }
 
 /* Main entry point for oacc transformations which run on the device
@@ -19902,16 +19925,13 @@  execute_oacc_device_lower ()
 	    rescan = true;
 	    break;
 
-	  case IFN_GOACC_REDUCTION_SETUP:
-	  case IFN_GOACC_REDUCTION_INIT:
-	  case IFN_GOACC_REDUCTION_FINI:
-	  case IFN_GOACC_REDUCTION_TEARDOWN:
+	  case IFN_GOACC_REDUCTION:
 	    /* Mark the function for SSA renaming.  */
 	    mark_virtual_operands_for_renaming (cfun);
 
 	    /* If the level is -1, this ended up being an unused
 	       axis.  Handle as a default.  */
-	    if (integer_minus_onep (gimple_call_arg (call, 2)))
+	    if (integer_minus_onep (gimple_call_arg (call, 3)))
 	      default_goacc_reduction (call);
 	    else
 	      targetm.goacc.reduction (call);
Index: gcc/target.def
===================================================================
--- gcc/target.def	(revision 229392)
+++ gcc/target.def	(working copy)
@@ -1677,16 +1677,12 @@  default_goacc_fork_join)
 DEFHOOK
 (reduction,
 "This hook is used by the oacc_transform pass to expand calls to the\n\
-internal functions @var{GOACC_REDUCTION_SETUP},\n\
-@var{GOACC_REDUCTION_INIT},\n\
-@var{GOACC_REDUCTION_FINI} and\n\
-@var{GOACC_REDUCTION_TEARDOWN} into a sequence of gimple instructions.\n\
-@var{call} is gimple statement containing the call to the function.  This\n\
-hook removes statement @var{call} after the expanded sequence has been\n\
-inserted.  This hook is also responsible for allocating any storage for\n\
-reductions when necessary.  It returns @var{true} if the expanded\n\
-sequence introduces any calls to OpenACC-specific internal functions.",
-bool, (gcall *call),
+@var{GOACC_REDUCTION} internal function, into a sequence of gimple\n\
+instructions.  @var{call} is gimple statement containing the call to\n\
+the function.  This hook removes statement @var{call} after the\n\
+expanded sequence has been inserted.  This hook is also responsible\n\
+for allocating any storage for reductions when necessary.",
+void, (gcall *call),
 default_goacc_reduction)
 
 HOOK_VECTOR_END (goacc)
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h	(revision 229392)
+++ gcc/targhooks.h	(working copy)
@@ -109,7 +109,7 @@  extern void default_finish_cost (void *,
 extern void default_destroy_cost_data (void *);
 
 /* OpenACC hooks.  */
-extern bool default_goacc_reduction (gcall *);
+extern void default_goacc_reduction (gcall *);
 extern bool default_goacc_validate_dims (tree, int [], int);
 extern unsigned default_goacc_dim_limit (unsigned);
 extern bool default_goacc_fork_join (gcall *, const int [], bool);