diff mbox

[gomp4] Worker reduction builtin

Message ID 55C0A70B.8050309@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Aug. 4, 2015, 11:50 a.m. UTC
I've committed this to gomp4  branch.  It creates a new builtin to be used for 
worker-level reductions that Cesar is working on.  When the builtin is expanded 
it allocates a slot in a new .shared array to hold the reduction variable.  This 
array is reused for reductions on different loops.

I also realized the lockk and unlock expanders needed to emit memory barriers to 
that writes made by one  thread in the protected region could be seen by other 
threads in the region.

nathan
diff mbox

Patch

2015-08-04  Nathan Sidwell  <nathan@codesourcery.com>

	* config/nvptx/nvptx.md (UNSPECV_MEMBAR): New.
	(nvptx_membar): New insn.
	* config/nvptx/nvptx.c (BARRIER_SHARED, BARRIER_GLOBAL,
	BARRIER_SYS): New.
	(lock_names, lock_space): Constify.
	(lock_level): New.
	(worker_red_hwm, worker_red_align, worker_red_name,
	worker_red_sym): New.
	(var_red_t, struct loop_red): New types.
	(loop_red): New.
	(nvptx_print_operand): Add 'B' case.
	(nvptx_reorg_reductions): New.
	(nvptx_reorg): Call it.
	(nvptx_file_end): Emit worker reduction array.
	(struct builtin_descriptor):  Remove builtin pointer from
	expander.
	(nvptx_expand_shuffle_down, nvptx_expand_lock_unlock,
	nvptx_expand_lock, nvptx_expand_unlock): Adjust.
	(nvptx_expand_lock_unlock): Emit barrier too.
	(nvptx_expand_work_red_addr): New.
	(NT_UINTPTR_UINT_UINT, NT_ULLPTR_UINT_UINT, NT_FLTPTR_UINT_UINT,
	NT_DBLPTR_UINT_UINT): New.
	(builtins): Add new builtins.
	(nvptx_init_builtins): Create new types.
	(nvptx_expand_builtin): Adjust expander call.

Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 226539)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -65,6 +65,7 @@ 
    UNSPECV_CAS
    UNSPECV_XCHG
    UNSPECV_BARSYNC
+   UNSPECV_MEMBAR
    UNSPECV_DIM_POS
 
    UNSPECV_FORK
@@ -1564,6 +1565,11 @@ 
   ""
   "bar.sync\\t%0;")
 
+(define_insn "nvptx_membar"
+  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+		    UNSPECV_MEMBAR)]
+  ""
+  "membar%M0;")
 
 ;; spinlock and unlock
 (define_insn "nvptx_spinlock"
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 226539)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -69,6 +69,11 @@ 
 #define SHUFFLE_BFLY 2
 #define SHUFFLE_IDX 3
 
+/* Memory barrier levels.  */
+#define BARRIER_SHARED 0
+#define BARRIER_GLOBAL 1
+#define BARRIER_SYS 2
+
 /* Record the function decls we've written, and the libfuncs and function
    decls corresponding to them.  */
 static std::stringstream func_decls;
@@ -107,15 +112,47 @@  static GTY(()) rtx worker_bcast_sym;
 #define LOCK_GLOBAL 0
 #define LOCK_SHARED 1
 #define LOCK_MAX    2
-static const char *const lock_names[] = 
-  {"__global_lock", "__shared_lock"};
-static const char *const lock_regions[] = 
-  {"global", "shared"};
-static unsigned lock_space[] =
-  {ADDR_SPACE_GLOBAL, ADDR_SPACE_SHARED};
+static const char *const lock_names[] = {"__global_lock", "__shared_lock"};
+static const unsigned lock_space[] = {ADDR_SPACE_GLOBAL, ADDR_SPACE_SHARED};
+static const unsigned lock_level[] = {BARRIER_GLOBAL, BARRIER_SHARED};
 static GTY(()) rtx lock_syms[LOCK_MAX];
 static bool lock_used[LOCK_MAX];
 
+/* Size of buffer needed for worker reductions.  This has to be
+   disjoing from the worker broadcast array, as both may be live
+   concurrently.  */
+static unsigned worker_red_hwm;
+static unsigned worker_red_align;
+#define worker_red_name "__worker_red"
+static GTY(()) rtx worker_red_sym;
+
+/* To process worker-level reductions we need a buffer in CTA local
+   (.shared) memory.  As the number of loops per function and number
+   of reductions per loop are likely to be small numbers, we use
+   simple unsorted vectors to hold the mappings.  */
+
+/* Mapping from a reduction to an offset within the worker reduction
+   array.  */
+typedef std::pair<unsigned, unsigned> var_red_t;
+
+/* Mapping from loops within a function to lists of reductions on that
+   loop.  */
+struct loop_red
+{
+  unsigned id;  /* Loop ID.  */
+  unsigned hwm;  /* Allocated worker buffer for this loop.  */
+  auto_vec<var_red_t> vars;   /* Reduction variables of the loop.  */
+
+  loop_red (unsigned id_)
+  :id (id_), hwm (0) 
+  {
+  }
+};
+
+/* It would be nice to put this intp machine_function, but auto_vec
+   pulls in too much other stuff.   */
+static auto_vec<loop_red> loop_reds;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -147,6 +184,9 @@  nvptx_option_override (void)
   worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
   worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 
+  worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
+  worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
   for (unsigned ix = LOCK_MAX; ix--;)
     lock_syms[ix] = gen_rtx_SYMBOL_REF (Pmode, lock_names[ix]);
 }
@@ -1893,6 +1933,7 @@  nvptx_print_operand_address (FILE *file,
    A -- print an address space identifier for a MEM
    c -- print an opcode suffix for a comparison operator, including a type code
    f -- print a full reg even for something that must always be split
+   B -- print a memory barrier level specified by CONST_INT
    R -- print an address space specified by CONST_INT
    S -- print a shuffle kind specified by CONST_INT
    t -- print a type opcode suffix, promoting QImode to 32 bits
@@ -1936,6 +1977,15 @@  nvptx_print_operand (FILE *file, rtx x,
       }
       break;
 
+    case 'B':
+      {
+	unsigned kind = UINTVAL (x);
+	static const char *const kinds[] = 
+	  {"cta", "gl", "sys"};
+	fprintf (file, ".%s", kinds[kind]);
+      }
+      break;
+      
     case 't':
       op_mode = nvptx_underlying_object_mode (x);
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
@@ -2945,6 +2995,19 @@  nvptx_neuter_pars (parallel *par, unsign
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
+static void
+nvptx_reorg_reductions (void)
+{
+  unsigned ix;
+
+  for (ix = loop_reds.length (); ix--;)
+    {
+      if (loop_reds[ix].hwm > worker_red_hwm)
+	worker_red_hwm = loop_reds[ix].hwm;
+      loop_reds.pop ();
+    }
+}
+
 /* NVPTX machine dependent reorg.
    Insert vector and worker single neutering code and state
    propagation when entering partioned mode.  Fixup subregs.  */
@@ -2952,6 +3015,8 @@  nvptx_neuter_pars (parallel *par, unsign
 static void
 nvptx_reorg (void)
 {
+  nvptx_reorg_reductions ();
+  
   /* We are freeing block_for_insn in the toplev to keep compatibility
      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
   compute_bb_for_insn ();
@@ -3171,13 +3236,27 @@  nvptx_file_end (void)
 	       worker_bcast_name, worker_bcast_hwm);
     }
 
+  if (worker_red_hwm)
+    {
+      /* Define the reduction buffer.  */
+
+      worker_red_hwm = (worker_red_hwm + worker_red_align - 1)
+	& ~(worker_red_align - 1);
+      
+      fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_red_name);
+      fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
+	       worker_red_align,
+	       worker_red_name, worker_red_hwm);
+    }
+
   /* Emit lock variables.  */
   for (unsigned ix = LOCK_MAX; ix--;)
     if (lock_used[ix])
       {
 	fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", lock_names[ix]);
-	fprintf (asm_out_file, ".%s .u32 %s;\n",
-		 lock_regions[ix], lock_names[ix]);
+	fprintf (asm_out_file, "%s .u32 %s;\n",
+		 nvptx_section_from_addr_space (lock_space[ix]),
+		 lock_names[ix]);
       }
 }
 
@@ -3187,15 +3266,12 @@  struct builtin_description
 {
   const char *name;
   unsigned short type;
-  rtx (*expander) (const struct builtin_description *, tree,
-		   rtx, machine_mode, int);
+  rtx (*expander) (tree, rtx, machine_mode, int);
 };
 
-
 /* Expander for the shuffle down builtins.  */
 static rtx
-nvptx_expand_shuffle_down (const struct builtin_description *ARG_UNUSED (desc),
-			   tree exp, rtx target, machine_mode mode, int ignore)
+nvptx_expand_shuffle_down (tree exp, rtx target, machine_mode mode, int ignore)
 {
   if (ignore)
     return target;
@@ -3222,8 +3298,7 @@  nvptx_expand_shuffle_down (const struct
 
 /* Expander for locking and unlocking.  */
 static rtx
-nvptx_expand_lock_unlock (const struct builtin_description *desc,
-			   tree exp, bool lock)
+nvptx_expand_lock_unlock (tree exp, bool lock)
 {
   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
 			 NULL_RTX, SImode, EXPAND_NORMAL);
@@ -3232,41 +3307,103 @@  nvptx_expand_lock_unlock (const struct b
   
   kind = GET_CODE (src) == CONST_INT ? INTVAL  (src) : LOCK_MAX;
   if (kind >= LOCK_MAX)
-    error ("builtin %<%s%> requires constant argument less than %u",
-	   desc->name, LOCK_MAX);
+    error ("builtin %D requires constant argument less than %u",
+	   get_callee_fndecl (exp), LOCK_MAX);
   lock_used[kind] = true;
 
   rtx mem = gen_rtx_MEM (SImode, lock_syms[kind]);
   rtx space = GEN_INT (lock_space[kind]);
-  
+  rtx barrier = gen_nvptx_membar (GEN_INT (lock_level[kind]));
+
+  if (!lock)
+    emit_insn (barrier);
   if (lock)
     pat = gen_nvptx_spinlock (mem, space,
 			      gen_reg_rtx (SImode), gen_reg_rtx (BImode));
   else
     pat = gen_nvptx_spinunlock (mem, space);
-  if (pat)
-    emit_insn (pat);
+  emit_insn (pat);
+  if (lock)
+    emit_insn (barrier);
   return const0_rtx;
 }
 
 /* Lock expander.  */
 
 static rtx
-nvptx_expand_lock (const struct builtin_description *desc,
-		   tree exp, rtx ARG_UNUSED (target),
+nvptx_expand_lock (tree exp, rtx ARG_UNUSED (target),
 		   machine_mode ARG_UNUSED (mode), int ARG_UNUSED (ignore))
 {
-  return nvptx_expand_lock_unlock (desc, exp, true);
+  return nvptx_expand_lock_unlock (exp, true);
 }
 
 /* Unlock expander.  */
 
 static rtx
-nvptx_expand_unlock (const struct builtin_description *desc,
-		   tree exp, rtx ARG_UNUSED (target),
-		   machine_mode ARG_UNUSED (mode), int ARG_UNUSED (ignore))
+nvptx_expand_unlock (tree exp, rtx ARG_UNUSED (target),
+		     machine_mode ARG_UNUSED (mode), int ARG_UNUSED (ignore))
+{
+  return nvptx_expand_lock_unlock (exp, false);
+}
+
+/* Worker reduction address expander.  */
+static rtx
+nvptx_expand_work_red_addr (tree exp, rtx target,
+			    machine_mode ARG_UNUSED (mode),
+			    int ignore)
 {
-  return nvptx_expand_lock_unlock (desc, exp, false);
+  if (ignore)
+    return target;
+  
+  rtx loop_id = expand_expr (CALL_EXPR_ARG (exp, 0),
+			     NULL_RTX, mode, EXPAND_NORMAL);
+  rtx red_id = expand_expr (CALL_EXPR_ARG (exp, 1),
+			     NULL_RTX, mode, EXPAND_NORMAL);
+  gcc_assert (GET_CODE (loop_id) == CONST_INT
+	      && GET_CODE (red_id) == CONST_INT);
+  gcc_assert (REG_P (target));
+
+  unsigned lid = (unsigned)UINTVAL (loop_id);
+  unsigned rid = (unsigned)UINTVAL (red_id);
+
+  unsigned ix;
+
+  for (ix = 0; ix != loop_reds.length (); ix++)
+    if (loop_reds[ix].id == lid)
+      goto found_lid;
+  /* Allocate a new loop.  */
+  loop_reds.safe_push (loop_red (lid));
+ found_lid:
+  loop_red &loop = loop_reds[ix];
+  for (ix = 0; ix != loop.vars.length (); ix++)
+    if (loop.vars[ix].first == rid)
+      goto found_rid;
+
+  /* Allocate a new var. */
+  {
+    tree type = TREE_TYPE (TREE_TYPE (exp));
+    enum machine_mode mode = TYPE_MODE (type);
+    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
+    unsigned off = loop.hwm;
+
+    if (align > worker_red_align)
+      worker_red_align = align;
+    off = (off + align - 1) & ~(align -1);
+    loop.hwm = off + GET_MODE_SIZE (mode);
+    loop.vars.safe_push (var_red_t (rid, off));
+  }
+ found_rid:
+
+  /* Return offset into worker reduction array.  */
+  unsigned offset = loop.vars[ix].second;
+  
+  rtx addr = gen_reg_rtx (Pmode);
+  emit_move_insn (addr,
+		  gen_rtx_PLUS (Pmode, worker_red_sym, GEN_INT (offset)));
+  emit_insn (gen_rtx_SET (target,
+			  gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+					  UNSPEC_FROM_SHARED)));
+  return target;
 }
 
 enum nvptx_types
@@ -3276,7 +3413,10 @@  enum nvptx_types
     NT_FLT_FLT_INT,
     NT_DBL_DBL_INT,
     NT_VOID_UINT,
-
+    NT_UINTPTR_UINT_UINT,
+    NT_ULLPTR_UINT_UINT,
+    NT_FLTPTR_UINT_UINT,
+    NT_DBLPTR_UINT_UINT,
     NT_MAX
   };
 
@@ -3292,6 +3432,14 @@  static const struct builtin_description
    nvptx_expand_shuffle_down},
   {"__builtin_nvptx_lock", NT_VOID_UINT, nvptx_expand_lock},
   {"__builtin_nvptx_unlock", NT_VOID_UINT, nvptx_expand_unlock},
+  {"__builtin_nvptx_work_red_addr", NT_UINTPTR_UINT_UINT,
+   nvptx_expand_work_red_addr},
+  {"__builtin_nvptx_work_red_addrll", NT_ULLPTR_UINT_UINT,
+   nvptx_expand_work_red_addr},
+  {"__builtin_nvptx_work_red_addrf", NT_FLTPTR_UINT_UINT,
+   nvptx_expand_work_red_addr},
+  {"__builtin_nvptx_work_red_addrd", NT_DBLPTR_UINT_UINT,
+   nvptx_expand_work_red_addr},
 };
 
 #define NVPTX_BUILTIN_MAX (sizeof (builtins) / sizeof (builtins[0]))
@@ -3331,10 +3479,31 @@  nvptx_init_builtins (void)
   types[NT_VOID_UINT]
     = build_function_type_list (void_type_node, unsigned_type_node, NULL_TREE);
 
+  types[NT_UINTPTR_UINT_UINT]
+    = build_function_type_list (build_pointer_type (unsigned_type_node),
+				unsigned_type_node, unsigned_type_node,
+				NULL_TREE);
+
+  types[NT_ULLPTR_UINT_UINT]
+    = build_function_type_list (build_pointer_type
+				(long_long_unsigned_type_node),
+				unsigned_type_node, unsigned_type_node,
+				NULL_TREE);
+
+  types[NT_FLTPTR_UINT_UINT]
+    = build_function_type_list (build_pointer_type (float_type_node),
+				unsigned_type_node, unsigned_type_node,
+				NULL_TREE);
+
+  types[NT_DBLPTR_UINT_UINT]
+    = build_function_type_list (build_pointer_type (double_type_node),
+				unsigned_type_node, unsigned_type_node,
+				NULL_TREE);
+
   for (ix = 0; ix != NVPTX_BUILTIN_MAX; ix++)
     nvptx_builtin_decls[ix]
-      =  add_builtin_function (builtins[ix].name, types[builtins[ix].type],
-			       ix, BUILT_IN_MD, NULL, NULL_TREE);
+      = add_builtin_function (builtins[ix].name, types[builtins[ix].type],
+			      ix, BUILT_IN_MD, NULL, NULL);
 }
 
 /* Expand an expression EXP that calls a built-in function,
@@ -3352,7 +3521,7 @@  nvptx_expand_builtin (tree exp, rtx targ
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   const struct builtin_description *d = &builtins[DECL_FUNCTION_CODE (fndecl)];
 
-  return d->expander (d, exp, target, mode, ignore);
+  return d->expander (exp, target, mode, ignore);
 }
 
 #undef TARGET_OPTION_OVERRIDE