diff mbox series

[11/11,nvptx] Generalize state propagation and synchronization

Message ID ece7531fec1a616f90999edaa9af0adc75f6cc50.1532464999.git.cesar@codesourcery.com
State New
Headers show
Series Initial vector length changes | expand

Commit Message

Cesar Philippidis July 24, 2018, 8:48 p.m. UTC
From: Tom de Vries <tom@codesourcery.com>

As the title mentions, this patch generalizes the state propagation and
synchronization code. Note that while the patch makes reference to
large_vectors, they are not enabled in nvptx_goacc_validate_dims.
Therefore, only the worker case is exercised in this patch.

2018-XX-YY  Tom de Vries  <tdevries@suse.de>
	    Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (oacc_bcast_partition): Declare.
	(nvptx_option_override): Init oacc_bcast_partition.
	(nvptx_init_oacc_workers): New function.
	(nvptx_declare_function_name): Call nvptx_init_oacc_workers.
	(nvptx_needs_shared_bcast): New function.
	(nvptx_find_par): Generalize to enable vectors to use shared-memory
	to propagate state.
	(nvptx_shared_propagate): Initialize vector bcast partition and
	synchronization state.
	(nvptx_single):  Generalize to enable vectors to use shared-memory
	to propagate state.
	(nvptx_process_pars): Likewise.
	(nvptx_set_current_function): Initialize oacc_broadcast_partition.
	* config/nvptx/nvptx.h (struct machine_function): Add
	bcast_partition and sync_bar members.

(cherry picked from openacc-gcc-7-branch commit
628f439f33ed6f689656a1ed8ff74db97e7ec3ed, and commit
293e415e04d6b407e59118253e5fdfe539000cfe)
diff mbox series

Patch

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 7d49b4f..abd47ac 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -136,6 +136,7 @@  static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
    memory.  It'd be nice if PTX supported common blocks, because then
    this could be shared across TUs (taking the largest size).  */
 static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_partition;
 static unsigned oacc_bcast_align;
 static GTY(()) rtx oacc_bcast_sym;
 
@@ -154,6 +155,8 @@  static bool need_softstack_decl;
 /* True if any function references __nvptx_uni.  */
 static bool need_unisimt_decl;
 
+static int nvptx_mach_max_workers ();
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -213,6 +216,7 @@  nvptx_option_override (void)
   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  oacc_bcast_partition = 0;
 
   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
@@ -1101,6 +1105,40 @@  nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
   fprintf (file, "\t}\n");
 }
 
+/* Emit code to initialize OpenACC worker broadcast and synchronization
+   registers.  */
+
+static void
+nvptx_init_oacc_workers (FILE *file)
+{
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
+  if (cfun->machine->bcast_partition)
+    {
+      fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
+      fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+    }
+  fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
+  if (cfun->machine->bcast_partition)
+    {
+      fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
+      fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
+      fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
+      fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
+	       "// vector broadcast offset\n",
+	       REGNO (cfun->machine->bcast_partition),
+	       oacc_bcast_partition);
+    }
+  /* Verify oacc_bcast_size.  */
+  gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
+	      <= oacc_bcast_size);
+  if (cfun->machine->sync_bar)
+    fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
+	     "// vector synchronization barrier\n",
+	     REGNO (cfun->machine->sync_bar));
+  fprintf (file, "\t}\n");
+}
+
 /* Emit code to initialize predicate and master lane index registers for
    -muniform-simt code generation variant.  */
 
@@ -1327,6 +1365,8 @@  nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   if (cfun->machine->unisimt_predicate
       || (cfun->machine->has_simtreg && !crtl->is_leaf))
     nvptx_init_unisimt_predicate (file);
+  if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
+    nvptx_init_oacc_workers (file);
 }
 
 /* Output code for switching uniform-simt state.  ENTERING indicates whether
@@ -3045,6 +3085,19 @@  nvptx_split_blocks (bb_insn_map_t *map)
     }
 }
 
+/* Return true if MASK contains parallelism that requires shared
+   memory to broadcast.  */
+
+static bool
+nvptx_needs_shared_bcast (unsigned mask)
+{
+  bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+  bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+    && nvptx_mach_vector_length () != PTX_WARP_SIZE;
+
+  return worker || large_vector;
+}
+
 /* BLOCK is a basic block containing a head or tail instruction.
    Locate the associated prehead or pretail instruction, which must be
    in the single predecessor block.  */
@@ -3120,7 +3173,7 @@  nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    par = new parallel (par, mask);
 	    par->forked_block = block;
 	    par->forked_insn = end;
-	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+	    if (nvptx_needs_shared_bcast (mask))
 	      par->fork_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	  }
@@ -3135,7 +3188,7 @@  nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    gcc_assert (par->mask == mask);
 	    par->join_block = block;
 	    par->join_insn = end;
-	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+	    if (nvptx_needs_shared_bcast (mask))
 	      par->joining_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 	    par = par->parent;
@@ -3992,11 +4045,33 @@  nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
   gcc_assert (empty == !data.offset);
   if (data.offset)
     {
+      rtx bcast_sym = oacc_bcast_sym;
+
       /* Stuff was emitted, initialize the base pointer now.  */
-      rtx init = gen_rtx_SET (data.base, oacc_bcast_sym);
+      if (vector && nvptx_mach_max_workers () > 1)
+	{
+	  if (!cfun->machine->bcast_partition)
+	    {
+	      /* It would be nice to place this register in
+		 DATA_AREA_SHARED.  */
+	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
+	    }
+	  if (!cfun->machine->sync_bar)
+	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
+
+	  bcast_sym = cfun->machine->bcast_partition;
+	}
+
+      rtx init = gen_rtx_SET (data.base, bcast_sym);
       emit_insn_after (init, insn);
 
-      oacc_bcast_size = MAX (oacc_bcast_size, data.offset);
+      unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
+      unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+			   ? nvptx_mach_max_workers () + 1
+			   : 1);
+
+      oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+      oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
     }
   return empty;
 }
@@ -4301,7 +4376,8 @@  nvptx_single (unsigned mask, basic_block from, basic_block to)
     {
       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
 
-      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
+	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
 	{
 	  /* Vector mode only, do a shuffle.  */
 #if WORKAROUND_PTXJIT_BUG
@@ -4368,23 +4444,51 @@  nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  /* Includes worker mode, do spill & fill.  By construction
 	     we should never have worker mode only. */
 	  broadcast_data_t data;
+	  unsigned size = GET_MODE_SIZE (SImode);
+	  bool vector = true;
 	  rtx barrier = GEN_INT (0);
 	  int threads = 0;
 
+	  if (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask)
+	    vector = false;
+
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
 
-	  oacc_bcast_size = MAX (oacc_bcast_size, GET_MODE_SIZE (SImode));
+	  if (vector
+	      && nvptx_mach_max_workers () > 1
+	      && cfun->machine->bcast_partition)
+	    data.base = cfun->machine->bcast_partition;
+
+	  gcc_assert (data.base != NULL);
+
+	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
+	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+			       ? nvptx_mach_max_workers () + 1
+			       : 1);
+
+	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
 
 	  data.offset = 0;
 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
-						    false),
+						    vector),
 			    before);
+
+	  if (vector
+	      && nvptx_mach_max_workers () > 1
+	      && cfun->machine->sync_bar)
+	    {
+	      barrier = cfun->machine->sync_bar;
+	      threads = nvptx_mach_vector_length ();
+	    }
+
 	  /* Barrier so other workers can see the write.  */
 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	  data.offset = 0;
 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
-						    false), tail);
+						    vector),
+			    tail);
 	  /* This barrier is needed to avoid worker zero clobbering
 	     the broadcast buffer before all the other workers have
 	     had a chance to read this instance of it.  */
@@ -4502,17 +4606,26 @@  nvptx_process_pars (parallel *par)
     }
 
   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+  bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
+  bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  if (worker || large_vector)
     {
       nvptx_shared_propagate (false, is_call, par->forked_block,
-			      par->forked_insn, false);
+			      par->forked_insn, !worker);
       bool empty = nvptx_shared_propagate (true, is_call,
 					   par->forked_block, par->fork_insn,
-					   false);
+					   !worker);
       rtx barrier = GEN_INT (0);
       int threads = 0;
 
+      if (!worker && cfun->machine->sync_bar)
+	{
+	  barrier = cfun->machine->sync_bar;
+	  threads = nvptx_mach_vector_length ();
+	}
+
       if (!empty || !is_call)
 	{
 	  /* Insert begin and end synchronizations.  */
@@ -6013,6 +6126,7 @@  nvptx_set_current_function (tree fndecl)
     return;
 
   nvptx_previous_fndecl = fndecl;
+  oacc_bcast_partition = 0;
 }
 
 #undef TARGET_OPTION_OVERRIDE
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 90fb2c9..b923560 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -212,6 +212,10 @@  struct GTY(()) machine_function
   rtx axis_predicate[2]; /* Neutering predicates.  */
   int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
 		      vector_length, dim[1] is num_workers.  */
+  rtx bcast_partition; /* Register containing the size of each
+			  vector's partition of share-memory used to
+			  broadcast state.  */
+  rtx sync_bar; /* Synchronization barrier ID for vectors.  */
   rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
   rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
   rtx unisimt_location; /* Mask location for -muniform-simt.  */