diff mbox

[PTX] OpenACC complex double reductions

Message ID 564C82DE.1050100@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Nov. 18, 2015, 1:53 p.m. UTC
Here's the version of the complex double reduction patch I've committed to trunk.

There's no atomic cmp&swap larger than 64 bits, so we have to do something else. 
  I  started with a patch to synthesize such an operation using a global lock, 
and fitted it into the current scheme.  But that (a) ended  up looking 
complicated and (b) had a lock.  As we have to use a lock, one might as well go 
for a mutex scheme.

The lock variable has to be in global memory, even if it's  protecting .shared 
state.  Locking in .shared memory can introduce resource starvation as there's 
then no descheduling of the thread attempting to get the lock.   Nvidia have 
confirmed that global locks do not suffer this problem.

nathan
diff mbox

Patch

2015-11-18  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (global_lock_var): New.
	(nvptx_global_lock_addr): New.
	(nvptx_lockless_update): Recomment and adjust for clarity.
	(nvptx_lockfull_update): New.
	(nvptx_reduction_update): New.
	(nvptx_goacc_reduction_fini): Call it.

	libgcc/
	* config/nvptx/reduction.c: New.
	* config/nvptx/t-nvptx (LIB2ADD): Add it.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
	worker & gang cases.
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 230544)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -114,6 +114,9 @@  static unsigned worker_red_align;
 #define worker_red_name "__worker_red"
 static GTY(()) rtx worker_red_sym;
 
+/* Global lock variable, needed for 128bit worker & gang reductions.  */
+static GTY(()) tree global_lock_var;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -3681,8 +3684,45 @@  nvptx_generate_vector_shuffle (location_
   gimplify_assign (dest_var, expr, seq);
 }
 
-/* Insert code to locklessly update  *PTR with *PTR OP VAR just before
-   GSI.  */
+/* Lazily generate the global lock var decl and return its address.  */
+
+static tree
+nvptx_global_lock_addr ()
+{
+  tree v = global_lock_var;
+  
+  if (!v)
+    {
+      tree name = get_identifier ("__reduction_lock");
+      tree type = build_qualified_type (unsigned_type_node,
+					TYPE_QUAL_VOLATILE);
+      v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
+      global_lock_var = v;
+      DECL_ARTIFICIAL (v) = 1;
+      DECL_EXTERNAL (v) = 1;
+      TREE_STATIC (v) = 1;
+      TREE_PUBLIC (v) = 1;
+      TREE_USED (v) = 1;
+      mark_addressable (v);
+      mark_decl_referenced (v);
+    }
+
+  return build_fold_addr_expr (v);
+}
+
+/* Insert code to locklessly update *PTR with *PTR OP VAR just before
+   GSI.  We use a lockless scheme for nearly all case, which looks
+   like:
+     actual = initval(OP);
+     do {
+       guess = actual;
+       write = guess OP myval;
+       actual = cmp&swap (ptr, guess, write)
+     } while (actual bit-different-to guess);
+   return write;
+
+   This relies on a cmp&swap instruction, which is available for 32-
+   and 64-bit types.  Larger types must use a locking scheme.  */
 
 static tree
 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
@@ -3690,46 +3730,30 @@  nvptx_lockless_update (location_t loc, g
 {
   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
   tree_code code = NOP_EXPR;
-  tree type = unsigned_type_node;
-
-  enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+  tree arg_type = unsigned_type_node;
+  tree var_type = TREE_TYPE (var);
 
-  if (!INTEGRAL_MODE_P (mode))
+  if (TREE_CODE (var_type) == COMPLEX_TYPE
+      || TREE_CODE (var_type) == REAL_TYPE)
     code = VIEW_CONVERT_EXPR;
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
+
+  if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
     {
+      arg_type = long_long_unsigned_type_node;
       fn = NVPTX_BUILTIN_CMP_SWAPLL;
-      type = long_long_unsigned_type_node;
     }
 
+  tree swap_fn = nvptx_builtin_decl (fn, true);
+
   gimple_seq init_seq = NULL;
-  tree init_var = make_ssa_name (type);
-  tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var));
-  init_expr = fold_build1 (code, type, init_expr);
+  tree init_var = make_ssa_name (arg_type);
+  tree init_expr = omp_reduction_init_op (loc, op, var_type);
+  init_expr = fold_build1 (code, arg_type, init_expr);
   gimplify_assign (init_var, init_expr, &init_seq);
   gimple *init_end = gimple_seq_last (init_seq);
 
   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
   
-  gimple_seq loop_seq = NULL;
-  tree expect_var = make_ssa_name (type);
-  tree actual_var = make_ssa_name (type);
-  tree write_var = make_ssa_name (type);
-  
-  tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var);
-  write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var);
-  write_expr = fold_build1 (code, type, write_expr);
-  gimplify_assign (write_var, write_expr, &loop_seq);
-
-  tree swap_expr = nvptx_builtin_decl (fn, true);
-  swap_expr = build_call_expr_loc (loc, swap_expr, 3,
-				   ptr, expect_var, write_var);
-  gimplify_assign (actual_var, swap_expr, &loop_seq);
-
-  gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
-				   NULL_TREE, NULL_TREE);
-  gimple_seq_add_stmt (&loop_seq, cond);
-
   /* Split the block just after the init stmts.  */
   basic_block pre_bb = gsi_bb (*gsi);
   edge pre_edge = split_block (pre_bb, init_end);
@@ -3738,12 +3762,34 @@  nvptx_lockless_update (location_t loc, g
   /* Reset the iterator.  */
   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
 
-  /* Insert the loop statements.  */
-  gimple *loop_end = gimple_seq_last (loop_seq);
-  gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
+  tree expect_var = make_ssa_name (arg_type);
+  tree actual_var = make_ssa_name (arg_type);
+  tree write_var = make_ssa_name (arg_type);
+  
+  /* Build and insert the reduction calculation.  */
+  gimple_seq red_seq = NULL;
+  tree write_expr = fold_build1 (code, var_type, expect_var);
+  write_expr = fold_build2 (op, var_type, write_expr, var);
+  write_expr = fold_build1 (code, arg_type, write_expr);
+  gimplify_assign (write_var, write_expr, &red_seq);
+
+  gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
+
+  /* Build & insert the cmp&swap sequence.  */
+  gimple_seq latch_seq = NULL;
+  tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
+					ptr, expect_var, write_var);
+  gimplify_assign (actual_var, swap_expr, &latch_seq);
+
+  gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
+				   NULL_TREE, NULL_TREE);
+  gimple_seq_add_stmt (&latch_seq, cond);
+
+  gimple *latch_end = gimple_seq_last (latch_seq);
+  gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
 
-  /* Split the block just after the loop stmts.  */
-  edge post_edge = split_block (loop_bb, loop_end);
+  /* Split the block just after the latch stmts.  */
+  edge post_edge = split_block (loop_bb, latch_end);
   basic_block post_bb = post_edge->dest;
   loop_bb = post_edge->src;
   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
@@ -3762,7 +3808,123 @@  nvptx_lockless_update (location_t loc, g
   loop->latch = loop_bb;
   add_loop (loop, loop_bb->loop_father);
 
-  return fold_build1 (code, TREE_TYPE (var), write_var);
+  return fold_build1 (code, var_type, write_var);
+}
+
+/* Insert code to lockfully update *PTR with *PTR OP VAR just before
+   GSI.  This is necessary for types larger than 64 bits, where there
+   is no cmp&swap instruction to implement a lockless scheme.  We use
+   a lock variable in global memory.
+
+   while (cmp&swap (&lock_var, 0, 1))
+     continue;
+   T accum = *ptr;
+   accum = accum OP var;
+   *ptr = accum;
+   cmp&swap (&lock_var, 1, 0);
+   return accum;
+
+   A lock in global memory is necessary to force execution engine
+   descheduling and avoid resource starvation that can occur if the
+   lock is in .shared memory.  */
+
+static tree
+nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
+		       tree ptr, tree var, tree_code op)
+{
+  tree var_type = TREE_TYPE (var);
+  tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
+  tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
+  tree uns_locked = build_int_cst (unsigned_type_node, 1);
+
+  /* Split the block just before the gsi.  Insert a gimple nop to make
+     this easier.  */
+  gimple *nop = gimple_build_nop ();
+  gsi_insert_before (gsi, nop, GSI_SAME_STMT);
+  basic_block entry_bb = gsi_bb (*gsi);
+  edge entry_edge = split_block (entry_bb, nop);
+  basic_block lock_bb = entry_edge->dest;
+  /* Reset the iterator.  */
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+  /* Build and insert the locking sequence.  */
+  gimple_seq lock_seq = NULL;
+  tree lock_var = make_ssa_name (unsigned_type_node);
+  tree lock_expr = nvptx_global_lock_addr ();
+  lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
+				   uns_unlocked, uns_locked);
+  gimplify_assign (lock_var, lock_expr, &lock_seq);
+  gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
+				   NULL_TREE, NULL_TREE);
+  gimple_seq_add_stmt (&lock_seq, cond);
+  gimple *lock_end = gimple_seq_last (lock_seq);
+  gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
+
+  /* Split the block just after the lock sequence.  */
+  edge locked_edge = split_block (lock_bb, lock_end);
+  basic_block update_bb = locked_edge->dest;
+  lock_bb = locked_edge->src;
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+  
+  /* Create the lock loop ... */
+  locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+  make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
+  set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
+
+  /* ... and the loop structure.  */
+  loop *lock_loop = alloc_loop ();
+  lock_loop->header = lock_bb;
+  lock_loop->latch = lock_bb;
+  lock_loop->nb_iterations_estimate = 1;
+  lock_loop->any_estimate = true;
+  add_loop (lock_loop, entry_bb->loop_father);
+
+  /* Build and insert the reduction calculation.  */
+  gimple_seq red_seq = NULL;
+  tree acc_in = make_ssa_name (var_type);
+  tree ref_in = build_simple_mem_ref (ptr);
+  TREE_THIS_VOLATILE (ref_in) = 1;
+  gimplify_assign (acc_in, ref_in, &red_seq);
+  
+  tree acc_out = make_ssa_name (var_type);
+  tree update_expr = fold_build2 (op, var_type, ref_in, var);
+  gimplify_assign (acc_out, update_expr, &red_seq);
+  
+  tree ref_out = build_simple_mem_ref (ptr);
+  TREE_THIS_VOLATILE (ref_out) = 1;
+  gimplify_assign (ref_out, acc_out, &red_seq);
+
+  gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
+
+  /* Build & insert the unlock sequence.  */
+  gimple_seq unlock_seq = NULL;
+  tree unlock_expr = nvptx_global_lock_addr ();
+  unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
+				     uns_locked, uns_unlocked);
+  gimplify_and_add (unlock_expr, &unlock_seq);
+  gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
+
+  return acc_out;
+}
+
+/* Emit a sequence to update a reduction accumlator at *PTR with the
+   value held in VAR using operator OP.  Return the updated value.
+
+   TODO: optimize for atomic ops and indepedent complex ops.  */
+
+static tree
+nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
+			tree ptr, tree var, tree_code op)
+{
+  tree type = TREE_TYPE (var);
+  tree size = TYPE_SIZE (type);
+
+  if (size == TYPE_SIZE (unsigned_type_node)
+      || size == TYPE_SIZE (long_long_unsigned_type_node))
+    return nvptx_lockless_update (loc, gsi, ptr, var, op);
+  else
+    return nvptx_lockfull_update (loc, gsi, ptr, var, op);
 }
 
 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
@@ -3944,11 +4106,11 @@  nvptx_goacc_reduction_fini (gcall *call)
 
       if (accum)
 	{
-	  /* Locklessly update the accumulator.  */
+	  /* UPDATE the accumulator.  */
 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 	  seq = NULL;
-	  r = nvptx_lockless_update (gimple_location (call), &gsi,
-				     accum, var, op);
+	  r = nvptx_reduction_update (gimple_location (call), &gsi,
+				      accum, var, op);
 	}
     }
 
Index: libgcc/config/nvptx/reduction.c
===================================================================
--- libgcc/config/nvptx/reduction.c	(revision 0)
+++ libgcc/config/nvptx/reduction.c	(working copy)
@@ -0,0 +1,31 @@ 
+/* Oversized reductions lock  variable
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* We use a global lock variable for reductions on objects larger than
+   64 bits.  Until and unless proven that lock contention for
+   different reduction is a problem, a single lock will suffice.  */
+
+unsigned volatile __reduction_lock = 0;
Index: libgcc/config/nvptx/t-nvptx
===================================================================
--- libgcc/config/nvptx/t-nvptx	(revision 230544)
+++ libgcc/config/nvptx/t-nvptx	(working copy)
@@ -1,6 +1,7 @@ 
 LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \
 	$(srcdir)/config/nvptx/free.asm \
-	$(srcdir)/config/nvptx/realloc.c
+	$(srcdir)/config/nvptx/realloc.c \
+	$(srcdir)/config/nvptx/reduction.c
 
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
o===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(revision 230544)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(working copy)
@@ -14,28 +14,41 @@  int close_enough (double _Complex a, dou
   return mag2_diff / mag2_a < (FRAC * FRAC);
 }
 
-int main (void)
-{
 #define N 100
-  double _Complex ary[N], sum, prod, tsum, tprod;
-  int ix;
 
-  sum = tsum = 0;
-  prod = tprod = 1;
-  
-  for (ix = 0; ix < N;  ix++)
-    {
-      double frac = ix * (1.0 / 1024) + 1.0;
-      
-      ary[ix] = frac + frac * 2.0i - 1.0i;
-      sum += ary[ix];
-      prod *= ary[ix];
-    }
+static int __attribute__ ((noinline))
+vector (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
+  double _Complex tsum = 0, tprod = 1;
 
-#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
   {
 #pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
-    for (ix = 0; ix < N; ix++)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
+
+static int __attribute__ ((noinline))
+worker (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
+  double _Complex tsum = 0, tprod = 1;
+
+#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
       {
 	tsum += ary[ix];
 	tprod *= ary[ix];
@@ -49,4 +62,53 @@  int main (void)
     return 1;
 
   return 0;
+}
+
+static int __attribute__ ((noinline))
+gang (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
+  double _Complex tsum = 0, tprod = 1;
+
+#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
+
+int main (void)
+{
+  double _Complex ary[N], sum = 0, prod = 1;
+
+  for (int ix = 0; ix < N;  ix++)
+    {
+      double frac = ix * (1.0 / 1024) + 1.0;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+  if (vector (ary, sum, prod))
+    return 1;
+  
+  if (worker (ary, sum, prod))
+    return 1;
+
+  if (gang (ary, sum, prod))
+    return 1;
+
+  return 0;
 }
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(revision 230544)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(working copy)
@@ -14,28 +14,41 @@  int close_enough (float _Complex a, floa
   return mag2_diff / mag2_a < (FRAC * FRAC);
 }
 
-int main (void)
-{
 #define N 100
-  float _Complex ary[N], sum, prod, tsum, tprod;
-  int ix;
 
-  sum = tsum = 0;
-  prod = tprod = 1;
-  
-  for (ix = 0; ix < N;  ix++)
-    {
-      float frac = ix * (1.0f / 1024) + 1.0f;
-      
-      ary[ix] = frac + frac * 2.0i - 1.0i;
-      sum += ary[ix];
-      prod *= ary[ix];
-    }
+static int __attribute__ ((noinline))
+vector (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
+  float _Complex tsum = 0, tprod = 1;
 
-#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
   {
 #pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
-    for (ix = 0; ix < N; ix++)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
+
+static int __attribute__ ((noinline))
+worker (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
+  float _Complex tsum = 0, tprod = 1;
+
+#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
       {
 	tsum += ary[ix];
 	tprod *= ary[ix];
@@ -49,4 +62,53 @@  int main (void)
     return 1;
 
   return 0;
+}
+
+static int __attribute__ ((noinline))
+gang (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
+  float _Complex tsum = 0, tprod = 1;
+
+#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
+
+int main (void)
+{
+  float _Complex ary[N], sum = 0, prod = 1;
+
+  for (int ix = 0; ix < N;  ix++)
+    {
+      float frac = ix * (1.0f / 1024) + 1.0f;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+  if (vector (ary, sum, prod))
+    return 1;
+  
+  if (worker (ary, sum, prod))
+    return 1;
+
+  if (gang (ary, sum, prod))
+    return 1;
+
+  return 0;
 }