diff mbox series

[4/4] openacc: Reference-typed reduction and private variable rewriting

Message ID 7deb63194542d4f00fe6488dbcdd137b0bcf211c.1614685766.git.julian@codesourcery.com
State New
Headers show
Series openacc: Worker partitioning in the middle end | expand

Commit Message

Julian Brown March 2, 2021, 12:20 p.m. UTC
A version of this patch was previously posted for mainline here:

  https://gcc.gnu.org/pipermail/gcc-patches/2019-November/534552.html

Reference-type private variables or reference-type variables used as
reduction targets do not work well with the scheme to implement worker
partitioning on AMD GCN. This patch (originally by Cesar Philippidis, but
modified somewhat since) provides support for replacing such variables
with new non-reference-typed temporary versions within partitioned
offload regions.

In more detail, the problem with reductions is as follows.  The expansion
of reduction operations (or similarly use of private variables) may
cause the bits of a reference variable (i.e. a pointer to a stack slot)
formed in worker-single mode to be broadcast to worker-partitioned mode,
and then dereferenced. Thus all workers will try to access the same
variable on worker 0's stack, which is not what was intended -- rather,
the reference in each worker should have been a pointer to a slot in
that worker's own stack.

A better solution to this problem might be to avoid trying to broadcast
pointers formed by taking the address of a stack slot somehow, but that
could prove tricky or perhaps impossible in the general case.

(I noticed during testing that Tobias has a couple of follow-up patches
to this one on the og10 branch. It might make sense to fold those into
this one too, else they'll need applying separately.)

Tested with offloading to AMD GCN (and separately to NVPTX). OK for
stage 1?

Julian

2021-03-02  Cesar Philippidis  <cesar@codesourcery.com>
	    Julian Brown  <julian@codesourcery.com>
	    Kwok Cheung Yeung  <kcy@codesourcery.com>

gcc/
	* gimplify.c (privatize_reduction): New struct.
	(localize_reductions_r, localize_reductions): New functions.
	(gimplify_omp_for): Call localize_reductions.
	(gimplify_omp_workshare): Likewise.
	* omp-low.c (lower_oacc_reductions): Handle localized reductions.
	Create fewer temp vars.
	* tree-core.h (omp_clause_code): Add OMP_CLAUSE_REDUCTION_PRIVATE_DECL
	documentation.
	* tree.c (omp_clause_num_ops): Bump number of ops for
	OMP_CLAUSE_REDUCTION to 6.
	(walk_tree_1): Adjust accordingly.
	* tree.h (OMP_CLAUSE_REDUCTION_PRIVATE_DECL): Add macro.

libgomp/
	* testsuite/libgomp.oacc-fortran/privatized-ref-1.f95: New test.
	* testsuite/libgomp.oacc-c++/privatized-ref-2.C: New test.
	* testsuite/libgomp.oacc-c++/privatized-ref-3.C: New test.
---
 gcc/gimplify.c                                | 117 ++++++++++++++++++
 gcc/omp-low.c                                 |  47 +++----
 gcc/tree-core.h                               |   4 +-
 gcc/tree.c                                    |  11 +-
 gcc/tree.h                                    |   2 +
 .../libgomp.oacc-c++/privatized-ref-2.C       |  64 ++++++++++
 .../libgomp.oacc-c++/privatized-ref-3.C       |  64 ++++++++++
 .../libgomp.oacc-fortran/privatized-ref-1.f95 |  71 +++++++++++
 8 files changed, 343 insertions(+), 37 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C
 create mode 100644 libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
diff mbox series

Patch

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index caf25ccdd5c..e092b7be723 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -236,6 +236,11 @@  struct gimplify_omp_ctx
   int defaultmap[4];
 };
 
+struct privatize_reduction
+{
+  tree ref_var, local_var;
+};
+
 static struct gimplify_ctx *gimplify_ctxp;
 static struct gimplify_omp_ctx *gimplify_omp_ctxp;
 static bool in_omp_construct;
@@ -11381,6 +11386,95 @@  gimplify_omp_taskloop_expr (tree type, tree *tp, gimple_seq *pre_p,
   OMP_FOR_CLAUSES (orig_for_stmt) = c;
 }
 
+/* Helper function for localize_reductions.  Replace all uses of REF_VAR with
+   LOCAL_VAR.  */
+
+static tree
+localize_reductions_r (tree *tp, int *walk_subtrees, void *data)
+{
+  enum tree_code tc = TREE_CODE (*tp);
+  struct privatize_reduction *pr = (struct privatize_reduction *) data;
+
+  if (TYPE_P (*tp))
+    *walk_subtrees = 0;
+
+  switch (tc)
+    {
+    case INDIRECT_REF:
+    case MEM_REF:
+      if (TREE_OPERAND (*tp, 0) == pr->ref_var)
+	*tp = pr->local_var;
+
+      *walk_subtrees = 0;
+      break;
+
+    case VAR_DECL:
+    case PARM_DECL:
+    case RESULT_DECL:
+      if (*tp == pr->ref_var)
+	*tp = pr->local_var;
+
+      *walk_subtrees = 0;
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL_TREE;
+}
+
+/* OpenACC worker and vector loop state propagation requires reductions
+   to be inside local variables.  This function replaces all reference-type
+   reductions variables associated with the loop with a local copy.  It is
+   also used to create private copies of reduction variables for those
+   which are not associated with acc loops.  */
+
+static void
+localize_reductions (tree clauses, tree body)
+{
+  tree c, var, type, new_var;
+  struct privatize_reduction pr;
+
+  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
+      {
+	var = OMP_CLAUSE_DECL (c);
+
+	if (!lang_hooks.decls.omp_privatize_by_reference (var))
+	  {
+	    OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = NULL;
+	    continue;
+	  }
+
+	type = TREE_TYPE (TREE_TYPE (var));
+	new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+	pr.ref_var = var;
+	pr.local_var = new_var;
+
+	walk_tree (&body, localize_reductions_r, &pr, NULL);
+
+	OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = new_var;
+      }
+    else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE)
+      {
+	var = OMP_CLAUSE_DECL (c);
+
+	if (!lang_hooks.decls.omp_privatize_by_reference (var))
+	  continue;
+
+	type = TREE_TYPE (TREE_TYPE (var));
+	new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+	pr.ref_var = var;
+	pr.local_var = new_var;
+
+	walk_tree (&body, localize_reductions_r, &pr, NULL);
+      }
+}
+
+
 /* Gimplify the gross structure of an OMP_FOR statement.  */
 
 static enum gimplify_status
@@ -11607,6 +11701,24 @@  gimplify_omp_for (tree *expr_p, gimple_seq *pre_p)
       gcc_unreachable ();
     }
 
+  if (ort == ORT_ACC)
+    {
+      gimplify_omp_ctx *outer = gimplify_omp_ctxp;
+
+      while (outer
+	     && outer->region_type != ORT_ACC_PARALLEL
+	     && outer->region_type != ORT_ACC_KERNELS)
+	outer = outer->outer_context;
+
+      /* FIXME: Reductions only work in parallel regions at present.  We avoid
+	 doing the reduction localization transformation in kernels regions
+	 here, because the code to remove reductions in kernels regions cannot
+	 handle that.  */
+      if (outer && outer->region_type == ORT_ACC_PARALLEL)
+	localize_reductions (OMP_FOR_CLAUSES (for_stmt),
+			     OMP_FOR_BODY (for_stmt));
+    }
+
   /* Set OMP_CLAUSE_LINEAR_NO_COPYIN flag on explicit linear
      clause for the IV.  */
   if (ort == ORT_SIMD && TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)) == 1)
@@ -13265,6 +13377,11 @@  gimplify_omp_workshare (tree *expr_p, gimple_seq *pre_p)
       || (ort & ORT_HOST_TEAMS) == ORT_HOST_TEAMS)
     {
       push_gimplify_context ();
+
+      /* FIXME: Reductions are not supported in kernels regions yet.  */
+      if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
+        localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
+
       gimple *g = gimplify_and_return_first (OMP_BODY (expr), &body);
       if (gimple_code (g) == GIMPLE_BIND)
 	pop_gimplify_context (g);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index fd8025e0e3f..7fd3b33d41d 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -7072,9 +7072,9 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 	gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
 
 	tree orig = OMP_CLAUSE_DECL (c);
-	tree var = maybe_lookup_decl (orig, ctx);
+	tree var;
 	tree ref_to_res = NULL_TREE;
-	tree incoming, outgoing, v1, v2, v3;
+	tree incoming, outgoing;
 	bool is_private = false;
 
 	enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
@@ -7086,6 +7086,9 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 	  rcode = BIT_IOR_EXPR;
 	tree op = build_int_cst (unsigned_type_node, rcode);
 
+	var = OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c);
+	if (!var)
+	  var = maybe_lookup_decl (orig, ctx);
 	if (!var)
 	  var = orig;
 
@@ -7176,36 +7179,13 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 	if (!ref_to_res)
 	  ref_to_res = integer_zero_node;
 
-	if (omp_is_reference (orig))
+	if (omp_is_reference (outgoing))
 	  {
-	    tree type = TREE_TYPE (var);
-	    const char *id = IDENTIFIER_POINTER (DECL_NAME (var));
-
-	    if (!inner)
-	      {
-		tree x = create_tmp_var (TREE_TYPE (type), id);
-		gimplify_assign (var, build_fold_addr_expr (x), fork_seq);
-	      }
-
-	    v1 = create_tmp_var (type, id);
-	    v2 = create_tmp_var (type, id);
-	    v3 = create_tmp_var (type, id);
-
-	    gimplify_assign (v1, var, fork_seq);
-	    gimplify_assign (v2, var, fork_seq);
-	    gimplify_assign (v3, var, fork_seq);
-
-	    var = build_simple_mem_ref (var);
-	    v1 = build_simple_mem_ref (v1);
-	    v2 = build_simple_mem_ref (v2);
-	    v3 = build_simple_mem_ref (v3);
 	    outgoing = build_simple_mem_ref (outgoing);
 
 	    if (!TREE_CONSTANT (incoming))
 	      incoming = build_simple_mem_ref (incoming);
 	  }
-	else
-	  v1 = v2 = v3 = var;
 
 	/* Determine position in reduction buffer, which may be used
 	   by target.  The parser has ensured that this is not a
@@ -7238,20 +7218,21 @@  lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
 					  TREE_TYPE (var), 6, init_code,
 					  unshare_expr (ref_to_res),
-					  v1, level, op, off);
+					  var, level, op, off);
 	tree fini_call
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
 					  TREE_TYPE (var), 6, fini_code,
 					  unshare_expr (ref_to_res),
-					  v2, level, op, off);
+					  var, level, op, off);
 	tree teardown_call
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
-					  TREE_TYPE (var), 6, teardown_code,
-					  ref_to_res, v3, level, op, off);
+					  TREE_TYPE (var), 6,
+					  teardown_code, ref_to_res, var,
+					  level, op, off);
 
-	gimplify_assign (v1, setup_call, &before_fork);
-	gimplify_assign (v2, init_call, &after_fork);
-	gimplify_assign (v3, fini_call, &before_join);
+	gimplify_assign (var, setup_call, &before_fork);
+	gimplify_assign (var, init_call, &after_fork);
+	gimplify_assign (var, fini_call, &before_join);
 	gimplify_assign (outgoing, teardown_call, &after_join);
       }
 
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index d2e6c895e42..01b106b81d7 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -259,7 +259,9 @@  enum omp_clause_code {
                 placeholder used in OMP_CLAUSE_REDUCTION_{INIT,MERGE}.
      Operand 4: OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER: Another dummy
 		VAR_DECL placeholder, used like the above for C/C++ array
-		reductions.  */
+		reductions.
+     Operand 5: OMP_CLAUSE_REDUCTION_PRIVATE_DECL: A private VAR_DECL of
+                the original DECL associated with the reduction clause.  */
   OMP_CLAUSE_REDUCTION,
 
   /* OpenMP clause: task_reduction (operator:variable_list).  */
diff --git a/gcc/tree.c b/gcc/tree.c
index c09434d7293..7ff82b91892 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -284,7 +284,7 @@  unsigned const char omp_clause_num_ops[] =
   1, /* OMP_CLAUSE_SHARED  */
   1, /* OMP_CLAUSE_FIRSTPRIVATE  */
   2, /* OMP_CLAUSE_LASTPRIVATE  */
-  5, /* OMP_CLAUSE_REDUCTION  */
+  6, /* OMP_CLAUSE_REDUCTION  */
   5, /* OMP_CLAUSE_TASK_REDUCTION  */
   5, /* OMP_CLAUSE_IN_REDUCTION  */
   1, /* OMP_CLAUSE_COPYIN  */
@@ -12326,11 +12326,16 @@  walk_tree_1 (tree *tp, walk_tree_fn func, void *data,
 	  WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
 
 	case OMP_CLAUSE_REDUCTION:
+	  {
+	    for (int i = 0; i < 6; i++)
+	      WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
+	    WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
+	  }
+
 	case OMP_CLAUSE_TASK_REDUCTION:
 	case OMP_CLAUSE_IN_REDUCTION:
 	  {
-	    int i;
-	    for (i = 0; i < 5; i++)
+	    for (int i = 0; i < 5; i++)
 	      WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
 	    WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
 	  }
diff --git a/gcc/tree.h b/gcc/tree.h
index 4f33868e8e1..baef6a75fa6 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1685,6 +1685,8 @@  class auto_suppress_location_wrappers
 #define OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER(NODE) \
   OMP_CLAUSE_OPERAND (OMP_CLAUSE_RANGE_CHECK (NODE, OMP_CLAUSE_REDUCTION, \
 					      OMP_CLAUSE_IN_REDUCTION), 4)
+#define OMP_CLAUSE_REDUCTION_PRIVATE_DECL(NODE) \
+  OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_REDUCTION), 5)
 
 /* True if a REDUCTION clause may reference the original list item (omp_orig)
    in its OMP_CLAUSE_REDUCTION_{,GIMPLE_}INIT.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C
new file mode 100644
index 00000000000..052ccc51d6a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C
@@ -0,0 +1,64 @@ 
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(16)
+  {
+    int i, j;
+#pragma acc loop gang
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop worker
+	for (j = 0; j < 256; j++)
+	  {
+	    int tmpvar;
+	    int &tmpref = tmpvar;
+	    tmpref = (i * 256 + j) * 99;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 99)
+      abort ();
+}
+
+void vectors (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(16)
+  {
+    int i, j;
+#pragma acc loop gang worker
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop vector
+	for (j = 0; j < 256; j++)
+	  {
+	    int tmpvar;
+	    int &tmpref = tmpvar;
+	    tmpref = (i * 256 + j) * 101;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 101)
+      abort ();
+}
+
+int main (int argc, char *argv[])
+{
+  workers ();
+  vectors ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
new file mode 100644
index 00000000000..d887178d507
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
@@ -0,0 +1,64 @@ 
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(16)
+  {
+    int i, j;
+    int tmpvar;
+    int &tmpref = tmpvar;
+#pragma acc loop gang
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop worker private(tmpref)
+	for (j = 0; j < 256; j++)
+	  {
+	    tmpref = (i * 256 + j) * 99;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 99)
+      abort ();
+}
+
+void vectors (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(16)
+  {
+    int i, j;
+    int tmpvar;
+    int &tmpref = tmpvar;
+#pragma acc loop gang worker
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop vector private(tmpref)
+	for (j = 0; j < 256; j++)
+	  {
+	    tmpref = (i * 256 + j) * 101;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 101)
+      abort ();
+}
+
+int main (int argc, char *argv[])
+{
+  workers ();
+  vectors ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95 b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
new file mode 100644
index 00000000000..e4b85206cc1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
@@ -0,0 +1,71 @@ 
+! { dg-do run }
+
+program main
+  implicit none
+  integer :: myint
+  integer :: i
+  real :: res(65536), tmp
+
+  res(:) = 0.0
+
+  myint = 5
+  call workers(myint, res)
+
+  do i=1,65536
+    tmp = i * 99
+    if (res(i) .ne. tmp) stop 1
+  end do
+
+  res(:) = 0.0
+
+  myint = 7
+  call vectors(myint, res)
+
+  do i=1,65536
+    tmp = i * 101
+    if (res(i) .ne. tmp) stop 2
+  end do
+
+contains
+
+  subroutine workers(t1, res)
+    implicit none
+    integer :: t1
+    integer :: i, j
+    real, intent(out) :: res(:)
+
+    !$acc parallel copyout(res) num_gangs(64) num_workers(16)
+
+    !$acc loop gang
+    do i=0,255
+      !$acc loop worker private(t1)
+      do j=1,256
+        t1 = (i * 256 + j) * 99
+        res(i * 256 + j) = t1
+      end do
+    end do
+
+    !$acc end parallel
+  end subroutine workers
+
+  subroutine vectors(t1, res)
+    implicit none
+    integer :: t1
+    integer :: i, j
+    real, intent(out) :: res(:)
+
+    !$acc parallel copyout(res) num_gangs(64) num_workers(16)
+
+    !$acc loop gang worker
+    do i=0,255
+      !$acc loop vector private(t1)
+      do j=1,256
+        t1 = (i * 256 + j) * 101
+        res(i * 256 + j) = t1
+      end do
+    end do
+
+    !$acc end parallel
+  end subroutine vectors
+
+end program main