diff mbox

[gomp4] partially enable GOMP_MAP_FIRSTPRIVATE_POINTER in gfortran

Message ID 23548788-8508-dda9-f559-b4e588e9c644@codesourcery.com
State New
Headers show

Commit Message

Cesar Philippidis Jan. 27, 2017, 5:13 p.m. UTC
This patch partially enables GOMP_MAP_FIRSTPRIVATE_POINTER in gfortran.
gfortran still falls back to GOMP_MAP_POINTER for arrays with
descriptors and derived types. The limitation on derived types is there
because we don't have much test coverage for it, and this patch series
was more exploratory for performance enhancements. With that in mind,
there are a couple of shortcomings with this patch.

 1) Dummy reduction variables fallback to GOMP_MAP_POINTER because of a
    pointer dereferencing bug. The state of debugging such problems on
    PTX targets leaves something to be desired, especially since print
    isn't working on nvptx targets currently.

 2) Apparently, firstprivate pointers negatively affects the alias
    analysis used by ACC KERNELS and parloops, so a couple of more
    execution tests fail to generate offloaded code.

I plan to resolve issue 1) in a follow up patch later on (but maybe not
in the immediate future). Regarding 2), ACC KERNELS are eventually going
to need a significant rework, but that's not going to happen in the near
future either. I've been pushing to get the performance of ACC PARALLEL
regions on par to other OpenACC compilers first, and hopefully that
won't be too far way.

With this patch, I'm observing an approximate 0.6s reduction in
CloverLeaf's original 0.9s execution time (it takes approximate 0.9s
after the GOMP_MAP_FIRSTPRIVATE_INT and GOMP_MAP_TO_PSET patches), to
yield a final execution time somewhere in the neighborhood of 0.3s.
That's about a one second savings from the unpatched version of GCC.

There's still quite a bit of room left for improvement, with respect to
data movement. Specifically, the nvptx runtime populates the struct
containing the addresses of all of the remapped offloaded variables, and
uploads it immediately before launching the PTX kernel. Clearly this
works, however is requires a synchronous host2dev data copy. And while
those synchronous barriers are generally quick, if you have thousands of
them, like CloverLeaf, they end up becoming a bottleneck. One way to get
around this is to pass in the remapped decl pointers as arguments to
cuLaunchKernel directly. Maybe we can do that during oaccdevlow for
nvptx targets. The next big thing, however, probably should be updating
the default launch geometry again.

This patch has been committed to gomp-4_0-branch.

Cesar
diff mbox

Patch

2017-01-27  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/fortran/
	* trans-openmp.c (gfc_omp_finish_clause): Use GOMP_MAP_POINTER
	for POINTER_TYPE decls.
	(gfc_trans_omp_clauses_1): Likewise.

	gcc/
	* gimplify.c (demote_firstprivate_pointer): New function.
	(gimplify_scan_omp_clauses): Enable target_map_pointers_as_0len_arrays
	and target_map_scalars_firstprivate in OpenACC and gfortran.
	(gimplify_adjust_omp_clauses): Demote FIRSTPRIVATE_POINTERS for OpenACC
	retuction variables. 
	* omp-low.c (lower_omp_target): Adjust receiver reference of decls for
	fortran dummy arguments.

	gcc/testsuite/
	* gfortran.dg/goacc/kernels-loop-n.f95: Xfail test.

	libgomp/
	* testsuite/libgomp.oacc-fortran/deviceptr-1.f90: Add -foffload-force.
	* testsuite/libgomp.oacc-fortran/non-scalar-data.f90: Likewise.

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index b586475..7826e1c 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -1042,7 +1042,7 @@  gfc_omp_finish_clause (tree c, gimple_seq *pre_p)
 	return;
       tree orig_decl = decl;
       c4 = build_omp_clause (OMP_CLAUSE_LOCATION (c), OMP_CLAUSE_MAP);
-      OMP_CLAUSE_SET_MAP_KIND (c4, GOMP_MAP_POINTER);
+      OMP_CLAUSE_SET_MAP_KIND (c4, GOMP_MAP_FIRSTPRIVATE_POINTER);
       OMP_CLAUSE_DECL (c4) = decl;
       OMP_CLAUSE_SIZE (c4) = size_int (0);
       decl = build_fold_indirect_ref (decl);
@@ -2005,9 +2005,12 @@  gfc_trans_omp_clauses_1 (stmtblock_t *block, gfc_omp_clauses *clauses,
 					(TREE_TYPE (TREE_TYPE (decl)))))
 		    {
 		      tree orig_decl = decl;
+		      enum gomp_map_kind gmk = GOMP_MAP_FIRSTPRIVATE_POINTER;
+		      if (n->u.map_op == OMP_MAP_FORCE_DEVICEPTR)
+			gmk = GOMP_MAP_POINTER;
 		      node4 = build_omp_clause (input_location,
 						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER);
+		      OMP_CLAUSE_SET_MAP_KIND (node4, gmk);
 		      OMP_CLAUSE_DECL (node4) = decl;
 		      OMP_CLAUSE_SIZE (node4) = size_int (0);
 		      decl = build_fold_indirect_ref (decl);
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 49d79fb..23e9ce8 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6589,6 +6589,37 @@  find_decl_expr (tree *tp, int *walk_subtrees, void *data)
   return NULL_TREE;
 }
 
+static void
+demote_firstprivate_pointer (tree decl, gimplify_omp_ctx *ctx)
+{
+  if (!lang_GNU_Fortran ())
+    return;
+
+  while (ctx)
+    {
+      if (ctx->region_type == ORT_ACC_PARALLEL
+	  || ctx->region_type == ORT_ACC_KERNELS)
+	break;
+      ctx = ctx->outer_context;
+    }
+
+  if (ctx == NULL)
+    return;
+
+  tree clauses = ctx->clauses;
+
+  for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    {
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+	  && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER
+	  && OMP_CLAUSE_DECL (c) == decl)
+	{
+	  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_POINTER);
+	  return;
+	}
+    }
+}
+
 /* Scan the OMP clauses in *LIST_P, installing mappings into a new
    and previous omp contexts.  */
 
@@ -6605,11 +6636,10 @@  gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
   ctx = new_omp_context (region_type);
   ctx->clauses = *list_p;
   outer_ctx = ctx->outer_context;
-  if (code == OMP_TARGET && !lang_GNU_Fortran ())
+  if (code == OMP_TARGET && !(lang_GNU_Fortran () && !(region_type & ORT_ACC)))
     {
-      ctx->target_map_pointers_as_0len_arrays = true;
-      /* FIXME: For Fortran we want to set this too, when
-	 the Fortran FE is updated to OpenMP 4.5.  */
+      if (!lang_GNU_Fortran () || region_type & ORT_ACC)
+	ctx->target_map_pointers_as_0len_arrays = true;
       ctx->target_map_scalars_firstprivate = true;
     }
   if (!lang_GNU_Fortran ())
@@ -6717,6 +6747,7 @@  gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
 	  if (!(region_type & ORT_ACC))
 	    check_non_private = "reduction";
 	  decl = OMP_CLAUSE_DECL (c);
+	  demote_firstprivate_pointer (decl, ctx->outer_context);
 	  if (TREE_CODE (decl) == MEM_REF)
 	    {
 	      tree type = TREE_TYPE (decl);
@@ -8254,11 +8285,16 @@  gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
 		      && kind != GOMP_MAP_FORCE_PRESENT
 		      && kind != GOMP_MAP_POINTER)
 		    {
-		      warning_at (OMP_CLAUSE_LOCATION (c), 0,
-				  "incompatible data clause with reduction "
-				  "on %qE; promoting to present_or_copy",
-				  DECL_NAME (t));
-		      OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_TOFROM);
+		      if (lang_hooks.decls.omp_privatize_by_reference (decl))
+			OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_POINTER);
+		      else
+			{
+			  warning_at (OMP_CLAUSE_LOCATION (c), 0,
+				      "incompatible data clause with reduction "
+				      "on %qE; promoting to present_or_copy",
+				      DECL_NAME (t));
+			  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_TOFROM);
+			}
 		    }
 		}
 	    }
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index adde8de..142d928 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -17547,7 +17547,8 @@  lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 		  }
 		else
 		  is_ref = is_reference (var);
-		if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_REFERENCE)
+		if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_REFERENCE
+		    || (lang_GNU_Fortran () && TREE_CODE (var) == PARM_DECL))
 		  is_ref = false;
 		bool ref_to_array = false;
 		if (is_ref)
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
index 2736a1b..bdfebde 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
@@ -37,4 +37,6 @@  end module test
 ! Check that the loop has been split off into a function.
 ! { dg-final { scan-tree-dump-times "(?n);; Function __test_MOD_foo._omp_fn.0 " 1 "optimized" } }
 
-! { dg-final { scan-tree-dump-times "(?n)oacc function \\(0," 1 "parloops1" } }
+! This failure was introduced with the GOMP_MAP_POINTER ->
+! GOMP_MAP_FIRSTPRIVATE_POINTER conversion.
+! { dg-final { scan-tree-dump-times "(?n)oacc function \\(0," 1 "parloops1" { xfail *-*-* } } }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
index 322f7dc..5831077 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
@@ -3,6 +3,7 @@ 
 ! the deviceptr variable is implied.
 
 ! { dg-do run }
+! { dg-additional-options "-foffload-force" }
 
 subroutine subr1 (a, b)
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90 b/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90
index 94e4228..ab5771e 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90
@@ -2,6 +2,7 @@ 
 ! offloaded regions are properly mapped using present_or_copy.
 
 ! { dg-do run }
+! { dg-additional-options "-foffload-force" }
 
 program main
   implicit none