[gomp4] (NVPTX) thread barriers after OpenACC worker loops
diff mbox

Message ID 20150608123758.2dfa7ebd@octopus
State New
Headers show

Commit Message

Julian Brown June 8, 2015, 11:37 a.m. UTC
Hi,

This patch adds a thread barrier after worker loops for OpenACC, in
accordance with OpenACC 2.0a section 2.7.3 (worker loops): "All workers
will complete execution of their assigned iterations before any worker
proceeds beyond the end of the loop.". (This is quite target-specific:
work to alleviate that is still ongoing.)

Barriers are "special" in that they should not be cloned or subject to
excessive code motion: to that end, barriers placed after loops have
their (outgoing) edge set to EDGE_ABNORMAL. That seems to suffice to
keep the barriers in the right places.

This passes libgomp testing when applied on gomp4 branch, and fixes the
previously-broken worker-partn-5.c and worker-partn-6.c tests, on top
of my previous patches:

https://gcc.gnu.org/ml/gcc-patches/2015-05/msg02612.html
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00307.html

(ping!), but unfortunately (again, with the above patches) appears to
interact badly with Cesar's patch for vector state propagation:

https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00371.html

I haven't yet investigated why (I reverted that patch in my local
series in order to test the attached patch).

FYI,

Julian

ChangeLog

    gcc/
    * omp-low.c (build_oacc_threadbarrier): New function.
    (oacc_loop_needs_threadbarrier_p): New function.
    (expand_omp_for_static_nochunk, expand_omp_for_static_chunk):
    Insert threadbarrier after worker loops.
    (find_omp_for_region_data): Rename to...
    (find_omp_for_region_gwv): This. Return mask, rather than modifying
    REGION structure.
    (build_omp_regions_1): Move modification of REGION structure to
    here, after calling above function with new name.
    (generate_oacc_broadcast): Use new build_oacc_threadbarrier
    function.
    (make_gimple_omp_edges): Make edges out of OpenACC worker loop exit
    block abnormal.
    * tree-ssa-alias.c (ref_maybe_used_by_call_p_1): Add
    BUILT_IN_GOACC_THREADBARRIER.

    libgomp/
    * testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c: Remove
    XFAIL.
    * testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c: Likewise.

Patch
diff mbox

commit e46fbc68b7bc7e705417475fcfb8e203056b5a51
Author: Julian Brown <julian@codesourcery.com>
Date:   Fri Jun 5 10:01:01 2015 -0700

    Threadbarrier after worker and vector loops.

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 55a2a12..45ff05a 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3691,6 +3691,15 @@  build_omp_barrier (tree lhs)
   return g;
 }
 
+/* Build a call to GOACC_threadbarrier.  */
+
+static gcall *
+build_oacc_threadbarrier (void)
+{
+  tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
+  return gimple_build_call (fndecl, 0);
+}
+
 /* If a context was created for STMT when it was scanned, return it.  */
 
 static omp_context *
@@ -7181,6 +7190,20 @@  expand_omp_for_generic (struct omp_region *region,
 }
 
 
+/* True if a barrier is needed after a loop partitioned over
+   gangs/workers/vectors as specified by GWV_BITS.  OpenACC semantics specify
+   that a (conceptual) barrier is needed after worker and vector-partitioned
+   loops, but not after gang-partitioned loops.  Currently we are relying on
+   warp reconvergence to synchronise threads within a warp after vector loops,
+   so an explicit barrier is not helpful after those.  */
+
+static bool
+oacc_loop_needs_threadbarrier_p (int gwv_bits)
+{
+  return (gwv_bits & (MASK_GANG | MASK_WORKER)) == MASK_WORKER;
+}
+
+
 /* A subroutine of expand_omp_for.  Generate code for a parallel
    loop with static schedule and no specified chunk size.  Given
    parameters:
@@ -7523,7 +7546,11 @@  expand_omp_for_static_nochunk (struct omp_region *region,
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
       if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
-	gcc_checking_assert (t == NULL_TREE);
+	{
+	  gcc_checking_assert (t == NULL_TREE);
+	  if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
+	    gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
+	}
       else
 	gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
     }
@@ -7956,7 +7983,11 @@  expand_omp_for_static_chunk (struct omp_region *region,
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
       if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
-	gcc_checking_assert (t == NULL_TREE);
+        {
+	  gcc_checking_assert (t == NULL_TREE);
+	  if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
+	    gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
+	}
       else
 	gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
     }
@@ -10270,22 +10301,26 @@  expand_omp (struct omp_region *region)
 /* Map each basic block to an omp_region.  */
 static hash_map<basic_block, omp_region *> *bb_region_map;
 
-/* Fill in additional data for a region REGION associated with an
+/* Return a mask of GWV bits for region REGION associated with an
    OMP_FOR STMT.  */
 
-static void
-find_omp_for_region_data (struct omp_region *region, gimple stmt)
+static int
+find_omp_for_region_gwv (gimple stmt)
 {
+  int tmp = 0;
+
   if (!is_gimple_omp_oacc (stmt))
-    return;
+    return 0;
 
   tree clauses = gimple_omp_for_clauses (stmt);
   if (find_omp_clause (clauses, OMP_CLAUSE_GANG))
-    region->gwv_this |= MASK_GANG;
+    tmp |= MASK_GANG;
   if (find_omp_clause (clauses, OMP_CLAUSE_WORKER))
-    region->gwv_this |= MASK_WORKER;
+    tmp |= MASK_WORKER;
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR))
-    region->gwv_this |= MASK_VECTOR;
+    tmp |= MASK_VECTOR;
+
+  return tmp;
 }
 
 /* Fill in additional data for a region REGION associated with an
@@ -10391,7 +10426,7 @@  build_omp_regions_1 (basic_block bb, struct omp_region *parent,
 		}
 	    }
 	  else if (code == GIMPLE_OMP_FOR)
-	    find_omp_for_region_data (region, stmt);
+	    region->gwv_this = find_omp_for_region_gwv (stmt);
 	  /* ..., this directive becomes the parent for a new region.  */
 	  if (region)
 	    parent = region;
@@ -10560,9 +10595,7 @@  generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
   gassign *st = gimple_build_assign (build_simple_mem_ref (ptr), var);
   gsi_insert_after (&where, st, GSI_NEW_STMT);
 
-  tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
-  gcall *sync_bar = gimple_build_call (fndecl, 0);
-  gsi_insert_after (&where, sync_bar, GSI_NEW_STMT);
+  gsi_insert_after (&where, build_oacc_threadbarrier (), GSI_NEW_STMT);
 
   gassign *cast2 = gimple_build_assign (ptr, NOP_EXPR,
 					parent->broadcast_array);
@@ -13877,11 +13910,29 @@  make_gimple_omp_edges (basic_block bb, struct omp_region **region,
        break;
 
     case GIMPLE_OMP_RETURN:
-      /* In the case of a GIMPLE_OMP_SECTION, the edge will go
-	 somewhere other than the next block.  This will be
-	 created later.  */
       cur_region->exit = bb;
-      fallthru = cur_region->type != GIMPLE_OMP_SECTION;
+      gimple for_stmt;
+      if (cur_region->type == GIMPLE_OMP_FOR
+	  && gimple_omp_for_kind (as_a <gomp_for *>
+				  ((for_stmt = last_stmt (cur_region->entry))))
+	     == GF_OMP_FOR_KIND_OACC_LOOP)
+        {
+	  /* Called before OMP expansion, so this information has not been
+	     recorded in cur_region->gwv_this yet.  */
+	  int gwv_bits = find_omp_for_region_gwv (for_stmt);
+	  if (oacc_loop_needs_threadbarrier_p (gwv_bits))
+	    {
+	      make_edge (bb, bb->next_bb, EDGE_FALLTHRU | EDGE_ABNORMAL);
+	      fallthru = false;
+	    }
+	  else
+	    fallthru = true;
+	}
+      else
+	/* In the case of a GIMPLE_OMP_SECTION, the edge will go
+	   somewhere other than the next block.  This will be
+	   created later.  */
+	fallthru = cur_region->type != GIMPLE_OMP_SECTION;
       cur_region = cur_region->outer;
       break;
 
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index fa6caef..249ac45 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -1764,6 +1764,7 @@  ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref)
 	case BUILT_IN_GOMP_ATOMIC_END:
 	case BUILT_IN_GOMP_BARRIER:
 	case BUILT_IN_GOMP_BARRIER_CANCEL:
+	case BUILT_IN_GOACC_THREADBARRIER:
 	case BUILT_IN_GOMP_TASKWAIT:
 	case BUILT_IN_GOMP_TASKGROUP_END:
 	case BUILT_IN_GOMP_CRITICAL_START:
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
index fc66b04..a1d2a72 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
@@ -1,5 +1,3 @@ 
-/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
-
 #include <assert.h>
 
 /* Test correct synchronisation between worker-partitioned loops.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
index 0f3a1d7..cfbcd17 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
@@ -1,5 +1,3 @@ 
-/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
-
 #include <assert.h>
 
 /* Test correct synchronisation between worker+vector-partitioned loops.  */