diff mbox series

[1/3,og9] Wait at end of OpenACC asynchronous kernels regions

Message ID 79cc9084f24fec88df02daa5b099c8288ee06626.1565729221.git.julian@codesourcery.com
State New
Headers show
Series OpenACC async fixes for AMD GCN | expand

Commit Message

Julian Brown Aug. 13, 2019, 9:37 p.m. UTC
This patch provides a workaround for unreliable operation of asynchronous
kernels regions on AMD GCN. At present, kernels regions are decomposed
into a series of parallel regions surrounded by a data region capturing
the data-movement clauses needed by the region as a whole:

  #pragma acc kernels async(n)
  { ... }

is translated to:

  #pragma acc data copyin(...) copyout(...)
  {
    #pragma acc parallel async(n) present(...)
    { ... }
    #pragma acc parallel async(n) present(...)
    { ... }
  }

This is however problematic for two reasons:

 - Variables mapped by the data clause will be unmapped immediately at the end
   of the data region, regardless of whether the inner asynchronous
   parallels have completed. (This causes crashes for GCN.)

 - Even if the "present" clause caused the reference count to stay above zero
   at the end of the data region -- which it doesn't -- the "present"
   clauses on the inner parallel regions would not cause "copyout"
   variables to be transferred back to the host at the appropriate time,
   i.e. when the async parallel region had completed.

There is no "async" data construct in OpenACC, so the correct solution
(which I am deferring on for now) is probably to use asynchronous
"enter data" and "exit data" directives when translating asynchronous
kernels regions instead.

The attached patch just adds a "wait" operation before the end of
the enclosing data region. This works, but introduces undesirable
synchronisation with the host.

Julian

ChangeLog

	gcc/
	* omp-oacc-kernels.c (add_wait): New function, split out of...
	(add_async_clauses_and_wait): ...here. Call new outlined function.
	(decompose_kernels_region_body): Add wait at the end of
	explicitly-asynchronous kernels regions.
---
 gcc/ChangeLog.openacc  |  7 +++++++
 gcc/omp-oacc-kernels.c | 28 +++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/gcc/ChangeLog.openacc b/gcc/ChangeLog.openacc
index 84d80511603..a22f07c817c 100644
--- a/gcc/ChangeLog.openacc
+++ b/gcc/ChangeLog.openacc
@@ -1,3 +1,10 @@ 
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* omp-oacc-kernels.c (add_wait): New function, split out of...
+	(add_async_clauses_and_wait): ...here. Call new outlined function.
+	(decompose_kernels_region_body): Add wait at the end of
+	explicitly-asynchronous kernels regions.
+
 2019-08-08  Julian Brown  <julian@codesourcery.com>
 
 	* config/gcn/gcn.c (gcn_goacc_validate_dims): Ensure
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 20913859c12..a6c4220f472 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -900,6 +900,18 @@  maybe_build_inner_data_region (location_t loc, gimple *body,
   return body;
 }
 
+static void
+add_wait (location_t loc, gimple_seq *region_body)
+{
+  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+                                         sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
 /* Helper function of decompose_kernels_region_body.  The statements in
    REGION_BODY are expected to be decomposed parallel regions; add an
    "async" clause to each.  Also add a "wait" pragma at the end of the
@@ -923,13 +935,7 @@  add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
       gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
                                      target_clauses);
     }
-  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
-  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
-  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
-  gimple *wait_call = gimple_build_call (wait_fn, 2,
-                                         sync_arg, integer_zero_node);
-  gimple_set_location (wait_call, loc);
-  gimple_seq_add_stmt (region_body, wait_call);
+  add_wait (loc, region_body);
 }
 
 /* Auxiliary analysis of the body of a kernels region, to determine for each
@@ -1378,6 +1384,14 @@  decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
      a wait directive at the end.  */
   if (async_clause == NULL)
     add_async_clauses_and_wait (loc, &region_body);
+  else
+    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
+       region, then target memory will get unmapped at the point the data
+       region ends, even if the inner asynchronous parallels have not yet
+       completed.  For kernels marked "async", we might want to use "enter data
+       async(...)" and "exit data async(...)" instead.
+       For now, insert a (synchronous) wait at the end of the block.  */
+    add_wait (loc, &region_body);
 
   tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
   gimple *body = gimple_build_bind (kernels_locals, region_body,