diff mbox

[gomp4] Asynchronous data unmapping & wait fixes for OpenACC

Message ID 20141017170540.3c0c4a41@octopus
State New
Headers show

Commit Message

Julian Brown Oct. 17, 2014, 4:05 p.m. UTC
Hi,

This patch introduces a new plugin hook in libgomp to register a
callback function to clean up host-side bookkeeping data after an
asynchronous operation has completed (replacing the previous ad-hoc
method used in the NVPTX backend), and adds code to ensure that same
cleanup is done reliably in the NVPTX backend when the user program
hits a "wait" directive, or equivalent.

OK for the gomp4 branch?

Thanks,

Julian

ChangeLog

    libgomp/
    * oacc-host.c (openacc_register_async_cleanup): New.
    (host_dispatch): Initialise register_async_cleanup_func entry.
    * oacc-int.h (struct ACC_dispatch_t): Add
    register_async_cleanup_func hook.
    * oacc-parallel.c (GOACC_parallel): Call
    register_async_cleanup_func hook after queuing asynchronous
    copy-back.
    * plugin-nvptx.c (enum PTX_event_type): Add PTX_EVT_ASYNC_CLEANUP.
    (struct PTX_event): Remove tgt field.
    (event_gc): Don't do async cleanup in PTX_EVT_KNL, do it in
    PTX_EVT_ASYNC_CLEANUP instead.
    (event_add): Remove tgt argument. Support PTX_EVT_ASYNC_CLEANUP
    events.
    (PTX_exec, PTX_host2dev, PTX_dev2host, PTX_wait_async)
    (PTX_wait_all_async): Update calls to event_add.
    (openacc_register_async_cleanup): New.
    (PTX_async_test): Call event_gc on success path.
    (PTX_async_test_all): Likewise.
    * target.c (gomp_load_plugin_for_device): Initialise
    register_async_cleanup hook.

Comments

Thomas Schwinge Oct. 20, 2014, 9:47 a.m. UTC | #1
Hi Julian!

On Fri, 17 Oct 2014 17:05:40 +0100, Julian Brown <julian@codesourcery.com> wrote:
> This patch introduces a new plugin hook in libgomp to register a
> callback function to clean up host-side bookkeeping data after an
> asynchronous operation has completed (replacing the previous ad-hoc
> method used in the NVPTX backend), and adds code to ensure that same
> cleanup is done reliably in the NVPTX backend when the user program
> hits a "wait" directive, or equivalent.
> 
> OK for the gomp4 branch?

Yes, thanks.

>     libgomp/
>     * oacc-host.c (openacc_register_async_cleanup): New.
>     (host_dispatch): Initialise register_async_cleanup_func entry.
>     * oacc-int.h (struct ACC_dispatch_t): Add
>     register_async_cleanup_func hook.
>     * oacc-parallel.c (GOACC_parallel): Call
>     register_async_cleanup_func hook after queuing asynchronous
>     copy-back.
>     * plugin-nvptx.c (enum PTX_event_type): Add PTX_EVT_ASYNC_CLEANUP.
>     (struct PTX_event): Remove tgt field.
>     (event_gc): Don't do async cleanup in PTX_EVT_KNL, do it in
>     PTX_EVT_ASYNC_CLEANUP instead.
>     (event_add): Remove tgt argument. Support PTX_EVT_ASYNC_CLEANUP
>     events.
>     (PTX_exec, PTX_host2dev, PTX_dev2host, PTX_wait_async)
>     (PTX_wait_all_async): Update calls to event_add.
>     (openacc_register_async_cleanup): New.
>     (PTX_async_test): Call event_gc on success path.
>     (PTX_async_test_all): Likewise.
>     * target.c (gomp_load_plugin_for_device): Initialise
>     register_async_cleanup hook.


Grüße,
 Thomas
diff mbox

Patch

commit 78d6b16bf258106282f791f2e7b3010bf75f2a86
Author: Julian Brown <julian@codesourcery.com>
Date:   Wed Oct 15 02:10:00 2014 -0700

    Async fixes/improvements.

diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index a47617a..f44ca5e 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -294,6 +294,16 @@  openacc_parallel (void (*fn) (void *), size_t mapnum __attribute__((unused)),
 }
 
 STATIC void
+openacc_register_async_cleanup (void *targ_mem_desc)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
+     so there's no point in delaying host-side cleanup -- just do it now.  */
+  GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
+#endif
+}
+
+STATIC void
 openacc_async_set_async (int async __attribute__((unused)))
 {
 #ifdef DEBUG
@@ -397,6 +407,8 @@  static struct gomp_device_descr host_dispatch =
 
       .exec_func = openacc_parallel,
 
+      .register_async_cleanup_func = openacc_register_async_cleanup,
+
       .async_set_async_func = openacc_async_set_async,
       .async_test_func = openacc_async_test,
       .async_test_all_func = openacc_async_test_all,
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index e1d2e32..03529cc 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -64,6 +64,9 @@  typedef struct ACC_dispatch_t
   void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
 		     unsigned short *, int, int, int, int, void *);
 
+  /* async cleanup callback registration */
+  void (*register_async_cleanup_func) (void *);
+
   /* asynchronous routines  */
   int (*async_test_func) (int);
   int (*async_test_all_func) (void);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 57ac8de..e3f156c 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -213,7 +213,10 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
   if (async < acc_async_noval)
     gomp_unmap_vars (tgt, true);
   else
-    gomp_copy_from_async (tgt);
+    {
+      gomp_copy_from_async (tgt);
+      ACC_dev->openacc.register_async_cleanup_func (tgt);
+    }
 
   ACC_dev->openacc.async_set_async_func (acc_async_sync);
 }
diff --git a/libgomp/plugin-nvptx.c b/libgomp/plugin-nvptx.c
index e163f3a..f193229 100644
--- a/libgomp/plugin-nvptx.c
+++ b/libgomp/plugin-nvptx.c
@@ -317,7 +317,8 @@  enum PTX_event_type
 {
   PTX_EVT_MEM,
   PTX_EVT_KNL,
-  PTX_EVT_SYNC
+  PTX_EVT_SYNC,
+  PTX_EVT_ASYNC_CLEANUP
 };
 
 struct PTX_event
@@ -325,7 +326,6 @@  struct PTX_event
   CUevent *evt;
   int type;
   void *addr;
-  void *tgt;
   int ord;
   SLIST_ENTRY(PTX_event) next;
 };
@@ -946,6 +946,10 @@  event_gc (bool memmap_lockable)
 	      break;
 	    
 	    case PTX_EVT_KNL:
+              map_pop (ptx_event->addr);
+	      break;
+
+	    case PTX_EVT_ASYNC_CLEANUP:
               {
 	        /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
 		   memory-map splay tree lock for the current device, so we
@@ -955,9 +959,7 @@  event_gc (bool memmap_lockable)
 	        if (!memmap_lockable)
 		  goto next_event;
 
-        	map_pop (ptx_event->addr);
-		if (ptx_event->tgt)
-		  GOMP_PLUGIN_async_unmap_vars (ptx_event->tgt);
+		GOMP_PLUGIN_async_unmap_vars (ptx_event->addr);
               }
 	      break;
 	    }
@@ -978,17 +980,17 @@  event_gc (bool memmap_lockable)
 }
 
 static void
-event_add (enum PTX_event_type type, CUevent *e, void *h, void *tgt)
+event_add (enum PTX_event_type type, CUevent *e, void *h)
 {
   struct PTX_event *ptx_event;
 
-  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC);
+  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
+	  || type == PTX_EVT_ASYNC_CLEANUP);
 
   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
   ptx_event->type = type;
   ptx_event->evt = e;
   ptx_event->addr = h;
-  ptx_event->tgt = tgt;
   ptx_event->ord = PTX_dev->ord;
 
   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
@@ -1092,7 +1094,7 @@  PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
-      event_add (PTX_EVT_KNL, e, (void *)dev_str, targ_mem_desc);
+      event_add (PTX_EVT_KNL, e, (void *)dev_str);
     }
 #else
   r = cuCtxSynchronize ();
@@ -1194,7 +1196,7 @@  PTX_host2dev (void *d, const void *h, size_t s)
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
-      event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+      event_add (PTX_EVT_MEM, e, (void *)h);
     }
   else
 #endif
@@ -1257,7 +1259,7 @@  PTX_dev2host (void *h, const void *d, size_t s)
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
-      event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+      event_add (PTX_EVT_MEM, e, (void *)h);
     }
   else
 #endif
@@ -1289,7 +1291,15 @@  PTX_async_test (int async)
 
   r = cuStreamQuery (s->stream);
   if (r == CUDA_SUCCESS)
-    return 1;
+    {
+      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
+	 whether all work has completed on this stream, and if so omits the call
+	 to the wait hook.  If that happens, event_gc might not get called
+	 (which prevents variables from getting unmapped and their associated
+	 device storage freed), so call it here.  */
+      event_gc (true);
+      return 1;
+    }
   else if (r == CUDA_ERROR_NOT_READY)
     return 0;
 
@@ -1318,6 +1328,8 @@  PTX_async_test_all (void)
 
   GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
 
+  event_gc (true);
+
   return 1;
 }
 
@@ -1370,7 +1382,7 @@  PTX_wait_async (int async1, int async2)
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
-  event_add (PTX_EVT_SYNC, e, NULL, NULL);
+  event_add (PTX_EVT_SYNC, e, NULL);
 
   r = cuStreamWaitEvent (s2->stream, *e, 0);
   if (r != CUDA_SUCCESS)
@@ -1448,7 +1460,7 @@  PTX_wait_all_async (int async)
       if (r != CUDA_SUCCESS)
 	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
-      event_add (PTX_EVT_SYNC, e, NULL, NULL);
+      event_add (PTX_EVT_SYNC, e, NULL);
 
       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
       if (r != CUDA_SUCCESS)
@@ -1771,6 +1783,30 @@  openacc_avail (void)
   return PTX_avail ();
 }
 
+void
+openacc_register_async_cleanup (void *targ_mem_desc)
+{
+  CUevent *e;
+  CUresult r;
+
+#ifdef DEBUG
+  fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
+          targ_mem_desc);
+#endif
+
+  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+  r = cuEventRecord (*e, current_stream->stream);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+}
+
 int
 openacc_async_test (int async)
 {
diff --git a/libgomp/target.c b/libgomp/target.c
index 418ba61..79b252d 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1067,6 +1067,8 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
       DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
       DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
       DLSYM_OPT (openacc.avail, openacc_avail);
+      DLSYM_OPT (openacc.register_async_cleanup,
+		 openacc_register_async_cleanup);
       DLSYM_OPT (openacc.async_test, openacc_async_test);
       DLSYM_OPT (openacc.async_test_all, openacc_async_test_all);
       DLSYM_OPT (openacc.async_wait, openacc_async_wait);