commit 78d6b16bf258106282f791f2e7b3010bf75f2a86
Author: Julian Brown <julian@codesourcery.com>
Date: Wed Oct 15 02:10:00 2014 -0700
Async fixes/improvements.
@@ -294,6 +294,16 @@ openacc_parallel (void (*fn) (void *), size_t mapnum __attribute__((unused)),
}
STATIC void
+openacc_register_async_cleanup (void *targ_mem_desc)
+{
+#ifdef HOST_NONSHM_PLUGIN
+ /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
+ so there's no point in delaying host-side cleanup -- just do it now. */
+ GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
+#endif
+}
+
+STATIC void
openacc_async_set_async (int async __attribute__((unused)))
{
#ifdef DEBUG
@@ -397,6 +407,8 @@ static struct gomp_device_descr host_dispatch =
.exec_func = openacc_parallel,
+ .register_async_cleanup_func = openacc_register_async_cleanup,
+
.async_set_async_func = openacc_async_set_async,
.async_test_func = openacc_async_test,
.async_test_all_func = openacc_async_test_all,
@@ -64,6 +64,9 @@ typedef struct ACC_dispatch_t
void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
unsigned short *, int, int, int, int, void *);
+ /* async cleanup callback registration */
+ void (*register_async_cleanup_func) (void *);
+
/* asynchronous routines */
int (*async_test_func) (int);
int (*async_test_all_func) (void);
@@ -213,7 +213,10 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
if (async < acc_async_noval)
gomp_unmap_vars (tgt, true);
else
- gomp_copy_from_async (tgt);
+ {
+ gomp_copy_from_async (tgt);
+ ACC_dev->openacc.register_async_cleanup_func (tgt);
+ }
ACC_dev->openacc.async_set_async_func (acc_async_sync);
}
@@ -317,7 +317,8 @@ enum PTX_event_type
{
PTX_EVT_MEM,
PTX_EVT_KNL,
- PTX_EVT_SYNC
+ PTX_EVT_SYNC,
+ PTX_EVT_ASYNC_CLEANUP
};
struct PTX_event
@@ -325,7 +326,6 @@ struct PTX_event
CUevent *evt;
int type;
void *addr;
- void *tgt;
int ord;
SLIST_ENTRY(PTX_event) next;
};
@@ -946,6 +946,10 @@ event_gc (bool memmap_lockable)
break;
case PTX_EVT_KNL:
+ map_pop (ptx_event->addr);
+ break;
+
+ case PTX_EVT_ASYNC_CLEANUP:
{
/* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
memory-map splay tree lock for the current device, so we
@@ -955,9 +959,7 @@ event_gc (bool memmap_lockable)
if (!memmap_lockable)
goto next_event;
- map_pop (ptx_event->addr);
- if (ptx_event->tgt)
- GOMP_PLUGIN_async_unmap_vars (ptx_event->tgt);
+ GOMP_PLUGIN_async_unmap_vars (ptx_event->addr);
}
break;
}
@@ -978,17 +980,17 @@ event_gc (bool memmap_lockable)
}
static void
-event_add (enum PTX_event_type type, CUevent *e, void *h, void *tgt)
+event_add (enum PTX_event_type type, CUevent *e, void *h)
{
struct PTX_event *ptx_event;
- assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC);
+ assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
+ || type == PTX_EVT_ASYNC_CLEANUP);
ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
ptx_event->type = type;
ptx_event->evt = e;
ptx_event->addr = h;
- ptx_event->tgt = tgt;
ptx_event->ord = PTX_dev->ord;
GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
@@ -1092,7 +1094,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
- event_add (PTX_EVT_KNL, e, (void *)dev_str, targ_mem_desc);
+ event_add (PTX_EVT_KNL, e, (void *)dev_str);
}
#else
r = cuCtxSynchronize ();
@@ -1194,7 +1196,7 @@ PTX_host2dev (void *d, const void *h, size_t s)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
- event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+ event_add (PTX_EVT_MEM, e, (void *)h);
}
else
#endif
@@ -1257,7 +1259,7 @@ PTX_dev2host (void *h, const void *d, size_t s)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
- event_add (PTX_EVT_MEM, e, (void *)h, NULL);
+ event_add (PTX_EVT_MEM, e, (void *)h);
}
else
#endif
@@ -1289,7 +1291,15 @@ PTX_async_test (int async)
r = cuStreamQuery (s->stream);
if (r == CUDA_SUCCESS)
- return 1;
+ {
+ /* The oacc-parallel.c:goacc_wait function calls this hook to determine
+ whether all work has completed on this stream, and if so omits the call
+ to the wait hook. If that happens, event_gc might not get called
+ (which prevents variables from getting unmapped and their associated
+ device storage freed), so call it here. */
+ event_gc (true);
+ return 1;
+ }
else if (r == CUDA_ERROR_NOT_READY)
return 0;
@@ -1318,6 +1328,8 @@ PTX_async_test_all (void)
GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+ event_gc (true);
+
return 1;
}
@@ -1370,7 +1382,7 @@ PTX_wait_async (int async1, int async2)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
- event_add (PTX_EVT_SYNC, e, NULL, NULL);
+ event_add (PTX_EVT_SYNC, e, NULL);
r = cuStreamWaitEvent (s2->stream, *e, 0);
if (r != CUDA_SUCCESS)
@@ -1448,7 +1460,7 @@ PTX_wait_all_async (int async)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
- event_add (PTX_EVT_SYNC, e, NULL, NULL);
+ event_add (PTX_EVT_SYNC, e, NULL);
r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
if (r != CUDA_SUCCESS)
@@ -1771,6 +1783,30 @@ openacc_avail (void)
return PTX_avail ();
}
+void
+openacc_register_async_cleanup (void *targ_mem_desc)
+{
+ CUevent *e;
+ CUresult r;
+
+#ifdef DEBUG
+ fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
+ targ_mem_desc);
+#endif
+
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+ r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
+
+ r = cuEventRecord (*e, current_stream->stream);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
+
+ event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+}
+
int
openacc_async_test (int async)
{
@@ -1067,6 +1067,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
DLSYM_OPT (openacc.avail, openacc_avail);
+ DLSYM_OPT (openacc.register_async_cleanup,
+ openacc_register_async_cleanup);
DLSYM_OPT (openacc.async_test, openacc_async_test);
DLSYM_OPT (openacc.async_test_all, openacc_async_test_all);
DLSYM_OPT (openacc.async_wait, openacc_async_wait);