Message ID | 2437ac99-433b-9bdb-6948-bc40b2e6cfd3@codesourcery.com |
---|---|
State | New |
Headers | show |
Series | [nvptx] Remove use of CUDA unified memory in libgomp | expand |
On 07/31/2018 05:27 PM, Cesar Philippidis wrote: > At present, libgomp is using CUDA unified memory only as a buffer pass > to the struct containing the pointers to the data mappings to the > offloaded functions. I'm not sure why unified memory is needed here if > it is still being managed explicitly by the driver. > > This patch removes the use of CUDA unified memory from the driver. I > don't recall observing any reduction in performance. Besides, > eventually, we'd like to eliminate the struct containing all pointers to > the offloaded data mappings and pass those pointers as individual > function arguments to cuLaunchKernel directly. > > Is this patch OK for trunk? I bootstrapped and regression tested it for > x86_64 with nvptx offloading. > > Thanks, > Cesar > > > 0001-nvptx-Remove-use-of-CUDA-unified-memory-in-libgomp.patch > > > [PATCH] [nvptx] Remove use of CUDA unified memory in libgomp > > 2018-XX-YY Cesar Philippidis <cesar@codesourcery.com> > > libgomp/ > * plugin/plugin-nvptx.c (struct cuda_map): New. > (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, > h_tail with (cuda_map *) map. > (cuda_map_create): New function. > (cuda_map_destroy): New function. > (map_init): Update to use a linked list of cuda_map objects. > (map_fini): Likewise. > (map_pop): Likewise. > (map_push): Likewise. Return CUdeviceptr instead of void. > (init_streams_for_device): Remove stales references to ptx_stream > members. > (select_stream_for_async): Likewise. > (nvptx_exec): Update call to map_init. > > (cherry picked from gomp-4_0-branch r242614) > --- > libgomp/plugin/plugin-nvptx.c | 167 +++++++++++++++++++++++------------------- > 1 file changed, 90 insertions(+), 77 deletions(-) > > diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c > index 1237ea10..d79ddf1 100644 > --- a/libgomp/plugin/plugin-nvptx.c > +++ b/libgomp/plugin/plugin-nvptx.c > @@ -200,20 +200,20 @@ cuda_error (CUresult r) > static unsigned int instantiated_devices = 0; > static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; > > +struct cuda_map > +{ > + CUdeviceptr d; > + size_t size; > + bool active; > + struct cuda_map *next; > +}; > + > struct ptx_stream > { > CUstream stream; > pthread_t host_thread; > bool multithreaded; > - > - CUdeviceptr d; > - void *h; > - void *h_begin; > - void *h_end; > - void *h_next; > - void *h_prev; > - void *h_tail; > - > + struct cuda_map *map; > struct ptx_stream *next; > }; > > @@ -225,101 +225,114 @@ struct nvptx_thread > struct ptx_device *ptx_dev; > }; > > +static struct cuda_map * > +cuda_map_create (size_t size) > +{ > + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); > + > + assert (map); > + > + map->next = NULL; > + map->size = size; > + map->active = false; > + > + CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); > + assert (map->d); > + > + return map; > +} > + > +static void > +cuda_map_destroy (struct cuda_map *map) > +{ > + CUDA_CALL_ASSERT (cuMemFree, map->d); > + free (map); > +} > + > +/* The following map_* routines manage the CUDA device memory that > + contains the data mapping arguments for cuLaunchKernel. Each > + asynchronous PTX stream may have multiple pending kernel > + invocations, which are launched in a FIFO order. As such, the map > + routines maintains a queue of cuLaunchKernel arguments. > + > + Calls to map_push and map_pop must be guarded by ptx_event_lock. > + Likewise, calls to map_init and map_fini are guarded by > + ptx_dev_lock inside GOMP_OFFLOAD_init_device and > + GOMP_OFFLOAD_fini_device, respectively. */ > + > static bool > map_init (struct ptx_stream *s) > { > int size = getpagesize (); > > assert (s); > - assert (!s->d); > - assert (!s->h); > - > - CUDA_CALL (cuMemAllocHost, &s->h, size); > - CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); > > - assert (s->h); > + s->map = cuda_map_create (size); > > - s->h_begin = s->h; > - s->h_end = s->h_begin + size; > - s->h_next = s->h_prev = s->h_tail = s->h_begin; > - > - assert (s->h_next); > - assert (s->h_end); > return true; > } > > static bool > map_fini (struct ptx_stream *s) > { > - CUDA_CALL (cuMemFreeHost, s->h); > + assert (s->map->next == NULL); > + assert (!s->map->active); > + > + cuda_map_destroy (s->map); > + > return true; > } > > static void > map_pop (struct ptx_stream *s) > { > - assert (s != NULL); > - assert (s->h_next); > - assert (s->h_prev); > - assert (s->h_tail); > - > - s->h_tail = s->h_next; > - > - if (s->h_tail >= s->h_end) > - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); > + struct cuda_map *next; > > - if (s->h_next == s->h_tail) > - s->h_prev = s->h_next; > + assert (s != NULL); > > - assert (s->h_next >= s->h_begin); > - assert (s->h_tail >= s->h_begin); > - assert (s->h_prev >= s->h_begin); > + if (s->map->next == NULL) > + { > + s->map->active = false; > + return; > + } > > - assert (s->h_next <= s->h_end); > - assert (s->h_tail <= s->h_end); > - assert (s->h_prev <= s->h_end); > + next = s->map->next; > + cuda_map_destroy (s->map); > + s->map = next; > } > > -static void > -map_push (struct ptx_stream *s, size_t size, void **h, void **d) > +static CUdeviceptr > +map_push (struct ptx_stream *s, size_t size) > { > - int left; > - int offset; > + struct cuda_map *map = NULL, *t = NULL; > > - assert (s != NULL); > + assert (s); > + assert (s->map); > > - left = s->h_end - s->h_next; > + /* Each PTX stream requires a separate data region to store the > + launch arguments for cuLaunchKernel. Allocate a new > + cuda_map and push it to the end of the list. */ > + if (s->map->active) > + { > + map = cuda_map_create (size); > > - assert (s->h_prev); > - assert (s->h_next); > + for (t = s->map; t->next != NULL; t = t->next) > + ; > > - if (size >= left) > + t->next = map; > + } > + else if (s->map->size < size) > { > - assert (s->h_next == s->h_prev); > - s->h_next = s->h_prev = s->h_tail = s->h_begin; > + cuda_map_destroy (s->map); > + map = cuda_map_create (size); > } > + else > + map = s->map; > > - assert (s->h_next); > - > - offset = s->h_next - s->h; > - > - *d = (void *)(s->d + offset); > - *h = (void *)(s->h + offset); > - > - s->h_prev = s->h_next; > - s->h_next += size; > - > - assert (s->h_prev); > - assert (s->h_next); > - > - assert (s->h_next >= s->h_begin); > - assert (s->h_tail >= s->h_begin); > - assert (s->h_prev >= s->h_begin); > - assert (s->h_next <= s->h_end); > - assert (s->h_tail <= s->h_end); > - assert (s->h_prev <= s->h_end); > + s->map = map; > + s->map->active = true; > > - return; > + return s->map->d; > } > > /* Target data function launch information. */ > @@ -451,8 +464,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) > null_stream->stream = NULL; > null_stream->host_thread = pthread_self (); > null_stream->multithreaded = true; > - null_stream->d = (CUdeviceptr) NULL; > - null_stream->h = NULL; > if (!map_init (null_stream)) > return false; > > @@ -587,8 +598,6 @@ select_stream_for_async (int async, pthread_t thread, bool create, > s->host_thread = thread; > s->multithreaded = false; > > - s->d = (CUdeviceptr) NULL; > - s->h = NULL; > if (!map_init (s)) > { > pthread_mutex_unlock (&ptx_dev->stream_lock); > @@ -1123,7 +1132,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, > int i; > struct ptx_stream *dev_str; > void *kargs[1]; > - void *hp, *dp; > + void *hp; > + CUdeviceptr dp; > struct nvptx_thread *nvthd = nvptx_thread (); > int warp_size = nvthd->ptx_dev->warp_size; > const char *maybe_abort_msg = "(perhaps abort was called)"; > @@ -1270,17 +1280,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, > /* This reserves a chunk of a pre-allocated page of memory mapped on both > the host and the device. HP is a host pointer to the new chunk, and DP is > the corresponding device pointer. */ > - map_push (dev_str, mapnum * sizeof (void *), &hp, &dp); > + pthread_mutex_lock (&ptx_event_lock); > + dp = map_push (dev_str, mapnum * sizeof (void *)); > + pthread_mutex_unlock (&ptx_event_lock); > > GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); > > /* Copy the array of arguments to the mapped page. */ > + hp = alloca(sizeof(void *) * mapnum); > for (i = 0; i < mapnum; i++) > ((void **) hp)[i] = devaddrs[i]; > > /* Copy the (device) pointers to arguments to the device (dp and hp might in > fact have the same value on a unified-memory system). */ This comment needs to be updated, right? > - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, > + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, > mapnum * sizeof (void *)); > GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" > " gangs=%u, workers=%u, vectors=%u\n", > -- 2.7.4 > Otherwise OK. Thanks, - Tom
On 08/01/2018 04:12 AM, Tom de Vries wrote: > On 07/31/2018 05:27 PM, Cesar Philippidis wrote: >> /* Copy the (device) pointers to arguments to the device (dp and hp might in >> fact have the same value on a unified-memory system). */ > > This comment needs to be updated, right? > >> - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, >> + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, >> mapnum * sizeof (void *)); >> GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" >> " gangs=%u, workers=%u, vectors=%u\n", >> -- 2.7.4 >> > > Otherwise OK. Thanks. I've committed the attach patch to trunk. Cesar [nvptx] Remove use of CUDA unified memory in libgomp 2018-09-18 Cesar Philippidis <cesar@codesourcery.com> libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. (cherry picked from gomp-4_0-branch r242614) --- libgomp/plugin/plugin-nvptx.c | 170 ++++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 79 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index bae1b05..6492e5f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -192,20 +192,20 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; +struct cuda_map +{ + CUdeviceptr d; + size_t size; + bool active; + struct cuda_map *next; +}; + struct ptx_stream { CUstream stream; pthread_t host_thread; bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - + struct cuda_map *map; struct ptx_stream *next; }; @@ -217,101 +217,114 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; +static struct cuda_map * +cuda_map_create (size_t size) +{ + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); + + assert (map); + + map->next = NULL; + map->size = size; + map->active = false; + + CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); + assert (map->d); + + return map; +} + +static void +cuda_map_destroy (struct cuda_map *map) +{ + CUDA_CALL_ASSERT (cuMemFree, map->d); + free (map); +} + +/* The following map_* routines manage the CUDA device memory that + contains the data mapping arguments for cuLaunchKernel. Each + asynchronous PTX stream may have multiple pending kernel + invocations, which are launched in a FIFO order. As such, the map + routines maintains a queue of cuLaunchKernel arguments. + + Calls to map_push and map_pop must be guarded by ptx_event_lock. + Likewise, calls to map_init and map_fini are guarded by + ptx_dev_lock inside GOMP_OFFLOAD_init_device and + GOMP_OFFLOAD_fini_device, respectively. */ + static bool map_init (struct ptx_stream *s) { int size = getpagesize (); assert (s); - assert (!s->d); - assert (!s->h); - - CUDA_CALL (cuMemAllocHost, &s->h, size); - CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); - assert (s->h); + s->map = cuda_map_create (size); - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); return true; } static bool map_fini (struct ptx_stream *s) { - CUDA_CALL (cuMemFreeHost, s->h); + assert (s->map->next == NULL); + assert (!s->map->active); + + cuda_map_destroy (s->map); + return true; } static void map_pop (struct ptx_stream *s) { - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - s->h_tail = s->h_next; - - if (s->h_tail >= s->h_end) - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + struct cuda_map *next; - if (s->h_next == s->h_tail) - s->h_prev = s->h_next; + assert (s != NULL); - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); + if (s->map->next == NULL) + { + s->map->active = false; + return; + } - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + next = s->map->next; + cuda_map_destroy (s->map); + s->map = next; } -static void -map_push (struct ptx_stream *s, size_t size, void **h, void **d) +static CUdeviceptr +map_push (struct ptx_stream *s, size_t size) { - int left; - int offset; + struct cuda_map *map = NULL, *t = NULL; - assert (s != NULL); + assert (s); + assert (s->map); - left = s->h_end - s->h_next; + /* Each PTX stream requires a separate data region to store the + launch arguments for cuLaunchKernel. Allocate a new + cuda_map and push it to the end of the list. */ + if (s->map->active) + { + map = cuda_map_create (size); - assert (s->h_prev); - assert (s->h_next); + for (t = s->map; t->next != NULL; t = t->next) + ; - if (size >= left) + t->next = map; + } + else if (s->map->size < size) { - assert (s->h_next == s->h_prev); - s->h_next = s->h_prev = s->h_tail = s->h_begin; + cuda_map_destroy (s->map); + map = cuda_map_create (size); } + else + map = s->map; - assert (s->h_next); - - offset = s->h_next - s->h; - - *d = (void *)(s->d + offset); - *h = (void *)(s->h + offset); - - s->h_prev = s->h_next; - s->h_next += size; - - assert (s->h_prev); - assert (s->h_next); - - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + s->map = map; + s->map->active = true; - return; + return s->map->d; } /* Target data function launch information. */ @@ -442,8 +455,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) null_stream->stream = NULL; null_stream->host_thread = pthread_self (); null_stream->multithreaded = true; - null_stream->d = (CUdeviceptr) NULL; - null_stream->h = NULL; if (!map_init (null_stream)) return false; @@ -578,8 +589,6 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->host_thread = thread; s->multithreaded = false; - s->d = (CUdeviceptr) NULL; - s->h = NULL; if (!map_init (s)) { pthread_mutex_unlock (&ptx_dev->stream_lock); @@ -1120,7 +1129,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int i; struct ptx_stream *dev_str; void *kargs[1]; - void *hp, *dp; + void *hp; + CUdeviceptr dp; struct nvptx_thread *nvthd = nvptx_thread (); int warp_size = nvthd->ptx_dev->warp_size; const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -1295,17 +1305,19 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ - map_push (dev_str, mapnum * sizeof (void *), &hp, &dp); + pthread_mutex_lock (&ptx_event_lock); + dp = map_push (dev_str, mapnum * sizeof (void *)); + pthread_mutex_unlock (&ptx_event_lock); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ + hp = alloca(sizeof(void *) * mapnum); for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; - /* Copy the (device) pointers to arguments to the device (dp and hp might in - fact have the same value on a unified-memory system). */ - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, + /* Copy the (device) pointers to arguments to the device */ + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, mapnum * sizeof (void *)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n",
[PATCH] [nvptx] Remove use of CUDA unified memory in libgomp 2018-XX-YY Cesar Philippidis <cesar@codesourcery.com> libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. (cherry picked from gomp-4_0-branch r242614) --- libgomp/plugin/plugin-nvptx.c | 167 +++++++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 77 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 1237ea10..d79ddf1 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -200,20 +200,20 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; +struct cuda_map +{ + CUdeviceptr d; + size_t size; + bool active; + struct cuda_map *next; +}; + struct ptx_stream { CUstream stream; pthread_t host_thread; bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - + struct cuda_map *map; struct ptx_stream *next; }; @@ -225,101 +225,114 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; +static struct cuda_map * +cuda_map_create (size_t size) +{ + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); + + assert (map); + + map->next = NULL; + map->size = size; + map->active = false; + + CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); + assert (map->d); + + return map; +} + +static void +cuda_map_destroy (struct cuda_map *map) +{ + CUDA_CALL_ASSERT (cuMemFree, map->d); + free (map); +} + +/* The following map_* routines manage the CUDA device memory that + contains the data mapping arguments for cuLaunchKernel. Each + asynchronous PTX stream may have multiple pending kernel + invocations, which are launched in a FIFO order. As such, the map + routines maintains a queue of cuLaunchKernel arguments. + + Calls to map_push and map_pop must be guarded by ptx_event_lock. + Likewise, calls to map_init and map_fini are guarded by + ptx_dev_lock inside GOMP_OFFLOAD_init_device and + GOMP_OFFLOAD_fini_device, respectively. */ + static bool map_init (struct ptx_stream *s) { int size = getpagesize (); assert (s); - assert (!s->d); - assert (!s->h); - - CUDA_CALL (cuMemAllocHost, &s->h, size); - CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); - assert (s->h); + s->map = cuda_map_create (size); - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); return true; } static bool map_fini (struct ptx_stream *s) { - CUDA_CALL (cuMemFreeHost, s->h); + assert (s->map->next == NULL); + assert (!s->map->active); + + cuda_map_destroy (s->map); + return true; } static void map_pop (struct ptx_stream *s) { - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - s->h_tail = s->h_next; - - if (s->h_tail >= s->h_end) - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + struct cuda_map *next; - if (s->h_next == s->h_tail) - s->h_prev = s->h_next; + assert (s != NULL); - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); + if (s->map->next == NULL) + { + s->map->active = false; + return; + } - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + next = s->map->next; + cuda_map_destroy (s->map); + s->map = next; } -static void -map_push (struct ptx_stream *s, size_t size, void **h, void **d) +static CUdeviceptr +map_push (struct ptx_stream *s, size_t size) { - int left; - int offset; + struct cuda_map *map = NULL, *t = NULL; - assert (s != NULL); + assert (s); + assert (s->map); - left = s->h_end - s->h_next; + /* Each PTX stream requires a separate data region to store the + launch arguments for cuLaunchKernel. Allocate a new + cuda_map and push it to the end of the list. */ + if (s->map->active) + { + map = cuda_map_create (size); - assert (s->h_prev); - assert (s->h_next); + for (t = s->map; t->next != NULL; t = t->next) + ; - if (size >= left) + t->next = map; + } + else if (s->map->size < size) { - assert (s->h_next == s->h_prev); - s->h_next = s->h_prev = s->h_tail = s->h_begin; + cuda_map_destroy (s->map); + map = cuda_map_create (size); } + else + map = s->map; - assert (s->h_next); - - offset = s->h_next - s->h; - - *d = (void *)(s->d + offset); - *h = (void *)(s->h + offset); - - s->h_prev = s->h_next; - s->h_next += size; - - assert (s->h_prev); - assert (s->h_next); - - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + s->map = map; + s->map->active = true; - return; + return s->map->d; } /* Target data function launch information. */ @@ -451,8 +464,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) null_stream->stream = NULL; null_stream->host_thread = pthread_self (); null_stream->multithreaded = true; - null_stream->d = (CUdeviceptr) NULL; - null_stream->h = NULL; if (!map_init (null_stream)) return false; @@ -587,8 +598,6 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->host_thread = thread; s->multithreaded = false; - s->d = (CUdeviceptr) NULL; - s->h = NULL; if (!map_init (s)) { pthread_mutex_unlock (&ptx_dev->stream_lock); @@ -1123,7 +1132,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int i; struct ptx_stream *dev_str; void *kargs[1]; - void *hp, *dp; + void *hp; + CUdeviceptr dp; struct nvptx_thread *nvthd = nvptx_thread (); int warp_size = nvthd->ptx_dev->warp_size; const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -1270,17 +1280,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ - map_push (dev_str, mapnum * sizeof (void *), &hp, &dp); + pthread_mutex_lock (&ptx_event_lock); + dp = map_push (dev_str, mapnum * sizeof (void *)); + pthread_mutex_unlock (&ptx_event_lock); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ + hp = alloca(sizeof(void *) * mapnum); for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, mapnum * sizeof (void *)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", -- 2.7.4