diff mbox series

[nvptx] Remove use of CUDA unified memory in libgomp

Message ID 2437ac99-433b-9bdb-6948-bc40b2e6cfd3@codesourcery.com
State New
Headers show
Series [nvptx] Remove use of CUDA unified memory in libgomp | expand

Commit Message

Cesar Philippidis July 31, 2018, 3:27 p.m. UTC
At present, libgomp is using CUDA unified memory only as a buffer pass
to the struct containing the pointers to the data mappings to the
offloaded functions. I'm not sure why unified memory is needed here if
it is still being managed explicitly by the driver.

This patch removes the use of CUDA unified memory from the driver. I
don't recall observing any reduction in performance. Besides,
eventually, we'd like to eliminate the struct containing all pointers to
the offloaded data mappings and pass those pointers as individual
function arguments to cuLaunchKernel directly.

Is this patch OK for trunk? I bootstrapped and regression tested it for
x86_64 with nvptx offloading.

Thanks,
Cesar

Comments

Tom de Vries Aug. 1, 2018, 11:12 a.m. UTC | #1
On 07/31/2018 05:27 PM, Cesar Philippidis wrote:
> At present, libgomp is using CUDA unified memory only as a buffer pass
> to the struct containing the pointers to the data mappings to the
> offloaded functions. I'm not sure why unified memory is needed here if
> it is still being managed explicitly by the driver.
> 
> This patch removes the use of CUDA unified memory from the driver. I
> don't recall observing any reduction in performance. Besides,
> eventually, we'd like to eliminate the struct containing all pointers to
> the offloaded data mappings and pass those pointers as individual
> function arguments to cuLaunchKernel directly.
> 
> Is this patch OK for trunk? I bootstrapped and regression tested it for
> x86_64 with nvptx offloading.
> 
> Thanks,
> Cesar
> 
> 
> 0001-nvptx-Remove-use-of-CUDA-unified-memory-in-libgomp.patch
> 
> 
> [PATCH] [nvptx] Remove use of CUDA unified memory in libgomp
> 
> 2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>
> 
> 	libgomp/
> 	* plugin/plugin-nvptx.c (struct cuda_map): New.
> 	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
> 	h_tail with (cuda_map *) map.
> 	(cuda_map_create): New function.
> 	(cuda_map_destroy): New function.
> 	(map_init): Update to use a linked list of cuda_map objects.
> 	(map_fini): Likewise.
> 	(map_pop): Likewise.
> 	(map_push): Likewise.  Return CUdeviceptr instead of void.
> 	(init_streams_for_device): Remove stales references to ptx_stream
> 	members.
> 	(select_stream_for_async): Likewise.
> 	(nvptx_exec): Update call to map_init.
> 
> (cherry picked from gomp-4_0-branch r242614)
> ---
>  libgomp/plugin/plugin-nvptx.c | 167 +++++++++++++++++++++++-------------------
>  1 file changed, 90 insertions(+), 77 deletions(-)
> 
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 1237ea10..d79ddf1 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -200,20 +200,20 @@ cuda_error (CUresult r)
>  static unsigned int instantiated_devices = 0;
>  static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
>  
> +struct cuda_map
> +{
> +  CUdeviceptr d;
> +  size_t size;
> +  bool active;
> +  struct cuda_map *next;
> +};
> +
>  struct ptx_stream
>  {
>    CUstream stream;
>    pthread_t host_thread;
>    bool multithreaded;
> -
> -  CUdeviceptr d;
> -  void *h;
> -  void *h_begin;
> -  void *h_end;
> -  void *h_next;
> -  void *h_prev;
> -  void *h_tail;
> -
> +  struct cuda_map *map;
>    struct ptx_stream *next;
>  };
>  
> @@ -225,101 +225,114 @@ struct nvptx_thread
>    struct ptx_device *ptx_dev;
>  };
>  
> +static struct cuda_map *
> +cuda_map_create (size_t size)
> +{
> +  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
> +
> +  assert (map);
> +
> +  map->next = NULL;
> +  map->size = size;
> +  map->active = false;
> +
> +  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
> +  assert (map->d);
> +
> +  return map;
> +}
> +
> +static void
> +cuda_map_destroy (struct cuda_map *map)
> +{
> +  CUDA_CALL_ASSERT (cuMemFree, map->d);
> +  free (map);
> +}
> +
> +/* The following map_* routines manage the CUDA device memory that
> +   contains the data mapping arguments for cuLaunchKernel.  Each
> +   asynchronous PTX stream may have multiple pending kernel
> +   invocations, which are launched in a FIFO order.  As such, the map
> +   routines maintains a queue of cuLaunchKernel arguments.
> +
> +   Calls to map_push and map_pop must be guarded by ptx_event_lock.
> +   Likewise, calls to map_init and map_fini are guarded by
> +   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
> +   GOMP_OFFLOAD_fini_device, respectively.  */
> +
>  static bool
>  map_init (struct ptx_stream *s)
>  {
>    int size = getpagesize ();
>  
>    assert (s);
> -  assert (!s->d);
> -  assert (!s->h);
> -
> -  CUDA_CALL (cuMemAllocHost, &s->h, size);
> -  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
>  
> -  assert (s->h);
> +  s->map = cuda_map_create (size);
>  
> -  s->h_begin = s->h;
> -  s->h_end = s->h_begin + size;
> -  s->h_next = s->h_prev = s->h_tail = s->h_begin;
> -
> -  assert (s->h_next);
> -  assert (s->h_end);
>    return true;
>  }
>  
>  static bool
>  map_fini (struct ptx_stream *s)
>  {
> -  CUDA_CALL (cuMemFreeHost, s->h);
> +  assert (s->map->next == NULL);
> +  assert (!s->map->active);
> +
> +  cuda_map_destroy (s->map);
> +
>    return true;
>  }
>  
>  static void
>  map_pop (struct ptx_stream *s)
>  {
> -  assert (s != NULL);
> -  assert (s->h_next);
> -  assert (s->h_prev);
> -  assert (s->h_tail);
> -
> -  s->h_tail = s->h_next;
> -
> -  if (s->h_tail >= s->h_end)
> -    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
> +  struct cuda_map *next;
>  
> -  if (s->h_next == s->h_tail)
> -    s->h_prev = s->h_next;
> +  assert (s != NULL);
>  
> -  assert (s->h_next >= s->h_begin);
> -  assert (s->h_tail >= s->h_begin);
> -  assert (s->h_prev >= s->h_begin);
> +  if (s->map->next == NULL)
> +    {
> +      s->map->active = false;
> +      return;
> +    }
>  
> -  assert (s->h_next <= s->h_end);
> -  assert (s->h_tail <= s->h_end);
> -  assert (s->h_prev <= s->h_end);
> +  next = s->map->next;
> +  cuda_map_destroy (s->map);
> +  s->map = next;
>  }
>  
> -static void
> -map_push (struct ptx_stream *s, size_t size, void **h, void **d)
> +static CUdeviceptr
> +map_push (struct ptx_stream *s, size_t size)
>  {
> -  int left;
> -  int offset;
> +  struct cuda_map *map = NULL, *t = NULL;
>  
> -  assert (s != NULL);
> +  assert (s);
> +  assert (s->map);
>  
> -  left = s->h_end - s->h_next;
> +  /* Each PTX stream requires a separate data region to store the
> +     launch arguments for cuLaunchKernel.  Allocate a new
> +     cuda_map and push it to the end of the list.  */
> +  if (s->map->active)
> +    {
> +      map = cuda_map_create (size);
>  
> -  assert (s->h_prev);
> -  assert (s->h_next);
> +      for (t = s->map; t->next != NULL; t = t->next)
> +	;
>  
> -  if (size >= left)
> +      t->next = map;
> +    }
> +  else if (s->map->size < size)
>      {
> -      assert (s->h_next == s->h_prev);
> -      s->h_next = s->h_prev = s->h_tail = s->h_begin;
> +      cuda_map_destroy (s->map);
> +      map = cuda_map_create (size);
>      }
> +  else
> +    map = s->map;
>  
> -  assert (s->h_next);
> -
> -  offset = s->h_next - s->h;
> -
> -  *d = (void *)(s->d + offset);
> -  *h = (void *)(s->h + offset);
> -
> -  s->h_prev = s->h_next;
> -  s->h_next += size;
> -
> -  assert (s->h_prev);
> -  assert (s->h_next);
> -
> -  assert (s->h_next >= s->h_begin);
> -  assert (s->h_tail >= s->h_begin);
> -  assert (s->h_prev >= s->h_begin);
> -  assert (s->h_next <= s->h_end);
> -  assert (s->h_tail <= s->h_end);
> -  assert (s->h_prev <= s->h_end);
> +  s->map = map;
> +  s->map->active = true;
>  
> -  return;
> +  return s->map->d;
>  }
>  
>  /* Target data function launch information.  */
> @@ -451,8 +464,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
>    null_stream->stream = NULL;
>    null_stream->host_thread = pthread_self ();
>    null_stream->multithreaded = true;
> -  null_stream->d = (CUdeviceptr) NULL;
> -  null_stream->h = NULL;
>    if (!map_init (null_stream))
>      return false;
>  
> @@ -587,8 +598,6 @@ select_stream_for_async (int async, pthread_t thread, bool create,
>  	  s->host_thread = thread;
>  	  s->multithreaded = false;
>  
> -	  s->d = (CUdeviceptr) NULL;
> -	  s->h = NULL;
>  	  if (!map_init (s))
>  	    {
>  	      pthread_mutex_unlock (&ptx_dev->stream_lock);
> @@ -1123,7 +1132,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
>    int i;
>    struct ptx_stream *dev_str;
>    void *kargs[1];
> -  void *hp, *dp;
> +  void *hp;
> +  CUdeviceptr dp;
>    struct nvptx_thread *nvthd = nvptx_thread ();
>    int warp_size = nvthd->ptx_dev->warp_size;
>    const char *maybe_abort_msg = "(perhaps abort was called)";
> @@ -1270,17 +1280,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
>    /* This reserves a chunk of a pre-allocated page of memory mapped on both
>       the host and the device. HP is a host pointer to the new chunk, and DP is
>       the corresponding device pointer.  */
> -  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
> +  pthread_mutex_lock (&ptx_event_lock);
> +  dp = map_push (dev_str, mapnum * sizeof (void *));
> +  pthread_mutex_unlock (&ptx_event_lock);
>  
>    GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
>  
>    /* Copy the array of arguments to the mapped page.  */
> +  hp = alloca(sizeof(void *) * mapnum);
>    for (i = 0; i < mapnum; i++)
>      ((void **) hp)[i] = devaddrs[i];
>  
>    /* Copy the (device) pointers to arguments to the device (dp and hp might in
>       fact have the same value on a unified-memory system).  */

This comment needs to be updated, right?

> -  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
> +  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
>  		    mapnum * sizeof (void *));
>    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>  		     " gangs=%u, workers=%u, vectors=%u\n",
> -- 2.7.4
> 

Otherwise OK.

Thanks,
- Tom
Cesar Philippidis Sept. 18, 2018, 3:42 p.m. UTC | #2
On 08/01/2018 04:12 AM, Tom de Vries wrote:
> On 07/31/2018 05:27 PM, Cesar Philippidis wrote:

>>    /* Copy the (device) pointers to arguments to the device (dp and hp might in
>>       fact have the same value on a unified-memory system).  */
> 
> This comment needs to be updated, right?
> 
>> -  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
>> +  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
>>  		    mapnum * sizeof (void *));
>>    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>>  		     " gangs=%u, workers=%u, vectors=%u\n",
>> -- 2.7.4
>>
> 
> Otherwise OK.

Thanks. I've committed the attach patch to trunk.

Cesar
[nvptx] Remove use of CUDA unified memory in libgomp

2018-09-18  Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.

(cherry picked from gomp-4_0-branch r242614)
---
 libgomp/plugin/plugin-nvptx.c | 170 ++++++++++++++++++++++--------------------
 1 file changed, 91 insertions(+), 79 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index bae1b05..6492e5f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -192,20 +192,20 @@ cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -217,101 +217,114 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, &s->h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-    s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+    {
+      s->map->active = false;
+      return;
+    }
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }
 
-static void
-map_push (struct ptx_stream *s, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
 {
-  int left;
-  int offset;
+  struct cuda_map *map = NULL, *t = NULL;
 
-  assert (s != NULL);
+  assert (s);
+  assert (s->map);
 
-  left = s->h_end - s->h_next;
+  /* Each PTX stream requires a separate data region to store the
+     launch arguments for cuLaunchKernel.  Allocate a new
+     cuda_map and push it to the end of the list.  */
+  if (s->map->active)
+    {
+      map = cuda_map_create (size);
 
-  assert (s->h_prev);
-  assert (s->h_next);
+      for (t = s->map; t->next != NULL; t = t->next)
+	;
 
-  if (size >= left)
+      t->next = map;
+    }
+  else if (s->map->size < size)
     {
-      assert (s->h_next == s->h_prev);
-      s->h_next = s->h_prev = s->h_tail = s->h_begin;
+      cuda_map_destroy (s->map);
+      map = cuda_map_create (size);
     }
+  else
+    map = s->map;
 
-  assert (s->h_next);
-
-  offset = s->h_next - s->h;
-
-  *d = (void *)(s->d + offset);
-  *h = (void *)(s->h + offset);
-
-  s->h_prev = s->h_next;
-  s->h_next += size;
-
-  assert (s->h_prev);
-  assert (s->h_next);
-
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  s->map = map;
+  s->map->active = true;
 
-  return;
+  return s->map->d;
 }
 
 /* Target data function launch information.  */
@@ -442,8 +455,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  null_stream->d = (CUdeviceptr) NULL;
-  null_stream->h = NULL;
   if (!map_init (null_stream))
     return false;
 
@@ -578,8 +589,6 @@ select_stream_for_async (int async, pthread_t thread, bool create,
 	  s->host_thread = thread;
 	  s->multithreaded = false;
 
-	  s->d = (CUdeviceptr) NULL;
-	  s->h = NULL;
 	  if (!map_init (s))
 	    {
 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -1120,7 +1129,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   int i;
   struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp, *dp;
+  void *hp;
+  CUdeviceptr dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -1295,17 +1305,19 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
-  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
+  pthread_mutex_lock (&ptx_event_lock);
+  dp = map_push (dev_str, mapnum * sizeof (void *));
+  pthread_mutex_unlock (&ptx_event_lock);
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
   /* Copy the array of arguments to the mapped page.  */
+  hp = alloca(sizeof(void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     ((void **) hp)[i] = devaddrs[i];
 
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+  /* Copy the (device) pointers to arguments to the device */
+  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
 		    mapnum * sizeof (void *));
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
diff mbox series

Patch

[PATCH] [nvptx] Remove use of CUDA unified memory in libgomp

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.

(cherry picked from gomp-4_0-branch r242614)
---
 libgomp/plugin/plugin-nvptx.c | 167 +++++++++++++++++++++++-------------------
 1 file changed, 90 insertions(+), 77 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 1237ea10..d79ddf1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -200,20 +200,20 @@  cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -225,101 +225,114 @@  struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, &s->h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-    s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+    {
+      s->map->active = false;
+      return;
+    }
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }
 
-static void
-map_push (struct ptx_stream *s, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
 {
-  int left;
-  int offset;
+  struct cuda_map *map = NULL, *t = NULL;
 
-  assert (s != NULL);
+  assert (s);
+  assert (s->map);
 
-  left = s->h_end - s->h_next;
+  /* Each PTX stream requires a separate data region to store the
+     launch arguments for cuLaunchKernel.  Allocate a new
+     cuda_map and push it to the end of the list.  */
+  if (s->map->active)
+    {
+      map = cuda_map_create (size);
 
-  assert (s->h_prev);
-  assert (s->h_next);
+      for (t = s->map; t->next != NULL; t = t->next)
+	;
 
-  if (size >= left)
+      t->next = map;
+    }
+  else if (s->map->size < size)
     {
-      assert (s->h_next == s->h_prev);
-      s->h_next = s->h_prev = s->h_tail = s->h_begin;
+      cuda_map_destroy (s->map);
+      map = cuda_map_create (size);
     }
+  else
+    map = s->map;
 
-  assert (s->h_next);
-
-  offset = s->h_next - s->h;
-
-  *d = (void *)(s->d + offset);
-  *h = (void *)(s->h + offset);
-
-  s->h_prev = s->h_next;
-  s->h_next += size;
-
-  assert (s->h_prev);
-  assert (s->h_next);
-
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  s->map = map;
+  s->map->active = true;
 
-  return;
+  return s->map->d;
 }
 
 /* Target data function launch information.  */
@@ -451,8 +464,6 @@  init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  null_stream->d = (CUdeviceptr) NULL;
-  null_stream->h = NULL;
   if (!map_init (null_stream))
     return false;
 
@@ -587,8 +598,6 @@  select_stream_for_async (int async, pthread_t thread, bool create,
 	  s->host_thread = thread;
 	  s->multithreaded = false;
 
-	  s->d = (CUdeviceptr) NULL;
-	  s->h = NULL;
 	  if (!map_init (s))
 	    {
 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -1123,7 +1132,8 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   int i;
   struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp, *dp;
+  void *hp;
+  CUdeviceptr dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -1270,17 +1280,20 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
-  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
+  pthread_mutex_lock (&ptx_event_lock);
+  dp = map_push (dev_str, mapnum * sizeof (void *));
+  pthread_mutex_unlock (&ptx_event_lock);
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
   /* Copy the array of arguments to the mapped page.  */
+  hp = alloca(sizeof(void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     ((void **) hp)[i] = devaddrs[i];
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
 		    mapnum * sizeof (void *));
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
-- 
2.7.4