diff mbox series

[OpenACC,libgomp,v6,stage1] Async-rework update

Message ID 4ea36c00-5e38-d92b-e7cd-5809dfb79934@mentor.com
State New
Headers show
Series [OpenACC,libgomp,v6,stage1] Async-rework update | expand

Commit Message

Chung-Lin Tang Feb. 25, 2019, 5:49 p.m. UTC
Hi Thomas,
I have incorporated all your patches you've included in the last mail (with
some modifications, though pretty minor I think).

The default_async, GOMP_PLUGIN_IF_VERSION, and testsuite changes have all been removed.
We can work on them later as we clarify more things.

Thanks,
Chung-Lin

Comments

Thomas Schwinge Feb. 26, 2019, 1:51 p.m. UTC | #1
Hi Chung-Lin!

On Tue, 26 Feb 2019 01:49:09 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> I have incorporated all your patches you've included in the last mail (with
> some modifications, though pretty minor I think).

OK, thanks, that's good for next GCC development stage 1 as far as I'm
concerned, and Tom has already approved the libgomp 'nvptx' plugin
changes, and I suppose you've addressed Jakub's requests -- so you're
good to go!  :-)


> The default_async, GOMP_PLUGIN_IF_VERSION, and testsuite changes have all been removed.
> We can work on them later as we clarify more things.

ACK, and likewise also for the remaining "TODO" items (either comments in
the source code, or as discussed in email), once we reach clarification
about these.


As I mentioned before, the 'GOMP_PLUGIN_IF_VERSION' changes could go into
trunk right now, given they boil down to just a documentation update.
(And the value then incremented as part of the "async re-work" commit;
pre-approved.)


Grüße
 Thomas


> Index: libgomp/oacc-async.c
> ===================================================================
> --- libgomp/oacc-async.c	(revision 269183)
> +++ libgomp/oacc-async.c	(working copy)
> @@ -27,49 +27,162 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <assert.h>
> +#include <string.h>
>  #include "openacc.h"
>  #include "libgomp.h"
>  #include "oacc-int.h"
>  
> -int
> -acc_async_test (int async)
> +static struct goacc_thread *
> +get_goacc_thread (void)
>  {
> -  if (!async_valid_p (async))
> -    gomp_fatal ("invalid async argument: %d", async);
> -
>    struct goacc_thread *thr = goacc_thread ();
>  
>    if (!thr || !thr->dev)
>      gomp_fatal ("no device active");
>  
> -  return thr->dev->openacc.async_test_func (async);
> +  return thr;
>  }
>  
> -int
> -acc_async_test_all (void)
> +static struct gomp_device_descr *
> +get_goacc_thread_device (void)
>  {
>    struct goacc_thread *thr = goacc_thread ();
>  
>    if (!thr || !thr->dev)
>      gomp_fatal ("no device active");
>  
> -  return thr->dev->openacc.async_test_all_func ();
> +  return thr->dev;
>  }
>  
> -void
> -acc_wait (int async)
> +static int
> +validate_async_val (int async)
>  {
>    if (!async_valid_p (async))
> -    gomp_fatal ("invalid async argument: %d", async);
> +    gomp_fatal ("invalid async-argument: %d", async);
>  
> +  if (async == acc_async_sync)
> +    return -1;
> +
> +  if (async == acc_async_noval)
> +    return 0;
> +
> +  if (async >= 0)
> +    /* TODO: we reserve 0 for acc_async_noval before we can clarify the
> +       semantics of "default_async".  */
> +    return 1 + async;
> +  else
> +    __builtin_unreachable ();
> +}
> +
> +/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
> +   might return NULL if no asyncqueue is to be used.  Otherwise, if CREATE,
> +   create the asyncqueue if it doesn't exist yet.  */
> +
> +attribute_hidden struct goacc_asyncqueue *
> +lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
> +{
> +  async = validate_async_val (async);
> +  if (async < 0)
> +    return NULL;
> +
> +  struct goacc_asyncqueue *ret_aq = NULL;
> +  struct gomp_device_descr *dev = thr->dev;
> +
> +  gomp_mutex_lock (&dev->openacc.async.lock);
> +
> +  if (!create
> +      && (async >= dev->openacc.async.nasyncqueue
> +	  || !dev->openacc.async.asyncqueue[async]))
> +    goto end;
> +
> +  if (async >= dev->openacc.async.nasyncqueue)
> +    {
> +      int diff = async + 1 - dev->openacc.async.nasyncqueue;
> +      dev->openacc.async.asyncqueue
> +	= gomp_realloc (dev->openacc.async.asyncqueue,
> +			sizeof (goacc_aq) * (async + 1));
> +      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
> +	      0, sizeof (goacc_aq) * diff);
> +      dev->openacc.async.nasyncqueue = async + 1;
> +    }
> +
> +  if (!dev->openacc.async.asyncqueue[async])
> +    {
> +      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
> +
> +      if (!dev->openacc.async.asyncqueue[async])
> +	{
> +	  gomp_mutex_unlock (&dev->openacc.async.lock);
> +	  gomp_fatal ("async %d creation failed", async);
> +	}
> +      
> +      /* Link new async queue into active list.  */
> +      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
> +      n->aq = dev->openacc.async.asyncqueue[async];
> +      n->next = dev->openacc.async.active;
> +      dev->openacc.async.active = n;
> +    }
> +
> +  ret_aq = dev->openacc.async.asyncqueue[async];
> +
> + end:
> +  gomp_mutex_unlock (&dev->openacc.async.lock);
> +  return ret_aq;
> +}
> +
> +/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
> +   might return NULL if no asyncqueue is to be used.  Otherwise, create the
> +   asyncqueue if it doesn't exist yet.  */
> +
> +attribute_hidden struct goacc_asyncqueue *
> +get_goacc_asyncqueue (int async)
> +{
> +  struct goacc_thread *thr = get_goacc_thread ();
> +  return lookup_goacc_asyncqueue (thr, true, async);
> +}
> +
> +int
> +acc_async_test (int async)
> +{
>    struct goacc_thread *thr = goacc_thread ();
>  
>    if (!thr || !thr->dev)
>      gomp_fatal ("no device active");
>  
> -  thr->dev->openacc.async_wait_func (async);
> +  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
> +  if (!aq)
> +    return 1;
> +  else
> +    return thr->dev->openacc.async.test_func (aq);
>  }
>  
> +int
> +acc_async_test_all (void)
> +{
> +  struct goacc_thread *thr = get_goacc_thread ();
> +
> +  int ret = 1;
> +  gomp_mutex_lock (&thr->dev->openacc.async.lock);
> +  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
> +    if (!thr->dev->openacc.async.test_func (l->aq))
> +      {
> +	ret = 0;
> +	break;
> +      }
> +  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
> +  return ret;
> +}
> +
> +void
> +acc_wait (int async)
> +{
> +  struct goacc_thread *thr = get_goacc_thread ();
> +
> +  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
> +  if (aq && !thr->dev->openacc.async.synchronize_func (aq))
> +    gomp_fatal ("wait on %d failed", async);
> +}
> +
>  /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
>  #ifdef HAVE_ATTRIBUTE_ALIAS
>  strong_alias (acc_wait, acc_async_wait)
> @@ -84,23 +197,46 @@ acc_async_wait (int async)
>  void
>  acc_wait_async (int async1, int async2)
>  {
> -  struct goacc_thread *thr = goacc_thread ();
> +  struct goacc_thread *thr = get_goacc_thread ();
>  
> -  if (!thr || !thr->dev)
> -    gomp_fatal ("no device active");
> +  goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
> +  /* TODO: Is this also correct for acc_async_sync, assuming that in this case,
> +     we'll always be synchronous anyways?  */
> +  if (!aq1)
> +    return;
>  
> -  thr->dev->openacc.async_wait_async_func (async1, async2);
> +  goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
> +  /* An async queue is always synchronized with itself.  */
> +  if (aq1 == aq2)
> +    return;
> +
> +  if (aq2)
> +    {
> +      if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
> +	gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
> +    }
> +  else
> +    {
> +      /* TODO: Local thread synchronization.
> +	 Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
> +      if (!thr->dev->openacc.async.synchronize_func (aq1))
> +	gomp_fatal ("wait on %d failed", async1);
> +    }
>  }
>  
>  void
>  acc_wait_all (void)
>  {
> -  struct goacc_thread *thr = goacc_thread ();
> +  struct gomp_device_descr *dev = get_goacc_thread_device ();
>  
> -  if (!thr || !thr->dev)
> -    gomp_fatal ("no device active");
> +  bool ret = true;
> +  gomp_mutex_lock (&dev->openacc.async.lock);
> +  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
> +    ret &= dev->openacc.async.synchronize_func (l->aq);
> +  gomp_mutex_unlock (&dev->openacc.async.lock);
>  
> -  thr->dev->openacc.async_wait_all_func ();
> +  if (!ret)
> +    gomp_fatal ("wait all failed");
>  }
>  
>  /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
> @@ -117,13 +253,73 @@ acc_async_wait_all (void)
>  void
>  acc_wait_all_async (int async)
>  {
> -  if (!async_valid_p (async))
> -    gomp_fatal ("invalid async argument: %d", async);
> +  struct goacc_thread *thr = get_goacc_thread ();
>  
> -  struct goacc_thread *thr = goacc_thread ();
> +  goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
>  
> -  if (!thr || !thr->dev)
> -    gomp_fatal ("no device active");
> +  bool ret = true;
> +  gomp_mutex_lock (&thr->dev->openacc.async.lock);
> +  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
> +    {
> +      if (waiting_queue)
> +	ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
> +      else
> +	/* TODO: Local thread synchronization.
> +	   Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
> +	ret &= thr->dev->openacc.async.synchronize_func (l->aq);
> +    }
> +  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
>  
> -  thr->dev->openacc.async_wait_all_async_func (async);
> +  if (!ret)
> +    gomp_fatal ("wait all async(%d) failed", async);
>  }
> +
> +attribute_hidden void
> +goacc_async_free (struct gomp_device_descr *devicep,
> +		  struct goacc_asyncqueue *aq, void *ptr)
> +{
> +  if (!aq)
> +    free (ptr);
> +  else
> +    devicep->openacc.async.queue_callback_func (aq, free, ptr);
> +}
> +
> +/* This function initializes the asyncqueues for the device specified by
> +   DEVICEP.  TODO DEVICEP must be locked on entry, and remains locked on
> +   return.  */
> +
> +attribute_hidden void
> +goacc_init_asyncqueues (struct gomp_device_descr *devicep)
> +{
> +  devicep->openacc.async.nasyncqueue = 0;
> +  devicep->openacc.async.asyncqueue = NULL;
> +  devicep->openacc.async.active = NULL;
> +  gomp_mutex_init (&devicep->openacc.async.lock);
> +}
> +
> +/* This function finalizes the asyncqueues for the device specified by DEVICEP.
> +   TODO DEVICEP must be locked on entry, and remains locked on return.  */
> +
> +attribute_hidden bool
> +goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
> +{
> +  bool ret = true;
> +  gomp_mutex_lock (&devicep->openacc.async.lock);
> +  if (devicep->openacc.async.nasyncqueue > 0)
> +    {
> +      goacc_aq_list next;
> +      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
> +	{
> +	  ret &= devicep->openacc.async.destruct_func (l->aq);
> +	  next = l->next;
> +	  free (l);
> +	}
> +      free (devicep->openacc.async.asyncqueue);
> +      devicep->openacc.async.nasyncqueue = 0;
> +      devicep->openacc.async.asyncqueue = NULL;
> +      devicep->openacc.async.active = NULL;
> +    }
> +  gomp_mutex_unlock (&devicep->openacc.async.lock);
> +  gomp_mutex_destroy (&devicep->openacc.async.lock);
> +  return ret;
> +}
> Index: libgomp/oacc-plugin.c
> ===================================================================
> --- libgomp/oacc-plugin.c	(revision 269183)
> +++ libgomp/oacc-plugin.c	(working copy)
> @@ -30,15 +30,12 @@
>  #include "oacc-plugin.h"
>  #include "oacc-int.h"
>  
> +/* This plugin function is now obsolete.  */
>  void
> -GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
> +GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
> +			      int async __attribute__((unused)))
>  {
> -  struct target_mem_desc *tgt = ptr;
> -  struct gomp_device_descr *devicep = tgt->device_descr;
> -
> -  devicep->openacc.async_set_async_func (async);
> -  gomp_unmap_vars (tgt, true);
> -  devicep->openacc.async_set_async_func (acc_async_sync);
> +  gomp_fatal ("invalid plugin function");
>  }
>  
>  /* Return the target-specific part of the TLS data for the current thread.  */
> Index: libgomp/plugin/cuda/cuda.h
> ===================================================================
> --- libgomp/plugin/cuda/cuda.h	(revision 269183)
> +++ libgomp/plugin/cuda/cuda.h	(working copy)
> @@ -54,7 +54,11 @@ typedef enum {
>    CUDA_ERROR_INVALID_CONTEXT = 201,
>    CUDA_ERROR_NOT_FOUND = 500,
>    CUDA_ERROR_NOT_READY = 600,
> -  CUDA_ERROR_LAUNCH_FAILED = 719
> +  CUDA_ERROR_LAUNCH_FAILED = 719,
> +  CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
> +  CUDA_ERROR_NOT_PERMITTED = 800,
> +  CUDA_ERROR_NOT_SUPPORTED = 801,
> +  CUDA_ERROR_UNKNOWN = 999
>  } CUresult;
>  
>  typedef enum {
> @@ -173,6 +177,8 @@ CUresult cuModuleLoadData (CUmodule *, const void
>  CUresult cuModuleUnload (CUmodule);
>  CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
>  					  CUoccupancyB2DSize, size_t, int);
> +typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
> +CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
>  CUresult cuStreamCreate (CUstream *, unsigned);
>  #define cuStreamDestroy cuStreamDestroy_v2
>  CUresult cuStreamDestroy (CUstream);
> Index: libgomp/plugin/plugin-nvptx.c
> ===================================================================
> --- libgomp/plugin/plugin-nvptx.c	(revision 269183)
> +++ libgomp/plugin/plugin-nvptx.c	(working copy)
> @@ -192,175 +192,30 @@ cuda_error (CUresult r)
>  static unsigned int instantiated_devices = 0;
>  static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
>  
> -struct cuda_map
> +/* NVPTX/CUDA specific definition of asynchronous queues.  */
> +struct goacc_asyncqueue
>  {
> -  CUdeviceptr d;
> -  size_t size;
> -  bool active;
> -  struct cuda_map *next;
> +  CUstream cuda_stream;
>  };
>  
> -struct ptx_stream
> +struct nvptx_callback
>  {
> -  CUstream stream;
> -  pthread_t host_thread;
> -  bool multithreaded;
> -  struct cuda_map *map;
> -  struct ptx_stream *next;
> +  void (*fn) (void *);
> +  void *ptr;
> +  struct goacc_asyncqueue *aq;
> +  struct nvptx_callback *next;
>  };
>  
>  /* Thread-specific data for PTX.  */
>  
>  struct nvptx_thread
>  {
> -  struct ptx_stream *current_stream;
> +  /* We currently have this embedded inside the plugin because libgomp manages
> +     devices through integer target_ids.  This might be better if using an
> +     opaque target-specific pointer directly from gomp_device_descr.  */
>    struct ptx_device *ptx_dev;
>  };
>  
> -static struct cuda_map *
> -cuda_map_create (size_t size)
> -{
> -  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
> -
> -  assert (map);
> -
> -  map->next = NULL;
> -  map->size = size;
> -  map->active = false;
> -
> -  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
> -  assert (map->d);
> -
> -  return map;
> -}
> -
> -static void
> -cuda_map_destroy (struct cuda_map *map)
> -{
> -  if (map->active)
> -    /* Possible reasons for the map to be still active:
> -       - the associated async kernel might still be running.
> -       - the associated async kernel might have finished, but the
> -         corresponding event that should trigger the pop_map has not been
> -	 processed by event_gc.
> -       - the associated sync kernel might have aborted
> -
> -       The async cases could happen if the user specified an async region
> -       without adding a corresponding wait that is guaranteed to be executed
> -       (before returning from main, or in an atexit handler).
> -       We do not want to deallocate a device pointer that is still being
> -       used, so skip it.
> -
> -       In the sync case, the device pointer is no longer used, but deallocating
> -       it using cuMemFree will not succeed, so skip it.
> -
> -       TODO: Handle this in a more constructive way, by f.i. waiting for streams
> -       to finish before de-allocating them (PR88981), or by ensuring the CUDA
> -       lib atexit handler is called before rather than after the libgomp plugin
> -       atexit handler (PR83795).  */
> -    ;
> -  else
> -    CUDA_CALL_NOCHECK (cuMemFree, map->d);
> -
> -  free (map);
> -}
> -
> -/* The following map_* routines manage the CUDA device memory that
> -   contains the data mapping arguments for cuLaunchKernel.  Each
> -   asynchronous PTX stream may have multiple pending kernel
> -   invocations, which are launched in a FIFO order.  As such, the map
> -   routines maintains a queue of cuLaunchKernel arguments.
> -
> -   Calls to map_push and map_pop must be guarded by ptx_event_lock.
> -   Likewise, calls to map_init and map_fini are guarded by
> -   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
> -   GOMP_OFFLOAD_fini_device, respectively.  */
> -
> -static bool
> -map_init (struct ptx_stream *s)
> -{
> -  int size = getpagesize ();
> -
> -  assert (s);
> -
> -  s->map = cuda_map_create (size);
> -
> -  return true;
> -}
> -
> -static bool
> -map_fini (struct ptx_stream *s)
> -{
> -  assert (s->map->next == NULL);
> -
> -  cuda_map_destroy (s->map);
> -
> -  return true;
> -}
> -
> -static void
> -map_pop (struct ptx_stream *s)
> -{
> -  struct cuda_map *next;
> -
> -  assert (s != NULL);
> -
> -  if (s->map->next == NULL)
> -    {
> -      s->map->active = false;
> -      return;
> -    }
> -
> -  next = s->map->next;
> -  cuda_map_destroy (s->map);
> -  s->map = next;
> -}
> -
> -static CUdeviceptr
> -map_push (struct ptx_stream *s, size_t size)
> -{
> -  struct cuda_map *map = NULL;
> -  struct cuda_map **t;
> -
> -  assert (s);
> -  assert (s->map);
> -
> -  /* Select an element to push.  */
> -  if (s->map->active)
> -    map = cuda_map_create (size);
> -  else
> -    {
> -      /* Pop the inactive front element.  */
> -      struct cuda_map *pop = s->map;
> -      s->map = pop->next;
> -      pop->next = NULL;
> -
> -      if (pop->size < size)
> -	{
> -	  cuda_map_destroy (pop);
> -
> -	  map = cuda_map_create (size);
> -	}
> -      else
> -	map = pop;
> -    }
> -
> -  /* Check that the element is as expected.  */
> -  assert (map->next == NULL);
> -  assert (!map->active);
> -
> -  /* Mark the element active.  */
> -  map->active = true;
> -
> -  /* Push the element to the back of the list.  */
> -  for (t = &s->map; (*t) != NULL; t = &(*t)->next)
> -    ;
> -  assert (t != NULL && *t == NULL);
> -  *t = map;
> -
> -  return map->d;
> -}
> -
>  /* Target data function launch information.  */
>  
>  struct targ_fn_launch
> @@ -412,22 +267,18 @@ struct ptx_image_data
>    struct ptx_image_data *next;
>  };
>  
> +struct ptx_free_block
> +{
> +  void *ptr;
> +  struct ptx_free_block *next;
> +};
> +
>  struct ptx_device
>  {
>    CUcontext ctx;
>    bool ctx_shared;
>    CUdevice dev;
> -  struct ptx_stream *null_stream;
> -  /* All non-null streams associated with this device (actually context),
> -     either created implicitly or passed in from the user (via
> -     acc_set_cuda_stream).  */
> -  struct ptx_stream *active_streams;
> -  struct {
> -    struct ptx_stream **arr;
> -    int size;
> -  } async_streams;
> -  /* A lock for use when manipulating the above stream list and array.  */
> -  pthread_mutex_t stream_lock;
> +
>    int ord;
>    bool overlap;
>    bool map;
> @@ -445,32 +296,13 @@ struct ptx_device
>  
>    struct ptx_image_data *images;  /* Images loaded on device.  */
>    pthread_mutex_t image_lock;     /* Lock for above list.  */
> -  
> -  struct ptx_device *next;
> -};
>  
> -enum ptx_event_type
> -{
> -  PTX_EVT_MEM,
> -  PTX_EVT_KNL,
> -  PTX_EVT_SYNC,
> -  PTX_EVT_ASYNC_CLEANUP
> -};
> +  struct ptx_free_block *free_blocks;
> +  pthread_mutex_t free_blocks_lock;
>  
> -struct ptx_event
> -{
> -  CUevent *evt;
> -  int type;
> -  void *addr;
> -  int ord;
> -  int val;
> -
> -  struct ptx_event *next;
> +  struct ptx_device *next;
>  };
>  
> -static pthread_mutex_t ptx_event_lock;
> -static struct ptx_event *ptx_events;
> -
>  static struct ptx_device **ptx_devices;
>  
>  static inline struct nvptx_thread *
> @@ -479,193 +311,6 @@ nvptx_thread (void)
>    return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
>  }
>  
> -static bool
> -init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
> -{
> -  int i;
> -  struct ptx_stream *null_stream
> -    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
> -
> -  null_stream->stream = NULL;
> -  null_stream->host_thread = pthread_self ();
> -  null_stream->multithreaded = true;
> -  if (!map_init (null_stream))
> -    return false;
> -
> -  ptx_dev->null_stream = null_stream;
> -  ptx_dev->active_streams = NULL;
> -  pthread_mutex_init (&ptx_dev->stream_lock, NULL);
> -
> -  if (concurrency < 1)
> -    concurrency = 1;
> -
> -  /* This is just a guess -- make space for as many async streams as the
> -     current device is capable of concurrently executing.  This can grow
> -     later as necessary.  No streams are created yet.  */
> -  ptx_dev->async_streams.arr
> -    = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
> -  ptx_dev->async_streams.size = concurrency;
> -
> -  for (i = 0; i < concurrency; i++)
> -    ptx_dev->async_streams.arr[i] = NULL;
> -
> -  return true;
> -}
> -
> -static bool
> -fini_streams_for_device (struct ptx_device *ptx_dev)
> -{
> -  free (ptx_dev->async_streams.arr);
> -
> -  bool ret = true;
> -  while (ptx_dev->active_streams != NULL)
> -    {
> -      struct ptx_stream *s = ptx_dev->active_streams;
> -      ptx_dev->active_streams = ptx_dev->active_streams->next;
> -
> -      ret &= map_fini (s);
> -
> -      CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
> -      if (r != CUDA_SUCCESS)
> -	{
> -	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
> -	  ret = false;
> -	}
> -      free (s);
> -    }
> -
> -  ret &= map_fini (ptx_dev->null_stream);
> -  free (ptx_dev->null_stream);
> -  return ret;
> -}
> -
> -/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
> -   thread THREAD (and also current device/context).  If CREATE is true, create
> -   the stream if it does not exist (or use EXISTING if it is non-NULL), and
> -   associate the stream with the same thread argument.  Returns stream to use
> -   as result.  */
> -
> -static struct ptx_stream *
> -select_stream_for_async (int async, pthread_t thread, bool create,
> -			 CUstream existing)
> -{
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -  /* Local copy of TLS variable.  */
> -  struct ptx_device *ptx_dev = nvthd->ptx_dev;
> -  struct ptx_stream *stream = NULL;
> -  int orig_async = async;
> -
> -  /* The special value acc_async_noval (-1) maps (for now) to an
> -     implicitly-created stream, which is then handled the same as any other
> -     numbered async stream.  Other options are available, e.g. using the null
> -     stream for anonymous async operations, or choosing an idle stream from an
> -     active set.  But, stick with this for now.  */
> -  if (async > acc_async_sync)
> -    async++;
> -
> -  if (create)
> -    pthread_mutex_lock (&ptx_dev->stream_lock);
> -
> -  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
> -     null stream, and in fact better performance may be obtainable if it doesn't
> -     (because the null stream enforces overly-strict synchronisation with
> -     respect to other streams for legacy reasons, and that's probably not
> -     needed with OpenACC).  Maybe investigate later.  */
> -  if (async == acc_async_sync)
> -    stream = ptx_dev->null_stream;
> -  else if (async >= 0 && async < ptx_dev->async_streams.size
> -	   && ptx_dev->async_streams.arr[async] && !(create && existing))
> -    stream = ptx_dev->async_streams.arr[async];
> -  else if (async >= 0 && create)
> -    {
> -      if (async >= ptx_dev->async_streams.size)
> -	{
> -	  int i, newsize = ptx_dev->async_streams.size * 2;
> -
> -	  if (async >= newsize)
> -	    newsize = async + 1;
> -
> -	  ptx_dev->async_streams.arr
> -	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
> -				   newsize * sizeof (struct ptx_stream *));
> -
> -	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
> -	    ptx_dev->async_streams.arr[i] = NULL;
> -
> -	  ptx_dev->async_streams.size = newsize;
> -	}
> -
> -      /* Create a new stream on-demand if there isn't one already, or if we're
> -	 setting a particular async value to an existing (externally-provided)
> -	 stream.  */
> -      if (!ptx_dev->async_streams.arr[async] || existing)
> -        {
> -	  CUresult r;
> -	  struct ptx_stream *s
> -	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
> -
> -	  if (existing)
> -	    s->stream = existing;
> -	  else
> -	    {
> -	      r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
> -				     CU_STREAM_DEFAULT);
> -	      if (r != CUDA_SUCCESS)
> -		{
> -		  pthread_mutex_unlock (&ptx_dev->stream_lock);
> -		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
> -				     cuda_error (r));
> -		}
> -	    }
> -
> -	  /* If CREATE is true, we're going to be queueing some work on this
> -	     stream.  Associate it with the current host thread.  */
> -	  s->host_thread = thread;
> -	  s->multithreaded = false;
> -
> -	  if (!map_init (s))
> -	    {
> -	      pthread_mutex_unlock (&ptx_dev->stream_lock);
> -	      GOMP_PLUGIN_fatal ("map_init fail");
> -	    }
> -
> -	  s->next = ptx_dev->active_streams;
> -	  ptx_dev->active_streams = s;
> -	  ptx_dev->async_streams.arr[async] = s;
> -	}
> -
> -      stream = ptx_dev->async_streams.arr[async];
> -    }
> -  else if (async < 0)
> -    {
> -      if (create)
> -	pthread_mutex_unlock (&ptx_dev->stream_lock);
> -      GOMP_PLUGIN_fatal ("bad async %d", async);
> -    }
> -
> -  if (create)
> -    {
> -      assert (stream != NULL);
> -
> -      /* If we're trying to use the same stream from different threads
> -	 simultaneously, set stream->multithreaded to true.  This affects the
> -	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
> -	 only wait for asynchronous launches from the same host thread they are
> -	 invoked on.  If multiple threads use the same async value, we make note
> -	 of that here and fall back to testing/waiting for all threads in those
> -	 functions.  */
> -      if (thread != stream->host_thread)
> -        stream->multithreaded = true;
> -
> -      pthread_mutex_unlock (&ptx_dev->stream_lock);
> -    }
> -  else if (stream && !stream->multithreaded
> -	   && !pthread_equal (stream->host_thread, thread))
> -    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
> -
> -  return stream;
> -}
> -
>  /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
>     should be locked on entry and remains locked on exit.  */
>  
> @@ -677,9 +322,6 @@ nvptx_init (void)
>    if (instantiated_devices != 0)
>      return true;
>  
> -  ptx_events = NULL;
> -  pthread_mutex_init (&ptx_event_lock, NULL);
> -
>    if (!init_cuda_lib ())
>      return false;
>  
> @@ -703,6 +345,11 @@ nvptx_attach_host_thread_to_device (int n)
>    CUcontext thd_ctx;
>  
>    r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
> +  if (r == CUDA_ERROR_NOT_PERMITTED)
> +    {
> +      /* Assume we're in a CUDA callback, just return true.  */
> +      return true;
> +    }
>    if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
>      {
>        GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
> @@ -847,8 +494,8 @@ nvptx_open_device (int n)
>    ptx_dev->images = NULL;
>    pthread_mutex_init (&ptx_dev->image_lock, NULL);
>  
> -  if (!init_streams_for_device (ptx_dev, async_engines))
> -    return NULL;
> +  ptx_dev->free_blocks = NULL;
> +  pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
>  
>    return ptx_dev;
>  }
> @@ -859,9 +506,15 @@ nvptx_close_device (struct ptx_device *ptx_dev)
>    if (!ptx_dev)
>      return true;
>  
> -  if (!fini_streams_for_device (ptx_dev))
> -    return false;
> -  
> +  for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
> +    {
> +      struct ptx_free_block *b_next = b->next;
> +      CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
> +      free (b);
> +      b = b_next;
> +    }
> +
> +  pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
>    pthread_mutex_destroy (&ptx_dev->image_lock);
>  
>    if (!ptx_dev->ctx_shared)
> @@ -1041,139 +694,19 @@ link_ptx (CUmodule *module, const struct targ_ptx_
>  }
>  
>  static void
> -event_gc (bool memmap_lockable)
> -{
> -  struct ptx_event *ptx_event = ptx_events;
> -  struct ptx_event *async_cleanups = NULL;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  pthread_mutex_lock (&ptx_event_lock);
> -
> -  while (ptx_event != NULL)
> -    {
> -      CUresult r;
> -      struct ptx_event *e = ptx_event;
> -
> -      ptx_event = ptx_event->next;
> -
> -      if (e->ord != nvthd->ptx_dev->ord)
> -	continue;
> -
> -      r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
> -      if (r == CUDA_SUCCESS)
> -	{
> -	  bool append_async = false;
> -	  CUevent *te;
> -
> -	  te = e->evt;
> -
> -	  switch (e->type)
> -	    {
> -	    case PTX_EVT_MEM:
> -	    case PTX_EVT_SYNC:
> -	      break;
> -
> -	    case PTX_EVT_KNL:
> -	      map_pop (e->addr);
> -	      break;
> -
> -	    case PTX_EVT_ASYNC_CLEANUP:
> -	      {
> -		/* The function gomp_plugin_async_unmap_vars needs to claim the
> -		   memory-map splay tree lock for the current device, so we
> -		   can't call it when one of our callers has already claimed
> -		   the lock.  In that case, just delay the GC for this event
> -		   until later.  */
> -		if (!memmap_lockable)
> -		  continue;
> -
> -		append_async = true;
> -	      }
> -	      break;
> -	    }
> -
> -	  CUDA_CALL_NOCHECK (cuEventDestroy, *te);
> -	  free ((void *)te);
> -
> -	  /* Unlink 'e' from ptx_events list.  */
> -	  if (ptx_events == e)
> -	    ptx_events = ptx_events->next;
> -	  else
> -	    {
> -	      struct ptx_event *e_ = ptx_events;
> -	      while (e_->next != e)
> -		e_ = e_->next;
> -	      e_->next = e_->next->next;
> -	    }
> -
> -	  if (append_async)
> -	    {
> -	      e->next = async_cleanups;
> -	      async_cleanups = e;
> -	    }
> -	  else
> -	    free (e);
> -	}
> -    }
> -
> -  pthread_mutex_unlock (&ptx_event_lock);
> -
> -  /* We have to do these here, after ptx_event_lock is released.  */
> -  while (async_cleanups)
> -    {
> -      struct ptx_event *e = async_cleanups;
> -      async_cleanups = async_cleanups->next;
> -
> -      GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
> -      free (e);
> -    }
> -}
> -
> -static void
> -event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
> -{
> -  struct ptx_event *ptx_event;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
> -	  || type == PTX_EVT_ASYNC_CLEANUP);
> -
> -  ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
> -  ptx_event->type = type;
> -  ptx_event->evt = e;
> -  ptx_event->addr = h;
> -  ptx_event->ord = nvthd->ptx_dev->ord;
> -  ptx_event->val = val;
> -
> -  pthread_mutex_lock (&ptx_event_lock);
> -
> -  ptx_event->next = ptx_events;
> -  ptx_events = ptx_event;
> -
> -  pthread_mutex_unlock (&ptx_event_lock);
> -}
> -
> -static void
>  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
> -	    int async, unsigned *dims, void *targ_mem_desc)
> +	    unsigned *dims, void *targ_mem_desc,
> +	    CUdeviceptr dp, CUstream stream)
>  {
>    struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
>    CUfunction function;
> -  CUresult r;
>    int i;
> -  struct ptx_stream *dev_str;
>    void *kargs[1];
> -  void *hp;
> -  CUdeviceptr dp = 0;
>    struct nvptx_thread *nvthd = nvptx_thread ();
>    int warp_size = nvthd->ptx_dev->warp_size;
> -  const char *maybe_abort_msg = "(perhaps abort was called)";
>  
>    function = targ_fn->fn;
>  
> -  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
> -  assert (dev_str == nvthd->current_stream);
> -
>    /* Initialize the launch dimensions.  Typically this is constant,
>       provided by the device compiler, but we must permit runtime
>       values.  */
> @@ -1361,27 +894,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
>  			   dims[GOMP_DIM_VECTOR]);
>      }
>  
> -  if (mapnum > 0)
> -    {
> -      /* This reserves a chunk of a pre-allocated page of memory mapped on both
> -	 the host and the device. HP is a host pointer to the new chunk, and DP is
> -	 the corresponding device pointer.  */
> -      pthread_mutex_lock (&ptx_event_lock);
> -      dp = map_push (dev_str, mapnum * sizeof (void *));
> -      pthread_mutex_unlock (&ptx_event_lock);
> -
> -      GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
> -
> -      /* Copy the array of arguments to the mapped page.  */
> -      hp = alloca(sizeof(void *) * mapnum);
> -      for (i = 0; i < mapnum; i++)
> -	((void **) hp)[i] = devaddrs[i];
> -
> -      /* Copy the (device) pointers to arguments to the device */
> -      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
> -			mapnum * sizeof (void *));
> -    }
> -
>    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>  		     " gangs=%u, workers=%u, vectors=%u\n",
>  		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
> @@ -1392,62 +904,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
>    // num_gangs		nctaid.x
>    // num_workers	ntid.y
>    // vector length	ntid.x
> -
>    kargs[0] = &dp;
>    CUDA_CALL_ASSERT (cuLaunchKernel, function,
>  		    dims[GOMP_DIM_GANG], 1, 1,
>  		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
> -		    0, dev_str->stream, kargs, 0);
> +		    0, stream, kargs, 0);
>  
> -#ifndef DISABLE_ASYNC
> -  if (async < acc_async_noval)
> -    {
> -      r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
> -      if (r == CUDA_ERROR_LAUNCH_FAILED)
> -	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
> -			   maybe_abort_msg);
> -      else if (r != CUDA_SUCCESS)
> -        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
> -    }
> -  else
> -    {
> -      CUevent *e;
> -
> -      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
> -
> -      r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -      if (r == CUDA_ERROR_LAUNCH_FAILED)
> -	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
> -			   maybe_abort_msg);
> -      else if (r != CUDA_SUCCESS)
> -        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
> -
> -      event_gc (true);
> -
> -      CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
> -
> -      if (mapnum > 0)
> -	event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
> -    }
> -#else
> -  r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
> -  if (r == CUDA_ERROR_LAUNCH_FAILED)
> -    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
> -		       maybe_abort_msg);
> -  else if (r != CUDA_SUCCESS)
> -    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
> -#endif
> -
>    GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
>  		     targ_fn->launch->fn);
> -
> -#ifndef DISABLE_ASYNC
> -  if (async < acc_async_noval)
> -#endif
> -    {
> -      if (mapnum > 0)
> -	map_pop (dev_str);
> -    }
>  }
>  
>  void * openacc_get_current_cuda_context (void);
> @@ -1462,8 +926,21 @@ nvptx_alloc (size_t s)
>  }
>  
>  static bool
> -nvptx_free (void *p)
> +nvptx_free (void *p, struct ptx_device *ptx_dev)
>  {
> +  /* Assume callback context if this is null.  */
> +  if (GOMP_PLUGIN_acc_thread () == NULL)
> +    {
> +      struct ptx_free_block *n
> +	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
> +      n->ptr = p;
> +      pthread_mutex_lock (&ptx_dev->free_blocks_lock);
> +      n->next = ptx_dev->free_blocks;
> +      ptx_dev->free_blocks = n;
> +      pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
> +      return true;
> +    }
> +
>    CUdeviceptr pb;
>    size_t ps;
>  
> @@ -1478,305 +955,6 @@ static bool
>    return true;
>  }
>  
> -
> -static bool
> -nvptx_host2dev (void *d, const void *h, size_t s)
> -{
> -  CUdeviceptr pb;
> -  size_t ps;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  if (!s)
> -    return true;
> -  if (!d)
> -    {
> -      GOMP_PLUGIN_error ("invalid device address");
> -      return false;
> -    }
> -
> -  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
> -
> -  if (!pb)
> -    {
> -      GOMP_PLUGIN_error ("invalid device address");
> -      return false;
> -    }
> -  if (!h)
> -    {
> -      GOMP_PLUGIN_error ("invalid host address");
> -      return false;
> -    }
> -  if (d == h)
> -    {
> -      GOMP_PLUGIN_error ("invalid host or device address");
> -      return false;
> -    }
> -  if ((void *)(d + s) > (void *)(pb + ps))
> -    {
> -      GOMP_PLUGIN_error ("invalid size");
> -      return false;
> -    }
> -
> -#ifndef DISABLE_ASYNC
> -  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
> -    {
> -      CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
> -      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -      event_gc (false);
> -      CUDA_CALL (cuMemcpyHtoDAsync,
> -		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
> -      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
> -      event_add (PTX_EVT_MEM, e, (void *)h, 0);
> -    }
> -  else
> -#endif
> -    CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
> -
> -  return true;
> -}
> -
> -static bool
> -nvptx_dev2host (void *h, const void *d, size_t s)
> -{
> -  CUdeviceptr pb;
> -  size_t ps;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  if (!s)
> -    return true;
> -  if (!d)
> -    {
> -      GOMP_PLUGIN_error ("invalid device address");
> -      return false;
> -    }
> -
> -  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
> -
> -  if (!pb)
> -    {
> -      GOMP_PLUGIN_error ("invalid device address");
> -      return false;
> -    }
> -  if (!h)
> -    {
> -      GOMP_PLUGIN_error ("invalid host address");
> -      return false;
> -    }
> -  if (d == h)
> -    {
> -      GOMP_PLUGIN_error ("invalid host or device address");
> -      return false;
> -    }
> -  if ((void *)(d + s) > (void *)(pb + ps))
> -    {
> -      GOMP_PLUGIN_error ("invalid size");
> -      return false;
> -    }
> -
> -#ifndef DISABLE_ASYNC
> -  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
> -    {
> -      CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
> -      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -      event_gc (false);
> -      CUDA_CALL (cuMemcpyDtoHAsync,
> -		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
> -      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
> -      event_add (PTX_EVT_MEM, e, (void *)h, 0);
> -    }
> -  else
> -#endif
> -    CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
> -
> -  return true;
> -}
> -
> -static void
> -nvptx_set_async (int async)
> -{
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -  nvthd->current_stream
> -    = select_stream_for_async (async, pthread_self (), true, NULL);
> -}
> -
> -static int
> -nvptx_async_test (int async)
> -{
> -  CUresult r;
> -  struct ptx_stream *s;
> -
> -  s = select_stream_for_async (async, pthread_self (), false, NULL);
> -  if (!s)
> -    return 1;
> -
> -  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
> -  if (r == CUDA_SUCCESS)
> -    {
> -      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
> -	 whether all work has completed on this stream, and if so omits the call
> -	 to the wait hook.  If that happens, event_gc might not get called
> -	 (which prevents variables from getting unmapped and their associated
> -	 device storage freed), so call it here.  */
> -      event_gc (true);
> -      return 1;
> -    }
> -  else if (r == CUDA_ERROR_NOT_READY)
> -    return 0;
> -
> -  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
> -
> -  return 0;
> -}
> -
> -static int
> -nvptx_async_test_all (void)
> -{
> -  struct ptx_stream *s;
> -  pthread_t self = pthread_self ();
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
> -
> -  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
> -    {
> -      if ((s->multithreaded || pthread_equal (s->host_thread, self))
> -	  && CUDA_CALL_NOCHECK (cuStreamQuery,
> -				s->stream) == CUDA_ERROR_NOT_READY)
> -	{
> -	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
> -	  return 0;
> -	}
> -    }
> -
> -  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
> -
> -  event_gc (true);
> -
> -  return 1;
> -}
> -
> -static void
> -nvptx_wait (int async)
> -{
> -  struct ptx_stream *s;
> -
> -  s = select_stream_for_async (async, pthread_self (), false, NULL);
> -  if (!s)
> -    return;
> -
> -  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
> -
> -  event_gc (true);
> -}
> -
> -static void
> -nvptx_wait_async (int async1, int async2)
> -{
> -  CUevent *e;
> -  struct ptx_stream *s1, *s2;
> -  pthread_t self = pthread_self ();
> -
> -  s1 = select_stream_for_async (async1, self, false, NULL);
> -  if (!s1)
> -    return;
> -
> -  /* The stream that is waiting (rather than being waited for) doesn't
> -     necessarily have to exist already.  */
> -  s2 = select_stream_for_async (async2, self, true, NULL);
> -
> -  /* A stream is always synchronized with itself.  */
> -  if (s1 == s2)
> -    return;
> -
> -  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
> -
> -  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -
> -  event_gc (true);
> -
> -  CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
> -
> -  event_add (PTX_EVT_SYNC, e, NULL, 0);
> -
> -  CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
> -}
> -
> -static void
> -nvptx_wait_all (void)
> -{
> -  CUresult r;
> -  struct ptx_stream *s;
> -  pthread_t self = pthread_self ();
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
> -
> -  /* Wait for active streams initiated by this thread (or by multiple threads)
> -     to complete.  */
> -  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
> -    {
> -      if (s->multithreaded || pthread_equal (s->host_thread, self))
> -	{
> -	  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
> -	  if (r == CUDA_SUCCESS)
> -	    continue;
> -	  else if (r != CUDA_ERROR_NOT_READY)
> -	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
> -
> -	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
> -	}
> -    }
> -
> -  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
> -
> -  event_gc (true);
> -}
> -
> -static void
> -nvptx_wait_all_async (int async)
> -{
> -  struct ptx_stream *waiting_stream, *other_stream;
> -  CUevent *e;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -  pthread_t self = pthread_self ();
> -
> -  /* The stream doing the waiting.  This could be the first mention of the
> -     stream, so create it if necessary.  */
> -  waiting_stream
> -    = select_stream_for_async (async, pthread_self (), true, NULL);
> -
> -  /* Launches on the null stream already block on other streams in the
> -     context.  */
> -  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
> -    return;
> -
> -  event_gc (true);
> -
> -  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
> -
> -  for (other_stream = nvthd->ptx_dev->active_streams;
> -       other_stream != NULL;
> -       other_stream = other_stream->next)
> -    {
> -      if (!other_stream->multithreaded
> -	  && !pthread_equal (other_stream->host_thread, self))
> -	continue;
> -
> -      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
> -
> -      CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -
> -      /* Record an event on the waited-for stream.  */
> -      CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
> -
> -      event_add (PTX_EVT_SYNC, e, NULL, 0);
> -
> -      CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
> -   }
> -
> -  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
> -}
> -
>  static void *
>  nvptx_get_current_cuda_device (void)
>  {
> @@ -1799,75 +977,6 @@ nvptx_get_current_cuda_context (void)
>    return nvthd->ptx_dev->ctx;
>  }
>  
> -static void *
> -nvptx_get_cuda_stream (int async)
> -{
> -  struct ptx_stream *s;
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  if (!nvthd || !nvthd->ptx_dev)
> -    return NULL;
> -
> -  s = select_stream_for_async (async, pthread_self (), false, NULL);
> -
> -  return s ? s->stream : NULL;
> -}
> -
> -static int
> -nvptx_set_cuda_stream (int async, void *stream)
> -{
> -  struct ptx_stream *oldstream;
> -  pthread_t self = pthread_self ();
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -
> -  /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
> -     to change the stream handle associated with "acc_async_sync".  */
> -  if (async == acc_async_sync)
> -    {
> -      GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
> -			 " with \"acc_async_sync\"\n");
> -      return 0;
> -    }
> -
> -  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
> -
> -  /* We have a list of active streams and an array mapping async values to
> -     entries of that list.  We need to take "ownership" of the passed-in stream,
> -     and add it to our list, removing the previous entry also (if there was one)
> -     in order to prevent resource leaks.  Note the potential for surprise
> -     here: maybe we should keep track of passed-in streams and leave it up to
> -     the user to tidy those up, but that doesn't work for stream handles
> -     returned from acc_get_cuda_stream above...  */
> -
> -  oldstream = select_stream_for_async (async, self, false, NULL);
> -
> -  if (oldstream)
> -    {
> -      if (nvthd->ptx_dev->active_streams == oldstream)
> -	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
> -      else
> -	{
> -	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
> -	  while (s->next != oldstream)
> -	    s = s->next;
> -	  s->next = s->next->next;
> -	}
> -
> -      CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
> -
> -      if (!map_fini (oldstream))
> -	GOMP_PLUGIN_fatal ("error when freeing host memory");
> -
> -      free (oldstream);
> -    }
> -
> -  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
> -
> -  (void) select_stream_for_async (async, self, true, (CUstream) stream);
> -
> -  return 1;
> -}
> -
>  /* Plugin entry points.  */
>  
>  const char *
> @@ -2107,6 +1216,23 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
>  {
>    if (!nvptx_attach_host_thread_to_device (ord))
>      return NULL;
> +
> +  struct ptx_device *ptx_dev = ptx_devices[ord];
> +  struct ptx_free_block *blocks, *tmp;
> +
> +  pthread_mutex_lock (&ptx_dev->free_blocks_lock);
> +  blocks = ptx_dev->free_blocks;
> +  ptx_dev->free_blocks = NULL;
> +  pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
> +
> +  while (blocks)
> +    {
> +      tmp = blocks->next;
> +      nvptx_free (blocks->ptr, ptx_dev);
> +      free (blocks);
> +      blocks = tmp;
> +    }
> +
>    return nvptx_alloc (size);
>  }
>  
> @@ -2114,93 +1240,92 @@ bool
>  GOMP_OFFLOAD_free (int ord, void *ptr)
>  {
>    return (nvptx_attach_host_thread_to_device (ord)
> -	  && nvptx_free (ptr));
> +	  && nvptx_free (ptr, ptx_devices[ord]));
>  }
>  
> -bool
> -GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
> -{
> -  return (nvptx_attach_host_thread_to_device (ord)
> -	  && nvptx_dev2host (dst, src, n));
> -}
> -
> -bool
> -GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
> -{
> -  return (nvptx_attach_host_thread_to_device (ord)
> -	  && nvptx_host2dev (dst, src, n));
> -}
> -
> -bool
> -GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
> -{
> -  struct ptx_device *ptx_dev = ptx_devices[ord];
> -  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
> -				ptx_dev->null_stream->stream);
> -  return true;
> -}
> -
> -void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
> -
>  void
>  GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
>  			   void **hostaddrs, void **devaddrs,
> -			   int async, unsigned *dims, void *targ_mem_desc)
> +			   unsigned *dims, void *targ_mem_desc)
>  {
> -  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
> -}
> +  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
>  
> -void
> -GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
> -{
> -  struct nvptx_thread *nvthd = nvptx_thread ();
> -  CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
> +  void **hp = NULL;
> +  CUdeviceptr dp = 0;
>  
> -  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
> -  CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
> -  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
> -}
> +  if (mapnum > 0)
> +    {
> +      hp = alloca (mapnum * sizeof (void *));
> +      for (int i = 0; i < mapnum; i++)
> +	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
> +      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
> +    }
>  
> -int
> -GOMP_OFFLOAD_openacc_async_test (int async)
> -{
> -  return nvptx_async_test (async);
> -}
> +  /* Copy the (device) pointers to arguments to the device (dp and hp might in
> +     fact have the same value on a unified-memory system).  */
> +  if (mapnum > 0)
> +    CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
> +		      mapnum * sizeof (void *));
>  
> -int
> -GOMP_OFFLOAD_openacc_async_test_all (void)
> -{
> -  return nvptx_async_test_all ();
> -}
> +  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
> +	      dp, NULL);
>  
> -void
> -GOMP_OFFLOAD_openacc_async_wait (int async)
> -{
> -  nvptx_wait (async);
> +  CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
> +  const char *maybe_abort_msg = "(perhaps abort was called)";
> +  if (r == CUDA_ERROR_LAUNCH_FAILED)
> +    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
> +		       maybe_abort_msg);
> +  else if (r != CUDA_SUCCESS)
> +    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
> +  CUDA_CALL_ASSERT (cuMemFree, dp);
>  }
>  
> -void
> -GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
> +static void
> +cuda_free_argmem (void *ptr)
>  {
> -  nvptx_wait_async (async1, async2);
> +  void **block = (void **) ptr;
> +  nvptx_free (block[0], (struct ptx_device *) block[1]);
> +  free (block);
>  }
>  
>  void
> -GOMP_OFFLOAD_openacc_async_wait_all (void)
> +GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
> +				 void **hostaddrs, void **devaddrs,
> +				 unsigned *dims, void *targ_mem_desc,
> +				 struct goacc_asyncqueue *aq)
>  {
> -  nvptx_wait_all ();
> -}
> +  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
>  
> -void
> -GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
> -{
> -  nvptx_wait_all_async (async);
> -}
> +  void **hp = NULL;
> +  CUdeviceptr dp = 0;
> +  void **block = NULL;
>  
> -void
> -GOMP_OFFLOAD_openacc_async_set_async (int async)
> -{
> -  nvptx_set_async (async);
> +  if (mapnum > 0)
> +    {
> +      block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
> +      hp = block + 2;
> +      for (int i = 0; i < mapnum; i++)
> +	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
> +      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
> +    }
> +
> +  /* Copy the (device) pointers to arguments to the device (dp and hp might in
> +     fact have the same value on a unified-memory system).  */
> +  if (mapnum > 0)
> +    {
> +      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
> +			mapnum * sizeof (void *), aq->cuda_stream);
> +      block[0] = (void *) dp;
> +
> +      struct nvptx_thread *nvthd =
> +	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
> +      block[1] = (void *) nvthd->ptx_dev;
> +    }
> +  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
> +	      dp, aq->cuda_stream);
> +
> +  if (mapnum > 0)
> +    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
>  }
>  
>  void *
> @@ -2222,7 +1347,6 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord)
>    if (!thd_ctx)
>      CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
>  
> -  nvthd->current_stream = ptx_dev->null_stream;
>    nvthd->ptx_dev = ptx_dev;
>  
>    return (void *) nvthd;
> @@ -2246,22 +1370,186 @@ GOMP_OFFLOAD_openacc_cuda_get_current_context (voi
>    return nvptx_get_current_cuda_context ();
>  }
>  
> -/* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
> -
> +/* This returns a CUstream.  */
>  void *
> -GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
> +GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
>  {
> -  return nvptx_get_cuda_stream (async);
> +  return (void *) aq->cuda_stream;
>  }
>  
> -/* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
> +/* This takes a CUstream.  */
> +int
> +GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
> +{
> +  if (aq->cuda_stream)
> +    {
> +      CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
> +      CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
> +    }
>  
> +  aq->cuda_stream = (CUstream) stream;
> +  return 1;
> +}
> +
> +struct goacc_asyncqueue *
> +GOMP_OFFLOAD_openacc_async_construct (void)
> +{
> +  CUstream stream = NULL;
> +  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
> +
> +  struct goacc_asyncqueue *aq
> +    = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
> +  aq->cuda_stream = stream;
> +  return aq;
> +}
> +
> +bool
> +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
> +{
> +  CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
> +  free (aq);
> +  return true;
> +}
> +
>  int
> -GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
> +GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>  {
> -  return nvptx_set_cuda_stream (async, stream);
> +  CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
> +  if (r == CUDA_SUCCESS)
> +    return 1;
> +  if (r == CUDA_ERROR_NOT_READY)
> +    return 0;
> +
> +  GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
> +  return -1;
>  }
>  
> +bool
> +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
> +{
> +  CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
> +				      struct goacc_asyncqueue *aq2)
> +{
> +  CUevent e;
> +  CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
> +  CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
> +  CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
> +  return true;
> +}
> +
> +static void
> +cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
> +{
> +  if (res != CUDA_SUCCESS)
> +    GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
> +  struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
> +  cb->fn (cb->ptr);
> +  free (ptr);
> +}
> +
> +void
> +GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
> +					   void (*callback_fn)(void *),
> +					   void *userptr)
> +{
> +  struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
> +  b->fn = callback_fn;
> +  b->ptr = userptr;
> +  b->aq = aq;
> +  CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
> +		    cuda_callback_wrapper, (void *) b, 0);
> +}
> +
> +static bool
> +cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
> +{
> +  CUdeviceptr pb;
> +  size_t ps;
> +  if (!s)
> +    return true;
> +  if (!d)
> +    {
> +      GOMP_PLUGIN_error ("invalid device address");
> +      return false;
> +    }
> +  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
> +  if (!pb)
> +    {
> +      GOMP_PLUGIN_error ("invalid device address");
> +      return false;
> +    }
> +  if (!h)
> +    {
> +      GOMP_PLUGIN_error ("invalid host address");
> +      return false;
> +    }
> +  if (d == h)
> +    {
> +      GOMP_PLUGIN_error ("invalid host or device address");
> +      return false;
> +    }
> +  if ((void *)(d + s) > (void *)(pb + ps))
> +    {
> +      GOMP_PLUGIN_error ("invalid size");
> +      return false;
> +    }
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
> +{
> +  if (!nvptx_attach_host_thread_to_device (ord)
> +      || !cuda_memcpy_sanity_check (src, dst, n))
> +    return false;
> +  CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
> +{
> +  if (!nvptx_attach_host_thread_to_device (ord)
> +      || !cuda_memcpy_sanity_check (dst, src, n))
> +    return false;
> +  CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
> +{
> +  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
> +				     size_t n, struct goacc_asyncqueue *aq)
> +{
> +  if (!nvptx_attach_host_thread_to_device (ord)
> +      || !cuda_memcpy_sanity_check (src, dst, n))
> +    return false;
> +  CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
> +  return true;
> +}
> +
> +bool
> +GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
> +				     size_t n, struct goacc_asyncqueue *aq)
> +{
> +  if (!nvptx_attach_host_thread_to_device (ord)
> +      || !cuda_memcpy_sanity_check (dst, src, n))
> +    return false;
> +  CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
> +  return true;
> +}
> +
>  /* Adjust launch dimensions: pick good values for number of blocks and warps
>     and ensure that number of warps does not exceed CUDA limits as well as GCC's
>     own limits.  */
> @@ -2360,8 +1648,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt
>      CU_LAUNCH_PARAM_END
>    };
>    r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
> -			 32, threads, 1, 0, ptx_dev->null_stream->stream,
> -			 NULL, config);
> +			 32, threads, 1, 0, NULL, NULL, config);
>    if (r != CUDA_SUCCESS)
>      GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
>  
> Index: libgomp/plugin/cuda-lib.def
> ===================================================================
> --- libgomp/plugin/cuda-lib.def	(revision 269183)
> +++ libgomp/plugin/cuda-lib.def	(working copy)
> @@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuModuleLoad)
>  CUDA_ONE_CALL (cuModuleLoadData)
>  CUDA_ONE_CALL (cuModuleUnload)
>  CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
> +CUDA_ONE_CALL (cuStreamAddCallback)
>  CUDA_ONE_CALL (cuStreamCreate)
>  CUDA_ONE_CALL (cuStreamDestroy)
>  CUDA_ONE_CALL (cuStreamQuery)
> Index: libgomp/oacc-mem.c
> ===================================================================
> --- libgomp/oacc-mem.c	(revision 269183)
> +++ libgomp/oacc-mem.c	(working copy)
> @@ -172,18 +172,11 @@ memcpy_tofrom_device (bool from, void *d, void *h,
>        return;
>      }
>  
> -  if (async > acc_async_sync)
> -    thr->dev->openacc.async_set_async_func (async);
> -
> -  bool ret = (from
> -	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
> -	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
> -
> -  if (async > acc_async_sync)
> -    thr->dev->openacc.async_set_async_func (acc_async_sync);
> -
> -  if (!ret)
> -    gomp_fatal ("error in %s", libfnname);
> +  goacc_aq aq = get_goacc_asyncqueue (async);
> +  if (from)
> +    gomp_copy_dev2host (thr->dev, aq, h, d, s);
> +  else
> +    gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
>  }
>  
>  void
> @@ -509,17 +502,13 @@ present_create_copy (unsigned f, void *h, size_t s
>  
>        gomp_mutex_unlock (&acc_dev->lock);
>  
> -      if (async > acc_async_sync)
> -	acc_dev->openacc.async_set_async_func (async);
> +      goacc_aq aq = get_goacc_asyncqueue (async);
>  
> -      tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
> -			   GOMP_MAP_VARS_OPENACC);
> +      tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
> +				 &kinds, true, GOMP_MAP_VARS_OPENACC);
>        /* Initialize dynamic refcount.  */
>        tgt->list[0].key->dynamic_refcount = 1;
>  
> -      if (async > acc_async_sync)
> -	acc_dev->openacc.async_set_async_func (acc_async_sync);
> -
>        gomp_mutex_lock (&acc_dev->lock);
>  
>        d = tgt->to_free;
> @@ -676,13 +665,9 @@ delete_copyout (unsigned f, void *h, size_t s, int
>  
>        if (f & FLAG_COPYOUT)
>  	{
> -	  if (async > acc_async_sync)
> -	    acc_dev->openacc.async_set_async_func (async);
> -	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
> -	  if (async > acc_async_sync)
> -	    acc_dev->openacc.async_set_async_func (acc_async_sync);
> +	  goacc_aq aq = get_goacc_asyncqueue (async);
> +	  gomp_copy_dev2host (acc_dev, aq, h, d, s);
>  	}
> -
>        gomp_remove_var (acc_dev, n);
>      }
>  
> @@ -765,17 +750,13 @@ update_dev_host (int is_dev, void *h, size_t s, in
>    d = (void *) (n->tgt->tgt_start + n->tgt_offset
>  		+ (uintptr_t) h - n->host_start);
>  
> -  if (async > acc_async_sync)
> -    acc_dev->openacc.async_set_async_func (async);
> +  goacc_aq aq = get_goacc_asyncqueue (async);
>  
>    if (is_dev)
> -    acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
> +    gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
>    else
> -    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
> +    gomp_copy_dev2host (acc_dev, aq, h, d, s);
>  
> -  if (async > acc_async_sync)
> -    acc_dev->openacc.async_set_async_func (acc_async_sync);
> -
>    gomp_mutex_unlock (&acc_dev->lock);
>  }
>  
> @@ -805,7 +786,7 @@ acc_update_self_async (void *h, size_t s, int asyn
>  
>  void
>  gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
> -			 void *kinds)
> +			 void *kinds, int async)
>  {
>    struct target_mem_desc *tgt;
>    struct goacc_thread *thr = goacc_thread ();
> @@ -835,8 +816,9 @@ gomp_acc_insert_pointer (size_t mapnum, void **hos
>      }
>  
>    gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
> -  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
> -		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
> +  goacc_aq aq = get_goacc_asyncqueue (async);
> +  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
> +			     NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
>    gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
>  
>    /* Initialize dynamic refcount.  */
> @@ -930,7 +912,10 @@ gomp_acc_remove_pointer (void *h, size_t s, bool f
>        if (async < acc_async_noval)
>  	gomp_unmap_vars (t, true);
>        else
> -	t->device_descr->openacc.register_async_cleanup_func (t, async);
> +	{
> +	  goacc_aq aq = get_goacc_asyncqueue (async);
> +	  gomp_unmap_vars_async (t, true, aq);
> +	}
>      }
>  
>    gomp_mutex_unlock (&acc_dev->lock);
> Index: libgomp/oacc-parallel.c
> ===================================================================
> --- libgomp/oacc-parallel.c	(revision 269183)
> +++ libgomp/oacc-parallel.c	(working copy)
> @@ -217,8 +217,6 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (voi
>      }
>    va_end (ap);
>    
> -  acc_dev->openacc.async_set_async_func (async);
> -
>    if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
>      {
>        k.host_start = (uintptr_t) fn;
> @@ -235,44 +233,29 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (voi
>    else
>      tgt_fn = (void (*)) fn;
>  
> -  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
> -		       GOMP_MAP_VARS_OPENACC);
> +  goacc_aq aq = get_goacc_asyncqueue (async);
>  
> +  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
> +			     true, GOMP_MAP_VARS_OPENACC);
> +  
>    devaddrs = gomp_alloca (sizeof (void *) * mapnum);
>    for (i = 0; i < mapnum; i++)
>      devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
>  			    + tgt->list[i].key->tgt_offset
>  			    + tgt->list[i].offset);
> -
> -  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
> -			      async, dims, tgt);
> -
> -  /* If running synchronously, unmap immediately.  */
> -  bool copyfrom = true;
> -  if (async_synchronous_p (async))
> -    gomp_unmap_vars (tgt, true);
> +  if (aq == NULL)
> +    {
> +      acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
> +				  dims, tgt);
> +      /* If running synchronously, unmap immediately.  */
> +      gomp_unmap_vars (tgt, true);
> +    }
>    else
>      {
> -      bool async_unmap = false;
> -      for (size_t i = 0; i < tgt->list_count; i++)
> -	{
> -	  splay_tree_key k = tgt->list[i].key;
> -	  if (k && k->refcount == 1)
> -	    {
> -	      async_unmap = true;
> -	      break;
> -	    }
> -	}
> -      if (async_unmap)
> -	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
> -      else
> -	{
> -	  copyfrom = false;
> -	  gomp_unmap_vars (tgt, copyfrom);
> -	}
> +      acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
> +					dims, tgt, aq);
> +      gomp_unmap_vars_async (tgt, true, aq);
>      }
> -
> -  acc_dev->openacc.async_set_async_func (acc_async_sync);
>  }
>  
>  /* Legacy entry point, only provide host execution.  */
> @@ -383,8 +366,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
>  	finalize = true;
>      }
>  
> -  acc_dev->openacc.async_set_async_func (async);
> -
>    /* Determine if this is an "acc enter data".  */
>    for (i = 0; i < mapnum; ++i)
>      {
> @@ -437,11 +418,11 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
>  		{
>  		case GOMP_MAP_ALLOC:
>  		case GOMP_MAP_FORCE_ALLOC:
> -		  acc_create (hostaddrs[i], sizes[i]);
> +		  acc_create_async (hostaddrs[i], sizes[i], async);
>  		  break;
>  		case GOMP_MAP_TO:
>  		case GOMP_MAP_FORCE_TO:
> -		  acc_copyin (hostaddrs[i], sizes[i]);
> +		  acc_copyin_async (hostaddrs[i], sizes[i], async);
>  		  break;
>  		default:
>  		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
> @@ -452,7 +433,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
>  	  else
>  	    {
>  	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
> -				       &sizes[i], &kinds[i]);
> +				       &sizes[i], &kinds[i], async);
>  	      /* Increment 'i' by two because OpenACC requires fortran
>  		 arrays to be contiguous, so each PSET is associated with
>  		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
> @@ -477,17 +458,17 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
>  		if (acc_is_present (hostaddrs[i], sizes[i]))
>  		  {
>  		    if (finalize)
> -		      acc_delete_finalize (hostaddrs[i], sizes[i]);
> +		      acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
>  		    else
> -		      acc_delete (hostaddrs[i], sizes[i]);
> +		      acc_delete_async (hostaddrs[i], sizes[i], async);
>  		  }
>  		break;
>  	      case GOMP_MAP_FROM:
>  	      case GOMP_MAP_FORCE_FROM:
>  		if (finalize)
> -		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
> +		  acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
>  		else
> -		  acc_copyout (hostaddrs[i], sizes[i]);
> +		  acc_copyout_async (hostaddrs[i], sizes[i], async);
>  		break;
>  	      default:
>  		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
> @@ -505,8 +486,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
>  	    i += pointer - 1;
>  	  }
>        }
> -
> -  acc_dev->openacc.async_set_async_func (acc_async_sync);
>  }
>  
>  static void
> @@ -532,9 +511,10 @@ goacc_wait (int async, int num_waits, va_list *ap)
>        if (async == acc_async_sync)
>  	acc_wait (qid);
>        else if (qid == async)
> -	;/* If we're waiting on the same asynchronous queue as we're
> -	    launching on, the queue itself will order work as
> -	    required, so there's no need to wait explicitly.  */
> +	/* If we're waiting on the same asynchronous queue as we're
> +	   launching on, the queue itself will order work as
> +	   required, so there's no need to wait explicitly.  */
> +	;
>        else
>  	acc_wait_async (qid, async);
>      }
> @@ -567,8 +547,6 @@ GOACC_update (int flags_m, size_t mapnum,
>        va_end (ap);
>      }
>  
> -  acc_dev->openacc.async_set_async_func (async);
> -
>    bool update_device = false;
>    for (i = 0; i < mapnum; ++i)
>      {
> @@ -591,6 +569,8 @@ GOACC_update (int flags_m, size_t mapnum,
>  		 the value of the allocated device memory in the
>  		 previous pointer.  */
>  	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
> +	      /* TODO: verify that we really cannot use acc_update_device_async
> +		 here.  */
>  	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
>  
>  	      /* Restore the host pointer.  */
> @@ -608,7 +588,7 @@ GOACC_update (int flags_m, size_t mapnum,
>  	  /* Fallthru  */
>  	case GOMP_MAP_FORCE_TO:
>  	  update_device = true;
> -	  acc_update_device (hostaddrs[i], sizes[i]);
> +	  acc_update_device_async (hostaddrs[i], sizes[i], async);
>  	  break;
>  
>  	case GOMP_MAP_FROM:
> @@ -620,7 +600,7 @@ GOACC_update (int flags_m, size_t mapnum,
>  	  /* Fallthru  */
>  	case GOMP_MAP_FORCE_FROM:
>  	  update_device = false;
> -	  acc_update_self (hostaddrs[i], sizes[i]);
> +	  acc_update_self_async (hostaddrs[i], sizes[i], async);
>  	  break;
>  
>  	default:
> @@ -628,8 +608,6 @@ GOACC_update (int flags_m, size_t mapnum,
>  	  break;
>  	}
>      }
> -
> -  acc_dev->openacc.async_set_async_func (acc_async_sync);
>  }
>  
>  void
> Index: libgomp/oacc-init.c
> ===================================================================
> --- libgomp/oacc-init.c	(revision 269183)
> +++ libgomp/oacc-init.c	(working copy)
> @@ -309,7 +309,7 @@ acc_shutdown_1 (acc_device_t d)
>        if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
>          {
>  	  devices_active = true;
> -	  ret &= acc_dev->fini_device_func (acc_dev->target_id);
> +	  ret &= gomp_fini_device (acc_dev);
>  	  acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
>  	}
>        gomp_mutex_unlock (&acc_dev->lock);
> @@ -426,8 +426,6 @@ goacc_attach_host_thread_to_device (int ord)
>    
>    thr->target_tls
>      = acc_dev->openacc.create_thread_data_func (ord);
> -  
> -  acc_dev->openacc.async_set_async_func (acc_async_sync);
>  }
>  
>  /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
> Index: libgomp/oacc-cuda.c
> ===================================================================
> --- libgomp/oacc-cuda.c	(revision 269183)
> +++ libgomp/oacc-cuda.c	(working copy)
> @@ -30,6 +30,7 @@
>  #include "config.h"
>  #include "libgomp.h"
>  #include "oacc-int.h"
> +#include <assert.h>
>  
>  void *
>  acc_get_current_cuda_device (void)
> @@ -62,7 +63,11 @@ acc_get_cuda_stream (int async)
>      return NULL;
>  
>    if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
> -    return thr->dev->openacc.cuda.get_stream_func (async);
> +    {
> +      goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
> +      if (aq)
> +	return thr->dev->openacc.cuda.get_stream_func (aq);
> +    }
>   
>    return NULL;
>  }
> @@ -79,8 +84,23 @@ acc_set_cuda_stream (int async, void *stream)
>  
>    thr = goacc_thread ();
>  
> +  int ret = -1;
>    if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
> -    return thr->dev->openacc.cuda.set_stream_func (async, stream);
> +    {
> +      goacc_aq aq = get_goacc_asyncqueue (async);
> +      /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
> +	 used to change the CUDA stream associated with "acc_async_sync".  */
> +      if (!aq)
> +	{
> +	  assert (async == acc_async_sync);
> +	  gomp_debug (0, "Refusing request to set CUDA stream associated"
> +		      " with \"acc_async_sync\"\n");
> +	  return 0;
> +	}
> +      gomp_mutex_lock (&thr->dev->openacc.async.lock);
> +      ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
> +      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
> +    }
>  
> -  return -1;
> +  return ret;
>  }
> Index: libgomp/target.c
> ===================================================================
> --- libgomp/target.c	(revision 269183)
> +++ libgomp/target.c	(working copy)
> @@ -177,6 +177,22 @@ gomp_device_copy (struct gomp_device_descr *device
>      }
>  }
>  
> +static inline void
> +goacc_device_copy_async (struct gomp_device_descr *devicep,
> +			 bool (*copy_func) (int, void *, const void *, size_t,
> +					    struct goacc_asyncqueue *),
> +			 const char *dst, void *dstaddr,
> +			 const char *src, const void *srcaddr,
> +			 size_t size, struct goacc_asyncqueue *aq)
> +{
> +  if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
> +    {
> +      gomp_mutex_unlock (&devicep->lock);
> +      gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
> +		  src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
> +    }
> +}
> +
>  /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
>     host to device memory transfers.  */
>  
> @@ -269,8 +285,9 @@ gomp_to_device_kind_p (int kind)
>      }
>  }
>  
> -static void
> +attribute_hidden void
>  gomp_copy_host2dev (struct gomp_device_descr *devicep,
> +		    struct goacc_asyncqueue *aq,
>  		    void *d, const void *h, size_t sz,
>  		    struct gomp_coalesce_buf *cbuf)
>  {
> @@ -299,14 +316,23 @@ gomp_copy_host2dev (struct gomp_device_descr *devi
>  	    }
>  	}
>      }
> -  gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
> +  if (__builtin_expect (aq != NULL, 0))
> +    goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
> +			     "dev", d, "host", h, sz, aq);
> +  else
> +    gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
>  }
>  
> -static void
> +attribute_hidden void
>  gomp_copy_dev2host (struct gomp_device_descr *devicep,
> +		    struct goacc_asyncqueue *aq,
>  		    void *h, const void *d, size_t sz)
>  {
> -  gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
> +  if (__builtin_expect (aq != NULL, 0))
> +    goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
> +			     "host", h, "dev", d, sz, aq);
> +  else
> +    gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
>  }
>  
>  static void
> @@ -324,7 +350,8 @@ gomp_free_device_memory (struct gomp_device_descr
>     Helper function of gomp_map_vars.  */
>  
>  static inline void
> -gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
> +gomp_map_vars_existing (struct gomp_device_descr *devicep,
> +			struct goacc_asyncqueue *aq, splay_tree_key oldn,
>  			splay_tree_key newn, struct target_var_desc *tgt_var,
>  			unsigned char kind, struct gomp_coalesce_buf *cbuf)
>  {
> @@ -346,7 +373,7 @@ static inline void
>      }
>  
>    if (GOMP_MAP_ALWAYS_TO_P (kind))
> -    gomp_copy_host2dev (devicep,
> +    gomp_copy_host2dev (devicep, aq,
>  			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
>  				  + newn->host_start - oldn->host_start),
>  			(void *) newn->host_start,
> @@ -364,8 +391,8 @@ get_kind (bool short_mapkind, void *kinds, int idx
>  }
>  
>  static void
> -gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
> -		  uintptr_t target_offset, uintptr_t bias,
> +gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
> +		  uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
>  		  struct gomp_coalesce_buf *cbuf)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
> @@ -376,7 +403,7 @@ static void
>    if (cur_node.host_start == (uintptr_t) NULL)
>      {
>        cur_node.tgt_offset = (uintptr_t) NULL;
> -      gomp_copy_host2dev (devicep,
> +      gomp_copy_host2dev (devicep, aq,
>  			  (void *) (tgt->tgt_start + target_offset),
>  			  (void *) &cur_node.tgt_offset,
>  			  sizeof (void *), cbuf);
> @@ -398,12 +425,13 @@ static void
>       array section.  Now subtract bias to get what we want
>       to initialize the pointer with.  */
>    cur_node.tgt_offset -= bias;
> -  gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
> +  gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
>  		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
>  }
>  
>  static void
> -gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
> +gomp_map_fields_existing (struct target_mem_desc *tgt,
> +			  struct goacc_asyncqueue *aq, splay_tree_key n,
>  			  size_t first, size_t i, void **hostaddrs,
>  			  size_t *sizes, void *kinds,
>  			  struct gomp_coalesce_buf *cbuf)
> @@ -423,7 +451,7 @@ static void
>        && n2->tgt == n->tgt
>        && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>      {
> -      gomp_map_vars_existing (devicep, n2, &cur_node,
> +      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
>  			      &tgt->list[i], kind & typemask, cbuf);
>        return;
>      }
> @@ -439,8 +467,8 @@ static void
>  	      && n2->host_start - n->host_start
>  		 == n2->tgt_offset - n->tgt_offset)
>  	    {
> -	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> -				      kind & typemask, cbuf);
> +	      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
> +				      &tgt->list[i], kind & typemask, cbuf);
>  	      return;
>  	    }
>  	}
> @@ -451,7 +479,7 @@ static void
>  	  && n2->tgt == n->tgt
>  	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>  	{
> -	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> +	  gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
>  				  kind & typemask, cbuf);
>  	  return;
>  	}
> @@ -483,10 +511,12 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>    return tgt->tgt_start + tgt->list[i].offset;
>  }
>  
> -attribute_hidden struct target_mem_desc *
> -gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
> -	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
> -	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
> +static inline __attribute__((always_inline)) struct target_mem_desc *
> +gomp_map_vars_internal (struct gomp_device_descr *devicep,
> +			struct goacc_asyncqueue *aq, size_t mapnum,
> +			void **hostaddrs, void **devaddrs, size_t *sizes,
> +			void *kinds, bool short_mapkind,
> +			enum gomp_map_vars_kind pragma_kind)
>  {
>    size_t i, tgt_align, tgt_size, not_found_cnt = 0;
>    bool has_firstprivate = false;
> @@ -600,7 +630,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  	      continue;
>  	    }
>  	  for (i = first; i <= last; i++)
> -	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> +	    gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
>  				      sizes, kinds, NULL);
>  	  i--;
>  	  continue;
> @@ -645,7 +675,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>        else
>  	n = splay_tree_lookup (mem_map, &cur_node);
>        if (n && n->refcount != REFCOUNT_LINK)
> -	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
> +	gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
>  				kind & typemask, NULL);
>        else
>  	{
> @@ -756,7 +786,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		tgt_size = (tgt_size + align - 1) & ~(align - 1);
>  		tgt->list[i].offset = tgt_size;
>  		len = sizes[i];
> -		gomp_copy_host2dev (devicep,
> +		gomp_copy_host2dev (devicep, aq,
>  				    (void *) (tgt->tgt_start + tgt_size),
>  				    (void *) hostaddrs[i], len, cbufp);
>  		tgt_size += len;
> @@ -790,7 +820,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		    continue;
>  		  }
>  		for (i = first; i <= last; i++)
> -		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> +		  gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
>  					    sizes, kinds, cbufp);
>  		i--;
>  		continue;
> @@ -810,7 +840,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
>  		if (cur_node.tgt_offset)
>  		  cur_node.tgt_offset -= sizes[i];
> -		gomp_copy_host2dev (devicep,
> +		gomp_copy_host2dev (devicep, aq,
>  				    (void *) (n->tgt->tgt_start
>  					      + n->tgt_offset
>  					      + cur_node.host_start
> @@ -831,7 +861,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  	      k->host_end = k->host_start + sizeof (void *);
>  	    splay_tree_key n = splay_tree_lookup (mem_map, k);
>  	    if (n && n->refcount != REFCOUNT_LINK)
> -	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
> +	      gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
>  				      kind & typemask, cbufp);
>  	    else
>  	      {
> @@ -884,18 +914,19 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		  case GOMP_MAP_FORCE_TOFROM:
>  		  case GOMP_MAP_ALWAYS_TO:
>  		  case GOMP_MAP_ALWAYS_TOFROM:
> -		    gomp_copy_host2dev (devicep,
> +		    gomp_copy_host2dev (devicep, aq,
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
>  					k->host_end - k->host_start, cbufp);
>  		    break;
>  		  case GOMP_MAP_POINTER:
> -		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
> +		    gomp_map_pointer (tgt, aq,
> +				      (uintptr_t) *(void **) k->host_start,
>  				      k->tgt_offset, sizes[i], cbufp);
>  		    break;
>  		  case GOMP_MAP_TO_PSET:
> -		    gomp_copy_host2dev (devicep,
> +		    gomp_copy_host2dev (devicep, aq,
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
> @@ -917,7 +948,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  			  tgt->list[j].always_copy_from = false;
>  			  if (k->refcount != REFCOUNT_INFINITY)
>  			    k->refcount++;
> -			  gomp_map_pointer (tgt,
> +			  gomp_map_pointer (tgt, aq,
>  					    (uintptr_t) *(void **) hostaddrs[j],
>  					    k->tgt_offset
>  					    + ((uintptr_t) hostaddrs[j]
> @@ -946,7 +977,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		    break;
>  		  case GOMP_MAP_FORCE_DEVICEPTR:
>  		    assert (k->host_end - k->host_start == sizeof (void *));
> -		    gomp_copy_host2dev (devicep,
> +		    gomp_copy_host2dev (devicep, aq,
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
> @@ -965,7 +996,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>  		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
>  		    /* We intentionally do not use coalescing here, as it's not
>  		       data allocated by the current call to this function.  */
> -		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
> +		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
>  					&tgt_addr, sizeof (void *), NULL);
>  		  }
>  		array++;
> @@ -978,7 +1009,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>        for (i = 0; i < mapnum; i++)
>  	{
>  	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
> -	  gomp_copy_host2dev (devicep,
> +	  gomp_copy_host2dev (devicep, aq,
>  			      (void *) (tgt->tgt_start + i * sizeof (void *)),
>  			      (void *) &cur_node.tgt_offset, sizeof (void *),
>  			      cbufp);
> @@ -989,7 +1020,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>      {
>        long c = 0;
>        for (c = 0; c < cbuf.chunk_cnt; ++c)
> -	gomp_copy_host2dev (devicep,
> +	gomp_copy_host2dev (devicep, aq,
>  			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
>  			    (char *) cbuf.buf + (cbuf.chunks[c].start
>  						 - cbuf.chunks[0].start),
> @@ -1012,7 +1043,27 @@ gomp_map_val (struct target_mem_desc *tgt, void **
>    return tgt;
>  }
>  
> -static void
> +attribute_hidden struct target_mem_desc *
> +gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
> +	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
> +	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
> +{
> +  return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
> +				 sizes, kinds, short_mapkind, pragma_kind);
> +}
> +
> +attribute_hidden struct target_mem_desc *
> +gomp_map_vars_async (struct gomp_device_descr *devicep,
> +		     struct goacc_asyncqueue *aq, size_t mapnum,
> +		     void **hostaddrs, void **devaddrs, size_t *sizes,
> +		     void *kinds, bool short_mapkind,
> +		     enum gomp_map_vars_kind pragma_kind)
> +{
> +  return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
> +				 sizes, kinds, short_mapkind, pragma_kind);
> +}
> +
> +attribute_hidden void
>  gomp_unmap_tgt (struct target_mem_desc *tgt)
>  {
>    /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region.  */
> @@ -1040,12 +1091,24 @@ gomp_remove_var (struct gomp_device_descr *devicep
>    return is_tgt_unmapped;
>  }
>  
> +static void
> +gomp_unref_tgt (void *ptr)
> +{
> +  struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
> +
> +  if (tgt->refcount > 1)
> +    tgt->refcount--;
> +  else
> +    gomp_unmap_tgt (tgt);
> +}
> +
>  /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
>     variables back from device to host: if it is false, it is assumed that this
>     has been done already.  */
>  
> -attribute_hidden void
> -gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
> +static inline __attribute__((always_inline)) void
> +gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
> +			  struct goacc_asyncqueue *aq)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
>  
> @@ -1082,7 +1145,7 @@ gomp_remove_var (struct gomp_device_descr *devicep
>  
>        if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
>  	  || tgt->list[i].always_copy_from)
> -	gomp_copy_dev2host (devicep,
> +	gomp_copy_dev2host (devicep, aq,
>  			    (void *) (k->host_start + tgt->list[i].offset),
>  			    (void *) (k->tgt->tgt_start + k->tgt_offset
>  				      + tgt->list[i].offset),
> @@ -1091,14 +1154,28 @@ gomp_remove_var (struct gomp_device_descr *devicep
>  	gomp_remove_var (devicep, k);
>      }
>  
> -  if (tgt->refcount > 1)
> -    tgt->refcount--;
> +  if (aq)
> +    devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt,
> +						(void *) tgt);
>    else
> -    gomp_unmap_tgt (tgt);
> +    gomp_unref_tgt ((void *) tgt);
>  
>    gomp_mutex_unlock (&devicep->lock);
>  }
>  
> +attribute_hidden void
> +gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
> +{
> +  gomp_unmap_vars_internal (tgt, do_copyfrom, NULL);
> +}
> +
> +attribute_hidden void
> +gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
> +		       struct goacc_asyncqueue *aq)
> +{
> +  gomp_unmap_vars_internal (tgt, do_copyfrom, aq);
> +}
> +
>  static void
>  gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
>  	     size_t *sizes, void *kinds, bool short_mapkind)
> @@ -1148,9 +1225,10 @@ gomp_update (struct gomp_device_descr *devicep, si
>  	    size_t size = cur_node.host_end - cur_node.host_start;
>  
>  	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
> -	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
> +	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
> +				  NULL);
>  	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
> -	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
> +	      gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
>  	  }
>        }
>    gomp_mutex_unlock (&devicep->lock);
> @@ -1443,9 +1521,24 @@ gomp_init_device (struct gomp_device_descr *device
>  				   false);
>      }
>  
> +  /* Initialize OpenACC asynchronous queues.  */
> +  goacc_init_asyncqueues (devicep);
> +
>    devicep->state = GOMP_DEVICE_INITIALIZED;
>  }
>  
> +/* This function finalizes the target device, specified by DEVICEP.  DEVICEP
> +   must be locked on entry, and remains locked on return.  */
> +
> +attribute_hidden bool
> +gomp_fini_device (struct gomp_device_descr *devicep)
> +{
> +  bool ret = goacc_fini_asyncqueues (devicep);
> +  ret &= devicep->fini_device_func (devicep->target_id);
> +  devicep->state = GOMP_DEVICE_FINALIZED;
> +  return ret;
> +}
> +
>  attribute_hidden void
>  gomp_unload_device (struct gomp_device_descr *devicep)
>  {
> @@ -1954,7 +2047,7 @@ gomp_exit_data (struct gomp_device_descr *devicep,
>  
>  	  if ((kind == GOMP_MAP_FROM && k->refcount == 0)
>  	      || kind == GOMP_MAP_ALWAYS_FROM)
> -	    gomp_copy_dev2host (devicep, (void *) cur_node.host_start,
> +	    gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
>  				(void *) (k->tgt->tgt_start + k->tgt_offset
>  					  + cur_node.host_start
>  					  - k->host_start),
> @@ -2636,20 +2729,20 @@ gomp_load_plugin_for_device (struct gomp_device_de
>    if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
>      {
>        if (!DLSYM_OPT (openacc.exec, openacc_exec)
> -	  || !DLSYM_OPT (openacc.register_async_cleanup,
> -			 openacc_register_async_cleanup)
> -	  || !DLSYM_OPT (openacc.async_test, openacc_async_test)
> -	  || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
> -	  || !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
> -	  || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
> -	  || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
> -	  || !DLSYM_OPT (openacc.async_wait_all_async,
> -			 openacc_async_wait_all_async)
> -	  || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
>  	  || !DLSYM_OPT (openacc.create_thread_data,
>  			 openacc_create_thread_data)
>  	  || !DLSYM_OPT (openacc.destroy_thread_data,
> -			 openacc_destroy_thread_data))
> +			 openacc_destroy_thread_data)
> +	  || !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
> +	  || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
> +	  || !DLSYM_OPT (openacc.async.test, openacc_async_test)
> +	  || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
> +	  || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
> +	  || !DLSYM_OPT (openacc.async.queue_callback,
> +			 openacc_async_queue_callback)
> +	  || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
> +	  || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
> +	  || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
>  	{
>  	  /* Require all the OpenACC handlers if we have
>  	     GOMP_OFFLOAD_CAP_OPENACC_200.  */
> @@ -2700,10 +2793,7 @@ gomp_target_fini (void)
>        struct gomp_device_descr *devicep = &devices[i];
>        gomp_mutex_lock (&devicep->lock);
>        if (devicep->state == GOMP_DEVICE_INITIALIZED)
> -	{
> -	  ret = devicep->fini_device_func (devicep->target_id);
> -	  devicep->state = GOMP_DEVICE_FINALIZED;
> -	}
> +	ret = gomp_fini_device (devicep);
>        gomp_mutex_unlock (&devicep->lock);
>        if (!ret)
>  	gomp_fatal ("device finalization failed");
> Index: libgomp/libgomp.h
> ===================================================================
> --- libgomp/libgomp.h	(revision 269183)
> +++ libgomp/libgomp.h	(working copy)
> @@ -949,25 +949,32 @@ typedef struct acc_dispatch_t
>    /* Execute.  */
>    __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
>  
> -  /* Async cleanup callback registration.  */
> -  __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
> -    *register_async_cleanup_func;
> -
> -  /* Asynchronous routines.  */
> -  __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
> -    *async_wait_all_async_func;
> -  __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
> -
>    /* Create/destroy TLS data.  */
>    __typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
>    __typeof (GOMP_OFFLOAD_openacc_destroy_thread_data)
>      *destroy_thread_data_func;
> +  
> +  struct {
> +    /* Once created and put into the "active" list, asyncqueues are then never
> +       destructed and removed from the "active" list, other than if the TODO
> +       device is shut down.  */
> +    gomp_mutex_t lock;
> +    int nasyncqueue;
> +    struct goacc_asyncqueue **asyncqueue;
> +    struct goacc_asyncqueue_list *active;
>  
> +    __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
> +
> +    __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
> +    __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
> +  } async;
> +
>    /* NVIDIA target specific routines.  */
>    struct {
>      __typeof (GOMP_OFFLOAD_openacc_cuda_get_current_device)
> @@ -1053,17 +1060,33 @@ enum gomp_map_vars_kind
>    GOMP_MAP_VARS_ENTER_DATA
>  };
>  
> -extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
> +extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
>  extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
>  extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
>  				       unsigned short *);
> +struct gomp_coalesce_buf;
> +extern void gomp_copy_host2dev (struct gomp_device_descr *,
> +				struct goacc_asyncqueue *, void *, const void *,
> +				size_t, struct gomp_coalesce_buf *);
> +extern void gomp_copy_dev2host (struct gomp_device_descr *,
> +				struct goacc_asyncqueue *, void *, const void *,
> +				size_t);
>  
>  extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
>  					      size_t, void **, void **,
>  					      size_t *, void *, bool,
>  					      enum gomp_map_vars_kind);
> +extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
> +						    struct goacc_asyncqueue *,
> +						    size_t, void **, void **,
> +						    size_t *, void *, bool,
> +						    enum gomp_map_vars_kind);
> +extern void gomp_unmap_tgt (struct target_mem_desc *);
>  extern void gomp_unmap_vars (struct target_mem_desc *, bool);
> +extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
> +				   struct goacc_asyncqueue *);
>  extern void gomp_init_device (struct gomp_device_descr *);
> +extern bool gomp_fini_device (struct gomp_device_descr *);
>  extern void gomp_free_memmap (struct splay_tree_s *);
>  extern void gomp_unload_device (struct gomp_device_descr *);
>  extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
> Index: libgomp/oacc-int.h
> ===================================================================
> --- libgomp/oacc-int.h	(revision 269183)
> +++ libgomp/oacc-int.h	(working copy)
> @@ -99,6 +99,13 @@ void goacc_restore_bind (void);
>  void goacc_lazy_initialize (void);
>  void goacc_host_init (void);
>  
> +void goacc_init_asyncqueues (struct gomp_device_descr *);
> +bool goacc_fini_asyncqueues (struct gomp_device_descr *);
> +void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
> +		       void *);
> +struct goacc_asyncqueue *get_goacc_asyncqueue (int);
> +struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
> +						  int);
>  static inline bool
>  async_valid_stream_id_p (int async)
>  {
> Index: libgomp/oacc-host.c
> ===================================================================
> --- libgomp/oacc-host.c	(revision 269183)
> +++ libgomp/oacc-host.c	(working copy)
> @@ -140,57 +140,91 @@ host_openacc_exec (void (*fn) (void *),
>  		   size_t mapnum __attribute__ ((unused)),
>  		   void **hostaddrs,
>  		   void **devaddrs __attribute__ ((unused)),
> -		   int async __attribute__ ((unused)),
> -		   unsigned *dims __attribute ((unused)),
> +		   unsigned *dims __attribute__ ((unused)),
>  		   void *targ_mem_desc __attribute__ ((unused)))
>  {
>    fn (hostaddrs);
>  }
>  
>  static void
> -host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
> -				     int async __attribute__ ((unused)))
> +host_openacc_async_exec (void (*fn) (void *),
> +			 size_t mapnum __attribute__ ((unused)),
> +			 void **hostaddrs,
> +			 void **devaddrs __attribute__ ((unused)),
> +			 unsigned *dims __attribute__ ((unused)),
> +			 void *targ_mem_desc __attribute__ ((unused)),
> +			 struct goacc_asyncqueue *aq __attribute__ ((unused)))
>  {
> +  fn (hostaddrs);
>  }
>  
>  static int
> -host_openacc_async_test (int async __attribute__ ((unused)))
> +host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
>  {
>    return 1;
>  }
>  
> -static int
> -host_openacc_async_test_all (void)
> +static bool
> +host_openacc_async_synchronize (struct goacc_asyncqueue *aq
> +				__attribute__ ((unused)))
>  {
> -  return 1;
> +  return true;
>  }
>  
> -static void
> -host_openacc_async_wait (int async __attribute__ ((unused)))
> +static bool
> +host_openacc_async_serialize (struct goacc_asyncqueue *aq1
> +			      __attribute__ ((unused)),
> +			      struct goacc_asyncqueue *aq2
> +			      __attribute__ ((unused)))
>  {
> +  return true;
>  }
>  
> -static void
> -host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
> -			       int async2 __attribute__ ((unused)))
> +static bool
> +host_openacc_async_host2dev (int ord __attribute__ ((unused)),
> +			     void *dst __attribute__ ((unused)),
> +			     const void *src __attribute__ ((unused)),
> +			     size_t n __attribute__ ((unused)),
> +			     struct goacc_asyncqueue *aq
> +			     __attribute__ ((unused)))
>  {
> +  return true;
>  }
>  
> -static void
> -host_openacc_async_wait_all (void)
> +static bool
> +host_openacc_async_dev2host (int ord __attribute__ ((unused)),
> +			     void *dst __attribute__ ((unused)),
> +			     const void *src __attribute__ ((unused)),
> +			     size_t n __attribute__ ((unused)),
> +			     struct goacc_asyncqueue *aq
> +			     __attribute__ ((unused)))
>  {
> +  return true;
>  }
>  
>  static void
> -host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
> +host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
> +				   __attribute__ ((unused)),
> +				   void (*callback_fn)(void *)
> +				   __attribute__ ((unused)),
> +				   void *userptr __attribute__ ((unused)))
>  {
>  }
>  
> -static void
> -host_openacc_async_set_async (int async __attribute__ ((unused)))
> +static struct goacc_asyncqueue *
> +host_openacc_async_construct (void)
>  {
> +  /* Non-NULL 0xffff... value as opaque dummy.  */
> +  return (struct goacc_asyncqueue *) -1;
>  }
>  
> +static bool
> +host_openacc_async_destruct (struct goacc_asyncqueue *aq
> +			     __attribute__ ((unused)))
> +{
> +  return true;
> +}
> +
>  static void *
>  host_openacc_create_thread_data (int ord __attribute__ ((unused)))
>  {
> @@ -235,19 +269,21 @@ static struct gomp_device_descr host_dispatch =
>  
>        .exec_func = host_openacc_exec,
>  
> -      .register_async_cleanup_func = host_openacc_register_async_cleanup,
> -
> -      .async_test_func = host_openacc_async_test,
> -      .async_test_all_func = host_openacc_async_test_all,
> -      .async_wait_func = host_openacc_async_wait,
> -      .async_wait_async_func = host_openacc_async_wait_async,
> -      .async_wait_all_func = host_openacc_async_wait_all,
> -      .async_wait_all_async_func = host_openacc_async_wait_all_async,
> -      .async_set_async_func = host_openacc_async_set_async,
> -
>        .create_thread_data_func = host_openacc_create_thread_data,
>        .destroy_thread_data_func = host_openacc_destroy_thread_data,
>  
> +      .async = {
> +	.construct_func = host_openacc_async_construct,
> +	.destruct_func = host_openacc_async_destruct,
> +	.test_func = host_openacc_async_test,
> +	.synchronize_func = host_openacc_async_synchronize,
> +	.serialize_func = host_openacc_async_serialize,
> +	.queue_callback_func = host_openacc_async_queue_callback,
> +	.exec_func = host_openacc_async_exec,
> +	.dev2host_func = host_openacc_async_dev2host,
> +	.host2dev_func = host_openacc_async_host2dev,
> +      },
> +
>        .cuda = {
>  	.get_current_device_func = NULL,
>  	.get_current_context_func = NULL,
> Index: libgomp/libgomp-plugin.h
> ===================================================================
> --- libgomp/libgomp-plugin.h	(revision 269183)
> +++ libgomp/libgomp-plugin.h	(working copy)
> @@ -53,6 +53,20 @@ enum offload_target_type
>    OFFLOAD_TARGET_TYPE_HSA = 7
>  };
>  
> +/* Opaque type to represent plugin-dependent implementation of an
> +   OpenACC asynchronous queue.  */
> +struct goacc_asyncqueue;
> +
> +/* Used to keep a list of active asynchronous queues.  */
> +struct goacc_asyncqueue_list
> +{
> +  struct goacc_asyncqueue *aq;
> +  struct goacc_asyncqueue_list *next;
> +};
> +
> +typedef struct goacc_asyncqueue *goacc_aq;
> +typedef struct goacc_asyncqueue_list *goacc_aq_list;
> +
>  /* Auxiliary struct, used for transferring pairs of addresses from plugin
>     to libgomp.  */
>  struct addr_pair
> @@ -93,22 +107,31 @@ extern bool GOMP_OFFLOAD_dev2dev (int, void *, con
>  extern bool GOMP_OFFLOAD_can_run (void *);
>  extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
>  extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
> +
>  extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
> -				       void **, int, unsigned *, void *);
> -extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
> -extern int GOMP_OFFLOAD_openacc_async_test (int);
> -extern int GOMP_OFFLOAD_openacc_async_test_all (void);
> -extern void GOMP_OFFLOAD_openacc_async_wait (int);
> -extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
> -extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
> -extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
> -extern void GOMP_OFFLOAD_openacc_async_set_async (int);
> +				       void **, unsigned *, void *);
>  extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
>  extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
> +extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
> +extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
> +extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
> +extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
> +extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
> +						  struct goacc_asyncqueue *);
> +extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
> +						       void (*)(void *), void *);
> +extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
> +					     void **, unsigned *, void *,
> +					     struct goacc_asyncqueue *);
> +extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
> +						 struct goacc_asyncqueue *);
> +extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
> +						 struct goacc_asyncqueue *);
>  extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
>  extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
> -extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int);
> -extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *);
> +extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
> +extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
> +						 void *);
>  
>  #ifdef __cplusplus
>  }
Chung-Lin Tang May 13, 2019, 1:33 p.m. UTC | #2
On 2019/2/26 9:51 PM, Thomas Schwinge wrote:
> On Tue, 26 Feb 2019 01:49:09 +0800, Chung-Lin Tang<chunglin_tang@mentor.com>  wrote:
>> I have incorporated all your patches you've included in the last mail (with
>> some modifications, though pretty minor I think).
> OK, thanks, that's good for next GCC development stage 1 as far as I'm
> concerned, and Tom has already approved the libgomp 'nvptx' plugin
> changes, and I suppose you've addressed Jakub's requests -- so you're
> good to go!:-)

Thanks Thomas, just committed. I also remembered to add your Reviewed-By tag in the commit log.

Thanks,
Chung-Lin
Thomas Schwinge Dec. 18, 2019, 5:05 p.m. UTC | #3
Hi!

On 2019-05-13T21:33:20+0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> committed

(... in r271128.)

As obvious, see attached "Make 'libgomp/target.c:gomp_unmap_tgt' 'static'
again"; committed to trunk in r279529.


Grüße
 Thomas
diff mbox series

Patch

Index: libgomp/oacc-async.c
===================================================================
--- libgomp/oacc-async.c	(revision 269183)
+++ libgomp/oacc-async.c	(working copy)
@@ -27,49 +27,162 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <assert.h>
+#include <string.h>
 #include "openacc.h"
 #include "libgomp.h"
 #include "oacc-int.h"
 
-int
-acc_async_test (int async)
+static struct goacc_thread *
+get_goacc_thread (void)
 {
-  if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
-
   struct goacc_thread *thr = goacc_thread ();
 
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
-  return thr->dev->openacc.async_test_func (async);
+  return thr;
 }
 
-int
-acc_async_test_all (void)
+static struct gomp_device_descr *
+get_goacc_thread_device (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
-  return thr->dev->openacc.async_test_all_func ();
+  return thr->dev;
 }
 
-void
-acc_wait (int async)
+static int
+validate_async_val (int async)
 {
   if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
+    gomp_fatal ("invalid async-argument: %d", async);
 
+  if (async == acc_async_sync)
+    return -1;
+
+  if (async == acc_async_noval)
+    return 0;
+
+  if (async >= 0)
+    /* TODO: we reserve 0 for acc_async_noval before we can clarify the
+       semantics of "default_async".  */
+    return 1 + async;
+  else
+    __builtin_unreachable ();
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
+   might return NULL if no asyncqueue is to be used.  Otherwise, if CREATE,
+   create the asyncqueue if it doesn't exist yet.  */
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+  async = validate_async_val (async);
+  if (async < 0)
+    return NULL;
+
+  struct goacc_asyncqueue *ret_aq = NULL;
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (&dev->openacc.async.lock);
+
+  if (!create
+      && (async >= dev->openacc.async.nasyncqueue
+	  || !dev->openacc.async.asyncqueue[async]))
+    goto end;
+
+  if (async >= dev->openacc.async.nasyncqueue)
+    {
+      int diff = async + 1 - dev->openacc.async.nasyncqueue;
+      dev->openacc.async.asyncqueue
+	= gomp_realloc (dev->openacc.async.asyncqueue,
+			sizeof (goacc_aq) * (async + 1));
+      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+	      0, sizeof (goacc_aq) * diff);
+      dev->openacc.async.nasyncqueue = async + 1;
+    }
+
+  if (!dev->openacc.async.asyncqueue[async])
+    {
+      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+
+      if (!dev->openacc.async.asyncqueue[async])
+	{
+	  gomp_mutex_unlock (&dev->openacc.async.lock);
+	  gomp_fatal ("async %d creation failed", async);
+	}
+      
+      /* Link new async queue into active list.  */
+      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+      n->aq = dev->openacc.async.asyncqueue[async];
+      n->next = dev->openacc.async.active;
+      dev->openacc.async.active = n;
+    }
+
+  ret_aq = dev->openacc.async.asyncqueue[async];
+
+ end:
+  gomp_mutex_unlock (&dev->openacc.async.lock);
+  return ret_aq;
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
+   might return NULL if no asyncqueue is to be used.  Otherwise, create the
+   asyncqueue if it doesn't exist yet.  */
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+  return lookup_goacc_asyncqueue (thr, true, async);
+}
+
+int
+acc_async_test (int async)
+{
   struct goacc_thread *thr = goacc_thread ();
 
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
-  thr->dev->openacc.async_wait_func (async);
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+  if (!aq)
+    return 1;
+  else
+    return thr->dev->openacc.async.test_func (aq);
 }
 
+int
+acc_async_test_all (void)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+
+  int ret = 1;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    if (!thr->dev->openacc.async.test_func (l->aq))
+      {
+	ret = 0;
+	break;
+      }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+  return ret;
+}
+
+void
+acc_wait (int async)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+  if (aq && !thr->dev->openacc.async.synchronize_func (aq))
+    gomp_fatal ("wait on %d failed", async);
+}
+
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
 #ifdef HAVE_ATTRIBUTE_ALIAS
 strong_alias (acc_wait, acc_async_wait)
@@ -84,23 +197,46 @@  acc_async_wait (int async)
 void
 acc_wait_async (int async1, int async2)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
+  /* TODO: Is this also correct for acc_async_sync, assuming that in this case,
+     we'll always be synchronous anyways?  */
+  if (!aq1)
+    return;
 
-  thr->dev->openacc.async_wait_async_func (async1, async2);
+  goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
+  /* An async queue is always synchronized with itself.  */
+  if (aq1 == aq2)
+    return;
+
+  if (aq2)
+    {
+      if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
+	gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
+    }
+  else
+    {
+      /* TODO: Local thread synchronization.
+	 Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
+      if (!thr->dev->openacc.async.synchronize_func (aq1))
+	gomp_fatal ("wait on %d failed", async1);
+    }
 }
 
 void
 acc_wait_all (void)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *dev = get_goacc_thread_device ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  bool ret = true;
+  gomp_mutex_lock (&dev->openacc.async.lock);
+  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
+    ret &= dev->openacc.async.synchronize_func (l->aq);
+  gomp_mutex_unlock (&dev->openacc.async.lock);
 
-  thr->dev->openacc.async_wait_all_func ();
+  if (!ret)
+    gomp_fatal ("wait all failed");
 }
 
 /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
@@ -117,13 +253,73 @@  acc_async_wait_all (void)
 void
 acc_wait_all_async (int async)
 {
-  if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  struct goacc_thread *thr = goacc_thread ();
+  goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  bool ret = true;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    {
+      if (waiting_queue)
+	ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+      else
+	/* TODO: Local thread synchronization.
+	   Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
+	ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+    }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
 
-  thr->dev->openacc.async_wait_all_async_func (async);
+  if (!ret)
+    gomp_fatal ("wait all async(%d) failed", async);
 }
+
+attribute_hidden void
+goacc_async_free (struct gomp_device_descr *devicep,
+		  struct goacc_asyncqueue *aq, void *ptr)
+{
+  if (!aq)
+    free (ptr);
+  else
+    devicep->openacc.async.queue_callback_func (aq, free, ptr);
+}
+
+/* This function initializes the asyncqueues for the device specified by
+   DEVICEP.  TODO DEVICEP must be locked on entry, and remains locked on
+   return.  */
+
+attribute_hidden void
+goacc_init_asyncqueues (struct gomp_device_descr *devicep)
+{
+  devicep->openacc.async.nasyncqueue = 0;
+  devicep->openacc.async.asyncqueue = NULL;
+  devicep->openacc.async.active = NULL;
+  gomp_mutex_init (&devicep->openacc.async.lock);
+}
+
+/* This function finalizes the asyncqueues for the device specified by DEVICEP.
+   TODO DEVICEP must be locked on entry, and remains locked on return.  */
+
+attribute_hidden bool
+goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
+{
+  bool ret = true;
+  gomp_mutex_lock (&devicep->openacc.async.lock);
+  if (devicep->openacc.async.nasyncqueue > 0)
+    {
+      goacc_aq_list next;
+      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+	{
+	  ret &= devicep->openacc.async.destruct_func (l->aq);
+	  next = l->next;
+	  free (l);
+	}
+      free (devicep->openacc.async.asyncqueue);
+      devicep->openacc.async.nasyncqueue = 0;
+      devicep->openacc.async.asyncqueue = NULL;
+      devicep->openacc.async.active = NULL;
+    }
+  gomp_mutex_unlock (&devicep->openacc.async.lock);
+  gomp_mutex_destroy (&devicep->openacc.async.lock);
+  return ret;
+}
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c	(revision 269183)
+++ libgomp/oacc-plugin.c	(working copy)
@@ -30,15 +30,12 @@ 
 #include "oacc-plugin.h"
 #include "oacc-int.h"
 
+/* This plugin function is now obsolete.  */
 void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
+GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
+			      int async __attribute__((unused)))
 {
-  struct target_mem_desc *tgt = ptr;
-  struct gomp_device_descr *devicep = tgt->device_descr;
-
-  devicep->openacc.async_set_async_func (async);
-  gomp_unmap_vars (tgt, true);
-  devicep->openacc.async_set_async_func (acc_async_sync);
+  gomp_fatal ("invalid plugin function");
 }
 
 /* Return the target-specific part of the TLS data for the current thread.  */
Index: libgomp/plugin/cuda/cuda.h
===================================================================
--- libgomp/plugin/cuda/cuda.h	(revision 269183)
+++ libgomp/plugin/cuda/cuda.h	(working copy)
@@ -54,7 +54,11 @@  typedef enum {
   CUDA_ERROR_INVALID_CONTEXT = 201,
   CUDA_ERROR_NOT_FOUND = 500,
   CUDA_ERROR_NOT_READY = 600,
-  CUDA_ERROR_LAUNCH_FAILED = 719
+  CUDA_ERROR_LAUNCH_FAILED = 719,
+  CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
+  CUDA_ERROR_NOT_PERMITTED = 800,
+  CUDA_ERROR_NOT_SUPPORTED = 801,
+  CUDA_ERROR_UNKNOWN = 999
 } CUresult;
 
 typedef enum {
@@ -173,6 +177,8 @@  CUresult cuModuleLoadData (CUmodule *, const void
 CUresult cuModuleUnload (CUmodule);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
+typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c	(revision 269183)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -192,175 +192,30 @@  cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
-struct cuda_map
+/* NVPTX/CUDA specific definition of asynchronous queues.  */
+struct goacc_asyncqueue
 {
-  CUdeviceptr d;
-  size_t size;
-  bool active;
-  struct cuda_map *next;
+  CUstream cuda_stream;
 };
 
-struct ptx_stream
+struct nvptx_callback
 {
-  CUstream stream;
-  pthread_t host_thread;
-  bool multithreaded;
-  struct cuda_map *map;
-  struct ptx_stream *next;
+  void (*fn) (void *);
+  void *ptr;
+  struct goacc_asyncqueue *aq;
+  struct nvptx_callback *next;
 };
 
 /* Thread-specific data for PTX.  */
 
 struct nvptx_thread
 {
-  struct ptx_stream *current_stream;
+  /* We currently have this embedded inside the plugin because libgomp manages
+     devices through integer target_ids.  This might be better if using an
+     opaque target-specific pointer directly from gomp_device_descr.  */
   struct ptx_device *ptx_dev;
 };
 
-static struct cuda_map *
-cuda_map_create (size_t size)
-{
-  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
-
-  assert (map);
-
-  map->next = NULL;
-  map->size = size;
-  map->active = false;
-
-  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
-  assert (map->d);
-
-  return map;
-}
-
-static void
-cuda_map_destroy (struct cuda_map *map)
-{
-  if (map->active)
-    /* Possible reasons for the map to be still active:
-       - the associated async kernel might still be running.
-       - the associated async kernel might have finished, but the
-         corresponding event that should trigger the pop_map has not been
-	 processed by event_gc.
-       - the associated sync kernel might have aborted
-
-       The async cases could happen if the user specified an async region
-       without adding a corresponding wait that is guaranteed to be executed
-       (before returning from main, or in an atexit handler).
-       We do not want to deallocate a device pointer that is still being
-       used, so skip it.
-
-       In the sync case, the device pointer is no longer used, but deallocating
-       it using cuMemFree will not succeed, so skip it.
-
-       TODO: Handle this in a more constructive way, by f.i. waiting for streams
-       to finish before de-allocating them (PR88981), or by ensuring the CUDA
-       lib atexit handler is called before rather than after the libgomp plugin
-       atexit handler (PR83795).  */
-    ;
-  else
-    CUDA_CALL_NOCHECK (cuMemFree, map->d);
-
-  free (map);
-}
-
-/* The following map_* routines manage the CUDA device memory that
-   contains the data mapping arguments for cuLaunchKernel.  Each
-   asynchronous PTX stream may have multiple pending kernel
-   invocations, which are launched in a FIFO order.  As such, the map
-   routines maintains a queue of cuLaunchKernel arguments.
-
-   Calls to map_push and map_pop must be guarded by ptx_event_lock.
-   Likewise, calls to map_init and map_fini are guarded by
-   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
-   GOMP_OFFLOAD_fini_device, respectively.  */
-
-static bool
-map_init (struct ptx_stream *s)
-{
-  int size = getpagesize ();
-
-  assert (s);
-
-  s->map = cuda_map_create (size);
-
-  return true;
-}
-
-static bool
-map_fini (struct ptx_stream *s)
-{
-  assert (s->map->next == NULL);
-
-  cuda_map_destroy (s->map);
-
-  return true;
-}
-
-static void
-map_pop (struct ptx_stream *s)
-{
-  struct cuda_map *next;
-
-  assert (s != NULL);
-
-  if (s->map->next == NULL)
-    {
-      s->map->active = false;
-      return;
-    }
-
-  next = s->map->next;
-  cuda_map_destroy (s->map);
-  s->map = next;
-}
-
-static CUdeviceptr
-map_push (struct ptx_stream *s, size_t size)
-{
-  struct cuda_map *map = NULL;
-  struct cuda_map **t;
-
-  assert (s);
-  assert (s->map);
-
-  /* Select an element to push.  */
-  if (s->map->active)
-    map = cuda_map_create (size);
-  else
-    {
-      /* Pop the inactive front element.  */
-      struct cuda_map *pop = s->map;
-      s->map = pop->next;
-      pop->next = NULL;
-
-      if (pop->size < size)
-	{
-	  cuda_map_destroy (pop);
-
-	  map = cuda_map_create (size);
-	}
-      else
-	map = pop;
-    }
-
-  /* Check that the element is as expected.  */
-  assert (map->next == NULL);
-  assert (!map->active);
-
-  /* Mark the element active.  */
-  map->active = true;
-
-  /* Push the element to the back of the list.  */
-  for (t = &s->map; (*t) != NULL; t = &(*t)->next)
-    ;
-  assert (t != NULL && *t == NULL);
-  *t = map;
-
-  return map->d;
-}
-
 /* Target data function launch information.  */
 
 struct targ_fn_launch
@@ -412,22 +267,18 @@  struct ptx_image_data
   struct ptx_image_data *next;
 };
 
+struct ptx_free_block
+{
+  void *ptr;
+  struct ptx_free_block *next;
+};
+
 struct ptx_device
 {
   CUcontext ctx;
   bool ctx_shared;
   CUdevice dev;
-  struct ptx_stream *null_stream;
-  /* All non-null streams associated with this device (actually context),
-     either created implicitly or passed in from the user (via
-     acc_set_cuda_stream).  */
-  struct ptx_stream *active_streams;
-  struct {
-    struct ptx_stream **arr;
-    int size;
-  } async_streams;
-  /* A lock for use when manipulating the above stream list and array.  */
-  pthread_mutex_t stream_lock;
+
   int ord;
   bool overlap;
   bool map;
@@ -445,32 +296,13 @@  struct ptx_device
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
-  
-  struct ptx_device *next;
-};
 
-enum ptx_event_type
-{
-  PTX_EVT_MEM,
-  PTX_EVT_KNL,
-  PTX_EVT_SYNC,
-  PTX_EVT_ASYNC_CLEANUP
-};
+  struct ptx_free_block *free_blocks;
+  pthread_mutex_t free_blocks_lock;
 
-struct ptx_event
-{
-  CUevent *evt;
-  int type;
-  void *addr;
-  int ord;
-  int val;
-
-  struct ptx_event *next;
+  struct ptx_device *next;
 };
 
-static pthread_mutex_t ptx_event_lock;
-static struct ptx_event *ptx_events;
-
 static struct ptx_device **ptx_devices;
 
 static inline struct nvptx_thread *
@@ -479,193 +311,6 @@  nvptx_thread (void)
   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 }
 
-static bool
-init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
-{
-  int i;
-  struct ptx_stream *null_stream
-    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
-  null_stream->stream = NULL;
-  null_stream->host_thread = pthread_self ();
-  null_stream->multithreaded = true;
-  if (!map_init (null_stream))
-    return false;
-
-  ptx_dev->null_stream = null_stream;
-  ptx_dev->active_streams = NULL;
-  pthread_mutex_init (&ptx_dev->stream_lock, NULL);
-
-  if (concurrency < 1)
-    concurrency = 1;
-
-  /* This is just a guess -- make space for as many async streams as the
-     current device is capable of concurrently executing.  This can grow
-     later as necessary.  No streams are created yet.  */
-  ptx_dev->async_streams.arr
-    = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
-  ptx_dev->async_streams.size = concurrency;
-
-  for (i = 0; i < concurrency; i++)
-    ptx_dev->async_streams.arr[i] = NULL;
-
-  return true;
-}
-
-static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
-{
-  free (ptx_dev->async_streams.arr);
-
-  bool ret = true;
-  while (ptx_dev->active_streams != NULL)
-    {
-      struct ptx_stream *s = ptx_dev->active_streams;
-      ptx_dev->active_streams = ptx_dev->active_streams->next;
-
-      ret &= map_fini (s);
-
-      CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
-      if (r != CUDA_SUCCESS)
-	{
-	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
-	  ret = false;
-	}
-      free (s);
-    }
-
-  ret &= map_fini (ptx_dev->null_stream);
-  free (ptx_dev->null_stream);
-  return ret;
-}
-
-/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
-   thread THREAD (and also current device/context).  If CREATE is true, create
-   the stream if it does not exist (or use EXISTING if it is non-NULL), and
-   associate the stream with the same thread argument.  Returns stream to use
-   as result.  */
-
-static struct ptx_stream *
-select_stream_for_async (int async, pthread_t thread, bool create,
-			 CUstream existing)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  /* Local copy of TLS variable.  */
-  struct ptx_device *ptx_dev = nvthd->ptx_dev;
-  struct ptx_stream *stream = NULL;
-  int orig_async = async;
-
-  /* The special value acc_async_noval (-1) maps (for now) to an
-     implicitly-created stream, which is then handled the same as any other
-     numbered async stream.  Other options are available, e.g. using the null
-     stream for anonymous async operations, or choosing an idle stream from an
-     active set.  But, stick with this for now.  */
-  if (async > acc_async_sync)
-    async++;
-
-  if (create)
-    pthread_mutex_lock (&ptx_dev->stream_lock);
-
-  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
-     null stream, and in fact better performance may be obtainable if it doesn't
-     (because the null stream enforces overly-strict synchronisation with
-     respect to other streams for legacy reasons, and that's probably not
-     needed with OpenACC).  Maybe investigate later.  */
-  if (async == acc_async_sync)
-    stream = ptx_dev->null_stream;
-  else if (async >= 0 && async < ptx_dev->async_streams.size
-	   && ptx_dev->async_streams.arr[async] && !(create && existing))
-    stream = ptx_dev->async_streams.arr[async];
-  else if (async >= 0 && create)
-    {
-      if (async >= ptx_dev->async_streams.size)
-	{
-	  int i, newsize = ptx_dev->async_streams.size * 2;
-
-	  if (async >= newsize)
-	    newsize = async + 1;
-
-	  ptx_dev->async_streams.arr
-	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
-				   newsize * sizeof (struct ptx_stream *));
-
-	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
-	    ptx_dev->async_streams.arr[i] = NULL;
-
-	  ptx_dev->async_streams.size = newsize;
-	}
-
-      /* Create a new stream on-demand if there isn't one already, or if we're
-	 setting a particular async value to an existing (externally-provided)
-	 stream.  */
-      if (!ptx_dev->async_streams.arr[async] || existing)
-        {
-	  CUresult r;
-	  struct ptx_stream *s
-	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
-	  if (existing)
-	    s->stream = existing;
-	  else
-	    {
-	      r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
-				     CU_STREAM_DEFAULT);
-	      if (r != CUDA_SUCCESS)
-		{
-		  pthread_mutex_unlock (&ptx_dev->stream_lock);
-		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
-				     cuda_error (r));
-		}
-	    }
-
-	  /* If CREATE is true, we're going to be queueing some work on this
-	     stream.  Associate it with the current host thread.  */
-	  s->host_thread = thread;
-	  s->multithreaded = false;
-
-	  if (!map_init (s))
-	    {
-	      pthread_mutex_unlock (&ptx_dev->stream_lock);
-	      GOMP_PLUGIN_fatal ("map_init fail");
-	    }
-
-	  s->next = ptx_dev->active_streams;
-	  ptx_dev->active_streams = s;
-	  ptx_dev->async_streams.arr[async] = s;
-	}
-
-      stream = ptx_dev->async_streams.arr[async];
-    }
-  else if (async < 0)
-    {
-      if (create)
-	pthread_mutex_unlock (&ptx_dev->stream_lock);
-      GOMP_PLUGIN_fatal ("bad async %d", async);
-    }
-
-  if (create)
-    {
-      assert (stream != NULL);
-
-      /* If we're trying to use the same stream from different threads
-	 simultaneously, set stream->multithreaded to true.  This affects the
-	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
-	 only wait for asynchronous launches from the same host thread they are
-	 invoked on.  If multiple threads use the same async value, we make note
-	 of that here and fall back to testing/waiting for all threads in those
-	 functions.  */
-      if (thread != stream->host_thread)
-        stream->multithreaded = true;
-
-      pthread_mutex_unlock (&ptx_dev->stream_lock);
-    }
-  else if (stream && !stream->multithreaded
-	   && !pthread_equal (stream->host_thread, thread))
-    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
-
-  return stream;
-}
-
 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
    should be locked on entry and remains locked on exit.  */
 
@@ -677,9 +322,6 @@  nvptx_init (void)
   if (instantiated_devices != 0)
     return true;
 
-  ptx_events = NULL;
-  pthread_mutex_init (&ptx_event_lock, NULL);
-
   if (!init_cuda_lib ())
     return false;
 
@@ -703,6 +345,11 @@  nvptx_attach_host_thread_to_device (int n)
   CUcontext thd_ctx;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
+  if (r == CUDA_ERROR_NOT_PERMITTED)
+    {
+      /* Assume we're in a CUDA callback, just return true.  */
+      return true;
+    }
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
     {
       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@@ -847,8 +494,8 @@  nvptx_open_device (int n)
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
-  if (!init_streams_for_device (ptx_dev, async_engines))
-    return NULL;
+  ptx_dev->free_blocks = NULL;
+  pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 
   return ptx_dev;
 }
@@ -859,9 +506,15 @@  nvptx_close_device (struct ptx_device *ptx_dev)
   if (!ptx_dev)
     return true;
 
-  if (!fini_streams_for_device (ptx_dev))
-    return false;
-  
+  for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
+    {
+      struct ptx_free_block *b_next = b->next;
+      CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
+      free (b);
+      b = b_next;
+    }
+
+  pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
   if (!ptx_dev->ctx_shared)
@@ -1041,139 +694,19 @@  link_ptx (CUmodule *module, const struct targ_ptx_
 }
 
 static void
-event_gc (bool memmap_lockable)
-{
-  struct ptx_event *ptx_event = ptx_events;
-  struct ptx_event *async_cleanups = NULL;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&ptx_event_lock);
-
-  while (ptx_event != NULL)
-    {
-      CUresult r;
-      struct ptx_event *e = ptx_event;
-
-      ptx_event = ptx_event->next;
-
-      if (e->ord != nvthd->ptx_dev->ord)
-	continue;
-
-      r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
-      if (r == CUDA_SUCCESS)
-	{
-	  bool append_async = false;
-	  CUevent *te;
-
-	  te = e->evt;
-
-	  switch (e->type)
-	    {
-	    case PTX_EVT_MEM:
-	    case PTX_EVT_SYNC:
-	      break;
-
-	    case PTX_EVT_KNL:
-	      map_pop (e->addr);
-	      break;
-
-	    case PTX_EVT_ASYNC_CLEANUP:
-	      {
-		/* The function gomp_plugin_async_unmap_vars needs to claim the
-		   memory-map splay tree lock for the current device, so we
-		   can't call it when one of our callers has already claimed
-		   the lock.  In that case, just delay the GC for this event
-		   until later.  */
-		if (!memmap_lockable)
-		  continue;
-
-		append_async = true;
-	      }
-	      break;
-	    }
-
-	  CUDA_CALL_NOCHECK (cuEventDestroy, *te);
-	  free ((void *)te);
-
-	  /* Unlink 'e' from ptx_events list.  */
-	  if (ptx_events == e)
-	    ptx_events = ptx_events->next;
-	  else
-	    {
-	      struct ptx_event *e_ = ptx_events;
-	      while (e_->next != e)
-		e_ = e_->next;
-	      e_->next = e_->next->next;
-	    }
-
-	  if (append_async)
-	    {
-	      e->next = async_cleanups;
-	      async_cleanups = e;
-	    }
-	  else
-	    free (e);
-	}
-    }
-
-  pthread_mutex_unlock (&ptx_event_lock);
-
-  /* We have to do these here, after ptx_event_lock is released.  */
-  while (async_cleanups)
-    {
-      struct ptx_event *e = async_cleanups;
-      async_cleanups = async_cleanups->next;
-
-      GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
-      free (e);
-    }
-}
-
-static void
-event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
-{
-  struct ptx_event *ptx_event;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
-	  || type == PTX_EVT_ASYNC_CLEANUP);
-
-  ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
-  ptx_event->type = type;
-  ptx_event->evt = e;
-  ptx_event->addr = h;
-  ptx_event->ord = nvthd->ptx_dev->ord;
-  ptx_event->val = val;
-
-  pthread_mutex_lock (&ptx_event_lock);
-
-  ptx_event->next = ptx_events;
-  ptx_events = ptx_event;
-
-  pthread_mutex_unlock (&ptx_event_lock);
-}
-
-static void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-	    int async, unsigned *dims, void *targ_mem_desc)
+	    unsigned *dims, void *targ_mem_desc,
+	    CUdeviceptr dp, CUstream stream)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
-  CUresult r;
   int i;
-  struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp;
-  CUdeviceptr dp = 0;
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
-  const char *maybe_abort_msg = "(perhaps abort was called)";
 
   function = targ_fn->fn;
 
-  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
-  assert (dev_str == nvthd->current_stream);
-
   /* Initialize the launch dimensions.  Typically this is constant,
      provided by the device compiler, but we must permit runtime
      values.  */
@@ -1361,27 +894,6 @@  nvptx_exec (void (*fn), size_t mapnum, void **host
 			   dims[GOMP_DIM_VECTOR]);
     }
 
-  if (mapnum > 0)
-    {
-      /* This reserves a chunk of a pre-allocated page of memory mapped on both
-	 the host and the device. HP is a host pointer to the new chunk, and DP is
-	 the corresponding device pointer.  */
-      pthread_mutex_lock (&ptx_event_lock);
-      dp = map_push (dev_str, mapnum * sizeof (void *));
-      pthread_mutex_unlock (&ptx_event_lock);
-
-      GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-      /* Copy the array of arguments to the mapped page.  */
-      hp = alloca(sizeof(void *) * mapnum);
-      for (i = 0; i < mapnum; i++)
-	((void **) hp)[i] = devaddrs[i];
-
-      /* Copy the (device) pointers to arguments to the device */
-      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
-			mapnum * sizeof (void *));
-    }
-
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1392,62 +904,14 @@  nvptx_exec (void (*fn), size_t mapnum, void **host
   // num_gangs		nctaid.x
   // num_workers	ntid.y
   // vector length	ntid.x
-
   kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 		    dims[GOMP_DIM_GANG], 1, 1,
 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
-		    0, dev_str->stream, kargs, 0);
+		    0, stream, kargs, 0);
 
-#ifndef DISABLE_ASYNC
-  if (async < acc_async_noval)
-    {
-      r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
-      if (r == CUDA_ERROR_LAUNCH_FAILED)
-	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
-			   maybe_abort_msg);
-      else if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
-    }
-  else
-    {
-      CUevent *e;
-
-      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      if (r == CUDA_ERROR_LAUNCH_FAILED)
-	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
-			   maybe_abort_msg);
-      else if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
-      event_gc (true);
-
-      CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
-
-      if (mapnum > 0)
-	event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
-    }
-#else
-  r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (r == CUDA_ERROR_LAUNCH_FAILED)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
-		       maybe_abort_msg);
-  else if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-#endif
-
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 		     targ_fn->launch->fn);
-
-#ifndef DISABLE_ASYNC
-  if (async < acc_async_noval)
-#endif
-    {
-      if (mapnum > 0)
-	map_pop (dev_str);
-    }
 }
 
 void * openacc_get_current_cuda_context (void);
@@ -1462,8 +926,21 @@  nvptx_alloc (size_t s)
 }
 
 static bool
-nvptx_free (void *p)
+nvptx_free (void *p, struct ptx_device *ptx_dev)
 {
+  /* Assume callback context if this is null.  */
+  if (GOMP_PLUGIN_acc_thread () == NULL)
+    {
+      struct ptx_free_block *n
+	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+      n->ptr = p;
+      pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+      n->next = ptx_dev->free_blocks;
+      ptx_dev->free_blocks = n;
+      pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+      return true;
+    }
+
   CUdeviceptr pb;
   size_t ps;
 
@@ -1478,305 +955,6 @@  static bool
   return true;
 }
 
-
-static bool
-nvptx_host2dev (void *d, const void *h, size_t s)
-{
-  CUdeviceptr pb;
-  size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!s)
-    return true;
-  if (!d)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-
-  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
-  if (!pb)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-  if (!h)
-    {
-      GOMP_PLUGIN_error ("invalid host address");
-      return false;
-    }
-  if (d == h)
-    {
-      GOMP_PLUGIN_error ("invalid host or device address");
-      return false;
-    }
-  if ((void *)(d + s) > (void *)(pb + ps))
-    {
-      GOMP_PLUGIN_error ("invalid size");
-      return false;
-    }
-
-#ifndef DISABLE_ASYNC
-  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
-    {
-      CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      event_gc (false);
-      CUDA_CALL (cuMemcpyHtoDAsync,
-		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
-      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h, 0);
-    }
-  else
-#endif
-    CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
-
-  return true;
-}
-
-static bool
-nvptx_dev2host (void *h, const void *d, size_t s)
-{
-  CUdeviceptr pb;
-  size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!s)
-    return true;
-  if (!d)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-
-  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
-  if (!pb)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-  if (!h)
-    {
-      GOMP_PLUGIN_error ("invalid host address");
-      return false;
-    }
-  if (d == h)
-    {
-      GOMP_PLUGIN_error ("invalid host or device address");
-      return false;
-    }
-  if ((void *)(d + s) > (void *)(pb + ps))
-    {
-      GOMP_PLUGIN_error ("invalid size");
-      return false;
-    }
-
-#ifndef DISABLE_ASYNC
-  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
-    {
-      CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      event_gc (false);
-      CUDA_CALL (cuMemcpyDtoHAsync,
-		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
-      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h, 0);
-    }
-  else
-#endif
-    CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
-
-  return true;
-}
-
-static void
-nvptx_set_async (int async)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  nvthd->current_stream
-    = select_stream_for_async (async, pthread_self (), true, NULL);
-}
-
-static int
-nvptx_async_test (int async)
-{
-  CUresult r;
-  struct ptx_stream *s;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-  if (!s)
-    return 1;
-
-  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
-  if (r == CUDA_SUCCESS)
-    {
-      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
-	 whether all work has completed on this stream, and if so omits the call
-	 to the wait hook.  If that happens, event_gc might not get called
-	 (which prevents variables from getting unmapped and their associated
-	 device storage freed), so call it here.  */
-      event_gc (true);
-      return 1;
-    }
-  else if (r == CUDA_ERROR_NOT_READY)
-    return 0;
-
-  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
-  return 0;
-}
-
-static int
-nvptx_async_test_all (void)
-{
-  struct ptx_stream *s;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
-    {
-      if ((s->multithreaded || pthread_equal (s->host_thread, self))
-	  && CUDA_CALL_NOCHECK (cuStreamQuery,
-				s->stream) == CUDA_ERROR_NOT_READY)
-	{
-	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-	  return 0;
-	}
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  event_gc (true);
-
-  return 1;
-}
-
-static void
-nvptx_wait (int async)
-{
-  struct ptx_stream *s;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-  if (!s)
-    return;
-
-  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
-
-  event_gc (true);
-}
-
-static void
-nvptx_wait_async (int async1, int async2)
-{
-  CUevent *e;
-  struct ptx_stream *s1, *s2;
-  pthread_t self = pthread_self ();
-
-  s1 = select_stream_for_async (async1, self, false, NULL);
-  if (!s1)
-    return;
-
-  /* The stream that is waiting (rather than being waited for) doesn't
-     necessarily have to exist already.  */
-  s2 = select_stream_for_async (async2, self, true, NULL);
-
-  /* A stream is always synchronized with itself.  */
-  if (s1 == s2)
-    return;
-
-  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
-  event_gc (true);
-
-  CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
-
-  event_add (PTX_EVT_SYNC, e, NULL, 0);
-
-  CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
-}
-
-static void
-nvptx_wait_all (void)
-{
-  CUresult r;
-  struct ptx_stream *s;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  /* Wait for active streams initiated by this thread (or by multiple threads)
-     to complete.  */
-  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
-    {
-      if (s->multithreaded || pthread_equal (s->host_thread, self))
-	{
-	  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
-	  if (r == CUDA_SUCCESS)
-	    continue;
-	  else if (r != CUDA_ERROR_NOT_READY)
-	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
-	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
-	}
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  event_gc (true);
-}
-
-static void
-nvptx_wait_all_async (int async)
-{
-  struct ptx_stream *waiting_stream, *other_stream;
-  CUevent *e;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  pthread_t self = pthread_self ();
-
-  /* The stream doing the waiting.  This could be the first mention of the
-     stream, so create it if necessary.  */
-  waiting_stream
-    = select_stream_for_async (async, pthread_self (), true, NULL);
-
-  /* Launches on the null stream already block on other streams in the
-     context.  */
-  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
-    return;
-
-  event_gc (true);
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  for (other_stream = nvthd->ptx_dev->active_streams;
-       other_stream != NULL;
-       other_stream = other_stream->next)
-    {
-      if (!other_stream->multithreaded
-	  && !pthread_equal (other_stream->host_thread, self))
-	continue;
-
-      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
-      /* Record an event on the waited-for stream.  */
-      CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
-
-      event_add (PTX_EVT_SYNC, e, NULL, 0);
-
-      CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
-   }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-}
-
 static void *
 nvptx_get_current_cuda_device (void)
 {
@@ -1799,75 +977,6 @@  nvptx_get_current_cuda_context (void)
   return nvthd->ptx_dev->ctx;
 }
 
-static void *
-nvptx_get_cuda_stream (int async)
-{
-  struct ptx_stream *s;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!nvthd || !nvthd->ptx_dev)
-    return NULL;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-
-  return s ? s->stream : NULL;
-}
-
-static int
-nvptx_set_cuda_stream (int async, void *stream)
-{
-  struct ptx_stream *oldstream;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
-     to change the stream handle associated with "acc_async_sync".  */
-  if (async == acc_async_sync)
-    {
-      GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
-			 " with \"acc_async_sync\"\n");
-      return 0;
-    }
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  /* We have a list of active streams and an array mapping async values to
-     entries of that list.  We need to take "ownership" of the passed-in stream,
-     and add it to our list, removing the previous entry also (if there was one)
-     in order to prevent resource leaks.  Note the potential for surprise
-     here: maybe we should keep track of passed-in streams and leave it up to
-     the user to tidy those up, but that doesn't work for stream handles
-     returned from acc_get_cuda_stream above...  */
-
-  oldstream = select_stream_for_async (async, self, false, NULL);
-
-  if (oldstream)
-    {
-      if (nvthd->ptx_dev->active_streams == oldstream)
-	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
-      else
-	{
-	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
-	  while (s->next != oldstream)
-	    s = s->next;
-	  s->next = s->next->next;
-	}
-
-      CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
-
-      if (!map_fini (oldstream))
-	GOMP_PLUGIN_fatal ("error when freeing host memory");
-
-      free (oldstream);
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  (void) select_stream_for_async (async, self, true, (CUstream) stream);
-
-  return 1;
-}
-
 /* Plugin entry points.  */
 
 const char *
@@ -2107,6 +1216,23 @@  GOMP_OFFLOAD_alloc (int ord, size_t size)
 {
   if (!nvptx_attach_host_thread_to_device (ord))
     return NULL;
+
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  struct ptx_free_block *blocks, *tmp;
+
+  pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+  blocks = ptx_dev->free_blocks;
+  ptx_dev->free_blocks = NULL;
+  pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+
+  while (blocks)
+    {
+      tmp = blocks->next;
+      nvptx_free (blocks->ptr, ptx_dev);
+      free (blocks);
+      blocks = tmp;
+    }
+
   return nvptx_alloc (size);
 }
 
@@ -2114,93 +1240,92 @@  bool
 GOMP_OFFLOAD_free (int ord, void *ptr)
 {
   return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_free (ptr));
+	  && nvptx_free (ptr, ptx_devices[ord]));
 }
 
-bool
-GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
-{
-  return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_dev2host (dst, src, n));
-}
-
-bool
-GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
-{
-  return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_host2dev (dst, src, n));
-}
-
-bool
-GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
-{
-  struct ptx_device *ptx_dev = ptx_devices[ord];
-  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
-				ptx_dev->null_stream->stream);
-  return true;
-}
-
-void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
-
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 			   void **hostaddrs, void **devaddrs,
-			   int async, unsigned *dims, void *targ_mem_desc)
+			   unsigned *dims, void *targ_mem_desc)
 {
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
-}
+  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
-void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+  void **hp = NULL;
+  CUdeviceptr dp = 0;
 
-  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-  CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
-  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
-}
+  if (mapnum > 0)
+    {
+      hp = alloca (mapnum * sizeof (void *));
+      for (int i = 0; i < mapnum; i++)
+	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+    }
 
-int
-GOMP_OFFLOAD_openacc_async_test (int async)
-{
-  return nvptx_async_test (async);
-}
+  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+     fact have the same value on a unified-memory system).  */
+  if (mapnum > 0)
+    CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
+		      mapnum * sizeof (void *));
 
-int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
-  return nvptx_async_test_all ();
-}
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+	      dp, NULL);
 
-void
-GOMP_OFFLOAD_openacc_async_wait (int async)
-{
-  nvptx_wait (async);
+  CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuMemFree, dp);
 }
 
-void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
+static void
+cuda_free_argmem (void *ptr)
 {
-  nvptx_wait_async (async1, async2);
+  void **block = (void **) ptr;
+  nvptx_free (block[0], (struct ptx_device *) block[1]);
+  free (block);
 }
 
 void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
+GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
 {
-  nvptx_wait_all ();
-}
+  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
-void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
-{
-  nvptx_wait_all_async (async);
-}
+  void **hp = NULL;
+  CUdeviceptr dp = 0;
+  void **block = NULL;
 
-void
-GOMP_OFFLOAD_openacc_async_set_async (int async)
-{
-  nvptx_set_async (async);
+  if (mapnum > 0)
+    {
+      block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
+      hp = block + 2;
+      for (int i = 0; i < mapnum; i++)
+	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+      CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+    }
+
+  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+     fact have the same value on a unified-memory system).  */
+  if (mapnum > 0)
+    {
+      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
+			mapnum * sizeof (void *), aq->cuda_stream);
+      block[0] = (void *) dp;
+
+      struct nvptx_thread *nvthd =
+	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+      block[1] = (void *) nvthd->ptx_dev;
+    }
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+	      dp, aq->cuda_stream);
+
+  if (mapnum > 0)
+    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
 }
 
 void *
@@ -2222,7 +1347,6 @@  GOMP_OFFLOAD_openacc_create_thread_data (int ord)
   if (!thd_ctx)
     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
 
-  nvthd->current_stream = ptx_dev->null_stream;
   nvthd->ptx_dev = ptx_dev;
 
   return (void *) nvthd;
@@ -2246,22 +1370,186 @@  GOMP_OFFLOAD_openacc_cuda_get_current_context (voi
   return nvptx_get_current_cuda_context ();
 }
 
-/* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
-
+/* This returns a CUstream.  */
 void *
-GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
+GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
 {
-  return nvptx_get_cuda_stream (async);
+  return (void *) aq->cuda_stream;
 }
 
-/* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
+/* This takes a CUstream.  */
+int
+GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
+{
+  if (aq->cuda_stream)
+    {
+      CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+      CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
+    }
 
+  aq->cuda_stream = (CUstream) stream;
+  return 1;
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (void)
+{
+  CUstream stream = NULL;
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+
+  struct goacc_asyncqueue *aq
+    = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
+  aq->cuda_stream = stream;
+  return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+  free (aq);
+  return true;
+}
+
 int
-GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
-  return nvptx_set_cuda_stream (async, stream);
+  CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
+  if (r == CUDA_SUCCESS)
+    return 1;
+  if (r == CUDA_ERROR_NOT_READY)
+    return 0;
+
+  GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
+  return -1;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+				      struct goacc_asyncqueue *aq2)
+{
+  CUevent e;
+  CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
+  CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
+  CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+  return true;
+}
+
+static void
+cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
+{
+  if (res != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
+  struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
+  cb->fn (cb->ptr);
+  free (ptr);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+					   void (*callback_fn)(void *),
+					   void *userptr)
+{
+  struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
+  b->fn = callback_fn;
+  b->ptr = userptr;
+  b->aq = aq;
+  CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
+		    cuda_callback_wrapper, (void *) b, 0);
+}
+
+static bool
+cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
+{
+  CUdeviceptr pb;
+  size_t ps;
+  if (!s)
+    return true;
+  if (!d)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
+  if (!pb)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if (!h)
+    {
+      GOMP_PLUGIN_error ("invalid host address");
+      return false;
+    }
+  if (d == h)
+    {
+      GOMP_PLUGIN_error ("invalid host or device address");
+      return false;
+    }
+  if ((void *)(d + s) > (void *)(pb + ps))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (src, dst, n))
+    return false;
+  CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (src, dst, n))
+    return false;
+  CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
+  return true;
+}
+
 /* Adjust launch dimensions: pick good values for number of blocks and warps
    and ensure that number of warps does not exceed CUDA limits as well as GCC's
    own limits.  */
@@ -2360,8 +1648,7 @@  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt
     CU_LAUNCH_PARAM_END
   };
   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
-			 32, threads, 1, 0, ptx_dev->null_stream->stream,
-			 NULL, config);
+			 32, threads, 1, 0, NULL, NULL, config);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 
Index: libgomp/plugin/cuda-lib.def
===================================================================
--- libgomp/plugin/cuda-lib.def	(revision 269183)
+++ libgomp/plugin/cuda-lib.def	(working copy)
@@ -42,6 +42,7 @@  CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
 CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuStreamAddCallback)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
Index: libgomp/oacc-mem.c
===================================================================
--- libgomp/oacc-mem.c	(revision 269183)
+++ libgomp/oacc-mem.c	(working copy)
@@ -172,18 +172,11 @@  memcpy_tofrom_device (bool from, void *d, void *h,
       return;
     }
 
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (async);
-
-  bool ret = (from
-	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
-	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
-
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (acc_async_sync);
-
-  if (!ret)
-    gomp_fatal ("error in %s", libfnname);
+  goacc_aq aq = get_goacc_asyncqueue (async);
+  if (from)
+    gomp_copy_dev2host (thr->dev, aq, h, d, s);
+  else
+    gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
 }
 
 void
@@ -509,17 +502,13 @@  present_create_copy (unsigned f, void *h, size_t s
 
       gomp_mutex_unlock (&acc_dev->lock);
 
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (async);
+      goacc_aq aq = get_goacc_asyncqueue (async);
 
-      tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
-			   GOMP_MAP_VARS_OPENACC);
+      tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
+				 &kinds, true, GOMP_MAP_VARS_OPENACC);
       /* Initialize dynamic refcount.  */
       tgt->list[0].key->dynamic_refcount = 1;
 
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (acc_async_sync);
-
       gomp_mutex_lock (&acc_dev->lock);
 
       d = tgt->to_free;
@@ -676,13 +665,9 @@  delete_copyout (unsigned f, void *h, size_t s, int
 
       if (f & FLAG_COPYOUT)
 	{
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (async);
-	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (acc_async_sync);
+	  goacc_aq aq = get_goacc_asyncqueue (async);
+	  gomp_copy_dev2host (acc_dev, aq, h, d, s);
 	}
-
       gomp_remove_var (acc_dev, n);
     }
 
@@ -765,17 +750,13 @@  update_dev_host (int is_dev, void *h, size_t s, in
   d = (void *) (n->tgt->tgt_start + n->tgt_offset
 		+ (uintptr_t) h - n->host_start);
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (async);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
   if (is_dev)
-    acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+    gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
   else
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+    gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (acc_async_sync);
-
   gomp_mutex_unlock (&acc_dev->lock);
 }
 
@@ -805,7 +786,7 @@  acc_update_self_async (void *h, size_t s, int asyn
 
 void
 gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
-			 void *kinds)
+			 void *kinds, int async)
 {
   struct target_mem_desc *tgt;
   struct goacc_thread *thr = goacc_thread ();
@@ -835,8 +816,9 @@  gomp_acc_insert_pointer (size_t mapnum, void **hos
     }
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
-		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
+  goacc_aq aq = get_goacc_asyncqueue (async);
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
+			     NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
 
   /* Initialize dynamic refcount.  */
@@ -930,7 +912,10 @@  gomp_acc_remove_pointer (void *h, size_t s, bool f
       if (async < acc_async_noval)
 	gomp_unmap_vars (t, true);
       else
-	t->device_descr->openacc.register_async_cleanup_func (t, async);
+	{
+	  goacc_aq aq = get_goacc_asyncqueue (async);
+	  gomp_unmap_vars_async (t, true, aq);
+	}
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c	(revision 269183)
+++ libgomp/oacc-parallel.c	(working copy)
@@ -217,8 +217,6 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (voi
     }
   va_end (ap);
   
-  acc_dev->openacc.async_set_async_func (async);
-
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
@@ -235,44 +233,29 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (voi
   else
     tgt_fn = (void (*)) fn;
 
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       GOMP_MAP_VARS_OPENACC);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+			     true, GOMP_MAP_VARS_OPENACC);
+  
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
 			    + tgt->list[i].key->tgt_offset
 			    + tgt->list[i].offset);
-
-  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-			      async, dims, tgt);
-
-  /* If running synchronously, unmap immediately.  */
-  bool copyfrom = true;
-  if (async_synchronous_p (async))
-    gomp_unmap_vars (tgt, true);
+  if (aq == NULL)
+    {
+      acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+				  dims, tgt);
+      /* If running synchronously, unmap immediately.  */
+      gomp_unmap_vars (tgt, true);
+    }
   else
     {
-      bool async_unmap = false;
-      for (size_t i = 0; i < tgt->list_count; i++)
-	{
-	  splay_tree_key k = tgt->list[i].key;
-	  if (k && k->refcount == 1)
-	    {
-	      async_unmap = true;
-	      break;
-	    }
-	}
-      if (async_unmap)
-	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
-      else
-	{
-	  copyfrom = false;
-	  gomp_unmap_vars (tgt, copyfrom);
-	}
+      acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+					dims, tgt, aq);
+      gomp_unmap_vars_async (tgt, true, aq);
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* Legacy entry point, only provide host execution.  */
@@ -383,8 +366,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	finalize = true;
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   /* Determine if this is an "acc enter data".  */
   for (i = 0; i < mapnum; ++i)
     {
@@ -437,11 +418,11 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		{
 		case GOMP_MAP_ALLOC:
 		case GOMP_MAP_FORCE_ALLOC:
-		  acc_create (hostaddrs[i], sizes[i]);
+		  acc_create_async (hostaddrs[i], sizes[i], async);
 		  break;
 		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
-		  acc_copyin (hostaddrs[i], sizes[i]);
+		  acc_copyin_async (hostaddrs[i], sizes[i], async);
 		  break;
 		default:
 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -452,7 +433,7 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  else
 	    {
 	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-				       &sizes[i], &kinds[i]);
+				       &sizes[i], &kinds[i], async);
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -477,17 +458,17 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		if (acc_is_present (hostaddrs[i], sizes[i]))
 		  {
 		    if (finalize)
-		      acc_delete_finalize (hostaddrs[i], sizes[i]);
+		      acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
 		    else
-		      acc_delete (hostaddrs[i], sizes[i]);
+		      acc_delete_async (hostaddrs[i], sizes[i], async);
 		  }
 		break;
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_FORCE_FROM:
 		if (finalize)
-		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
+		  acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
 		else
-		  acc_copyout (hostaddrs[i], sizes[i]);
+		  acc_copyout_async (hostaddrs[i], sizes[i], async);
 		break;
 	      default:
 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -505,8 +486,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	    i += pointer - 1;
 	  }
       }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 static void
@@ -532,9 +511,10 @@  goacc_wait (int async, int num_waits, va_list *ap)
       if (async == acc_async_sync)
 	acc_wait (qid);
       else if (qid == async)
-	;/* If we're waiting on the same asynchronous queue as we're
-	    launching on, the queue itself will order work as
-	    required, so there's no need to wait explicitly.  */
+	/* If we're waiting on the same asynchronous queue as we're
+	   launching on, the queue itself will order work as
+	   required, so there's no need to wait explicitly.  */
+	;
       else
 	acc_wait_async (qid, async);
     }
@@ -567,8 +547,6 @@  GOACC_update (int flags_m, size_t mapnum,
       va_end (ap);
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   bool update_device = false;
   for (i = 0; i < mapnum; ++i)
     {
@@ -591,6 +569,8 @@  GOACC_update (int flags_m, size_t mapnum,
 		 the value of the allocated device memory in the
 		 previous pointer.  */
 	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
+	      /* TODO: verify that we really cannot use acc_update_device_async
+		 here.  */
 	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
 
 	      /* Restore the host pointer.  */
@@ -608,7 +588,7 @@  GOACC_update (int flags_m, size_t mapnum,
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_TO:
 	  update_device = true;
-	  acc_update_device (hostaddrs[i], sizes[i]);
+	  acc_update_device_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	case GOMP_MAP_FROM:
@@ -620,7 +600,7 @@  GOACC_update (int flags_m, size_t mapnum,
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_FROM:
 	  update_device = false;
-	  acc_update_self (hostaddrs[i], sizes[i]);
+	  acc_update_self_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	default:
@@ -628,8 +608,6 @@  GOACC_update (int flags_m, size_t mapnum,
 	  break;
 	}
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 void
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c	(revision 269183)
+++ libgomp/oacc-init.c	(working copy)
@@ -309,7 +309,7 @@  acc_shutdown_1 (acc_device_t d)
       if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
         {
 	  devices_active = true;
-	  ret &= acc_dev->fini_device_func (acc_dev->target_id);
+	  ret &= gomp_fini_device (acc_dev);
 	  acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
 	}
       gomp_mutex_unlock (&acc_dev->lock);
@@ -426,8 +426,6 @@  goacc_attach_host_thread_to_device (int ord)
   
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
-  
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
Index: libgomp/oacc-cuda.c
===================================================================
--- libgomp/oacc-cuda.c	(revision 269183)
+++ libgomp/oacc-cuda.c	(working copy)
@@ -30,6 +30,7 @@ 
 #include "config.h"
 #include "libgomp.h"
 #include "oacc-int.h"
+#include <assert.h>
 
 void *
 acc_get_current_cuda_device (void)
@@ -62,7 +63,11 @@  acc_get_cuda_stream (int async)
     return NULL;
 
   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
-    return thr->dev->openacc.cuda.get_stream_func (async);
+    {
+      goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+      if (aq)
+	return thr->dev->openacc.cuda.get_stream_func (aq);
+    }
  
   return NULL;
 }
@@ -79,8 +84,23 @@  acc_set_cuda_stream (int async, void *stream)
 
   thr = goacc_thread ();
 
+  int ret = -1;
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
-    return thr->dev->openacc.cuda.set_stream_func (async, stream);
+    {
+      goacc_aq aq = get_goacc_asyncqueue (async);
+      /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
+	 used to change the CUDA stream associated with "acc_async_sync".  */
+      if (!aq)
+	{
+	  assert (async == acc_async_sync);
+	  gomp_debug (0, "Refusing request to set CUDA stream associated"
+		      " with \"acc_async_sync\"\n");
+	  return 0;
+	}
+      gomp_mutex_lock (&thr->dev->openacc.async.lock);
+      ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
+      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+    }
 
-  return -1;
+  return ret;
 }
Index: libgomp/target.c
===================================================================
--- libgomp/target.c	(revision 269183)
+++ libgomp/target.c	(working copy)
@@ -177,6 +177,22 @@  gomp_device_copy (struct gomp_device_descr *device
     }
 }
 
+static inline void
+goacc_device_copy_async (struct gomp_device_descr *devicep,
+			 bool (*copy_func) (int, void *, const void *, size_t,
+					    struct goacc_asyncqueue *),
+			 const char *dst, void *dstaddr,
+			 const char *src, const void *srcaddr,
+			 size_t size, struct goacc_asyncqueue *aq)
+{
+  if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
+		  src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
+    }
+}
+
 /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
    host to device memory transfers.  */
 
@@ -269,8 +285,9 @@  gomp_to_device_kind_p (int kind)
     }
 }
 
-static void
+attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
+		    struct goacc_asyncqueue *aq,
 		    void *d, const void *h, size_t sz,
 		    struct gomp_coalesce_buf *cbuf)
 {
@@ -299,14 +316,23 @@  gomp_copy_host2dev (struct gomp_device_descr *devi
 	    }
 	}
     }
-  gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
+  if (__builtin_expect (aq != NULL, 0))
+    goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
+			     "dev", d, "host", h, sz, aq);
+  else
+    gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
-static void
+attribute_hidden void
 gomp_copy_dev2host (struct gomp_device_descr *devicep,
+		    struct goacc_asyncqueue *aq,
 		    void *h, const void *d, size_t sz)
 {
-  gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
+  if (__builtin_expect (aq != NULL, 0))
+    goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
+			     "host", h, "dev", d, sz, aq);
+  else
+    gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
 }
 
 static void
@@ -324,7 +350,8 @@  gomp_free_device_memory (struct gomp_device_descr
    Helper function of gomp_map_vars.  */
 
 static inline void
-gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
+gomp_map_vars_existing (struct gomp_device_descr *devicep,
+			struct goacc_asyncqueue *aq, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
 			unsigned char kind, struct gomp_coalesce_buf *cbuf)
 {
@@ -346,7 +373,7 @@  static inline void
     }
 
   if (GOMP_MAP_ALWAYS_TO_P (kind))
-    gomp_copy_host2dev (devicep,
+    gomp_copy_host2dev (devicep, aq,
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
@@ -364,8 +391,8 @@  get_kind (bool short_mapkind, void *kinds, int idx
 }
 
 static void
-gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-		  uintptr_t target_offset, uintptr_t bias,
+gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
+		  uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
 		  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
@@ -376,7 +403,7 @@  static void
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      gomp_copy_host2dev (devicep,
+      gomp_copy_host2dev (devicep, aq,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
 			  sizeof (void *), cbuf);
@@ -398,12 +425,13 @@  static void
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
+  gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
 		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
 }
 
 static void
-gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
+gomp_map_fields_existing (struct target_mem_desc *tgt,
+			  struct goacc_asyncqueue *aq, splay_tree_key n,
 			  size_t first, size_t i, void **hostaddrs,
 			  size_t *sizes, void *kinds,
 			  struct gomp_coalesce_buf *cbuf)
@@ -423,7 +451,7 @@  static void
       && n2->tgt == n->tgt
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
-      gomp_map_vars_existing (devicep, n2, &cur_node,
+      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
 			      &tgt->list[i], kind & typemask, cbuf);
       return;
     }
@@ -439,8 +467,8 @@  static void
 	      && n2->host_start - n->host_start
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
-	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				      kind & typemask, cbuf);
+	      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
+				      &tgt->list[i], kind & typemask, cbuf);
 	      return;
 	    }
 	}
@@ -451,7 +479,7 @@  static void
 	  && n2->tgt == n->tgt
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
-	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+	  gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
 				  kind & typemask, cbuf);
 	  return;
 	}
@@ -483,10 +511,12 @@  gomp_map_val (struct target_mem_desc *tgt, void **
   return tgt->tgt_start + tgt->list[i].offset;
 }
 
-attribute_hidden struct target_mem_desc *
-gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
-	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
-	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+static inline __attribute__((always_inline)) struct target_mem_desc *
+gomp_map_vars_internal (struct gomp_device_descr *devicep,
+			struct goacc_asyncqueue *aq, size_t mapnum,
+			void **hostaddrs, void **devaddrs, size_t *sizes,
+			void *kinds, bool short_mapkind,
+			enum gomp_map_vars_kind pragma_kind)
 {
   size_t i, tgt_align, tgt_size, not_found_cnt = 0;
   bool has_firstprivate = false;
@@ -600,7 +630,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
-	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+	    gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
 				      sizes, kinds, NULL);
 	  i--;
 	  continue;
@@ -645,7 +675,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
       else
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
-	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
+	gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
 				kind & typemask, NULL);
       else
 	{
@@ -756,7 +786,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		tgt_size = (tgt_size + align - 1) & ~(align - 1);
 		tgt->list[i].offset = tgt_size;
 		len = sizes[i];
-		gomp_copy_host2dev (devicep,
+		gomp_copy_host2dev (devicep, aq,
 				    (void *) (tgt->tgt_start + tgt_size),
 				    (void *) hostaddrs[i], len, cbufp);
 		tgt_size += len;
@@ -790,7 +820,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		    continue;
 		  }
 		for (i = first; i <= last; i++)
-		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+		  gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
 					    sizes, kinds, cbufp);
 		i--;
 		continue;
@@ -810,7 +840,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
 		if (cur_node.tgt_offset)
 		  cur_node.tgt_offset -= sizes[i];
-		gomp_copy_host2dev (devicep,
+		gomp_copy_host2dev (devicep, aq,
 				    (void *) (n->tgt->tgt_start
 					      + n->tgt_offset
 					      + cur_node.host_start
@@ -831,7 +861,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 	      k->host_end = k->host_start + sizeof (void *);
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n && n->refcount != REFCOUNT_LINK)
-	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
+	      gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
 				      kind & typemask, cbufp);
 	    else
 	      {
@@ -884,18 +914,19 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
 					k->host_end - k->host_start, cbufp);
 		    break;
 		  case GOMP_MAP_POINTER:
-		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
+		    gomp_map_pointer (tgt, aq,
+				      (uintptr_t) *(void **) k->host_start,
 				      k->tgt_offset, sizes[i], cbufp);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
@@ -917,7 +948,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 			  tgt->list[j].always_copy_from = false;
 			  if (k->refcount != REFCOUNT_INFINITY)
 			    k->refcount++;
-			  gomp_map_pointer (tgt,
+			  gomp_map_pointer (tgt, aq,
 					    (uintptr_t) *(void **) hostaddrs[j],
 					    k->tgt_offset
 					    + ((uintptr_t) hostaddrs[j]
@@ -946,7 +977,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		    break;
 		  case GOMP_MAP_FORCE_DEVICEPTR:
 		    assert (k->host_end - k->host_start == sizeof (void *));
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
@@ -965,7 +996,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
 		    /* We intentionally do not use coalescing here, as it's not
 		       data allocated by the current call to this function.  */
-		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
 					&tgt_addr, sizeof (void *), NULL);
 		  }
 		array++;
@@ -978,7 +1009,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  gomp_copy_host2dev (devicep,
+	  gomp_copy_host2dev (devicep, aq,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
 			      (void *) &cur_node.tgt_offset, sizeof (void *),
 			      cbufp);
@@ -989,7 +1020,7 @@  gomp_map_val (struct target_mem_desc *tgt, void **
     {
       long c = 0;
       for (c = 0; c < cbuf.chunk_cnt; ++c)
-	gomp_copy_host2dev (devicep,
+	gomp_copy_host2dev (devicep, aq,
 			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
 			    (char *) cbuf.buf + (cbuf.chunks[c].start
 						 - cbuf.chunks[0].start),
@@ -1012,7 +1043,27 @@  gomp_map_val (struct target_mem_desc *tgt, void **
   return tgt;
 }
 
-static void
+attribute_hidden struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
+	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+{
+  return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
+				 sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden struct target_mem_desc *
+gomp_map_vars_async (struct gomp_device_descr *devicep,
+		     struct goacc_asyncqueue *aq, size_t mapnum,
+		     void **hostaddrs, void **devaddrs, size_t *sizes,
+		     void *kinds, bool short_mapkind,
+		     enum gomp_map_vars_kind pragma_kind)
+{
+  return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
+				 sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden void
 gomp_unmap_tgt (struct target_mem_desc *tgt)
 {
   /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region.  */
@@ -1040,12 +1091,24 @@  gomp_remove_var (struct gomp_device_descr *devicep
   return is_tgt_unmapped;
 }
 
+static void
+gomp_unref_tgt (void *ptr)
+{
+  struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
+
+  if (tgt->refcount > 1)
+    tgt->refcount--;
+  else
+    gomp_unmap_tgt (tgt);
+}
+
 /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
    variables back from device to host: if it is false, it is assumed that this
    has been done already.  */
 
-attribute_hidden void
-gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+static inline __attribute__((always_inline)) void
+gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
+			  struct goacc_asyncqueue *aq)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
 
@@ -1082,7 +1145,7 @@  gomp_remove_var (struct gomp_device_descr *devicep
 
       if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
 	  || tgt->list[i].always_copy_from)
-	gomp_copy_dev2host (devicep,
+	gomp_copy_dev2host (devicep, aq,
 			    (void *) (k->host_start + tgt->list[i].offset),
 			    (void *) (k->tgt->tgt_start + k->tgt_offset
 				      + tgt->list[i].offset),
@@ -1091,14 +1154,28 @@  gomp_remove_var (struct gomp_device_descr *devicep
 	gomp_remove_var (devicep, k);
     }
 
-  if (tgt->refcount > 1)
-    tgt->refcount--;
+  if (aq)
+    devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt,
+						(void *) tgt);
   else
-    gomp_unmap_tgt (tgt);
+    gomp_unref_tgt ((void *) tgt);
 
   gomp_mutex_unlock (&devicep->lock);
 }
 
+attribute_hidden void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+{
+  gomp_unmap_vars_internal (tgt, do_copyfrom, NULL);
+}
+
+attribute_hidden void
+gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
+		       struct goacc_asyncqueue *aq)
+{
+  gomp_unmap_vars_internal (tgt, do_copyfrom, aq);
+}
+
 static void
 gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 	     size_t *sizes, void *kinds, bool short_mapkind)
@@ -1148,9 +1225,10 @@  gomp_update (struct gomp_device_descr *devicep, si
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
+	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+				  NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
-	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
+	      gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
 	  }
       }
   gomp_mutex_unlock (&devicep->lock);
@@ -1443,9 +1521,24 @@  gomp_init_device (struct gomp_device_descr *device
 				   false);
     }
 
+  /* Initialize OpenACC asynchronous queues.  */
+  goacc_init_asyncqueues (devicep);
+
   devicep->state = GOMP_DEVICE_INITIALIZED;
 }
 
+/* This function finalizes the target device, specified by DEVICEP.  DEVICEP
+   must be locked on entry, and remains locked on return.  */
+
+attribute_hidden bool
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+  bool ret = goacc_fini_asyncqueues (devicep);
+  ret &= devicep->fini_device_func (devicep->target_id);
+  devicep->state = GOMP_DEVICE_FINALIZED;
+  return ret;
+}
+
 attribute_hidden void
 gomp_unload_device (struct gomp_device_descr *devicep)
 {
@@ -1954,7 +2047,7 @@  gomp_exit_data (struct gomp_device_descr *devicep,
 
 	  if ((kind == GOMP_MAP_FROM && k->refcount == 0)
 	      || kind == GOMP_MAP_ALWAYS_FROM)
-	    gomp_copy_dev2host (devicep, (void *) cur_node.host_start,
+	    gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
 				(void *) (k->tgt->tgt_start + k->tgt_offset
 					  + cur_node.host_start
 					  - k->host_start),
@@ -2636,20 +2729,20 @@  gomp_load_plugin_for_device (struct gomp_device_de
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
     {
       if (!DLSYM_OPT (openacc.exec, openacc_exec)
-	  || !DLSYM_OPT (openacc.register_async_cleanup,
-			 openacc_register_async_cleanup)
-	  || !DLSYM_OPT (openacc.async_test, openacc_async_test)
-	  || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
-	  || !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
-	  || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
-	  || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
-	  || !DLSYM_OPT (openacc.async_wait_all_async,
-			 openacc_async_wait_all_async)
-	  || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
 	  || !DLSYM_OPT (openacc.create_thread_data,
 			 openacc_create_thread_data)
 	  || !DLSYM_OPT (openacc.destroy_thread_data,
-			 openacc_destroy_thread_data))
+			 openacc_destroy_thread_data)
+	  || !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
+	  || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
+	  || !DLSYM_OPT (openacc.async.test, openacc_async_test)
+	  || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
+	  || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
+	  || !DLSYM_OPT (openacc.async.queue_callback,
+			 openacc_async_queue_callback)
+	  || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
+	  || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
+	  || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
 	{
 	  /* Require all the OpenACC handlers if we have
 	     GOMP_OFFLOAD_CAP_OPENACC_200.  */
@@ -2700,10 +2793,7 @@  gomp_target_fini (void)
       struct gomp_device_descr *devicep = &devices[i];
       gomp_mutex_lock (&devicep->lock);
       if (devicep->state == GOMP_DEVICE_INITIALIZED)
-	{
-	  ret = devicep->fini_device_func (devicep->target_id);
-	  devicep->state = GOMP_DEVICE_FINALIZED;
-	}
+	ret = gomp_fini_device (devicep);
       gomp_mutex_unlock (&devicep->lock);
       if (!ret)
 	gomp_fatal ("device finalization failed");
Index: libgomp/libgomp.h
===================================================================
--- libgomp/libgomp.h	(revision 269183)
+++ libgomp/libgomp.h	(working copy)
@@ -949,25 +949,32 @@  typedef struct acc_dispatch_t
   /* Execute.  */
   __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
 
-  /* Async cleanup callback registration.  */
-  __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
-    *register_async_cleanup_func;
-
-  /* Asynchronous routines.  */
-  __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
-    *async_wait_all_async_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
-
   /* Create/destroy TLS data.  */
   __typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
   __typeof (GOMP_OFFLOAD_openacc_destroy_thread_data)
     *destroy_thread_data_func;
+  
+  struct {
+    /* Once created and put into the "active" list, asyncqueues are then never
+       destructed and removed from the "active" list, other than if the TODO
+       device is shut down.  */
+    gomp_mutex_t lock;
+    int nasyncqueue;
+    struct goacc_asyncqueue **asyncqueue;
+    struct goacc_asyncqueue_list *active;
 
+    __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
+
+    __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+  } async;
+
   /* NVIDIA target specific routines.  */
   struct {
     __typeof (GOMP_OFFLOAD_openacc_cuda_get_current_device)
@@ -1053,17 +1060,33 @@  enum gomp_map_vars_kind
   GOMP_MAP_VARS_ENTER_DATA
 };
 
-extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
+extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
 extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 				       unsigned short *);
+struct gomp_coalesce_buf;
+extern void gomp_copy_host2dev (struct gomp_device_descr *,
+				struct goacc_asyncqueue *, void *, const void *,
+				size_t, struct gomp_coalesce_buf *);
+extern void gomp_copy_dev2host (struct gomp_device_descr *,
+				struct goacc_asyncqueue *, void *, const void *,
+				size_t);
 
 extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 					      size_t, void **, void **,
 					      size_t *, void *, bool,
 					      enum gomp_map_vars_kind);
+extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
+						    struct goacc_asyncqueue *,
+						    size_t, void **, void **,
+						    size_t *, void *, bool,
+						    enum gomp_map_vars_kind);
+extern void gomp_unmap_tgt (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
+extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
+				   struct goacc_asyncqueue *);
 extern void gomp_init_device (struct gomp_device_descr *);
+extern bool gomp_fini_device (struct gomp_device_descr *);
 extern void gomp_free_memmap (struct splay_tree_s *);
 extern void gomp_unload_device (struct gomp_device_descr *);
 extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
Index: libgomp/oacc-int.h
===================================================================
--- libgomp/oacc-int.h	(revision 269183)
+++ libgomp/oacc-int.h	(working copy)
@@ -99,6 +99,13 @@  void goacc_restore_bind (void);
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);
 
+void goacc_init_asyncqueues (struct gomp_device_descr *);
+bool goacc_fini_asyncqueues (struct gomp_device_descr *);
+void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
+		       void *);
+struct goacc_asyncqueue *get_goacc_asyncqueue (int);
+struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
+						  int);
 static inline bool
 async_valid_stream_id_p (int async)
 {
Index: libgomp/oacc-host.c
===================================================================
--- libgomp/oacc-host.c	(revision 269183)
+++ libgomp/oacc-host.c	(working copy)
@@ -140,57 +140,91 @@  host_openacc_exec (void (*fn) (void *),
 		   size_t mapnum __attribute__ ((unused)),
 		   void **hostaddrs,
 		   void **devaddrs __attribute__ ((unused)),
-		   int async __attribute__ ((unused)),
-		   unsigned *dims __attribute ((unused)),
+		   unsigned *dims __attribute__ ((unused)),
 		   void *targ_mem_desc __attribute__ ((unused)))
 {
   fn (hostaddrs);
 }
 
 static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
-				     int async __attribute__ ((unused)))
+host_openacc_async_exec (void (*fn) (void *),
+			 size_t mapnum __attribute__ ((unused)),
+			 void **hostaddrs,
+			 void **devaddrs __attribute__ ((unused)),
+			 unsigned *dims __attribute__ ((unused)),
+			 void *targ_mem_desc __attribute__ ((unused)),
+			 struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
+  fn (hostaddrs);
 }
 
 static int
-host_openacc_async_test (int async __attribute__ ((unused)))
+host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
   return 1;
 }
 
-static int
-host_openacc_async_test_all (void)
+static bool
+host_openacc_async_synchronize (struct goacc_asyncqueue *aq
+				__attribute__ ((unused)))
 {
-  return 1;
+  return true;
 }
 
-static void
-host_openacc_async_wait (int async __attribute__ ((unused)))
+static bool
+host_openacc_async_serialize (struct goacc_asyncqueue *aq1
+			      __attribute__ ((unused)),
+			      struct goacc_asyncqueue *aq2
+			      __attribute__ ((unused)))
 {
+  return true;
 }
 
-static void
-host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
-			       int async2 __attribute__ ((unused)))
+static bool
+host_openacc_async_host2dev (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
 {
+  return true;
 }
 
-static void
-host_openacc_async_wait_all (void)
+static bool
+host_openacc_async_dev2host (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
 {
+  return true;
 }
 
 static void
-host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
+host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
+				   __attribute__ ((unused)),
+				   void (*callback_fn)(void *)
+				   __attribute__ ((unused)),
+				   void *userptr __attribute__ ((unused)))
 {
 }
 
-static void
-host_openacc_async_set_async (int async __attribute__ ((unused)))
+static struct goacc_asyncqueue *
+host_openacc_async_construct (void)
 {
+  /* Non-NULL 0xffff... value as opaque dummy.  */
+  return (struct goacc_asyncqueue *) -1;
 }
 
+static bool
+host_openacc_async_destruct (struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
+{
+  return true;
+}
+
 static void *
 host_openacc_create_thread_data (int ord __attribute__ ((unused)))
 {
@@ -235,19 +269,21 @@  static struct gomp_device_descr host_dispatch =
 
       .exec_func = host_openacc_exec,
 
-      .register_async_cleanup_func = host_openacc_register_async_cleanup,
-
-      .async_test_func = host_openacc_async_test,
-      .async_test_all_func = host_openacc_async_test_all,
-      .async_wait_func = host_openacc_async_wait,
-      .async_wait_async_func = host_openacc_async_wait_async,
-      .async_wait_all_func = host_openacc_async_wait_all,
-      .async_wait_all_async_func = host_openacc_async_wait_all_async,
-      .async_set_async_func = host_openacc_async_set_async,
-
       .create_thread_data_func = host_openacc_create_thread_data,
       .destroy_thread_data_func = host_openacc_destroy_thread_data,
 
+      .async = {
+	.construct_func = host_openacc_async_construct,
+	.destruct_func = host_openacc_async_destruct,
+	.test_func = host_openacc_async_test,
+	.synchronize_func = host_openacc_async_synchronize,
+	.serialize_func = host_openacc_async_serialize,
+	.queue_callback_func = host_openacc_async_queue_callback,
+	.exec_func = host_openacc_async_exec,
+	.dev2host_func = host_openacc_async_dev2host,
+	.host2dev_func = host_openacc_async_host2dev,
+      },
+
       .cuda = {
 	.get_current_device_func = NULL,
 	.get_current_context_func = NULL,
Index: libgomp/libgomp-plugin.h
===================================================================
--- libgomp/libgomp-plugin.h	(revision 269183)
+++ libgomp/libgomp-plugin.h	(working copy)
@@ -53,6 +53,20 @@  enum offload_target_type
   OFFLOAD_TARGET_TYPE_HSA = 7
 };
 
+/* Opaque type to represent plugin-dependent implementation of an
+   OpenACC asynchronous queue.  */
+struct goacc_asyncqueue;
+
+/* Used to keep a list of active asynchronous queues.  */
+struct goacc_asyncqueue_list
+{
+  struct goacc_asyncqueue *aq;
+  struct goacc_asyncqueue_list *next;
+};
+
+typedef struct goacc_asyncqueue *goacc_aq;
+typedef struct goacc_asyncqueue_list *goacc_aq_list;
+
 /* Auxiliary struct, used for transferring pairs of addresses from plugin
    to libgomp.  */
 struct addr_pair
@@ -93,22 +107,31 @@  extern bool GOMP_OFFLOAD_dev2dev (int, void *, con
 extern bool GOMP_OFFLOAD_can_run (void *);
 extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
 extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
+
 extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
-				       void **, int, unsigned *, void *);
-extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
-extern int GOMP_OFFLOAD_openacc_async_test (int);
-extern int GOMP_OFFLOAD_openacc_async_test_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait (int);
-extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
-extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
-extern void GOMP_OFFLOAD_openacc_async_set_async (int);
+				       void **, unsigned *, void *);
 extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
 extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
+extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
+						  struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
+						       void (*)(void *), void *);
+extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
+					     void **, unsigned *, void *,
+					     struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
+						 struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+						 struct goacc_asyncqueue *);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
-extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int);
-extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *);
+extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
+						 void *);
 
 #ifdef __cplusplus
 }