[gomp4] Rationalise thread-local variables in libgomp OpenACC support
diff mbox

Message ID 20141029152017.4829ad2f@octopus
State New
Headers show

Commit Message

Julian Brown Oct. 29, 2014, 3:20 p.m. UTC
On Tue, 28 Oct 2014 11:16:19 +0000
Julian Brown <julian@codesourcery.com> wrote:

> Hi,
> 
> This patch rationalises TLS support by moving all thread-local
> variables into a single structure. Because this meant interfering with
> how per-thread/per-device initialisation was done, I took the
> opportunity to tidy up a couple of other bits along the way.
> Highlights are:

Here's a slightly-updated version of the patch, adjusted for Thomas's
removal of the queue.h list-handling functions. ChangeLog as before.

Thanks,

Julian

Patch
diff mbox

commit ab4e9ff7a52e43418d6d2fc5b5e76e0065e130d5
Author: Julian Brown <julian@codesourcery.com>
Date:   Mon Oct 27 08:43:07 2014 -0700

    TLS rework

diff --git a/libgomp/env.c b/libgomp/env.c
index 32fb92c..8b22e6f 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -28,6 +28,7 @@ 
 #include "libgomp.h"
 #include "libgomp_f.h"
 #include "target.h"
+#include "oacc-int.h"
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index e31573c..1496437 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -50,8 +50,4 @@  extern void GOMP_PLUGIN_mutex_destroy (gomp_mutex_t *mutex);
 extern void GOMP_PLUGIN_mutex_lock (gomp_mutex_t *mutex);
 extern void GOMP_PLUGIN_mutex_unlock (gomp_mutex_t *mutex);
 
-/* target.c */
-
-extern void GOMP_PLUGIN_async_unmap_vars (void *ptr);
-
 #endif
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 538aabb..c6a88a2 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -337,4 +337,5 @@  PLUGIN_1.0 {
 	GOMP_PLUGIN_mutex_lock;
 	GOMP_PLUGIN_mutex_unlock;
 	GOMP_PLUGIN_async_unmap_vars;
+	GOMP_PLUGIN_acc_thread;
 };
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 08b6b95..dddfe05 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -29,6 +29,7 @@ 
 #include "openacc.h"
 #include "libgomp.h"
 #include "target.h"
+#include "oacc-int.h"
 
 int
 acc_async_test (int async)
@@ -36,13 +37,13 @@  acc_async_test (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  return ACC_dev->openacc.async_test_func (async);
+  return base_dev->openacc.async_test_func (async);
 }
 
 int
 acc_async_test_all (void)
 {
-  return ACC_dev->openacc.async_test_all_func ();
+  return base_dev->openacc.async_test_all_func ();
 }
 
 void
@@ -51,22 +52,19 @@  acc_wait (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  ACC_dev->openacc.async_wait_func (async);
-  return;
+  base_dev->openacc.async_wait_func (async);
 }
 
 void
 acc_wait_async (int async1, int async2)
 {
-  ACC_dev->openacc.async_wait_async_func (async1, async2);
-  return;
+  base_dev->openacc.async_wait_async_func (async1, async2);
 }
 
 void
 acc_wait_all (void)
 {
-  ACC_dev->openacc.async_wait_all_func ();
-  return;
+  base_dev->openacc.async_wait_all_func ();
 }
 
 void
@@ -75,6 +73,5 @@  acc_wait_all_async (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  ACC_dev->openacc.async_wait_all_async_func (async);
-  return;
+  base_dev->openacc.async_wait_all_async_func (async);
 }
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index f587325..3daf5b1 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -29,14 +29,15 @@ 
 #include "config.h"
 #include "libgomp.h"
 #include "target.h"
+#include "oacc-int.h"
 
 void *
 acc_get_current_cuda_device (void)
 {
   void *p = NULL;
 
-  if (ACC_dev && ACC_dev->openacc.cuda.get_current_device_func)
-    p = ACC_dev->openacc.cuda.get_current_device_func ();
+  if (base_dev && base_dev->openacc.cuda.get_current_device_func)
+    p = base_dev->openacc.cuda.get_current_device_func ();
 
   return p;
 }
@@ -46,8 +47,8 @@  acc_get_current_cuda_context (void)
 {
   void *p = NULL;
 
-  if (ACC_dev && ACC_dev->openacc.cuda.get_current_context_func)
-    p = ACC_dev->openacc.cuda.get_current_context_func ();
+  if (base_dev && base_dev->openacc.cuda.get_current_context_func)
+    p = base_dev->openacc.cuda.get_current_context_func ();
 
   return p;
 }
@@ -60,8 +61,8 @@  acc_get_cuda_stream (int async)
   if (async < 0)
     return p;
 
-  if (ACC_dev && ACC_dev->openacc.cuda.get_stream_func)
-    p = ACC_dev->openacc.cuda.get_stream_func (async);
+  if (base_dev && base_dev->openacc.cuda.get_stream_func)
+    p = base_dev->openacc.cuda.get_stream_func (async);
 
   return p;
 }
@@ -73,9 +74,11 @@  acc_set_cuda_stream (int async, void *stream)
 
   if (async < 0 || stream == NULL)
     return 0;
+  
+  ACC_lazy_initialize ();
 
-  if (ACC_dev && ACC_dev->openacc.cuda.set_stream_func)
-    s = ACC_dev->openacc.cuda.set_stream_func (async, stream);
+  if (base_dev && base_dev->openacc.cuda.set_stream_func)
+    s = base_dev->openacc.cuda.set_stream_func (async, stream);
 
   return s;
 }
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index f44ca5e..6fe8f6c 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -35,6 +35,9 @@ 
 #include "target.h"
 #ifdef HOST_NONSHM_PLUGIN
 #include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+#else
+#include "oacc-int.h"
 #endif
 
 #include <stdint.h>
@@ -365,6 +368,17 @@  openacc_async_wait_all_async (int async __attribute__((unused)))
 #endif
 }
 
+STATIC void *
+openacc_create_thread_data (void *targ_data __attribute__((unused)))
+{
+  return NULL;
+}
+
+STATIC void
+openacc_destroy_thread_data (void *tls_data __attribute__((unused)))
+{
+}
+
 #ifndef HOST_NONSHM_PLUGIN
 static struct gomp_device_descr host_dispatch =
   {
@@ -416,7 +430,10 @@  static struct gomp_device_descr host_dispatch =
       .async_wait_async_func = openacc_async_wait_async,
       .async_wait_all_func = openacc_async_wait_all,
       .async_wait_all_async_func = openacc_async_wait_all_async,
-      
+
+      .create_thread_data_func = openacc_create_thread_data,
+      .destroy_thread_data_func = openacc_destroy_thread_data,
+
       .cuda = {
 	.get_current_device_func = NULL,
 	.get_current_context_func = NULL,
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index ffa9ad8..f08bc38 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -27,44 +27,50 @@ 
 
 #include "libgomp.h"
 #include "target.h"
+#include "oacc-int.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <stdbool.h>
 #include <stdio.h>
 
-gomp_mutex_t acc_device_lock;
+static gomp_mutex_t acc_device_lock;
 
-/* Current dispatcher, and how it was initialized */
-static acc_device_t init_key = _ACC_device_hwm;
-
-/* The dispatch table for the current accelerator device.  This is currently
-   global, so you can only have one type of device open at any given time in a
-   program.  */
-struct gomp_device_descr const *ACC_dev;
+/* The dispatch table for the current accelerator device.  This is global, so
+   you can only have one type of device open at any given time in a program. 
+   This is the "base" device in that several devices that use the same
+   dispatch table may be active concurrently: this one (the "zeroth") is used
+   for overall initialisation/shutdown, and other instances -- not necessarily
+   including this one -- may be opened and closed once the base device has
+   been initialized.  */
+struct gomp_device_descr const *base_dev;
 
-/* Handle for current thread.  */
-__thread  void *ACC_handle;
-static __thread int handle_num = -1;
+#ifdef HAVE_TLS
+__thread struct goacc_thread *goacc_tls_data;
+#else
+pthread_key_t goacc_tls_key;
+#endif
+static pthread_key_t goacc_cleanup_key;
 
-/* This context structure associates the handle for a physical device with
-   memory-mapping information for that device, and is used to associate new
-   host threads with previously-opened devices.  Note that it's not directly
-   connected with the CUDA "context" concept as used by the NVidia plugin.  */
-struct ACC_context {
-  struct memmap_t *ACC_memmap;
-  void *ACC_handle;
-
-  struct ACC_context *next;
-};
+/* Current dispatcher, and how it was initialized */
+static acc_device_t init_key = _ACC_device_hwm;
 
-static struct ACC_context *ACC_contexts;
+static struct goacc_thread *goacc_threads;
+static gomp_mutex_t goacc_thread_lock;
 
+/* An array of dispatchers for device types, indexed by the type.  This array
+   only references "base" devices, and other instances of the same type are
+   found by simply indexing from each such device (which are stored linearly,
+   grouped by device in target.c:devices).  */
 static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 };
 
 void
 ACC_register (struct gomp_device_descr const *disp)
 {
+  /* Only register the 0th device here.  */
+  if (disp->ord != 0)
+    return;
+
   gomp_mutex_lock (&acc_device_lock);
 
   assert (acc_device_type (disp->type) != acc_device_none
@@ -76,21 +82,6 @@  ACC_register (struct gomp_device_descr const *disp)
   gomp_mutex_unlock (&acc_device_lock);
 }
 
-static void
-close_handle (void)
-{
-  if (ACC_memmap)
-    {
-      if (ACC_mem_close (ACC_handle, ACC_memmap))
-        {
-          if (ACC_dev->openacc.close_device_func (ACC_handle) < 0)
-            gomp_fatal ("failed to close device");
-        }
-
-      ACC_memmap = 0;
-    }
-}
-
 static struct gomp_device_descr const *
 resolve_device (acc_device_t d)
 {
@@ -148,79 +139,135 @@  resolve_device (acc_device_t d)
   return dispatchers[d];
 }
 
+/* This is called when plugins have been initialized, and serves to call
+   (indirectly) the target's device_init hook.  Calling multiple times without
+   an intervening _acc_shutdown call is an error.  */
+
 static struct gomp_device_descr const *
 _acc_init (acc_device_t d)
 {
   struct gomp_device_descr const *acc_dev;
 
-  if (ACC_dev)
-    gomp_fatal ("device already active");
-
-  init_key = d;  /* We need to remember what we were intialized as, to
-		    check shutdown etc.  */
-
   acc_dev = resolve_device (d);
+
   if (!acc_dev || !acc_dev->openacc.avail_func ())
     gomp_fatal ("device %u not supported", (unsigned)d);
 
-  if (!acc_dev->is_initialized)
-    gomp_init_device ((struct gomp_device_descr *) acc_dev);
+  if (acc_dev->is_initialized)
+    gomp_fatal ("device already active");
+
+  /* We need to remember what we were intialized as, to check shutdown etc.  */
+  init_key = d;  
+
+  gomp_init_device ((struct gomp_device_descr *) acc_dev);
 
   return acc_dev;
 }
 
-/* Open the ORD'th device of the currently-active type (ACC_dev must be
+static struct goacc_thread *
+goacc_new_thread (void)
+{
+  struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread));
+
+#ifdef HAVE_TLS
+  goacc_tls_data = thr;
+#else
+  pthread_setspecific (goacc_tls_key, thr);
+#endif
+
+  pthread_setspecific (goacc_cleanup_key, thr);
+
+  gomp_mutex_lock (&goacc_thread_lock);
+  thr->next = goacc_threads;
+  goacc_threads = thr;
+  gomp_mutex_unlock (&goacc_thread_lock);
+
+  return thr;
+}
+
+static void
+goacc_destroy_thread (void *data)
+{
+  struct goacc_thread *thr = data, *walk, *prev;
+  
+  gomp_mutex_lock (&goacc_thread_lock);
+  
+  if (thr)
+    {
+      if (base_dev && thr->target_tls)
+	{
+	  base_dev->openacc.destroy_thread_data_func (thr->target_tls);
+	  thr->target_tls = NULL;
+	}
+
+      assert (!thr->mapped_data);
+
+      /* Remove from thread list.  */
+      for (prev = NULL, walk = goacc_threads; walk;
+	   prev = walk, walk = walk->next)
+	if (walk == thr)
+	  {
+	    if (prev == NULL)
+	      goacc_threads = walk->next;
+	    else
+	      prev->next = walk->next;
+
+	    free (thr);
+
+	    break;
+	  }
+
+      assert (walk);
+    }
+
+  gomp_mutex_unlock (&goacc_thread_lock);
+}
+
+/* Open the ORD'th device of the currently-active type (base_dev must be
    initialised before calling).  If ORD is < 0, open the default-numbered
    device (set by the ACC_DEVICE_NUM environment variable or a call to
    acc_set_device_num), or leave any currently-opened device as is.  "Opening"
-   consists of  calling the device's open_device_func hook, and either creating
-   a new memory mapping or associating a new thread with an existing such
-   mapping (that matches ACC_handle, i.e. which corresponds to the same
-   physical device).  */
+   consists of calling the device's open_device_func hook, and setting up
+   thread-local data (maybe allocating, then initializing with information
+   pertaining to the newly-opened or previously-opened device).  */
 
 static void
 lazy_open (int ord)
 {
-  struct ACC_context *acc_ctx;
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev;
 
-  if (ACC_memmap)
+  if (thr && thr->dev)
     {
-      assert (ord < 0 || ord == handle_num);
+      assert (ord < 0 || ord == thr->dev->ord);
       return;
     }
 
-  assert (ACC_dev);
+  assert (base_dev);
 
   if (ord < 0)
     ord = goacc_device_num;
 
-  ACC_handle = ACC_dev->openacc.open_device_func (ord);
-  handle_num = ord;
+  if (!thr)
+    thr = goacc_new_thread ();
 
-  for (acc_ctx = ACC_contexts; acc_ctx != NULL; acc_ctx = acc_ctx->next)
-    {
-      if (acc_ctx->ACC_handle == ACC_handle)
-        {
-          ACC_memmap = acc_ctx->ACC_memmap;
-	  ACC_dev->openacc.async_set_async_func (acc_async_sync);
+  acc_dev = thr->dev = (struct gomp_device_descr *) &base_dev[ord];
 
-          return;
-        }
-    }
+  assert (acc_dev->ord == ord);
 
-  ACC_memmap = ACC_mem_open (ACC_handle, NULL, handle_num);
+  thr->saved_bound_dev = NULL;
+  thr->mapped_data = NULL;
 
-  ACC_dev->openacc.async_set_async_func (acc_async_sync);
+  if (!acc_dev->target_data)
+    acc_dev->target_data = acc_dev->openacc.open_device_func (ord);
 
-  acc_ctx = gomp_malloc (sizeof (struct ACC_context));
-  acc_ctx->ACC_handle = ACC_handle;
-  acc_ctx->ACC_memmap = ACC_memmap;
+  thr->target_tls
+    = acc_dev->openacc.create_thread_data_func (acc_dev->target_data);
 
-  if (!ACC_memmap->mem_map.is_initialized)
-    gomp_init_tables (ACC_dev, &ACC_memmap->mem_map);
+  acc_dev->openacc.async_set_async_func (acc_async_sync);
 
-  acc_ctx->next = ACC_contexts;
-  ACC_contexts = acc_ctx;
+  if (!acc_dev->mem_map.is_initialized)
+    gomp_init_tables (acc_dev, &acc_dev->mem_map);
 }
 
 /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
@@ -229,12 +276,12 @@  lazy_open (int ord)
 void
 acc_init (acc_device_t d)
 {
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
 
-  ACC_dev = _acc_init (d);
+  base_dev = _acc_init (d);
 
   lazy_open (-1);
 
@@ -246,31 +293,52 @@  ialias (acc_init)
 void
 _acc_shutdown (acc_device_t d)
 {
+  struct goacc_thread *walk;
+
   /* We don't check whether d matches the actual device found, because
      OpenACC 2.0 (3.2.12) says the parameters to the init and this
      call must match (for the shutdown call anyway, it's silent on
      others).  */
 
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_fatal ("no device initialized");
-  if (init_key != d)
+  if (d != init_key)
     gomp_fatal ("device %u(%u) is initialized",
-	       (unsigned)init_key, (unsigned)ACC_dev->type);
+		(unsigned) init_key, (unsigned) base_dev->type);
 
-  close_handle ();
+  gomp_mutex_lock (&goacc_thread_lock);
 
-  while (ACC_contexts != NULL)
+  /* Free target-specific TLS data and close all devices.  */
+  for (walk = goacc_threads; walk != NULL; walk = walk->next)
     {
-      struct ACC_context *c = ACC_contexts;
-      ACC_contexts = ACC_contexts->next;
-      free (c);
+      if (walk->target_tls)
+	base_dev->openacc.destroy_thread_data_func (walk->target_tls);
+
+      walk->target_tls = NULL;
+
+      /* This would mean the user is shutting down OpenACC in the middle of an
+         "acc data" pragma.  Likely not intentional.  */
+      if (walk->mapped_data)
+	gomp_fatal ("shutdown in 'acc data' region");
+
+      if (walk->dev)
+	{
+          if (walk->dev->openacc.close_device_func (walk->dev->target_data) < 0)
+	    gomp_fatal ("failed to close device");
+
+	  walk->dev->target_data = NULL;
+
+	  gomp_free_memmap (walk->dev);
+
+	  walk->dev = NULL;
+	}
     }
 
-  gomp_fini_device ((struct gomp_device_descr *) ACC_dev);
+  gomp_mutex_unlock (&goacc_thread_lock);
 
-  ACC_dev = 0;
-  ACC_handle = 0;
-  handle_num = -1;
+  gomp_fini_device ((struct gomp_device_descr *) base_dev);
+
+  base_dev = NULL;
 }
 
 void
@@ -285,32 +353,42 @@  acc_shutdown (acc_device_t d)
 
 ialias (acc_shutdown)
 
+/* This function is called after plugins have been initialized.  It deals with
+   the "base" device, and is used to prepare the runtime for dealing with a
+   number of such devices (as implemented by some particular plugin).  If the
+   argument device type D matches a previous call to the function, return the
+   current base device, else shut the old device down and re-initialize with
+   the new device type.  */
+
 static struct gomp_device_descr const *
 lazy_init (acc_device_t d)
 {
-  if (ACC_dev)
+  if (base_dev)
     {
       /* Re-initializing the same device, do nothing.  */
       if (d == init_key)
-	return ACC_dev;
+	return base_dev;
 
       _acc_shutdown (init_key);
     }
 
-  assert (!ACC_dev);
+  assert (!base_dev);
 
   return _acc_init (d);
 }
 
+/* Ensure that plugins are loaded, initialize and open the (default-numbered)
+   device.  */
+
 static void
 lazy_init_and_open (acc_device_t d)
 {
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
 
-  ACC_dev = lazy_init (d);
+  base_dev = lazy_init (d);
 
   lazy_open (-1);
 
@@ -326,7 +404,7 @@  acc_get_num_devices (acc_device_t d)
   if (d == acc_device_none)
     return 0;
 
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_init_targets_once ();
 
   acc_dev = resolve_device (d);
@@ -356,8 +434,8 @@  acc_get_device_type (void)
   acc_device_t res = acc_device_none;
   const struct gomp_device_descr *dev;
 
-  if (ACC_dev)
-    res = acc_device_type (ACC_dev->type);
+  if (base_dev)
+    res = acc_device_type (base_dev->type);
   else
     {
       gomp_init_targets_once ();
@@ -383,7 +461,7 @@  acc_get_device_num (acc_device_t d)
   if (d >= _ACC_device_hwm)
     gomp_fatal ("device %u out of range", (unsigned)d);
 
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_init_targets_once ();
 
   dev = resolve_device (d);
@@ -407,7 +485,7 @@  acc_set_device_num (int n, acc_device_t d)
   const struct gomp_device_descr *dev;
   int num_devices;
 
-  if (!ACC_dev)
+  if (!base_dev)
     gomp_init_targets_once ();
   
   if ((int) d == 0)
@@ -429,17 +507,22 @@  acc_set_device_num (int n, acc_device_t d)
     }
   else
     {
+      struct goacc_thread *thr = goacc_thread ();
+
       gomp_mutex_lock (&acc_device_lock);
 
-      ACC_dev = lazy_init (d);
+      base_dev = lazy_init (d);
 
-      num_devices = ACC_dev->get_num_devices_func ();
+      num_devices = base_dev->get_num_devices_func ();
 
       if (n >= num_devices)
         gomp_fatal ("device %u out of range", n);
 
-      if (n != handle_num)
-	close_handle ();
+      /* If we're changing the device number, de-associate this thread with
+	 the device (but don't close the device, since it may be in use by
+	 other threads).  */
+      if (thr && thr->dev && n != thr->dev->ord)
+	thr->dev = NULL;
 
       lazy_open (n);
 
@@ -452,7 +535,10 @@  ialias (acc_set_device_num)
 int
 acc_on_device (acc_device_t dev)
 {
-  if (ACC_dev && acc_device_type (ACC_dev->type) == acc_device_host_nonshm)
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (thr && thr->dev
+      && acc_device_type (thr->dev->type) == acc_device_host_nonshm)
     return dev == acc_device_host_nonshm || dev == acc_device_not_host;
     
   /* Just rely on the compiler builtin.  */
@@ -465,27 +551,38 @@  ACC_runtime_initialize (void)
 {
   gomp_mutex_init (&acc_device_lock);
 
-  ACC_contexts = NULL;
+#ifndef HAVE_TLS
+  pthread_key_create (&goacc_tls_key, NULL);
+#endif
+
+  pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
+
+  base_dev = NULL;
+
+  goacc_threads = NULL;
+  gomp_mutex_init (&goacc_thread_lock);
 }
 
 /* Compiler helper functions */
 
-static __thread struct gomp_device_descr const *saved_bound_dev;
-
 void
 ACC_save_and_set_bind (acc_device_t d)
 {
-  assert (!saved_bound_dev);
+  struct goacc_thread *thr = goacc_thread ();
+
+  assert (!thr->saved_bound_dev);
 
-  saved_bound_dev = ACC_dev;
-  ACC_dev = dispatchers[d];
+  thr->saved_bound_dev = thr->dev;
+  thr->dev = (struct gomp_device_descr *) dispatchers[d];
 }
 
 void
 ACC_restore_bind (void)
 {
-  ACC_dev = saved_bound_dev;
-  saved_bound_dev = NULL;
+  struct goacc_thread *thr = goacc_thread ();
+
+  thr->dev = thr->saved_bound_dev;
+  thr->saved_bound_dev = NULL;
 }
 
 /* This is called from any OpenACC support function that may need to implicitly
@@ -496,10 +593,12 @@  ACC_restore_bind (void)
 void
 ACC_lazy_initialize (void)
 {
-  if (ACC_dev && ACC_memmap)
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (thr && thr->dev)
     return;
 
-  if (!ACC_dev)
+  if (!base_dev)
     lazy_init_and_open (acc_device_default);
   else
     {
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 03529cc..e3da7b3 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -47,74 +47,52 @@ 
 # pragma GCC visibility push(hidden)
 #endif
 
-typedef struct ACC_dispatch_t
+static inline enum acc_device_t
+acc_device_type (enum target_type type)
 {
-  /* open or close a device instance.  */
-  void *(*open_device_func) (int n);
-  int (*close_device_func) (void *h);
-
-  /* set or get the device number.  */
-  int (*get_device_num_func) (void);
-  void (*set_device_num_func) (int);
-
-  /* availability */
-  bool (*avail_func) (void);
-
-  /* execute */
-  void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
-		     unsigned short *, int, int, int, int, void *);
-
-  /* async cleanup callback registration */
-  void (*register_async_cleanup_func) (void *);
-
-  /* asynchronous routines  */
-  int (*async_test_func) (int);
-  int (*async_test_all_func) (void);
-  void (*async_wait_func) (int);
-  void (*async_wait_async_func) (int, int);
-  void (*async_wait_all_func) (void);
-  void (*async_wait_all_async_func) (int);
-  void (*async_set_async_func) (int);
-
-  /* NVIDIA target specific routines  */
-  struct {
-    void *(*get_current_device_func) (void);
-    void *(*get_current_context_func) (void);
-    void *(*get_stream_func) (int);
-    int (*set_stream_func) (int, void *);
-  } cuda;
-} ACC_dispatch_t;
-
-typedef enum ACC_dispatch_f
-  {
-    ACC_unified_mem_f = 1 << 0,
-  }
-ACC_dispatch_f;
+  return (enum acc_device_t) type;
+}
+
+struct goacc_thread
+{
+  /* The device for the current thread.  */
+  struct gomp_device_descr *dev;
+  
+  struct gomp_device_descr *saved_bound_dev;
+
+  /* This is a linked list of data mapped by the "acc data" pragma, following
+     strictly push/pop semantics according to lexical scope.  */
+  struct target_mem_desc *mapped_data;
+    
+  /* These structures form a list: this is the next thread in that list.  */
+  struct goacc_thread *next;
+  
+  /* Target-specific data (used by plugin).  */
+  void *target_tls;
+};
+
+#ifdef HAVE_TLS
+extern __thread struct goacc_thread *goacc_tls_data;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+  return goacc_tls_data;
+}
+#else
+extern pthread_key_t goacc_tls_key;
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+  return pthread_getspecific (goacc_tls_key);
+}
+#endif
 
 struct gomp_device_descr;
 
 void ACC_register (struct gomp_device_descr const *) __GOACC_NOTHROW;
 
-/* Memory routines.  */
-struct memmap_t *ACC_mem_open (void *, struct memmap_t *, int) __GOACC_NOTHROW;
-bool ACC_mem_close (void *, struct memmap_t *) __GOACC_NOTHROW;
-struct gomp_device_descr *ACC_resolve_device(int) __GOACC_NOTHROW;
-
-/* Current dispatcher */
-extern struct gomp_device_descr const *ACC_dev;
-
-/* Device handle for current thread.  */
-extern __thread void *ACC_handle;
-
-typedef struct memmap_t
-{
-  unsigned live;
-  struct target_mem_desc *tlist;
-  struct gomp_memory_mapping mem_map;
-} memmap_t;
-
-/* Memory mapping */
-extern __thread struct memmap_t *ACC_memmap;
+/* Current dispatcher.  */
+extern struct gomp_device_descr const *base_dev;
 
 void ACC_runtime_initialize (void);
 void ACC_save_and_set_bind (acc_device_t);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index f60599f..d812f72 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -30,70 +30,20 @@ 
 #include "libgomp.h"
 #include "gomp-constants.h"
 #include "target.h"
+#include "oacc-int.h"
 #include <stdio.h>
 #include <stdint.h>
+#include <assert.h>
 
 #include "splay-tree.h"
 
-/* Although this pointer is local to each host thread, it points to a memmap_t
-   that is stored per-context (different host threads may be associated with
-   different contexts, and each context is associated with a physical
-   device).  */
-__thread struct memmap_t *ACC_memmap;
-
-memmap_t *
-ACC_mem_open (void *handle, memmap_t *src, int handle_num)
-{
-  if (!src)
-    {
-      src = gomp_malloc (sizeof (*src));
-      src->live = 0;
-      src->mem_map.splay_tree.root = NULL;
-      src->tlist = NULL;
-      gomp_mutex_init (&src->mem_map.lock);
-      src->mem_map.is_initialized = false;
-    }
-
-  src->live++;
-
-  return src;
-}
-
-bool
-ACC_mem_close (void *handle, memmap_t *mm)
-{
-  bool closed = 0;
-
-  if (!--mm->live)
-    {
-      struct target_mem_desc *t;
-
-      for (t = mm->tlist; t != NULL; t = t->prev)
-        {
-          ACC_dev->device_free_func (t->to_free);
-
-          t->tgt_end = 0;
-          t->to_free = 0;
-
-          gomp_unmap_vars (t, true);
-        }
-
-       closed = 1;
-    }
-
-  gomp_mutex_destroy (&mm->mem_map.lock);
-
-  return closed;
-}
-
 /* Return block containing [H->S), or NULL if not contained.  */
 
 attribute_hidden splay_tree_key
-lookup_host (memmap_t *mm, void *h, size_t s)
+lookup_host (struct gomp_memory_mapping *mem_map, void *h, size_t s)
 {
   struct splay_tree_key_s node;
   splay_tree_key key;
-  struct gomp_memory_mapping *mem_map = &mm->mem_map;
 
   node.host_start = (uintptr_t) h;
   node.host_end = (uintptr_t) h + s;
@@ -113,25 +63,31 @@  lookup_host (memmap_t *mm, void *h, size_t s)
    operation.  */
 
 static splay_tree_key
-lookup_dev (memmap_t *b, void *d, size_t s)
+lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
 {
   int i;
   struct target_mem_desc *t;
+  struct gomp_memory_mapping *mem_map;
+  
+  if (!tgt)
+    return NULL;
+  
+  mem_map = tgt->mem_map;
 
-  gomp_mutex_lock (&b->mem_map.lock);
+  gomp_mutex_lock (&mem_map->lock);
 
-  for (t = b->tlist; t != NULL; t = t->prev)
+  for (t = tgt; t != NULL; t = t->prev)
     {
       if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
         break;
     }
 
-  gomp_mutex_unlock (&b->mem_map.lock);
+  gomp_mutex_unlock (&mem_map->lock);
 
   if (!t)
     return NULL;
 
-  for (i = 0; i < t->refcount; i++)
+  for (i = 0; i < t->list_count; i++)
     {
       void * offset;
 
@@ -156,7 +112,7 @@  acc_malloc (size_t s)
 
   ACC_lazy_initialize ();
 
-  return ACC_dev->device_alloc_func (s);
+  return base_dev->device_alloc_func (s);
 }
 
 /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -166,6 +122,7 @@  void
 acc_free (void *d)
 {
   splay_tree_key k;
+  struct goacc_thread *thr = goacc_thread ();
 
   if (!d)
     return;
@@ -173,16 +130,16 @@  acc_free (void *d)
   /* We don't have to call lazy open here, as the ptr value must have
      been returned by acc_malloc.  It's not permitted to pass NULL in
      (unless you got that null from acc_malloc).  */
-  if ((k = lookup_dev (ACC_memmap, d, 1)))
+  if ((k = lookup_dev (thr->dev->openacc.data_environ, d, 1)))
    {
      void *offset;
 
      offset = d - k->tgt->tgt_start + k->tgt_offset;
 
-     acc_unmap_data((void *)(k->host_start + offset));
+     acc_unmap_data ((void *)(k->host_start + offset));
    }
 
-  ACC_dev->device_free_func (d);
+  base_dev->device_free_func (d);
 }
 
 void
@@ -190,7 +147,7 @@  acc_memcpy_to_device (void *d, void *h, size_t s)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
-  ACC_dev->device_host2dev_func (d, h, s);
+  base_dev->device_host2dev_func (d, h, s);
 }
 
 void
@@ -198,7 +155,7 @@  acc_memcpy_from_device (void *h, void *d, size_t s)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
-  ACC_dev->device_dev2host_func (h, d, s);
+  base_dev->device_dev2host_func (h, d, s);
 }
 
 /* Return the device pointer that corresponds to host data H.  Or NULL
@@ -213,7 +170,9 @@  acc_deviceptr (void *h)
 
   ACC_lazy_initialize ();
 
-  n = lookup_host (ACC_memmap, h, 1);
+  struct goacc_thread *thr = goacc_thread ();
+
+  n = lookup_host (&thr->dev->mem_map, h, 1);
 
   if (!n)
     return NULL;
@@ -237,7 +196,9 @@  acc_hostptr (void *d)
 
   ACC_lazy_initialize ();
 
-  n = lookup_dev (ACC_memmap, d, 1);
+  struct goacc_thread *thr = goacc_thread ();
+
+  n = lookup_dev (thr->dev->openacc.data_environ, d, 1);
 
   if (!n)
     return NULL;
@@ -261,10 +222,14 @@  acc_is_present (void *h, size_t s)
 
   ACC_lazy_initialize ();
 
-  n = lookup_host (ACC_memmap, h, s);
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
+  n = lookup_host (&acc_dev->mem_map, h, s);
 
-  if (n && (((uintptr_t)h < n->host_start) ||
-	((uintptr_t)h + s > n->host_end) || (s > n->host_end - n->host_start)))
+  if (n && ((uintptr_t)h < n->host_start
+	    || (uintptr_t)h + s > n->host_end
+	    || s > n->host_end - n->host_start))
     n = NULL;
 
   return n != NULL;
@@ -284,7 +249,10 @@  acc_map_data (void *h, void *d, size_t s)
 
   ACC_lazy_initialize ();
 
-  if (ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
+  if (acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
     {
       if (d != h)
         gomp_fatal ("cannot map data on shared-memory system");
@@ -293,35 +261,39 @@  acc_map_data (void *h, void *d, size_t s)
     }
   else
     {
+      struct goacc_thread *thr = goacc_thread ();
+
       if (!d || !h || !s)
 	gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
                     (void *)h, (int)s, (void *)d, (int)s);
 
-      if (lookup_host (ACC_memmap, h, s))
+      if (lookup_host (&acc_dev->mem_map, h, s))
 	gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h,
 		    (int)s);
 
-      if (lookup_dev (ACC_memmap, d, s))
+      if (lookup_dev (thr->dev->openacc.data_environ, d, s))
 	gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
 		    (int)s);
 
-      tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
-			   &ACC_memmap->mem_map, mapnum, &hostaddrs,
+      tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+			   &acc_dev->mem_map, mapnum, &hostaddrs,
 			   &devaddrs, &sizes, &kinds, true, false);
     }
 
-  tgt->prev = ACC_memmap->tlist;
-  ACC_memmap->tlist = tgt;
+  tgt->prev = acc_dev->openacc.data_environ;
+  acc_dev->openacc.data_environ = tgt;
 }
 
 void
 acc_unmap_data (void *h)
 {
-  /* No need to call lazy open, as the address must have been mapped.
-   */
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
+  /* No need to call lazy open, as the address must have been mapped.  */
 
   size_t host_size;
-  splay_tree_key n = lookup_host (ACC_memmap, h, 1);
+  splay_tree_key n = lookup_host (&acc_dev->mem_map, h, 1);
   struct target_mem_desc *t;
 
   if (!n)
@@ -331,7 +303,7 @@  acc_unmap_data (void *h)
 
   if (n->host_start != (uintptr_t) h)
     gomp_fatal ("[%p,%d] surrounds1 %p",
-            (void *)n->host_start, (int)host_size, (void *)h);
+        	(void *) n->host_start, (int) host_size, (void *) h);
 
   t = n->tgt;
 
@@ -345,24 +317,23 @@  acc_unmap_data (void *h)
       t->tgt_end = 0;
       t->to_free = 0;
 
-      gomp_mutex_lock (&ACC_memmap->mem_map.lock);
+      gomp_mutex_lock (&acc_dev->mem_map.lock);
 
-      for (tp = NULL, t = ACC_memmap->tlist; t != NULL; tp = t, t = t->prev)
-        {
-          if (n->tgt == t)
-            {
-              if (tp)
-                tp->prev = t->prev;
-              else
-                ACC_memmap->tlist = t->prev;
+      for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+	   tp = t, t = t->prev)
+        if (n->tgt == t)
+          {
+            if (tp)
+              tp->prev = t->prev;
+            else
+              acc_dev->openacc.data_environ = t->prev;
 
-              break; 
-            }
-        }
+            break; 
+          }
 
-      gomp_mutex_unlock (&ACC_memmap->mem_map.lock);
+      gomp_mutex_unlock (&acc_dev->mem_map.lock);
     }
-
+  
   gomp_unmap_vars (t, true);
 }
 
@@ -381,7 +352,10 @@  present_create_copy (unsigned f, void *h, size_t s)
 
   ACC_lazy_initialize ();
 
-  n = lookup_host (ACC_memmap, h, s);
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
+  n = lookup_host (&acc_dev->mem_map, h, s);
   if (n)
     {
       /* Present. */
@@ -409,13 +383,17 @@  present_create_copy (unsigned f, void *h, size_t s)
       else
         kinds = GOMP_MAP_ALLOC;
 
-      tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
-			   &ACC_memmap->mem_map, mapnum, &hostaddrs,
+      tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+			   &acc_dev->mem_map, mapnum, &hostaddrs,
 			   NULL, &s, &kinds, true, false);
 
+      gomp_mutex_lock (&acc_dev->mem_map.lock);
+
       d = tgt->to_free;
-      tgt->prev = ACC_memmap->tlist;
-      ACC_memmap->tlist = tgt;
+      tgt->prev = acc_dev->openacc.data_environ;
+      acc_dev->openacc.data_environ = tgt;
+
+      gomp_mutex_unlock (&acc_dev->mem_map.lock);
     }
   
   return d;
@@ -453,8 +431,10 @@  delete_copyout (unsigned f, void *h, size_t s)
   size_t host_size;
   splay_tree_key n;
   void *d;
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
 
-  n = lookup_host (ACC_memmap, h, s);
+  n = lookup_host (&acc_dev->mem_map, h, s);
 
   /* No need to call lazy open, as the data must already have been
      mapped.  */
@@ -468,14 +448,14 @@  delete_copyout (unsigned f, void *h, size_t s)
 
   if (n->host_start != (uintptr_t) h || host_size != s)
     gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
-            (void *)n->host_start, (int)host_size, (void *)h, (int)s);
+        	(void *) n->host_start, (int) host_size, (void *) h, (int) s);
 
   if (f & DC_Copyout)
-    ACC_dev->device_dev2host_func (h, d, s);
+    acc_dev->device_dev2host_func (h, d, s);
   
-  acc_unmap_data(h);
+  acc_unmap_data (h);
 
-  ACC_dev->device_free_func (d);
+  acc_dev->device_free_func (d);
 }
 
 void
@@ -494,11 +474,10 @@  update_dev_host (int is_dev, void *h, size_t s)
 {
   splay_tree_key n;
   void *d;
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
 
-  if (!ACC_memmap)
-    gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
-
-  n = lookup_host (ACC_memmap, h, s);
+  n = lookup_host (&acc_dev->mem_map, h, s);
 
   /* No need to call lazy open, as the data must already have been
      mapped.  */
@@ -509,10 +488,9 @@  update_dev_host (int is_dev, void *h, size_t s)
   d = (void *) (n->tgt->tgt_start + n->tgt_offset);
 
   if (is_dev)
-    ACC_dev->device_host2dev_func (d, h, s);
+    acc_dev->device_host2dev_func (d, h, s);
   else
-    ACC_dev->device_dev2host_func (h, d, s);
-
+    acc_dev->device_dev2host_func (h, d, s);
 }
 
 void
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index e3f156c..b787df7 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -30,43 +30,15 @@ 
 #include "libgomp_g.h"
 #include "gomp-constants.h"
 #include "target.h"
+#include "oacc-int.h"
 #include <stdio.h>
 #include <string.h>
 #include <stdarg.h>
 #include <assert.h>
 #include <alloca.h>
 
-#ifdef FUTURE
-// device geometry per device type
-struct devgeom
-{
-  int gangs;
-  int workers;
-  int vectors;
-};
-  
-
-// XXX: acceptable defaults?
-static __thread struct devgeom devgeom = { 1, 1, 1 };
-#endif
-
-#ifdef LATER
-static void
-dump_devaddrs(void)
-{
-  int i;
-  struct devaddr *dp;
-
-  gomp_notify("++++ num_devaddrs %d\n", num_devaddrs);
-  for (dp = devaddrs, i = 1; dp != 0; dp = dp->next, i++)
-    {
-      gomp_notify("++++ %.02d) %p\n", i, dp->d);
-    }
-}
-#endif
-
 static void
-dump_var(char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
+dump_var (char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
 {
   gomp_notify(" %2zi: %3s 0x%.2x -", idx, s, kind & 0xff);
 
@@ -108,6 +80,8 @@  dump_var(char *s, size_t idx, void *hostaddr, size_t size, unsigned char kind)
 attribute_hidden void
 select_acc_device (int device_type)
 {
+  ACC_lazy_initialize ();
+
   if (device_type == GOMP_IF_CLAUSE_FALSE)
     return;
 
@@ -121,8 +95,6 @@  select_acc_device (int device_type)
 	 know what they're doing...  */
       acc_set_device_type (device_type);
     }
-
-  ACC_lazy_initialize ();
 }
 
 void goacc_wait (int async, int num_waits, va_list ap);
@@ -136,6 +108,8 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
 {
   bool if_clause_condition_value = device != GOMP_IF_CLAUSE_FALSE;
   va_list ap;
+  struct goacc_thread *thr;
+  struct gomp_device_descr *acc_dev;
   struct target_mem_desc *tgt;
   void **devaddrs;
   unsigned int i;
@@ -155,6 +129,9 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
 
   select_acc_device (device);
 
+  thr = goacc_thread ();
+  acc_dev = thr->dev;
+
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
   if (!if_clause_condition_value)
@@ -164,7 +141,7 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
       ACC_restore_bind ();
       return;
     }
-  else if (acc_device_type (ACC_dev->type) == acc_device_host)
+  else if (acc_device_type (acc_dev->type) == acc_device_host)
     {
       fn (hostaddrs);
       return;
@@ -177,15 +154,15 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
 
   va_end (ap);
 
-  ACC_dev->openacc.async_set_async_func (async);
+  acc_dev->openacc.async_set_async_func (async);
 
-  if (!(ACC_dev->capabilities & TARGET_CAP_NATIVE_EXEC))
+  if (!(acc_dev->capabilities & TARGET_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
       k.host_end = k.host_start + 1;
-      gomp_mutex_lock (&ACC_memmap->mem_map.lock);
-      tgt_fn_key = splay_tree_lookup (&ACC_memmap->mem_map.splay_tree, &k);
-      gomp_mutex_unlock (&ACC_memmap->mem_map.lock);
+      gomp_mutex_lock (&acc_dev->mem_map.lock);
+      tgt_fn_key = splay_tree_lookup (&acc_dev->mem_map.splay_tree, &k);
+      gomp_mutex_unlock (&acc_dev->mem_map.lock);
 
       if (tgt_fn_key == NULL)
 	gomp_fatal ("target function wasn't mapped: perhaps -fopenacc was "
@@ -196,8 +173,8 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
   else
     tgt_fn = (void (*)) fn;
 
-  tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
-		       &ACC_memmap->mem_map, mapnum, hostaddrs,
+  tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+		       &acc_dev->mem_map, mapnum, hostaddrs,
 		       NULL, sizes, kinds, true, false);
 
   devaddrs = alloca (sizeof (void *) * mapnum);
@@ -205,7 +182,7 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
     devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
 			    + tgt->list[i]->tgt_offset);
 
-  ACC_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
 			      num_gangs, num_workers, vector_length, async,
 			      tgt);
 
@@ -215,14 +192,12 @@  GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
   else
     {
       gomp_copy_from_async (tgt);
-      ACC_dev->openacc.register_async_cleanup_func (tgt);
+      acc_dev->openacc.register_async_cleanup_func (tgt);
     }
 
-  ACC_dev->openacc.async_set_async_func (acc_async_sync);
+  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
-static __thread struct target_mem_desc *mapped_data = NULL;
-
 void
 GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
 		  void **hostaddrs, size_t *sizes, unsigned short *kinds)
@@ -235,33 +210,37 @@  GOACC_data_start (int device, const void *openmp_target, size_t mapnum,
 
   select_acc_device (device);
 
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
   /* Host fallback or 'do nothing'.  */
-  if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+  if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
       || !if_clause_condition_value)
     {
       tgt = gomp_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, true, false);
-      tgt->prev = mapped_data;
-      mapped_data = tgt;
+      tgt->prev = thr->mapped_data;
+      thr->mapped_data = tgt;
 
       return;
     }
 
   gomp_notify ("  %s: prepare mappings\n", __FUNCTION__);
-  tgt = gomp_map_vars ((struct gomp_device_descr *) ACC_dev,
-		       &ACC_memmap->mem_map, mapnum, hostaddrs,
+  tgt = gomp_map_vars ((struct gomp_device_descr *) acc_dev,
+		       &acc_dev->mem_map, mapnum, hostaddrs,
 		       NULL, sizes, kinds, true, false);
   gomp_notify ("  %s: mappings prepared\n", __FUNCTION__);
-  tgt->prev = mapped_data;
-  mapped_data = tgt;
+  tgt->prev = thr->mapped_data;
+  thr->mapped_data = tgt;
 }
 
 void
 GOACC_data_end (void)
 {
-  struct target_mem_desc *tgt = mapped_data;
+  struct goacc_thread *thr = goacc_thread ();
+  struct target_mem_desc *tgt = thr->mapped_data;
 
   gomp_notify ("  %s: restore mappings\n", __FUNCTION__);
-  mapped_data = tgt->prev;
+  thr->mapped_data = tgt->prev;
   gomp_unmap_vars (tgt, true);
   gomp_notify ("  %s: mappings restored\n", __FUNCTION__);
 }
@@ -296,6 +275,8 @@  GOACC_kernels (int device, void (*fn) (void *), const void *openmp_target,
 void
 goacc_wait (int async, int num_waits, va_list ap)
 {
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
   int i;
 
   assert (num_waits >= 0);
@@ -322,7 +303,7 @@  goacc_wait (int async, int num_waits, va_list ap)
 
   if (async == acc_async_noval && num_waits == 0)
     {
-      ACC_dev->openacc.async_wait_all_async_func (acc_async_noval);
+      acc_dev->openacc.async_wait_all_async_func (acc_async_noval);
       return;
     }
 
@@ -337,7 +318,7 @@  goacc_wait (int async, int num_waits, va_list ap)
          the queue itself will order work as required, so there's no need to
 	 wait explicitly.  */
       if (qid != async)
-	ACC_dev->openacc.async_wait_async_func (qid, async);
+	acc_dev->openacc.async_wait_async_func (qid, async);
     }
 }
 
@@ -351,7 +332,10 @@  GOACC_update (int device, const void *openmp_target, size_t mapnum,
 
   select_acc_device (device);
 
-  if ((ACC_dev->capabilities & TARGET_CAP_SHARED_MEM)
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+
+  if ((acc_dev->capabilities & TARGET_CAP_SHARED_MEM)
       || !if_clause_condition_value)
     return;
 
@@ -366,34 +350,34 @@  GOACC_update (int device, const void *openmp_target, size_t mapnum,
       va_end (ap);
     }
 
-  ACC_dev->openacc.async_set_async_func (async);
+  acc_dev->openacc.async_set_async_func (async);
 
   for (i = 0; i < mapnum; ++i)
     {
       unsigned char kind = kinds[i] & 0xff;
 
-      dump_var("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
+      dump_var ("UPD", i, hostaddrs[i], sizes[i], kinds[i]);
 
       switch (kind)
 	{
-	  case GOMP_MAP_POINTER:
-	     break;
+	case GOMP_MAP_POINTER:
+	  break;
 
-	  case GOMP_MAP_FORCE_TO:
-	     acc_update_device (hostaddrs[i], sizes[i]);
-	     break;
+	case GOMP_MAP_FORCE_TO:
+	  acc_update_device (hostaddrs[i], sizes[i]);
+	  break;
 
-	  case GOMP_MAP_FORCE_FROM:
-	     acc_update_self (hostaddrs[i], sizes[i]);
-	     break;
+	case GOMP_MAP_FORCE_FROM:
+	  acc_update_self (hostaddrs[i], sizes[i]);
+	  break;
 
-	  default:
-	     gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
-	     break;
+	default:
+	  gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind);
+	  break;
 	}
     }
 
-  ACC_dev->openacc.async_set_async_func (acc_async_sync);
+  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 void
diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c
index 6abb43c..67c452b 100644
--- a/libgomp/oacc-plugin.c
+++ b/libgomp/oacc-plugin.c
@@ -28,13 +28,7 @@ 
 #include "libgomp.h"
 #include "oacc-plugin.h"
 #include "target.h"
-
-void
-ACC_plugin_register (struct gomp_device_descr *device)
-{
-  ACC_register (device);
-}
-
+#include "oacc-int.h"
 
 void
 GOMP_PLUGIN_async_unmap_vars (void *ptr)
@@ -43,3 +37,12 @@  GOMP_PLUGIN_async_unmap_vars (void *ptr)
   
   gomp_unmap_vars (tgt, false);
 }
+
+/* Return the target-specific part of the TLS data for the current thread.  */
+
+void *
+GOMP_PLUGIN_acc_thread (void)
+{
+  struct goacc_thread *thr = goacc_thread ();
+  return thr ? thr->target_tls : NULL;
+}
diff --git a/libgomp/oacc-plugin.h b/libgomp/oacc-plugin.h
index ca919f9..d05a28f 100644
--- a/libgomp/oacc-plugin.h
+++ b/libgomp/oacc-plugin.h
@@ -26,8 +26,7 @@ 
 #ifndef _OACC_PLUGIN_H
 #define _OACC_PLUGIN_H 1
 
-#include "target.h"
-
-extern void ACC_plugin_register (struct gomp_device_descr *dev);
+extern void GOMP_PLUGIN_async_unmap_vars (void *ptr);
+extern void *GOMP_PLUGIN_acc_thread (void);
 
 #endif
diff --git a/libgomp/plugin-nvptx.c b/libgomp/plugin-nvptx.c
index 33f868a..8ed81ec 100644
--- a/libgomp/plugin-nvptx.c
+++ b/libgomp/plugin-nvptx.c
@@ -38,6 +38,7 @@ 
 #include "libgomp.h"
 #include "target.h"
 #include "libgomp-plugin.h"
+#include "oacc-plugin.h"
 
 #include <cuda.h>
 #include <stdint.h>
@@ -151,8 +152,13 @@  struct PTX_stream
   struct PTX_stream *next;
 };
 
-/* Each thread may select a stream (also specific to a device/context).  */
-static __thread struct PTX_stream *current_stream;
+/* Thread-specific data for PTX.  */
+
+struct nvptx_thread
+{
+  struct PTX_stream *current_stream;
+  struct PTX_device *ptx_dev;
+};
 
 struct map
 {
@@ -307,9 +313,6 @@  struct PTX_device
   struct PTX_device *next;
 };
 
-static __thread struct PTX_device *PTX_dev;
-static struct PTX_device *PTX_devices;
-
 enum PTX_event_type
 {
   PTX_EVT_MEM,
@@ -399,6 +402,12 @@  verify_device_library (void)
   return 0;
 }
 
+static inline struct nvptx_thread *
+nvptx_thread (void)
+{
+  return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
 static void
 init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
 {
@@ -460,8 +469,9 @@  static struct PTX_stream *
 select_stream_for_async (int async, pthread_t thread, bool create,
 			 CUstream existing)
 {
+  struct nvptx_thread *nvthd = nvptx_thread ();
   /* Local copy of TLS variable.  */
-  struct PTX_device *ptx_dev = PTX_dev;
+  struct PTX_device *ptx_dev = nvthd->ptx_dev;
   struct PTX_stream *stream = NULL;
   int orig_async = async;
   
@@ -591,7 +601,6 @@  PTX_init (void)
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
 
-  PTX_devices = NULL;
   PTX_events = NULL;
 
   GOMP_PLUGIN_mutex_init (&PTX_event_lock);
@@ -612,133 +621,93 @@  PTX_fini (void)
 static void *
 PTX_open_device (int n)
 {
+  struct PTX_device *ptx_dev;
   CUdevice dev;
   CUresult r;
   int async_engines, pi;
 
-  if (PTX_devices)
-    {
-      struct PTX_device *ptx_device;
-
-      for (ptx_device = PTX_devices;
-	   ptx_device != NULL;
-	   ptx_device = ptx_device->next)
-        {
-          if (ptx_device->ord == n)
-            {
-              PTX_dev = ptx_device;
-
-              if (PTX_dev->ctx)
-                {
-                  r = cuCtxPushCurrent (PTX_dev->ctx);
-                  if (r != CUDA_SUCCESS)
-                    GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s",
-				       cuErrorMsg (r));
-                }
-
-              return (void *)PTX_dev;
-            }
-        }
-    }
-
   r = cuDeviceGet (&dev, n);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
 
-  PTX_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
-  PTX_dev->ord = n;
-  PTX_dev->dev = dev;
-  PTX_dev->ctx_shared = false;
+  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
 
-  PTX_dev->next = PTX_devices;
-  PTX_devices = PTX_dev;
+  ptx_dev->ord = n;
+  ptx_dev->dev = dev;
+  ptx_dev->ctx_shared = false;
 
-  r = cuCtxGetCurrent (&PTX_dev->ctx);
+  r = cuCtxGetCurrent (&ptx_dev->ctx);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
 
-  if (!PTX_dev->ctx)
+  if (!ptx_dev->ctx)
     {
-      r = cuCtxCreate (&PTX_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
       if (r != CUDA_SUCCESS)
 	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
     }
   else
-    {
-      PTX_dev->ctx_shared = true;
-    }
-   
+    ptx_dev->ctx_shared = true;
+
   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 
-  PTX_dev->overlap = pi;
+  ptx_dev->overlap = pi;
 
   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 
-  PTX_dev->map = pi;
+  ptx_dev->map = pi;
 
   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 
-  PTX_dev->concur = pi;
+  ptx_dev->concur = pi;
 
   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 
-  PTX_dev->mode = pi;
+  ptx_dev->mode = pi;
 
   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 
-  PTX_dev->mkern = pi;
+  ptx_dev->mkern = pi;
 
   r = cuDeviceGetAttribute (&async_engines,
 			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
     async_engines = 1;
 
-  init_streams_for_device (PTX_dev, async_engines);
-
-  current_stream = PTX_dev->null_stream;
+  init_streams_for_device (ptx_dev, async_engines);
 
-  return (void *)PTX_dev;
+  return (void *) ptx_dev;
 }
 
 static int
-PTX_close_device (void *h __attribute__((unused)))
+PTX_close_device (void *targ_data)
 {
   CUresult r;
+  struct PTX_device *ptx_dev = targ_data;
 
-  if (!PTX_dev)
+  if (!ptx_dev)
     return 0;
   
-  fini_streams_for_device (PTX_dev);
+  fini_streams_for_device (ptx_dev);
 
-  if (!PTX_dev->ctx_shared)
+  if (!ptx_dev->ctx_shared)
     {
-      r = cuCtxDestroy (PTX_dev->ctx);
+      r = cuCtxDestroy (ptx_dev->ctx);
       if (r != CUDA_SUCCESS)
 	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
     }
 
-  if (PTX_devices == PTX_dev)
-    PTX_devices = PTX_devices->next;
-  else
-    {
-      struct PTX_device* d = PTX_devices;
-      while (d->next != PTX_dev)
-	d = d->next;
-      d->next = d->next->next;
-    }
-  free (PTX_dev);
-
-  PTX_dev = NULL;
+  free (ptx_dev);
 
   return 0;
 }
@@ -749,7 +718,12 @@  PTX_get_num_devices (void)
   int n;
   CUresult r;
 
-  assert (PTX_inited);
+  /* This function will be called before the plugin has been initialized in
+     order to enumerate available devices, but CUDA API routines can't be used
+     until cuInit has been called.  Just call it now (but don't yet do any
+     further initialization).  */
+  if (!PTX_inited)
+    cuInit (0);
 
   r = cuDeviceGetCount (&n);
   if (r!= CUDA_SUCCESS)
@@ -927,6 +901,7 @@  static void
 event_gc (bool memmap_lockable)
 {
   struct PTX_event *ptx_event = PTX_events;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
 
@@ -937,7 +912,7 @@  event_gc (bool memmap_lockable)
 
       ptx_event = ptx_event->next;
 
-      if (e->ord != PTX_dev->ord)
+      if (e->ord != nvthd->ptx_dev->ord)
 	continue;
 
       r = cuEventQuery (*e->evt);
@@ -996,6 +971,7 @@  static void
 event_add (enum PTX_event_type type, CUevent *e, void *h)
 {
   struct PTX_event *ptx_event;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 	  || type == PTX_EVT_ASYNC_CLEANUP);
@@ -1004,7 +980,7 @@  event_add (enum PTX_event_type type, CUevent *e, void *h)
   ptx_event->type = type;
   ptx_event->evt = e;
   ptx_event->addr = h;
-  ptx_event->ord = PTX_dev->ord;
+  ptx_event->ord = nvthd->ptx_dev->ord;
 
   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
 
@@ -1027,11 +1003,12 @@  PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   void *kargs[1];
   void *hp, *dp;
   unsigned int nthreads_in_block;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
   function = targ_fn->fn;
   
   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
-  assert (dev_str == current_stream);
+  assert (dev_str == nvthd->current_stream);
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -1166,6 +1143,7 @@  PTX_host2dev (void *d, const void *h, size_t s)
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1190,7 +1168,7 @@  PTX_host2dev (void *d, const void *h, size_t s)
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (current_stream != PTX_dev->null_stream)
+  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1202,11 +1180,12 @@  PTX_host2dev (void *d, const void *h, size_t s)
 
       event_gc (false);
 
-      r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, current_stream->stream);
+      r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
+			     nvthd->current_stream->stream);
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
 
-      r = cuEventRecord (*e, current_stream->stream);
+      r = cuEventRecord (*e, nvthd->current_stream->stream);
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
@@ -1229,6 +1208,7 @@  PTX_dev2host (void *h, const void *d, size_t s)
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1253,7 +1233,7 @@  PTX_dev2host (void *h, const void *d, size_t s)
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (current_stream != PTX_dev->null_stream)
+  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1265,11 +1245,12 @@  PTX_dev2host (void *h, const void *d, size_t s)
 
       event_gc (false);
 
-      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, current_stream->stream);
+      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
+			     nvthd->current_stream->stream);
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
 
-      r = cuEventRecord (*e, current_stream->stream);
+      r = cuEventRecord (*e, nvthd->current_stream->stream);
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
@@ -1289,7 +1270,9 @@  PTX_dev2host (void *h, const void *d, size_t s)
 static void
 PTX_set_async (int async)
 {
-  current_stream = select_stream_for_async (async, pthread_self (), true, NULL);
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  nvthd->current_stream
+    = select_stream_for_async (async, pthread_self (), true, NULL);
 }
 
 static int
@@ -1327,20 +1310,21 @@  PTX_async_test_all (void)
 {
   struct PTX_stream *s;
   pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
-  GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
 
-  for (s = PTX_dev->active_streams; s != NULL; s = s->next)
+  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
     {
       if ((s->multithreaded || pthread_equal (s->host_thread, self))
 	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
 	{
-	  GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+	  GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
 	  return 0;
 	}
     }
 
-  GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
 
   event_gc (true);
 
@@ -1409,12 +1393,13 @@  PTX_wait_all (void)
   CUresult r;
   struct PTX_stream *s;
   pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
-  GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
 
   /* Wait for active streams initiated by this thread (or by multiple threads)
      to complete.  */
-  for (s = PTX_dev->active_streams; s != NULL; s = s->next)
+  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
     {
       if (s->multithreaded || pthread_equal (s->host_thread, self))
         {
@@ -1430,7 +1415,7 @@  PTX_wait_all (void)
 	}
     }
 
-  GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
 
   event_gc (true);
 }
@@ -1441,6 +1426,7 @@  PTX_wait_all_async (int async)
   CUresult r;
   struct PTX_stream *waiting_stream, *other_stream;
   CUevent *e;
+  struct nvptx_thread *nvthd = nvptx_thread ();
   pthread_t self = pthread_self ();
   
   /* The stream doing the waiting.  This could be the first mention of the
@@ -1450,14 +1436,14 @@  PTX_wait_all_async (int async)
   
   /* Launches on the null stream already block on other streams in the
      context.  */
-  if (!waiting_stream || waiting_stream == PTX_dev->null_stream)
+  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
     return;
 
   event_gc (true);
 
-  GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
 
-  for (other_stream = PTX_dev->active_streams;
+  for (other_stream = nvthd->ptx_dev->active_streams;
        other_stream != NULL;
        other_stream = other_stream->next)
     {
@@ -1483,33 +1469,38 @@  PTX_wait_all_async (int async)
 	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
    }
 
-  GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
 }
 
 static void *
 PTX_get_current_cuda_device (void)
 {
-  if (!PTX_dev)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!nvthd || !nvthd->ptx_dev)
     return NULL;
 
-  return &PTX_dev->dev;
+  return &nvthd->ptx_dev->dev;
 }
 
 static void *
 PTX_get_current_cuda_context (void)
 {
-  if (!PTX_dev)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!nvthd || !nvthd->ptx_dev)
     return NULL;
 
-  return PTX_dev->ctx;
+  return nvthd->ptx_dev->ctx;
 }
 
 static void *
 PTX_get_cuda_stream (int async)
 {
   struct PTX_stream *s;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
-  if (!PTX_dev)
+  if (!nvthd || !nvthd->ptx_dev)
     return NULL;
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
@@ -1522,8 +1513,9 @@  PTX_set_cuda_stream (int async, void *stream)
 {
   struct PTX_stream *oldstream;
   pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
-  GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_lock (&nvthd->ptx_dev->stream_lock);
 
   if (async < 0)
     GOMP_PLUGIN_fatal ("bad async %d", async);
@@ -1540,11 +1532,11 @@  PTX_set_cuda_stream (int async, void *stream)
   
   if (oldstream)
     {
-      if (PTX_dev->active_streams == oldstream)
-	PTX_dev->active_streams = PTX_dev->active_streams->next;
+      if (nvthd->ptx_dev->active_streams == oldstream)
+	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
       else
 	{
-	  struct PTX_stream *s = PTX_dev->active_streams;
+	  struct PTX_stream *s = nvthd->ptx_dev->active_streams;
 	  while (s->next != oldstream)
 	    s = s->next;
 	  s->next = s->next->next;
@@ -1555,7 +1547,7 @@  PTX_set_cuda_stream (int async, void *stream)
       free (oldstream);
     }
 
-  GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
+  GOMP_PLUGIN_mutex_unlock (&nvthd->ptx_dev->stream_lock);
 
   (void) select_stream_for_async (async, self, true, (CUstream) stream);
 
@@ -1778,9 +1770,11 @@  openacc_close_device (void *h)
 void
 openacc_set_device_num (int n)
 {
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
   assert (n >= 0);
 
-  if (!PTX_dev || PTX_dev->ord != n)
+  if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
     (void) PTX_open_device (n);
 }
 
@@ -1792,8 +1786,10 @@  openacc_set_device_num (int n)
 int
 openacc_get_device_num (void)
 {
-  if (PTX_dev)
-    return PTX_dev->ord;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (nvthd && nvthd->ptx_dev)
+    return nvthd->ptx_dev->ord;
   else
     return -1;
 }
@@ -1812,6 +1808,7 @@  openacc_register_async_cleanup (void *targ_mem_desc)
 {
   CUevent *e;
   CUresult r;
+  struct nvptx_thread *nvthd = nvptx_thread ();
 
 #ifdef DEBUG
   fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
@@ -1824,7 +1821,7 @@  openacc_register_async_cleanup (void *targ_mem_desc)
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
 
-  r = cuEventRecord (*e, current_stream->stream);
+  r = cuEventRecord (*e, nvthd->current_stream->stream);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
 
@@ -1900,6 +1897,40 @@  openacc_async_set_async (int async)
 }
 
 void *
+openacc_create_thread_data (void *targ_data)
+{
+  struct PTX_device *ptx_dev = (struct PTX_device *) targ_data;
+  struct nvptx_thread *nvthd
+    = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
+  CUresult r;
+  CUcontext thd_ctx;
+
+  r = cuCtxGetCurrent (&thd_ctx);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
+
+  assert (ptx_dev->ctx);
+
+  if (!thd_ctx)
+    {
+      r = cuCtxPushCurrent (ptx_dev->ctx);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuErrorMsg (r));
+    }
+
+  nvthd->current_stream = ptx_dev->null_stream;
+  nvthd->ptx_dev = ptx_dev;
+
+  return (void *) nvthd;
+}
+
+void
+openacc_destroy_thread_data (void *data)
+{
+  free (data);
+}
+
+void *
 openacc_get_current_cuda_device (void)
 {
 #ifdef DEBUG
diff --git a/libgomp/target.c b/libgomp/target.c
index 79b252d..bce8ca6 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -80,6 +80,7 @@  splay_compare (splay_tree_key x, splay_tree_key y)
 }
 
 #include "target.h"
+#include "oacc-int.h"
 
 attribute_hidden void
 gomp_init_targets_once (void)
@@ -815,21 +816,28 @@  gomp_init_dev_tables (struct gomp_device_descr *devicep)
 }
 
 attribute_hidden void
-gomp_fini_device (struct gomp_device_descr *devicep)
+gomp_free_memmap (struct gomp_device_descr *devicep)
 {
   struct gomp_memory_mapping *mm = &devicep->mem_map;
 
-  if (devicep->is_initialized)
-    devicep->device_fini_func ();
-
   while (mm->splay_tree.root)
     {
       struct target_mem_desc *tgt = mm->splay_tree.root->key.tgt;
+      
+      splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
       free (tgt->array);
       free (tgt);
-      splay_tree_remove (&mm->splay_tree, &mm->splay_tree.root->key);
     }
 
+  mm->is_initialized = false;
+}
+
+attribute_hidden void
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+  if (devicep->is_initialized)
+    devicep->device_fini_func ();
+
   devicep->is_initialized = false;
 }
 
@@ -1076,6 +1084,8 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
       DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all);
       DLSYM_OPT (openacc.async_wait_all_async, openacc_async_wait_all_async);
       DLSYM_OPT (openacc.async_set_async, openacc_async_set_async);
+      DLSYM_OPT (openacc.create_thread_data, openacc_create_thread_data);
+      DLSYM_OPT (openacc.destroy_thread_data, openacc_destroy_thread_data);
       /* Require all the OpenACC handlers if we have TARGET_CAP_OPENACC_200.  */
       if (optional_present != optional_total)
 	{
@@ -1155,6 +1165,8 @@  gomp_find_available_plugins (void)
   while ((ent = readdir (dir)) != NULL)
     {
       struct gomp_device_descr current_device, *devicep;
+      unsigned int i;
+
       if (!gomp_check_plugin_file_name (ent->d_name))
 	continue;
       if (strlen (plugin_path) + 1 + strlen (ent->d_name) >= PATH_MAX)
@@ -1172,18 +1184,24 @@  gomp_find_available_plugins (void)
 	  goto out;
 	}
 
-      devices[num_devices] = current_device;
-      devicep = &devices[num_devices];
-
-      devicep->is_initialized = false;
-      devicep->offload_regions_registered = false;
-      devicep->mem_map.splay_tree.root = NULL;
-      devicep->mem_map.is_initialized = false;
-      devicep->type = devicep->get_type_func ();
-      devicep->name = devicep->get_name_func ();
-      devicep->capabilities = devicep->get_caps_func ();
-      gomp_mutex_init (&devicep->mem_map.lock);
-      devicep->id = ++num_devices;
+      for (i = 0; i < current_device.get_num_devices_func (); i++)
+        {
+	  devices[num_devices] = current_device;
+	  devicep = &devices[num_devices];
+
+	  devicep->is_initialized = false;
+	  devicep->offload_regions_registered = false;
+	  devicep->mem_map.splay_tree.root = NULL;
+	  devicep->mem_map.is_initialized = false;
+	  devicep->type = devicep->get_type_func ();
+	  devicep->name = devicep->get_name_func ();
+	  devicep->capabilities = devicep->get_caps_func ();
+	  gomp_mutex_init (&devicep->mem_map.lock);
+	  devicep->ord = i;
+	  devicep->target_data = NULL;
+	  devicep->openacc.data_environ = NULL;
+	  devicep->id = ++num_devices;
+	}
     }
   /* Prefer a device with TARGET_CAP_OPENMP_400 for ICV default-device-var.  */
   if (num_devices > 1)
@@ -1219,7 +1237,7 @@  gomp_find_available_plugins (void)
 	 found all the plugins, so registering with the OpenACC runtime (which
 	 takes a copy of the pointer argument) must be delayed until now.  */
       if (devices[i].capabilities & TARGET_CAP_OPENACC_200)
-	ACC_plugin_register (&devices[i]);
+	ACC_register (&devices[i]);
     }
 
  out:
diff --git a/libgomp/target.h b/libgomp/target.h
index 635cc52..b5773e2 100644
--- a/libgomp/target.h
+++ b/libgomp/target.h
@@ -87,13 +87,53 @@  struct gomp_memory_mapping
   bool is_initialized;
 };
 
-#include "oacc-int.h"
-
-static inline enum acc_device_t
-acc_device_type (enum target_type type)
+typedef struct ACC_dispatch_t
 {
-  return (enum acc_device_t) type;
-}
+  /* This is a linked list of data mapped using the
+     acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas
+     (TODO).  Unlike mapped_data in the goacc_thread struct, unmapping can
+     happen out-of-order with respect to mapping.  */
+  struct target_mem_desc *data_environ;
+
+  /* Open or close a device instance.  */
+  void *(*open_device_func) (int n);
+  int (*close_device_func) (void *h);
+
+  /* Set or get the device number.  */
+  int (*get_device_num_func) (void);
+  void (*set_device_num_func) (int);
+
+  /* Availability.  */
+  bool (*avail_func) (void);
+
+  /* Execute.  */
+  void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
+		     unsigned short *, int, int, int, int, void *);
+
+  /* Async cleanup callback registration.  */
+  void (*register_async_cleanup_func) (void *);
+
+  /* Asynchronous routines.  */
+  int (*async_test_func) (int);
+  int (*async_test_all_func) (void);
+  void (*async_wait_func) (int);
+  void (*async_wait_async_func) (int, int);
+  void (*async_wait_all_func) (void);
+  void (*async_wait_all_async_func) (int);
+  void (*async_set_async_func) (int);
+
+  /* Create/destroy TLS data.  */
+  void *(*create_thread_data_func) (void *);
+  void (*destroy_thread_data_func) (void *);
+
+  /* NVIDIA target specific routines.  */
+  struct {
+    void *(*get_current_device_func) (void);
+    void *(*get_current_context_func) (void);
+    void *(*get_stream_func) (int);
+    int (*set_stream_func) (int, void *);
+  } cuda;
+} ACC_dispatch_t;
 
 struct mapping_table {
   uintptr_t host_start;
@@ -118,6 +158,9 @@  struct gomp_device_descr
      TARGET construct.  */
   int id;
 
+  /* The number of the device for this particular device type.  */
+  int ord;
+
   /* This is the TYPE of device.  */
   enum target_type type;
 
@@ -148,9 +191,11 @@  struct gomp_device_descr
   /* OpenACC-specific functions.  */
   ACC_dispatch_t openacc;
   
-  /* Memory-mapping info (only for OpenMP -- mappings are stored per-thread
-     for OpenACC. It's not clear if that's a useful distinction).  */
+  /* Memory-mapping info for this device instance.  */
   struct gomp_memory_mapping mem_map;
+
+  /* Extra information required for a device instance by a given target.  */
+  void *target_data;
 };
 
 extern struct target_mem_desc *
@@ -175,4 +220,7 @@  gomp_init_tables (const struct gomp_device_descr *devicep,
 extern attribute_hidden void
 gomp_fini_device (struct gomp_device_descr *devicep);
 
+extern attribute_hidden void
+gomp_free_memmap (struct gomp_device_descr *devicep);
+
 #endif /* _TARGET_H */