libgomp nvptx plugin: rework initialisation and support the proposed load/unload hooks (was: Merge current set of OpenACC changes from gomp-4_0-branch)
diff mbox

Message ID 20150408153142.128b97b9@octopus
State New
Headers show

Commit Message

Julian Brown April 8, 2015, 2:31 p.m. UTC
On Tue, 7 Apr 2015 17:26:45 +0200
Jakub Jelinek <jakub@redhat.com> wrote:

> On Mon, Apr 06, 2015 at 03:45:57PM +0300, Ilya Verbin wrote:
> > On Wed, Apr 01, 2015 at 15:20:25 +0200, Jakub Jelinek wrote:
> > > LGTM with proper ChangeLog entry.
> > 
> > I've commited this patch into trunk.
> > 
> > Julian, you probably want to update the nvptx plugin.
> 
> Note that as the number of P1s without posted fixes is now zero, it is
> likely RC1 will be done this week, so if you want nvptx working in
> GCC 5, please post a fix as soon as possible.

This version is mostly the same as the last posted version but has a
tweak in GOACC_parallel to account for the new splay tree arrangement
for target functions:

-      tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start;
+      tgt_fn = (void (*)) tgt_fn_key->tgt_offset;

Have there been any other changes I might have missed?

It passes libgomp testing on NVPTX. OK?

Thanks,

Julian

Comments

Jakub Jelinek April 8, 2015, 2:34 p.m. UTC | #1
On Wed, Apr 08, 2015 at 03:31:42PM +0100, Julian Brown wrote:
> It passes libgomp testing on NVPTX. OK?

Please write a proper ChangeLog entry for it.
Ok with that.

	Jakub
Ilya Verbin April 8, 2015, 2:58 p.m. UTC | #2
On Wed, Apr 08, 2015 at 15:31:42 +0100, Julian Brown wrote:
> This version is mostly the same as the last posted version but has a
> tweak in GOACC_parallel to account for the new splay tree arrangement
> for target functions:
> 
> -      tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start;
> +      tgt_fn = (void (*)) tgt_fn_key->tgt_offset;
> 
> Have there been any other changes I might have missed?

No.

> It passes libgomp testing on NVPTX. OK?

Have you tested it with disabled offloading?

I see several regressions:
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/acc_on_device-1.c -DACC_DEVICE_TYPE_host_nonshm=1 -DACC_MEM_SHARED=0 execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/if-1.c -DACC_DEVICE_TYPE_host_nonshm=1 -DACC_MEM_SHARED=0 execution test

  -- Ilya
Julian Brown April 8, 2015, 4:14 p.m. UTC | #3
On Wed, 8 Apr 2015 17:58:56 +0300
Ilya Verbin <iverbin@gmail.com> wrote:

> Have you tested it with disabled offloading?
> 
> I see several regressions:
> FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/acc_on_device-1.c
> -DACC_DEVICE_TYPE_host_nonshm=1 -DACC_MEM_SHARED=0 execution test
> FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/if-1.c
> -DACC_DEVICE_TYPE_host_nonshm=1 -DACC_MEM_SHARED=0 execution test

No -- thanks for the note. I've committed the patch now, but I'll try
to get to looking at these in the next day or two (it's probably
something relatively minor, I guess).

Julian

Patch
diff mbox

commit ac06b5e25e170061bb9855b9ea4b8e5696816bf1
Author: Julian Brown <julian@codesourcery.com>
Date:   Tue Apr 7 09:23:58 2015 -0700

    NVPTX load/unload and init-rework patch.

diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c
index 02c44b6..dbc68bc 100644
--- a/gcc/config/nvptx/mkoffload.c
+++ b/gcc/config/nvptx/mkoffload.c
@@ -839,6 +839,7 @@  process (FILE *in, FILE *out)
 {
   const char *input = read_file (in);
   Token *tok = tokenize (input);
+  unsigned int nvars = 0, nfuncs = 0;
 
   do
     tok = parse_file (tok);
@@ -850,16 +851,17 @@  process (FILE *in, FILE *out)
   write_stmts (out, rev_stmts (fns));
   fprintf (out, ";\n\n");
   fprintf (out, "static const char *var_mappings[] = {\n");
-  for (id_map *id = var_ids; id; id = id->next)
+  for (id_map *id = var_ids; id; id = id->next, nvars++)
     fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
   fprintf (out, "};\n\n");
   fprintf (out, "static const char *func_mappings[] = {\n");
-  for (id_map *id = func_ids; id; id = id->next)
+  for (id_map *id = func_ids; id; id = id->next, nfuncs++)
     fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
   fprintf (out, "};\n\n");
 
   fprintf (out, "static const void *target_data[] = {\n");
-  fprintf (out, "  ptx_code, var_mappings, func_mappings\n");
+  fprintf (out, "  ptx_code, (void*) %u, var_mappings, (void*) %u, "
+		"func_mappings\n", nvars, nfuncs);
   fprintf (out, "};\n\n");
 
   fprintf (out, "extern void GOMP_offload_register (const void *, int, void *);\n");
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a1d42c5..5272f01 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -655,9 +655,6 @@  struct target_mem_desc {
   /* Corresponding target device descriptor.  */
   struct gomp_device_descr *device_descr;
 
-  /* Memory mapping info for the thread that created this descriptor.  */
-  struct splay_tree_s *mem_map;
-
   /* List of splay keys to remove (or decrease refcount)
      at the end of region.  */
   splay_tree_key list[];
@@ -691,18 +688,6 @@  typedef struct acc_dispatch_t
   /* This is guarded by the lock in the "outer" struct gomp_device_descr.  */
   struct target_mem_desc *data_environ;
 
-  /* Extra information required for a device instance by a given target.  */
-  /* This is guarded by the lock in the "outer" struct gomp_device_descr.  */
-  void *target_data;
-
-  /* Open or close a device instance.  */
-  void *(*open_device_func) (int n);
-  int (*close_device_func) (void *h);
-
-  /* Set or get the device number.  */
-  int (*get_device_num_func) (void);
-  void (*set_device_num_func) (int);
-
   /* Execute.  */
   void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
 		     unsigned short *, int, int, int, int, void *);
@@ -720,7 +705,7 @@  typedef struct acc_dispatch_t
   void (*async_set_async_func) (int);
 
   /* Create/destroy TLS data.  */
-  void *(*create_thread_data_func) (void *);
+  void *(*create_thread_data_func) (int);
   void (*destroy_thread_data_func) (void *);
 
   /* NVIDIA target specific routines.  */
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 08b7c5e..1f5827e 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -26,7 +26,7 @@ 
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-
+#include <assert.h>
 #include "openacc.h"
 #include "libgomp.h"
 #include "oacc-int.h"
@@ -37,13 +37,23 @@  acc_async_test (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  return base_dev->openacc.async_test_func (async);
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  return thr->dev->openacc.async_test_func (async);
 }
 
 int
 acc_async_test_all (void)
 {
-  return base_dev->openacc.async_test_all_func ();
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  return thr->dev->openacc.async_test_all_func ();
 }
 
 void
@@ -52,19 +62,34 @@  acc_wait (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  base_dev->openacc.async_wait_func (async);
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  thr->dev->openacc.async_wait_func (async);
 }
 
 void
 acc_wait_async (int async1, int async2)
 {
-  base_dev->openacc.async_wait_async_func (async1, async2);
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  thr->dev->openacc.async_wait_async_func (async1, async2);
 }
 
 void
 acc_wait_all (void)
 {
-  base_dev->openacc.async_wait_all_func ();
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  thr->dev->openacc.async_wait_all_func ();
 }
 
 void
@@ -73,5 +98,10 @@  acc_wait_all_async (int async)
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
-  base_dev->openacc.async_wait_all_async_func (async);
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  thr->dev->openacc.async_wait_all_async_func (async);
 }
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index c8ef376..4aab422 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -34,51 +34,53 @@ 
 void *
 acc_get_current_cuda_device (void)
 {
-  void *p = NULL;
+  struct goacc_thread *thr = goacc_thread ();
 
-  if (base_dev && base_dev->openacc.cuda.get_current_device_func)
-    p = base_dev->openacc.cuda.get_current_device_func ();
+  if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
+    return thr->dev->openacc.cuda.get_current_device_func ();
 
-  return p;
+  return NULL;
 }
 
 void *
 acc_get_current_cuda_context (void)
 {
-  void *p = NULL;
+  struct goacc_thread *thr = goacc_thread ();
 
-  if (base_dev && base_dev->openacc.cuda.get_current_context_func)
-    p = base_dev->openacc.cuda.get_current_context_func ();
-
-  return p;
+  if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
+    return thr->dev->openacc.cuda.get_current_context_func ();
+ 
+  return NULL;
 }
 
 void *
 acc_get_cuda_stream (int async)
 {
-  void *p = NULL;
+  struct goacc_thread *thr = goacc_thread ();
 
   if (async < 0)
-    return p;
-
-  if (base_dev && base_dev->openacc.cuda.get_stream_func)
-    p = base_dev->openacc.cuda.get_stream_func (async);
+    return NULL;
 
-  return p;
+  if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
+    return thr->dev->openacc.cuda.get_stream_func (async);
+ 
+  return NULL;
 }
 
 int
 acc_set_cuda_stream (int async, void *stream)
 {
-  int s = -1;
+  struct goacc_thread *thr;
 
   if (async < 0 || stream == NULL)
     return 0;
 
   goacc_lazy_initialize ();
 
-  if (base_dev && base_dev->openacc.cuda.set_stream_func)
-    s = base_dev->openacc.cuda.set_stream_func (async, stream);
+  thr = goacc_thread ();
+
+  if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
+    return thr->dev->openacc.cuda.set_stream_func (async, stream);
 
-  return s;
+  return -1;
 }
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index e4756b6..6dcdbf3 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -53,16 +53,9 @@  static struct gomp_device_descr host_dispatch =
     .host2dev_func = GOMP_OFFLOAD_host2dev,
     .run_func = GOMP_OFFLOAD_run,
 
-    .mem_map.root = NULL,
     .is_initialized = false,
 
     .openacc = {
-      .open_device_func = GOMP_OFFLOAD_openacc_open_device,
-      .close_device_func = GOMP_OFFLOAD_openacc_close_device,
-
-      .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num,
-      .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num,
-
       .exec_func = GOMP_OFFLOAD_openacc_parallel,
 
       .register_async_cleanup_func
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 1e0243e..dc40fb6 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -37,14 +37,13 @@ 
 
 static gomp_mutex_t acc_device_lock;
 
-/* The dispatch table for the current accelerator device.  This is global, so
-   you can only have one type of device open at any given time in a program.
-   This is the "base" device in that several devices that use the same
-   dispatch table may be active concurrently: this one (the "zeroth") is used
-   for overall initialisation/shutdown, and other instances -- not necessarily
-   including this one -- may be opened and closed once the base device has
-   been initialized.  */
-struct gomp_device_descr *base_dev;
+/* A cached version of the dispatcher for the global "current" accelerator type,
+   e.g. used as the default when creating new host threads.  This is the
+   device-type equivalent of goacc_device_num (which specifies which device to
+   use out of potentially several of the same type).  If there are several
+   devices of a given type, this points at the first one.  */
+
+static struct gomp_device_descr *cached_base_dev = NULL;
 
 #if defined HAVE_TLS || defined USE_EMUTLS
 __thread struct goacc_thread *goacc_tls_data;
@@ -53,9 +52,6 @@  pthread_key_t goacc_tls_key;
 #endif
 static pthread_key_t goacc_cleanup_key;
 
-/* Current dispatcher, and how it was initialized */
-static acc_device_t init_key = _ACC_device_hwm;
-
 static struct goacc_thread *goacc_threads;
 static gomp_mutex_t goacc_thread_lock;
 
@@ -94,6 +90,21 @@  get_openacc_name (const char *name)
     return name;
 }
 
+static const char *
+name_of_acc_device_t (enum acc_device_t type)
+{
+  switch (type)
+    {
+    case acc_device_none: return "none";
+    case acc_device_default: return "default";
+    case acc_device_host: return "host";
+    case acc_device_host_nonshm: return "host_nonshm";
+    case acc_device_not_host: return "not_host";
+    case acc_device_nvidia: return "nvidia";
+    default: gomp_fatal ("unknown device type %u", (unsigned) type);
+    }
+}
+
 static struct gomp_device_descr *
 resolve_device (acc_device_t d)
 {
@@ -159,22 +170,87 @@  resolve_device (acc_device_t d)
 static struct gomp_device_descr *
 acc_init_1 (acc_device_t d)
 {
-  struct gomp_device_descr *acc_dev;
+  struct gomp_device_descr *base_dev, *acc_dev;
+  int ndevs;
 
-  acc_dev = resolve_device (d);
+  base_dev = resolve_device (d);
+
+  ndevs = base_dev->get_num_devices_func ();
+
+  if (!base_dev || ndevs <= 0 || goacc_device_num >= ndevs)
+    gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
 
-  if (!acc_dev || acc_dev->get_num_devices_func () <= 0)
-    gomp_fatal ("device %u not supported", (unsigned)d);
+  acc_dev = &base_dev[goacc_device_num];
 
   if (acc_dev->is_initialized)
     gomp_fatal ("device already active");
 
-  /* We need to remember what we were intialized as, to check shutdown etc.  */
-  init_key = d;
-
   gomp_init_device (acc_dev);
 
-  return acc_dev;
+  return base_dev;
+}
+
+static void
+acc_shutdown_1 (acc_device_t d)
+{
+  struct gomp_device_descr *base_dev;
+  struct goacc_thread *walk;
+  int ndevs, i;
+  bool devices_active = false;
+
+  /* Get the base device for this device type.  */
+  base_dev = resolve_device (d);
+
+  if (!base_dev)
+    gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
+
+  gomp_mutex_lock (&goacc_thread_lock);
+
+  /* Free target-specific TLS data and close all devices.  */
+  for (walk = goacc_threads; walk != NULL; walk = walk->next)
+    {
+      if (walk->target_tls)
+	base_dev->openacc.destroy_thread_data_func (walk->target_tls);
+
+      walk->target_tls = NULL;
+
+      /* This would mean the user is shutting down OpenACC in the middle of an
+         "acc data" pragma.  Likely not intentional.  */
+      if (walk->mapped_data)
+	gomp_fatal ("shutdown in 'acc data' region");
+
+      /* Similarly, if this happens then user code has done something weird.  */
+      if (walk->saved_bound_dev)
+        gomp_fatal ("shutdown during host fallback");
+
+      if (walk->dev)
+	{
+	  gomp_mutex_lock (&walk->dev->lock);
+	  gomp_free_memmap (&walk->dev->mem_map);
+	  gomp_mutex_unlock (&walk->dev->lock);
+
+	  walk->dev = NULL;
+	  walk->base_dev = NULL;
+	}
+    }
+
+  gomp_mutex_unlock (&goacc_thread_lock);
+
+  ndevs = base_dev->get_num_devices_func ();
+
+  /* Close all the devices of this type that have been opened.  */
+  for (i = 0; i < ndevs; i++)
+    {
+      struct gomp_device_descr *acc_dev = &base_dev[i];
+      if (acc_dev->is_initialized)
+        {
+	  devices_active = true;
+	  gomp_fini_device (acc_dev);
+	}
+    }
+
+  if (!devices_active)
+    gomp_fatal ("no device initialized");
 }
 
 static struct goacc_thread *
@@ -207,9 +283,11 @@  goacc_destroy_thread (void *data)
 
   if (thr)
     {
-      if (base_dev && thr->target_tls)
+      struct gomp_device_descr *acc_dev = thr->dev;
+
+      if (acc_dev && thr->target_tls)
 	{
-	  base_dev->openacc.destroy_thread_data_func (thr->target_tls);
+	  acc_dev->openacc.destroy_thread_data_func (thr->target_tls);
 	  thr->target_tls = NULL;
 	}
 
@@ -236,53 +314,49 @@  goacc_destroy_thread (void *data)
   gomp_mutex_unlock (&goacc_thread_lock);
 }
 
-/* Open the ORD'th device of the currently-active type (base_dev must be
-   initialised before calling).  If ORD is < 0, open the default-numbered
-   device (set by the ACC_DEVICE_NUM environment variable or a call to
-   acc_set_device_num), or leave any currently-opened device as is.  "Opening"
-   consists of calling the device's open_device_func hook, and setting up
-   thread-local data (maybe allocating, then initializing with information
-   pertaining to the newly-opened or previously-opened device).  */
+/* Use the ORD'th device instance for the current host thread (or -1 for the
+   current global default).  The device (and the runtime) must be initialised
+   before calling this function.  */
 
-static void
-lazy_open (int ord)
+void
+goacc_attach_host_thread_to_device (int ord)
 {
   struct goacc_thread *thr = goacc_thread ();
-  struct gomp_device_descr *acc_dev;
-
-  if (thr && thr->dev)
-    {
-      assert (ord < 0 || ord == thr->dev->target_id);
-      return;
-    }
-
-  assert (base_dev);
-
+  struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL;
+  int num_devices;
+  
+  if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0))
+    return;
+  
   if (ord < 0)
     ord = goacc_device_num;
-
-  /* The OpenACC 2.0 spec leaves the runtime's behaviour when an out-of-range
-     device is requested as implementation-defined (4.2 ACC_DEVICE_NUM).
-     We choose to raise an error in such a case.  */
-  if (ord >= base_dev->get_num_devices_func ())
-    gomp_fatal ("device %u does not exist", ord);
-
+  
+  /* Decide which type of device to use.  If the current thread has a device
+     type already (e.g. set by acc_set_device_type), use that, else use the
+     global default.  */
+  if (thr && thr->base_dev)
+    base_dev = thr->base_dev;
+  else
+    {
+      assert (cached_base_dev);
+      base_dev = cached_base_dev;
+    }
+  
+  num_devices = base_dev->get_num_devices_func ();
+  if (num_devices <= 0 || ord >= num_devices)
+    gomp_fatal ("device %u out of range", ord);
+  
   if (!thr)
     thr = goacc_new_thread ();
-
-  acc_dev = thr->dev = &base_dev[ord];
-
-  assert (acc_dev->target_id == ord);
-
+  
+  thr->base_dev = base_dev;
+  thr->dev = acc_dev = &base_dev[ord];
   thr->saved_bound_dev = NULL;
   thr->mapped_data = NULL;
-
-  if (!acc_dev->openacc.target_data)
-    acc_dev->openacc.target_data = acc_dev->openacc.open_device_func (ord);
-
+  
   thr->target_tls
-    = acc_dev->openacc.create_thread_data_func (acc_dev->openacc.target_data);
-
+    = acc_dev->openacc.create_thread_data_func (ord);
+  
   acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
@@ -292,74 +366,20 @@  lazy_open (int ord)
 void
 acc_init (acc_device_t d)
 {
-  if (!base_dev)
+  if (!cached_base_dev)
     gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
 
-  base_dev = acc_init_1 (d);
-
-  lazy_open (-1);
+  cached_base_dev = acc_init_1 (d);
 
   gomp_mutex_unlock (&acc_device_lock);
+  
+  goacc_attach_host_thread_to_device (-1);
 }
 
 ialias (acc_init)
 
-static void
-acc_shutdown_1 (acc_device_t d)
-{
-  struct goacc_thread *walk;
-
-  /* We don't check whether d matches the actual device found, because
-     OpenACC 2.0 (3.2.12) says the parameters to the init and this
-     call must match (for the shutdown call anyway, it's silent on
-     others).  */
-
-  if (!base_dev)
-    gomp_fatal ("no device initialized");
-  if (d != init_key)
-    gomp_fatal ("device %u(%u) is initialized",
-		(unsigned) init_key, (unsigned) base_dev->type);
-
-  gomp_mutex_lock (&goacc_thread_lock);
-
-  /* Free target-specific TLS data and close all devices.  */
-  for (walk = goacc_threads; walk != NULL; walk = walk->next)
-    {
-      if (walk->target_tls)
-	base_dev->openacc.destroy_thread_data_func (walk->target_tls);
-
-      walk->target_tls = NULL;
-
-      /* This would mean the user is shutting down OpenACC in the middle of an
-         "acc data" pragma.  Likely not intentional.  */
-      if (walk->mapped_data)
-	gomp_fatal ("shutdown in 'acc data' region");
-
-      if (walk->dev)
-	{
-	  void *target_data = walk->dev->openacc.target_data;
-	  if (walk->dev->openacc.close_device_func (target_data) < 0)
-	    gomp_fatal ("failed to close device");
-
-	  walk->dev->openacc.target_data = target_data = NULL;
-
-	  gomp_mutex_lock (&walk->dev->lock);
-	  gomp_free_memmap (&walk->dev->mem_map);
-	  gomp_mutex_unlock (&walk->dev->lock);
-
-	  walk->dev = NULL;
-	}
-    }
-
-  gomp_mutex_unlock (&goacc_thread_lock);
-
-  gomp_fini_device (base_dev);
-
-  base_dev = NULL;
-}
-
 void
 acc_shutdown (acc_device_t d)
 {
@@ -372,59 +392,16 @@  acc_shutdown (acc_device_t d)
 
 ialias (acc_shutdown)
 
-/* This function is called after plugins have been initialized.  It deals with
-   the "base" device, and is used to prepare the runtime for dealing with a
-   number of such devices (as implemented by some particular plugin).  If the
-   argument device type D matches a previous call to the function, return the
-   current base device, else shut the old device down and re-initialize with
-   the new device type.  */
-
-static struct gomp_device_descr *
-lazy_init (acc_device_t d)
-{
-  if (base_dev)
-    {
-      /* Re-initializing the same device, do nothing.  */
-      if (d == init_key)
-	return base_dev;
-
-      acc_shutdown_1 (init_key);
-    }
-
-  assert (!base_dev);
-
-  return acc_init_1 (d);
-}
-
-/* Ensure that plugins are loaded, initialize and open the (default-numbered)
-   device.  */
-
-static void
-lazy_init_and_open (acc_device_t d)
-{
-  if (!base_dev)
-    gomp_init_targets_once ();
-
-  gomp_mutex_lock (&acc_device_lock);
-
-  base_dev = lazy_init (d);
-
-  lazy_open (-1);
-
-  gomp_mutex_unlock (&acc_device_lock);
-}
-
 int
 acc_get_num_devices (acc_device_t d)
 {
   int n = 0;
-  const struct gomp_device_descr *acc_dev;
+  struct gomp_device_descr *acc_dev;
 
   if (d == acc_device_none)
     return 0;
 
-  if (!base_dev)
-    gomp_init_targets_once ();
+  gomp_init_targets_once ();
 
   acc_dev = resolve_device (d);
   if (!acc_dev)
@@ -439,10 +416,39 @@  acc_get_num_devices (acc_device_t d)
 
 ialias (acc_get_num_devices)
 
+/* Set the device type for the current thread only (using the current global
+   default device number), initialising that device if necessary.  Also set the
+   default device type for new threads to D.  */
+
 void
 acc_set_device_type (acc_device_t d)
 {
-  lazy_init_and_open (d);
+  struct gomp_device_descr *base_dev, *acc_dev;
+  struct goacc_thread *thr = goacc_thread ();
+
+  gomp_mutex_lock (&acc_device_lock);
+
+  if (!cached_base_dev)
+    gomp_init_targets_once ();
+
+  cached_base_dev = base_dev = resolve_device (d);
+  acc_dev = &base_dev[goacc_device_num];
+
+  if (!acc_dev->is_initialized)
+    gomp_init_device (acc_dev);
+
+  gomp_mutex_unlock (&acc_device_lock);
+
+  /* We're changing device type: invalidate the current thread's dev and
+     base_dev pointers.  */
+  if (thr && thr->base_dev != base_dev)
+    {
+      thr->base_dev = thr->dev = NULL;
+      if (thr->mapped_data)
+        gomp_fatal ("acc_set_device_type in 'acc data' region");
+    }
+
+  goacc_attach_host_thread_to_device (-1);
 }
 
 ialias (acc_set_device_type)
@@ -451,10 +457,11 @@  acc_device_t
 acc_get_device_type (void)
 {
   acc_device_t res = acc_device_none;
-  const struct gomp_device_descr *dev;
+  struct gomp_device_descr *dev;
+  struct goacc_thread *thr = goacc_thread ();
 
-  if (base_dev)
-    res = acc_device_type (base_dev->type);
+  if (thr && thr->base_dev)
+    res = acc_device_type (thr->base_dev->type);
   else
     {
       gomp_init_targets_once ();
@@ -475,78 +482,65 @@  int
 acc_get_device_num (acc_device_t d)
 {
   const struct gomp_device_descr *dev;
-  int num;
+  struct goacc_thread *thr = goacc_thread ();
 
   if (d >= _ACC_device_hwm)
     gomp_fatal ("device %u out of range", (unsigned)d);
 
-  if (!base_dev)
+  if (!cached_base_dev)
     gomp_init_targets_once ();
 
   dev = resolve_device (d);
   if (!dev)
-    gomp_fatal ("no devices of type %u", d);
+    gomp_fatal ("device %s not supported", name_of_acc_device_t (d));
 
-  /* We might not have called lazy_open for this host thread yet, in which case
-     the get_device_num_func hook will return -1.  */
-  num = dev->openacc.get_device_num_func ();
-  if (num < 0)
-    num = goacc_device_num;
+  if (thr && thr->base_dev == dev && thr->dev)
+    return thr->dev->target_id;
 
-  return num;
+  return goacc_device_num;
 }
 
 ialias (acc_get_device_num)
 
 void
-acc_set_device_num (int n, acc_device_t d)
+acc_set_device_num (int ord, acc_device_t d)
 {
-  const struct gomp_device_descr *dev;
+  struct gomp_device_descr *base_dev, *acc_dev;
   int num_devices;
 
-  if (!base_dev)
+  if (!cached_base_dev)
     gomp_init_targets_once ();
 
-  if ((int) d == 0)
-    {
-      int i;
-
-      /* A device setting of zero sets all device types on the system to use
-         the Nth instance of that device type.  Only attempt it for initialized
-	 devices though.  */
-      for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++)
-        {
-	  dev = resolve_device (d);
-	  if (dev && dev->is_initialized)
-	    dev->openacc.set_device_num_func (n);
-	}
+  if (ord < 0)
+    ord = goacc_device_num;
 
-      /* ...and for future calls to acc_init/acc_set_device_type, etc.  */
-      goacc_device_num = n;
-    }
+  if ((int) d == 0)
+    /* Set whatever device is being used by the current host thread to use
+       device instance ORD.  It's unclear if this is supposed to affect other
+       host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num).  */
+    goacc_attach_host_thread_to_device (ord);
   else
     {
-      struct goacc_thread *thr = goacc_thread ();
-
       gomp_mutex_lock (&acc_device_lock);
 
-      base_dev = lazy_init (d);
+      cached_base_dev = base_dev = resolve_device (d);
 
       num_devices = base_dev->get_num_devices_func ();
 
-      if (n >= num_devices)
-        gomp_fatal ("device %u out of range", n);
+      if (ord >= num_devices)
+        gomp_fatal ("device %u out of range", ord);
 
-      /* If we're changing the device number, de-associate this thread with
-	 the device (but don't close the device, since it may be in use by
-	 other threads).  */
-      if (thr && thr->dev && n != thr->dev->target_id)
-	thr->dev = NULL;
+      acc_dev = &base_dev[ord];
 
-      lazy_open (n);
+      if (!acc_dev->is_initialized)
+        gomp_init_device (acc_dev);
 
       gomp_mutex_unlock (&acc_device_lock);
+
+      goacc_attach_host_thread_to_device (ord);
     }
+  
+  goacc_device_num = ord;
 }
 
 ialias (acc_set_device_num)
@@ -554,10 +548,7 @@  ialias (acc_set_device_num)
 int
 acc_on_device (acc_device_t dev)
 {
-  struct goacc_thread *thr = goacc_thread ();
-
-  if (thr && thr->dev
-      && acc_device_type (thr->dev->type) == acc_device_host_nonshm)
+  if (acc_get_device_type () == acc_device_host_nonshm)
     return dev == acc_device_host_nonshm || dev == acc_device_not_host;
 
   /* Just rely on the compiler builtin.  */
@@ -577,7 +568,7 @@  goacc_runtime_initialize (void)
 
   pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
 
-  base_dev = NULL;
+  cached_base_dev = NULL;
 
   goacc_threads = NULL;
   gomp_mutex_init (&goacc_thread_lock);
@@ -606,9 +597,8 @@  goacc_restore_bind (void)
 }
 
 /* This is called from any OpenACC support function that may need to implicitly
-   initialize the libgomp runtime.  On exit all such initialization will have
-   been done, and both the global ACC_dev and the per-host-thread ACC_memmap
-   pointers will be valid.  */
+   initialize the libgomp runtime, either globally or from a new host thread. 
+   On exit "goacc_thread" will return a valid & populated thread block.  */
 
 attribute_hidden void
 goacc_lazy_initialize (void)
@@ -618,12 +608,8 @@  goacc_lazy_initialize (void)
   if (thr && thr->dev)
     return;
 
-  if (!base_dev)
-    lazy_init_and_open (acc_device_default);
+  if (!cached_base_dev)
+    acc_init (acc_device_default);
   else
-    {
-      gomp_mutex_lock (&acc_device_lock);
-      lazy_open (-1);
-      gomp_mutex_unlock (&acc_device_lock);
-    }
+    goacc_attach_host_thread_to_device (-1);
 }
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 85619c8..0ace737 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -56,6 +56,9 @@  acc_device_type (enum offload_target_type type)
 
 struct goacc_thread
 {
+  /* The base device for the current thread.  */
+  struct gomp_device_descr *base_dev;
+
   /* The device for the current thread.  */
   struct gomp_device_descr *dev;
 
@@ -89,10 +92,7 @@  goacc_thread (void)
 #endif
 
 void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW;
-
-/* Current dispatcher.  */
-extern struct gomp_device_descr *base_dev;
-
+void goacc_attach_host_thread_to_device (int);
 void goacc_runtime_initialize (void);
 void goacc_save_and_set_bind (acc_device_t);
 void goacc_restore_bind (void);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index fdc82e6..89ef5fc 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -107,7 +107,9 @@  acc_malloc (size_t s)
 
   struct goacc_thread *thr = goacc_thread ();
 
-  return base_dev->alloc_func (thr->dev->target_id, s);
+  assert (thr->dev);
+
+  return thr->dev->alloc_func (thr->dev->target_id, s);
 }
 
 /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -122,6 +124,8 @@  acc_free (void *d)
   if (!d)
     return;
 
+  assert (thr && thr->dev);
+
   /* We don't have to call lazy open here, as the ptr value must have
      been returned by acc_malloc.  It's not permitted to pass NULL in
      (unless you got that null from acc_malloc).  */
@@ -134,7 +138,7 @@  acc_free (void *d)
      acc_unmap_data ((void *)(k->host_start + offset));
    }
 
-  base_dev->free_func (thr->dev->target_id, d);
+  thr->dev->free_func (thr->dev->target_id, d);
 }
 
 void
@@ -144,7 +148,9 @@  acc_memcpy_to_device (void *d, void *h, size_t s)
      been obtained from a routine that did that.  */
   struct goacc_thread *thr = goacc_thread ();
 
-  base_dev->host2dev_func (thr->dev->target_id, d, h, s);
+  assert (thr && thr->dev);
+
+  thr->dev->host2dev_func (thr->dev->target_id, d, h, s);
 }
 
 void
@@ -154,7 +160,9 @@  acc_memcpy_from_device (void *h, void *d, size_t s)
      been obtained from a routine that did that.  */
   struct goacc_thread *thr = goacc_thread ();
 
-  base_dev->dev2host_func (thr->dev->target_id, h, d, s);
+  assert (thr && thr->dev);
+
+  thr->dev->dev2host_func (thr->dev->target_id, h, d, s);
 }
 
 /* Return the device pointer that corresponds to host data H.  Or NULL
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 563f9bb..d899946 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -49,32 +49,6 @@  find_pset (int pos, size_t mapnum, unsigned short *kinds)
   return kind == GOMP_MAP_TO_PSET;
 }
 
-
-/* Ensure that the target device for DEVICE_TYPE is initialised (and that
-   plugins have been loaded if appropriate).  The ACC_dev variable for the
-   current thread will be set appropriately for the given device type on
-   return.  */
-
-attribute_hidden void
-select_acc_device (int device_type)
-{
-  goacc_lazy_initialize ();
-
-  if (device_type == GOMP_DEVICE_HOST_FALLBACK)
-    return;
-
-  if (device_type == acc_device_none)
-    device_type = acc_device_host;
-
-  if (device_type >= 0)
-    {
-      /* NOTE: this will go badly if the surrounding data environment is set up
-         to use a different device type.  We'll just have to trust that users
-	 know what they're doing...  */
-      acc_set_device_type (device_type);
-    }
-}
-
 static void goacc_wait (int async, int num_waits, va_list ap);
 
 void
@@ -111,7 +85,7 @@  GOACC_parallel (int device, void (*fn) (void *),
 	      __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds,
 	      async);
 #endif
-  select_acc_device (device);
+  goacc_lazy_initialize ();
 
   thr = goacc_thread ();
   acc_dev = thr->dev;
@@ -151,7 +125,7 @@  GOACC_parallel (int device, void (*fn) (void *),
       if (tgt_fn_key == NULL)
 	gomp_fatal ("target function wasn't mapped");
 
-      tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start;
+      tgt_fn = (void (*)) tgt_fn_key->tgt_offset;
     }
   else
     tgt_fn = (void (*)) fn;
@@ -195,7 +169,7 @@  GOACC_data_start (int device, size_t mapnum,
 	      __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
 #endif
 
-  select_acc_device (device);
+  goacc_lazy_initialize ();
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -242,7 +216,7 @@  GOACC_enter_exit_data (int device, size_t mapnum,
   bool data_enter = false;
   size_t i;
 
-  select_acc_device (device);
+  goacc_lazy_initialize ();
 
   thr = goacc_thread ();
   acc_dev = thr->dev;
@@ -429,7 +403,7 @@  GOACC_update (int device, size_t mapnum,
   bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
   size_t i;
 
-  select_acc_device (device);
+  goacc_lazy_initialize ();
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c
index bc60f72..1faf5bc 100644
--- a/libgomp/plugin/plugin-host.c
+++ b/libgomp/plugin/plugin-host.c
@@ -119,31 +119,6 @@  GOMP_OFFLOAD_unload_image (int n __attribute__ ((unused)),
 }
 
 STATIC void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
-  return (void *) (intptr_t) n;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_close_device (void *hnd)
-{
-  return 0;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
-  return 0;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
-  if (n > 0)
-    GOMP (fatal) ("device number %u out of range for host execution", n);
-}
-
-STATIC void *
 GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s)
 {
   return GOMP (malloc) (s);
@@ -254,7 +229,7 @@  GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused)))
 }
 
 STATIC void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data
+GOMP_OFFLOAD_openacc_create_thread_data (int ord
 					 __attribute__ ((unused)))
 {
   return NULL;
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 483cb75..583ec87 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -133,7 +133,8 @@  struct targ_fn_descriptor
   const char *name;
 };
 
-static bool ptx_inited = false;
+static unsigned int instantiated_devices = 0;
+static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
 struct ptx_stream
 {
@@ -331,9 +332,21 @@  struct ptx_event
   struct ptx_event *next;
 };
 
+struct ptx_image_data
+{
+  void *target_data;
+  CUmodule module;
+  struct ptx_image_data *next;
+};
+
 static pthread_mutex_t ptx_event_lock;
 static struct ptx_event *ptx_events;
 
+static struct ptx_device **ptx_devices;
+
+static struct ptx_image_data *ptx_images = NULL;
+static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
+
 #define _XSTR(s) _STR(s)
 #define _STR(s) #s
 
@@ -450,8 +463,8 @@  fini_streams_for_device (struct ptx_device *ptx_dev)
       struct ptx_stream *s = ptx_dev->active_streams;
       ptx_dev->active_streams = ptx_dev->active_streams->next;
 
-      cuStreamDestroy (s->stream);
       map_fini (s);
+      cuStreamDestroy (s->stream);
       free (s);
     }
 
@@ -575,21 +588,21 @@  select_stream_for_async (int async, pthread_t thread, bool create,
   return stream;
 }
 
-static int nvptx_get_num_devices (void);
-
-/* Initialize the device.  */
-static int
+/* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
+   should be locked on entry and remains locked on exit.  */
+static bool
 nvptx_init (void)
 {
   CUresult r;
   int rc;
+  int ndevs;
 
-  if (ptx_inited)
-    return nvptx_get_num_devices ();
+  if (instantiated_devices != 0)
+    return true;
 
   rc = verify_device_library ();
   if (rc < 0)
-    return -1;
+    return false;
 
   r = cuInit (0);
   if (r != CUDA_SUCCESS)
@@ -599,22 +612,64 @@  nvptx_init (void)
 
   pthread_mutex_init (&ptx_event_lock, NULL);
 
-  ptx_inited = true;
+  r = cuDeviceGetCount (&ndevs);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 
-  return nvptx_get_num_devices ();
+  ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
+					    * ndevs);
+
+  return true;
 }
 
+/* Select the N'th PTX device for the current host thread.  The device must
+   have been previously opened before calling this function.  */
+
 static void
-nvptx_fini (void)
+nvptx_attach_host_thread_to_device (int n)
 {
-  ptx_inited = false;
+  CUdevice dev;
+  CUresult r;
+  struct ptx_device *ptx_dev;
+  CUcontext thd_ctx;
+
+  r = cuCtxGetDevice (&dev);
+  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+
+  if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
+    return;
+  else
+    {
+      CUcontext old_ctx;
+
+      ptx_dev = ptx_devices[n];
+      assert (ptx_dev);
+
+      r = cuCtxGetCurrent (&thd_ctx);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+      /* We don't necessarily have a current context (e.g. if it has been
+         destroyed.  Pop it if we do though.  */
+      if (thd_ctx != NULL)
+	{
+	  r = cuCtxPopCurrent (&old_ctx);
+	  if (r != CUDA_SUCCESS)
+            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+	}
+
+      r = cuCtxPushCurrent (ptx_dev->ctx);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+    }
 }
 
-static void *
+static struct ptx_device *
 nvptx_open_device (int n)
 {
   struct ptx_device *ptx_dev;
-  CUdevice dev;
+  CUdevice dev, ctx_dev;
   CUresult r;
   int async_engines, pi;
 
@@ -628,6 +683,21 @@  nvptx_open_device (int n)
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
 
+  r = cuCtxGetDevice (&ctx_dev);
+  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+  
+  if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
+    {
+      /* The current host thread has an active context for a different device.
+         Detach it.  */
+      CUcontext old_ctx;
+      
+      r = cuCtxPopCurrent (&old_ctx);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+    }
+
   r = cuCtxGetCurrent (&ptx_dev->ctx);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
@@ -678,17 +748,16 @@  nvptx_open_device (int n)
 
   init_streams_for_device (ptx_dev, async_engines);
 
-  return (void *) ptx_dev;
+  return ptx_dev;
 }
 
-static int
-nvptx_close_device (void *targ_data)
+static void
+nvptx_close_device (struct ptx_device *ptx_dev)
 {
   CUresult r;
-  struct ptx_device *ptx_dev = targ_data;
 
   if (!ptx_dev)
-    return 0;
+    return;
 
   fini_streams_for_device (ptx_dev);
 
@@ -700,8 +769,6 @@  nvptx_close_device (void *targ_data)
     }
 
   free (ptx_dev);
-
-  return 0;
 }
 
 static int
@@ -714,7 +781,7 @@  nvptx_get_num_devices (void)
      order to enumerate available devices, but CUDA API routines can't be used
      until cuInit has been called.  Just call it now (but don't yet do any
      further initialization).  */
-  if (!ptx_inited)
+  if (instantiated_devices == 0)
     cuInit (0);
 
   r = cuDeviceGetCount (&n);
@@ -1507,64 +1574,84 @@  GOMP_OFFLOAD_get_num_devices (void)
   return nvptx_get_num_devices ();
 }
 
-static void **kernel_target_data;
-static void **kernel_host_table;
-
 void
-GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+GOMP_OFFLOAD_init_device (int n)
 {
-  kernel_target_data = target_data;
-  kernel_host_table = host_table;
-}
+  pthread_mutex_lock (&ptx_dev_lock);
 
-void
-GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
-{
-  (void) nvptx_init ();
+  if (!nvptx_init () || ptx_devices[n] != NULL)
+    {
+      pthread_mutex_unlock (&ptx_dev_lock);
+      return;
+    }
+
+  ptx_devices[n] = nvptx_open_device (n);
+  instantiated_devices++;
+
+  pthread_mutex_unlock (&ptx_dev_lock);
 }
 
 void
-GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
+GOMP_OFFLOAD_fini_device (int n)
 {
-  nvptx_fini ();
+  pthread_mutex_lock (&ptx_dev_lock);
+
+  if (ptx_devices[n] != NULL)
+    {
+      nvptx_attach_host_thread_to_device (n);
+      nvptx_close_device (ptx_devices[n]);
+      ptx_devices[n] = NULL;
+      instantiated_devices--;
+    }
+
+  pthread_mutex_unlock (&ptx_dev_lock);
 }
 
 int
-GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
-			struct mapping_table **tablep)
+GOMP_OFFLOAD_load_image (int ord, void *target_data,
+			 struct addr_pair **target_table)
 {
   CUmodule module;
-  void **fn_table;
-  char **fn_names;
-  int fn_entries, i;
+  char **fn_names, **var_names;
+  unsigned int fn_entries, var_entries, i, j;
   CUresult r;
   struct targ_fn_descriptor *targ_fns;
+  void **img_header = (void **) target_data;
+  struct ptx_image_data *new_image;
 
-  if (nvptx_init () <= 0)
-    return 0;
+  GOMP_OFFLOAD_init_device (ord);
 
-  /* This isn't an error, because an image may legitimately have no offloaded
-     regions and so will not call GOMP_offload_register.  */
-  if (kernel_target_data == NULL)
-    return 0;
+  nvptx_attach_host_thread_to_device (ord);
+
+  link_ptx (&module, img_header[0]);
 
-  link_ptx (&module, kernel_target_data[0]);
+  pthread_mutex_lock (&ptx_image_lock);
+  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
+  new_image->target_data = target_data;
+  new_image->module = module;
+  new_image->next = ptx_images;
+  ptx_images = new_image;
+  pthread_mutex_unlock (&ptx_image_lock);
 
-  /* kernel_target_data[0] -> ptx code
-     kernel_target_data[1] -> variable mappings
-     kernel_target_data[2] -> array of kernel names in ascii
+  /* The mkoffload utility emits a table of pointers/integers at the start of
+     each offload image:
 
-     kernel_host_table[0] -> start of function addresses (__offload_func_table)
-     kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
+     img_header[0] -> ptx code
+     img_header[1] -> number of variables
+     img_header[2] -> array of variable names (pointers to strings)
+     img_header[3] -> number of kernels
+     img_header[4] -> array of kernel names (pointers to strings)
 
      The array of kernel names and the functions addresses form a
      one-to-one correspondence.  */
 
-  fn_table = kernel_host_table[0];
-  fn_names = (char **) kernel_target_data[2];
-  fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+  var_entries = (uintptr_t) img_header[1];
+  var_names = (char **) img_header[2];
+  fn_entries = (uintptr_t) img_header[3];
+  fn_names = (char **) img_header[4];
 
-  *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+  *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
+				      * (fn_entries + var_entries));
   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
 				 * fn_entries);
 
@@ -1579,38 +1666,86 @@  GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
       targ_fns[i].fn = function;
       targ_fns[i].name = (const char *) fn_names[i];
 
-      (*tablep)[i].host_start = (uintptr_t) fn_table[i];
-      (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
-      (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
-      (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+      (*target_table)[i].start = (uintptr_t) &targ_fns[i];
+      (*target_table)[i].end = (*target_table)[i].start + 1;
     }
 
-  return fn_entries;
+  for (j = 0; j < var_entries; j++, i++)
+    {
+      CUdeviceptr var;
+      size_t bytes;
+
+      r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+
+      (*target_table)[i].start = (uintptr_t) var;
+      (*target_table)[i].end = (*target_table)[i].start + bytes;
+    }
+
+  return i;
+}
+
+void
+GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
+{
+  void **img_header = (void **) target_data;
+  struct targ_fn_descriptor *targ_fns
+    = (struct targ_fn_descriptor *) img_header[0];
+  struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
+
+  free (targ_fns);
+
+  pthread_mutex_lock (&ptx_image_lock);
+  for (image = ptx_images; image != NULL;)
+    {
+      struct ptx_image_data *next = image->next;
+
+      if (image->target_data == target_data)
+	{
+	  cuModuleUnload (image->module);
+	  free (image);
+	  if (prev)
+	    prev->next = next;
+	}
+      else
+	{
+	  prev = image;
+	  if (!newhd)
+	    newhd = image;
+	}
+
+      image = next;
+    }
+  ptx_images = newhd;
+  pthread_mutex_unlock (&ptx_image_lock);
 }
 
 void *
-GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
+GOMP_OFFLOAD_alloc (int ord, size_t size)
 {
+  nvptx_attach_host_thread_to_device (ord);
   return nvptx_alloc (size);
 }
 
 void
-GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
+GOMP_OFFLOAD_free (int ord, void *ptr)
 {
+  nvptx_attach_host_thread_to_device (ord);
   nvptx_free (ptr);
 }
 
 void *
-GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
-		       const void *src, size_t n)
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 {
+  nvptx_attach_host_thread_to_device (ord);
   return nvptx_dev2host (dst, src, n);
 }
 
 void *
-GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
-		       const void *src, size_t n)
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
 {
+  nvptx_attach_host_thread_to_device (ord);
   return nvptx_host2dev (dst, src, n);
 }
 
@@ -1627,45 +1762,6 @@  GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
 	    num_workers, vector_length, async, targ_mem_desc);
 }
 
-void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
-  return nvptx_open_device (n);
-}
-
-int
-GOMP_OFFLOAD_openacc_close_device (void *h)
-{
-  return nvptx_close_device (h);
-}
-
-void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  assert (n >= 0);
-
-  if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
-    (void) nvptx_open_device (n);
-}
-
-/* This can be called before the device is "opened" for the current thread, in
-   which case we can't tell which device number should be returned.  We don't
-   actually want to open the device here, so just return -1 and let the caller
-   (oacc-init.c:acc_get_device_num) handle it.  */
-
-int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (nvthd && nvthd->ptx_dev)
-    return nvthd->ptx_dev->ord;
-  else
-    return -1;
-}
-
 void
 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
 {
@@ -1729,14 +1825,18 @@  GOMP_OFFLOAD_openacc_async_set_async (int async)
 }
 
 void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+GOMP_OFFLOAD_openacc_create_thread_data (int ord)
 {
-  struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
+  struct ptx_device *ptx_dev;
   struct nvptx_thread *nvthd
     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
   CUresult r;
   CUcontext thd_ctx;
 
+  ptx_dev = ptx_devices[ord];
+
+  assert (ptx_dev);
+
   r = cuCtxGetCurrent (&thd_ctx);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
diff --git a/libgomp/target.c b/libgomp/target.c
index dfe7fb9..d8da783 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -178,7 +178,6 @@  gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
   tgt->list_count = mapnum;
   tgt->refcount = 1;
   tgt->device_descr = devicep;
-  tgt->mem_map = mem_map;
 
   if (mapnum == 0)
     return tgt;
@@ -597,7 +596,7 @@  gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
 	  devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
 				  (void *) (k->tgt->tgt_start + k->tgt_offset),
 				  k->host_end - k->host_start);
-	splay_tree_remove (tgt->mem_map, k);
+	splay_tree_remove (&devicep->mem_map, k);
 	if (k->tgt->refcount > 1)
 	  k->tgt->refcount--;
 	else
@@ -1159,10 +1158,6 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
     {
       optional_present = optional_total = 0;
       DLSYM_OPT (openacc.exec, openacc_parallel);
-      DLSYM_OPT (openacc.open_device, openacc_open_device);
-      DLSYM_OPT (openacc.close_device, openacc_close_device);
-      DLSYM_OPT (openacc.get_device_num, openacc_get_device_num);
-      DLSYM_OPT (openacc.set_device_num, openacc_set_device_num);
       DLSYM_OPT (openacc.register_async_cleanup,
 		 openacc_register_async_cleanup);
       DLSYM_OPT (openacc.async_test, openacc_async_test);
@@ -1271,7 +1266,6 @@  gomp_target_init (void)
 		current_device.mem_map.root = NULL;
 		current_device.is_initialized = false;
 		current_device.openacc.data_environ = NULL;
-		current_device.openacc.target_data = NULL;
 		for (i = 0; i < new_num_devices; i++)
 		  {
 		    current_device.target_id = i;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
index 84045db..a4cf7f2 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c
@@ -58,7 +58,7 @@  main (int argc, char **argv)
       acc_set_device_num (1, (acc_device_t) 0);
 
       devnum = acc_get_device_num (devtype);
-      if (devnum != 0)
+      if (devnum != 1)
 	abort ();
   }