[1/2,og8] Further OpenACC reference-counting improvements
diff mbox series

Message ID 293170ece8a8b198373849be97b3950e8fbcccc8.1543438190.git.julian@codesourcery.com
State New
Headers show
Series
  • Further OpenACC/libgomp refcounting fixes
Related show

Commit Message

Julian Brown Nov. 28, 2018, 9:22 p.m. UTC
This is the main set of improvements to reference-counting behaviour
(see parent email for further details).

ChangeLog

	libgomp/
	* libgomp.h (splay_tree_key_s): Substitute dynamic_refcount field for
	virtual_refcount.
	(acc_dispatch_t): Remove data_environ field.
	(gomp_acc_insert_pointer, gomp_acc_data_env_remove_tgt): Remove
	prototypes.
	(gomp_acc_remove_pointer): Update prototype.
	* oacc-async.c (goacc_remove_var_async): New function.
	* oacc-host.c (host_dispatch): Don't initialise removed data_environ
	field.
	* oacc-init.c (acc_shutdown_1): Use gomp_remove_var instead of
	gomp_unmap_vars to remove mappings by splay tree key instead of target
	memory descriptor.
	* oacc-int.h (splay_tree_key_s): Add forward declaration.
	(goacc_remove_car_async): Add prototype.
	* oacc-mem.c (gomp_acc_data_env_remove, gomp_acc_data_env_remove_tgt):
	Remove functions.
	(present_create_copy): Use virtual_refcount instead of dynamic_refcount,
	and don't modify after calling gomp_map_vars_async.  Don't create dummy
	target_mem_desc.  Fix target pointer return value.
	(delete_copyout): Update for virtual_refcount semantics.  Use
	goacc_remove_var_async for asynchronous delete/copyouts.
	(gomp_acc_insert_pointer): Remove function.
	(gomp_acc_remove_pointer): Use virtual_refcount semantics.
	* oacc-parallel.c (find_pointer): Add missing GOMP_MAP_FORCE_DETACH
	case.
	(GOACC_enter_exit_data): Fix struct mapping/unmapping for
	virtual_refcount semantics.  Fix attach/detach behaviour.  Don't call
	gomp_acc_insert_pointer.
	* target.c (gomp_map_vars_existing): Fix initialisation of do_detach
	field.
	(gomp_map_vars_async): Handle GOMP_MAP_VARS_OPENACC_ENTER_DATA.  Update
	for virtual_refcount semantics.  Add some missing initialisations in
	dynamic array code paths.
	(gomp_unmap_tgt): Don't call gomp_acc_data_env_remove_tgt.
	(gomp_remove_var): Fix use-after-free.
	(gomp_unmap_vars_async): Update for virtual_refcount semantics.
	(gomp_load_image_to_device): Don't use tgt's variable list to store
	static function and variable mappings. Initialise virtual refcount.
	(gomp_target_init): Don't initialise removed data_environ field.
	* testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c: Update test for
	fixed refcount behaviour.
	* testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c: Likewise.
---
 libgomp/libgomp.h                                  |   22 +--
 libgomp/oacc-async.c                               |   18 ++
 libgomp/oacc-host.c                                |    2 -
 libgomp/oacc-init.c                                |    6 +-
 libgomp/oacc-int.h                                 |    5 +
 libgomp/oacc-mem.c                                 |  206 +++++---------------
 libgomp/oacc-parallel.c                            |  127 ++++++-------
 libgomp/target.c                                   |   63 ++++---
 .../libgomp.oacc-c-c++-common/deep-copy-7.c        |   11 +-
 .../libgomp.oacc-c-c++-common/deep-copy-8.c        |    1 +
 10 files changed, 189 insertions(+), 272 deletions(-)

Patch
diff mbox series

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 568e260..ea44afc 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -860,8 +860,11 @@  struct splay_tree_key_s {
   uintptr_t tgt_offset;
   /* Reference count.  */
   uintptr_t refcount;
-  /* Dynamic reference count.  */
-  uintptr_t dynamic_refcount;
+  /* Reference counts beyond those that represent genuine references in the
+     linked splay tree key/target memory structures, e.g. for multiple OpenACC
+     "present increment" operations (via "acc enter data") refering to the same
+     host-memory block.  */
+  uintptr_t virtual_refcount;
   /* For a block with attached pointers, the attachment counters for each.  */
   unsigned short *attach_count;
   /* Pointer to the original mapping of "omp declare target link" object.  */
@@ -887,13 +890,6 @@  splay_compare (splay_tree_key x, splay_tree_key y)
 
 typedef struct acc_dispatch_t
 {
-  /* This is a linked list of data mapped using the
-     acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas.
-     Unlike mapped_data in the goacc_thread struct, unmapping can
-     happen out-of-order with respect to mapping.  */
-  /* This is guarded by the lock in the "outer" struct gomp_device_descr.  */
-  struct target_mem_desc *data_environ;
-
   /* Execute.  */
   __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
   __typeof (GOMP_OFFLOAD_openacc_exec_params) *exec_params_func;
@@ -1010,9 +1006,9 @@  enum gomp_map_vars_kind
 
 struct gomp_coalesce_buf;
 
-extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
-extern void gomp_acc_remove_pointer (void **, size_t *, unsigned short *,
-				     int, void *, bool, int);
+extern void gomp_acc_remove_pointer (struct gomp_device_descr *, void **,
+				     size_t *, unsigned short *, int, bool,
+				     int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 				       unsigned short *);
 struct gomp_coalesce_buf;
@@ -1041,8 +1037,6 @@  extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
 						    size_t, void **, void **,
 						    size_t *, void *, bool,
 						    enum gomp_map_vars_kind);
-extern void gomp_acc_data_env_remove_tgt (struct target_mem_desc **,
-					  struct target_mem_desc *);
 extern void gomp_unmap_tgt (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
 extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index bb00279..be47222 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -385,6 +385,24 @@  goacc_async_copyout_unmap_vars (struct target_mem_desc *tgt,
 					      (void *) tgt);
 }
 
+/* Remove a variable asynchronously.  This actually removes the variable
+   mapping immediately, but retains the linked target_mem_desc until the
+   asynchronous operation has completed (as it may still refer to target
+   memory).  The device lock must be held before entry, and remains locked on
+   exit.  */
+
+attribute_hidden void
+goacc_remove_var_async (struct gomp_device_descr *devicep, splay_tree_key n,
+                       struct goacc_asyncqueue *aq)
+{
+  struct target_mem_desc *tgt = n->tgt;
+  assert (tgt);
+  tgt->refcount++;
+  gomp_remove_var (devicep, n);
+  devicep->openacc.async.queue_callback_func (aq, goacc_async_unmap_tgt,
+                                             (void *) tgt);
+}
+
 attribute_hidden void
 goacc_async_free (struct gomp_device_descr *devicep,
 		  struct goacc_asyncqueue *aq, void *ptr)
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 4521fff..00bc5f6 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -286,8 +286,6 @@  static struct gomp_device_descr host_dispatch =
     .state = GOMP_DEVICE_UNINITIALIZED,
 
     .openacc = {
-      .data_environ = NULL,
-
       .exec_func = host_openacc_exec,
       .exec_params_func = host_openacc_exec_params,
 
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 48c9646..5e38d01 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -389,9 +389,9 @@  acc_shutdown_1 (acc_device_t d)
 	{
 	  while (walk->dev->mem_map.root)
 	    {
-	      struct target_mem_desc *tgt = walk->dev->mem_map.root->key.tgt;
-
-	      gomp_unmap_vars (tgt, false);
+	      splay_tree_key k = &walk->dev->mem_map.root->key;
+	      k->link_key = NULL;
+	      gomp_remove_var (walk->dev, k);
 	    }
 
 	  walk->dev = NULL;
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 1f6c62c..d903065 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -109,10 +109,15 @@  void goacc_restore_bind (void);
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);
 
+struct splay_tree_key_s;
+
 void goacc_init_asyncqueues (struct gomp_device_descr *);
 bool goacc_fini_asyncqueues (struct gomp_device_descr *);
 void goacc_async_copyout_unmap_vars (struct target_mem_desc *,
 				     struct goacc_asyncqueue *);
+void goacc_remove_var_async (struct gomp_device_descr *devicep,
+			     struct splay_tree_key_s *n,
+			     struct goacc_asyncqueue *aq);
 void goacc_async_free (struct gomp_device_descr *,
 		       struct goacc_asyncqueue *, void *);
 struct goacc_asyncqueue *get_goacc_asyncqueue (int);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 3202f06..9b70820 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -439,77 +439,6 @@  acc_map_data (void *h, void *d, size_t s)
     }
 }
 
-/* Remove the target_mem_desc holding the mapping for MAPNUM HOSTADDRS from
-   the OpenACC data environment pointed to by DATA_ENV.  The device lock
-   should be held before calling, and remains locked on exit.  */
-
-static void
-gomp_acc_data_env_remove (struct gomp_device_descr *acc_dev,
-			  struct target_mem_desc **data_env, void **hostaddrs,
-			  int mapnum)
-{
-  struct target_mem_desc *t, *tp;
-
-  for (tp = NULL, t = *data_env; t != NULL; tp = t, t = t->prev)
-    {
-      bool all_match = true;
-
-      /* We must locate the target descriptor by "value", matching each
-	 hostaddr that it describes.  */
-      if (t->list_count != mapnum)
-        continue;
-
-      for (int i = 0; i < t->list_count; i++)
-	if (t->list[i].key
-	    && (t->list[i].key->host_start + t->list[i].offset
-		!= (uintptr_t) hostaddrs[i]))
-	  {
-	    all_match = false;
-	    break;
-	  }
-
-      if (all_match)
-	{
-	  if (t->refcount > 1)
-	    t->refcount--;
-	  else
-	    {
-	      if (tp)
-		tp->prev = t->prev;
-	      else
-		*data_env = t->prev;
-	    }
-	  return;
-	}
-    }
-
-  gomp_mutex_unlock (&acc_dev->lock);
-  gomp_fatal ("cannot find data mapping to remove in data environment");
-}
-
-/* Similar, but removes target_mem_desc REMOVE from the DATA_ENV, in case its
-   reference count drops to zero resulting in it being unmapped (in
-   target.c:gomp_unmap_tgt).  Unlike the above function it is not an error if
-   REMOVE is not present in the environment.  The device lock should be held
-   before calling, and remains locked on exit.  */
-
-attribute_hidden void
-gomp_acc_data_env_remove_tgt (struct target_mem_desc **data_env,
-			      struct target_mem_desc *remove)
-{
-  struct target_mem_desc *t, *tp;
-
-  for (tp = NULL, t = *data_env; t != NULL; tp = t, t = t->prev)
-    if (t == remove)
-      {
-	if (tp)
-	  tp->prev = t->prev;
-	else
-	  *data_env = t->prev;
-	return;
-      }
-}
-
 void
 acc_unmap_data (void *h)
 {
@@ -626,26 +555,9 @@  present_create_copy (unsigned f, void *h, size_t s, int async)
       if (n->refcount != REFCOUNT_INFINITY)
 	{
 	  n->refcount++;
-	  n->dynamic_refcount++;
+	  n->virtual_refcount++;
 	}
 
-      struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt)
-						 + sizeof (tgt->list[0]));
-      tgt->refcount = 1;
-      tgt->tgt_start = 0;
-      tgt->tgt_end = 0;
-      tgt->to_free = NULL;
-      tgt->prev = acc_dev->openacc.data_environ;
-      tgt->list_count = 1;
-      tgt->device_descr = acc_dev;
-      tgt->list[0].key = n;
-      tgt->list[0].copy_from = false;
-      tgt->list[0].always_copy_from = false;
-      tgt->list[0].do_detach = false;
-      tgt->list[0].offset = (uintptr_t) h - n->host_start;
-      tgt->list[0].length = 0;
-      acc_dev->openacc.data_environ = tgt;
-
       gomp_mutex_unlock (&acc_dev->lock);
     }
   else if (!(f & FLAG_CREATE))
@@ -655,7 +567,6 @@  present_create_copy (unsigned f, void *h, size_t s, int async)
     }
   else
     {
-      struct target_mem_desc *tgt;
       size_t mapnum = 1;
       unsigned short kinds;
       void *hostaddrs = h;
@@ -669,20 +580,15 @@  present_create_copy (unsigned f, void *h, size_t s, int async)
 
       goacc_aq aq = get_goacc_asyncqueue (async);
 
-      tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
-				 &kinds, true,
-				 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
-
-      for (int i = 0; i < tgt->list_count; i++)
-        if (tgt->list[i].key)
-	  tgt->list[i].key->dynamic_refcount++;
+      gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s, &kinds,
+			   true, GOMP_MAP_VARS_OPENACC_ENTER_DATA);
 
       gomp_mutex_lock (&acc_dev->lock);
-      tgt->prev = acc_dev->openacc.data_environ;
-      acc_dev->openacc.data_environ = tgt;
+      n = lookup_host (acc_dev, h, s);
+      assert (n != NULL);
+      d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h
+		    - n->host_start);
       gomp_mutex_unlock (&acc_dev->lock);
-
-      d = tgt->to_free;
     }
 
   if (profiling_setup_p)
@@ -765,7 +671,6 @@  delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
 {
   size_t host_size;
   splay_tree_key n;
-  void *d;
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
@@ -797,9 +702,6 @@  delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
       gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
     }
 
-  d = (void *) (n->tgt->tgt_start + n->tgt_offset
-		+ (uintptr_t) h - n->host_start);
-
   host_size = n->host_end - n->host_start;
 
   if (n->host_start != (uintptr_t) h || host_size != s)
@@ -812,29 +714,37 @@  delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
   if (n->refcount == REFCOUNT_INFINITY)
     {
       n->refcount = 0;
-      n->dynamic_refcount = 0;
-      n->attach_count = NULL;
+      n->virtual_refcount = 0;
     }
 
   if (f & FLAG_FINALIZE)
     {
-      n->refcount -= n->dynamic_refcount;
-      n->dynamic_refcount = 0;
+      n->refcount -= n->virtual_refcount;
+      n->virtual_refcount = 0;
     }
-  else if (n->dynamic_refcount)
+
+  if (n->virtual_refcount > 0)
     {
-      n->dynamic_refcount--;
       n->refcount--;
+      n->virtual_refcount--;
     }
+  else if (n->refcount > 0)
+    n->refcount--;
 
   if (n->refcount == 0)
     {
+      goacc_aq aq = get_goacc_asyncqueue (async);
+
       if (f & FLAG_COPYOUT)
-	{
-	  goacc_aq aq = get_goacc_asyncqueue (async);
+        {
+	  void *d = (void *) (n->tgt->tgt_start + n->tgt_offset
+			      + (uintptr_t) h - n->host_start);
 	  gomp_copy_dev2host (acc_dev, aq, h, d, s);
 	}
-      gomp_remove_var (acc_dev, n);
+      if (aq)
+	goacc_remove_var_async (acc_dev, n, aq);
+      else
+	gomp_remove_var (acc_dev, n);
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
@@ -1003,53 +913,15 @@  gomp_acc_declare_allocate (bool allocate, size_t mapnum, void **hostaddrs,
 }
 
 void
-gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
-			 void *kinds, int async)
+gomp_acc_remove_pointer (struct gomp_device_descr *acc_dev, void **hostaddrs,
+			 size_t *sizes, unsigned short *kinds, int async,
+			 bool finalize, int mapnum)
 {
-  struct target_mem_desc *tgt;
-  struct goacc_thread *thr = goacc_thread ();
-  struct gomp_device_descr *acc_dev = thr->dev;
-
-  gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-  goacc_aq aq = get_goacc_asyncqueue (async);
-  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
-			     NULL, sizes, kinds, true,
-			     GOMP_MAP_VARS_OPENACC_ENTER_DATA);
-  gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
-
-  for (size_t i = 0; i < tgt->list_count; i++)
-    if (tgt->list[i].key)
-      tgt->list[i].key->dynamic_refcount++;
-
-  gomp_mutex_lock (&acc_dev->lock);
-  tgt->prev = acc_dev->openacc.data_environ;
-  acc_dev->openacc.data_environ = tgt;
-  gomp_mutex_unlock (&acc_dev->lock);
-}
-
-void
-gomp_acc_remove_pointer (void **hostaddrs, size_t *sizes, unsigned short *kinds,
-			 int async, void *detach_from, bool finalize,
-			 int mapnum)
-{
-  struct goacc_thread *thr = goacc_thread ();
-  struct gomp_device_descr *acc_dev = thr->dev;
   struct splay_tree_key_s cur_node;
   splay_tree_key n;
 
   gomp_mutex_lock (&acc_dev->lock);
 
-  if (detach_from)
-    {
-      splay_tree_key n2 = lookup_host (acc_dev, detach_from, 1);
-      goacc_aq aq = get_goacc_asyncqueue (async);
-      gomp_detach_pointer (acc_dev, aq, n2, (uintptr_t) detach_from, finalize,
-			   NULL);
-    }
-
-  gomp_acc_data_env_remove (acc_dev, &acc_dev->openacc.data_environ, hostaddrs,
-			    mapnum);
-
   for (int i = 0; i < mapnum; i++)
     {
       int kind = kinds[i] & 0xff;
@@ -1062,6 +934,7 @@  gomp_acc_remove_pointer (void **hostaddrs, size_t *sizes, unsigned short *kinds,
 	case GOMP_MAP_ALWAYS_FROM:
 	  copyfrom = true;
 	  /* Fallthrough.  */
+
 	case GOMP_MAP_TO_PSET:
 	case GOMP_MAP_POINTER:
 	case GOMP_MAP_DELETE:
@@ -1075,27 +948,41 @@  gomp_acc_remove_pointer (void **hostaddrs, size_t *sizes, unsigned short *kinds,
 				  || kind == GOMP_MAP_POINTER)
 				 ? sizeof (void *) : sizes[i]);
 	  n = splay_tree_lookup (&acc_dev->mem_map, &cur_node);
+
 	  if (n == NULL)
 	    continue;
+
+	  if (n->refcount == REFCOUNT_INFINITY)
+	    {
+	      n->refcount = 1;
+	      n->virtual_refcount = 0;
+	    }
+
 	  if (finalize)
 	    {
-	      n->refcount -= n->dynamic_refcount;
-	      n->dynamic_refcount = 0;
+	      n->refcount -= n->virtual_refcount;
+	      n->virtual_refcount = 0;
 	    }
-	  else if (n->refcount > 0 && n->refcount != REFCOUNT_INFINITY)
+
+	  if (n->virtual_refcount > 0)
 	    {
 	      n->refcount--;
-	      n->dynamic_refcount--;
+	      n->virtual_refcount--;
 	    }
+	  else if (n->refcount > 0)
+	    n->refcount--;
+
 	  if (copyfrom)
 	    gomp_copy_dev2host (acc_dev, NULL, (void *) cur_node.host_start,
 				(void *) (n->tgt->tgt_start + n->tgt_offset
 					  + cur_node.host_start
 					  - n->host_start),
 				cur_node.host_end - cur_node.host_start);
+
 	  if (n->refcount == 0)
 	    gomp_remove_var (acc_dev, n);
 	  break;
+
 	default:
 	  gomp_mutex_unlock (&acc_dev->lock);
 	  gomp_fatal ("gomp_acc_remove_pointer unhandled kind 0x%.2x",
@@ -1103,7 +990,6 @@  gomp_acc_remove_pointer (void **hostaddrs, size_t *sizes, unsigned short *kinds,
 	}
     }
 
-
   gomp_mutex_unlock (&acc_dev->lock);
 }
 
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 8a3c65b..a4487b8 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -69,7 +69,8 @@  find_pointer (int pos, size_t mapnum, unsigned short *kinds)
 	if (kind1 == GOMP_MAP_POINTER
 	    || kind1 == GOMP_MAP_ALWAYS_POINTER
 	    || kind1 == GOMP_MAP_ATTACH
-	    || kind1 == GOMP_MAP_DETACH)
+	    || kind1 == GOMP_MAP_DETACH
+	    || kind1 == GOMP_MAP_FORCE_DETACH)
 	  return 2;
 	else if (kind1 == GOMP_MAP_TO_PSET)
 	  return 3;
@@ -847,42 +848,10 @@  GOACC_enter_exit_data (int device, size_t mapnum,
 		case GOMP_MAP_STRUCT:
 		  {
 		    int elems = sizes[i];
-		    struct splay_tree_key_s k;
-		    splay_tree_key str;
-		    uintptr_t elems_lo = (uintptr_t) hostaddrs[i + 1];
-		    uintptr_t elems_hi = (uintptr_t) hostaddrs[i + elems]
-					 + sizes[i + elems];
-		    k.host_start = elems_lo;
-		    k.host_end = elems_hi;
-		    gomp_mutex_lock (&acc_dev->lock);
-		    str = splay_tree_lookup (&acc_dev->mem_map, &k);
-		    gomp_mutex_unlock (&acc_dev->lock);
-		    if (str == NULL)
-		      {
-		        size_t mapsize = elems_hi - elems_lo;
-			goacc_aq aq = get_goacc_asyncqueue (async);
-			struct target_mem_desc *tgt;
-			unsigned short thiskind = GOMP_MAP_ALLOC;
-			int j;
-			for (j = 0; j < elems; j++)
-			  if ((kinds[i + j] & 0xff) != GOMP_MAP_ALLOC)
-			    {
-			      thiskind = GOMP_MAP_TO;
-			      break;
-			    }
-			tgt = gomp_map_vars_async (acc_dev, aq, 1,
-				&hostaddrs[i + 1], NULL, &mapsize, &thiskind,
-				true, GOMP_MAP_VARS_OPENACC_ENTER_DATA);
-
-			for (j = 0; j < tgt->list_count; j++)
-			  if (tgt->list[j].key)
-			    tgt->list[j].key->dynamic_refcount++;
-
-			gomp_mutex_lock (&acc_dev->lock);
-			tgt->prev = acc_dev->openacc.data_environ;
-			acc_dev->openacc.data_environ = tgt;
-			gomp_mutex_unlock (&acc_dev->lock);
-		      }
+		    goacc_aq aq = get_goacc_asyncqueue (async);
+		    gomp_map_vars_async (acc_dev, aq, elems + 1, &hostaddrs[i],
+					 NULL, &sizes[i], &kinds[i], true,
+					 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
 		    i += elems;
 		  }
 		  break;
@@ -898,8 +867,15 @@  GOACC_enter_exit_data (int device, size_t mapnum,
 		gomp_acc_declare_allocate (true, pointer, &hostaddrs[i],
 					   &sizes[i], &kinds[i]);
 	      else
-		gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-					 &sizes[i], &kinds[i], async);
+	        {
+		  goacc_aq aq = get_goacc_asyncqueue (async);
+	          for (int j = 0; j < 2; j++)
+		    gomp_map_vars_async (acc_dev, aq,
+					 (j == 0 || pointer == 2) ? 1 : 2,
+					 &hostaddrs[i + j], NULL,
+					 &sizes[i + j], &kinds[i + j], true,
+					 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
+		}
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -930,8 +906,7 @@  GOACC_enter_exit_data (int device, size_t mapnum,
     }
   else
     {
-      /* This loop only handles explicit "detach" clauses that are not an
-	 implicit part of a copy{,in,out}, etc. mapping.  */
+      /* Handle "detach" before copyback/deletion of mapped data.  */
       for (i = 0; i < mapnum; i++)
         {
 	  unsigned char kind = kinds[i] & 0xff;
@@ -948,7 +923,16 @@  GOACC_enter_exit_data (int device, size_t mapnum,
 	        i += sizes[i];
 	    }
 	  else
-	    i += pointer - 1;
+	    {
+	      unsigned char kind2 = kinds[i + pointer - 1] & 0xff;
+
+	      if (kind2 == GOMP_MAP_DETACH)
+		acc_detach (hostaddrs[i + pointer - 1]);
+	      else if (kind2 == GOMP_MAP_FORCE_DETACH)
+	        acc_detach_finalize (hostaddrs[i + pointer - 1]);
+
+	      i += pointer - 1;
+	    }
 	}
 
       for (i = 0; i < mapnum; ++i)
@@ -985,19 +969,39 @@  GOACC_enter_exit_data (int device, size_t mapnum,
 	      case GOMP_MAP_STRUCT:
 		{
 		  int elems = sizes[i];
-		  struct splay_tree_key_s k;
-		  splay_tree_key str;
-		  uintptr_t elems_lo = (uintptr_t) hostaddrs[i + 1];
-		  uintptr_t elems_hi = (uintptr_t) hostaddrs[i + elems]
-				       + sizes[i + elems];
-		  k.host_start = elems_lo;
-		  k.host_end = elems_hi;
-		  gomp_mutex_lock (&acc_dev->lock);
-		  str = splay_tree_lookup (&acc_dev->mem_map, &k);
-		  gomp_mutex_unlock (&acc_dev->lock);
-		  if (str == NULL)
-		    gomp_fatal ("[%p,%ld] is not mapped", (void *) elems_lo,
-				(unsigned long) (elems_hi - elems_lo));
+		  goacc_aq aq = get_goacc_asyncqueue (async);
+		  for (int j = 1; j <= elems; j++)
+		    {
+		      struct splay_tree_key_s k;
+		      k.host_start = (uintptr_t) hostaddrs[i + j];
+		      k.host_end = k.host_start + sizes[i + j];
+		      splay_tree_key str;
+		      gomp_mutex_lock (&acc_dev->lock);
+		      str = splay_tree_lookup (&acc_dev->mem_map, &k);
+		      gomp_mutex_unlock (&acc_dev->lock);
+		      if (str)
+		        {
+			  if (finalize)
+			    {
+			      str->refcount -= str->virtual_refcount;
+			      str->virtual_refcount = 0;
+			    }
+			  if (str->virtual_refcount > 0)
+			    {
+			      str->refcount--;
+			      str->virtual_refcount--;
+			    }
+			  else if (str->refcount > 0)
+			    str->refcount--;
+			  if (str->refcount == 0)
+			    {
+			      if (aq)
+				goacc_remove_var_async (acc_dev, str, aq);
+			      else
+				gomp_remove_var (acc_dev, str);
+			    }
+			}
+		    }
 		  i += elems;
 		}
 		break;
@@ -1012,17 +1016,8 @@  GOACC_enter_exit_data (int device, size_t mapnum,
 		gomp_acc_declare_allocate (false, pointer, &hostaddrs[i],
 					   &sizes[i], &kinds[i]);
 	      else
-		{
-		  unsigned short ptrkind = kinds[i + pointer - 1] & 0xff;
-		  bool detach = (ptrkind == GOMP_MAP_DETACH
-				 || ptrkind == GOMP_MAP_FORCE_DETACH);
-		  void *detach_from = detach ? hostaddrs[i + pointer - 1]
-					     : NULL;
-		  gomp_acc_remove_pointer (&hostaddrs[i], &sizes[i], &kinds[i],
-					   async, detach_from, finalize,
-					   pointer);
-		  /* See the above comment.  */
-		}
+		gomp_acc_remove_pointer (acc_dev, &hostaddrs[i], &sizes[i],
+					 &kinds[i], async, finalize, pointer);
 	      i += pointer - 1;
 	    }
 	}
diff --git a/libgomp/target.c b/libgomp/target.c
index bb5e1e9..91139a6 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -374,7 +374,7 @@  gomp_map_vars_existing (struct gomp_device_descr *devicep,
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
   tgt_var->always_copy_from = GOMP_MAP_ALWAYS_FROM_P (kind);
-  tgt_var->do_detach = false;
+  tgt_var->do_detach = kind == GOMP_MAP_ATTACH;
   tgt_var->offset = newn->host_start - oldn->host_start;
   tgt_var->length = newn->host_end - newn->host_start;
 
@@ -841,8 +841,9 @@  gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 attribute_hidden struct target_mem_desc *
 gomp_map_vars_async (struct gomp_device_descr *devicep,
 		     struct goacc_asyncqueue *aq, size_t mapnum,
-		     void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
-		     bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+		     void **hostaddrs, void **devaddrs, size_t *sizes,
+		     void *kinds, bool short_mapkind,
+		     enum gomp_map_vars_kind pragma_kind)
 {
   size_t i, tgt_align, tgt_size, not_found_cnt = 0;
   bool has_firstprivate = false;
@@ -873,7 +874,8 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
   tgt = gomp_malloc (sizeof (*tgt)
 		     + sizeof (tgt->list[0]) * (mapnum + da_data_row_num));
   tgt->list_count = mapnum + da_data_row_num;
-  tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
+  tgt->refcount = (pragma_kind == GOMP_MAP_VARS_ENTER_DATA
+		   || pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA) ? 0 : 1;
   tgt->device_descr = devicep;
   struct gomp_coalesce_buf cbuf, *cbufp = NULL;
 
@@ -1307,6 +1309,10 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	      {
 		tgt->list[i].key = &array->key;
 		tgt->list[i].key->tgt = tgt;
+		tgt->list[i].key->refcount = REFCOUNT_INFINITY;
+		tgt->list[i].key->virtual_refcount = 0;
+		tgt->list[i].key->attach_count = NULL;
+		tgt->list[i].key->link_key = NULL;
 		array++;
 		continue;
 	      }
@@ -1356,7 +1362,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
-		k->dynamic_refcount = 0;
+		k->virtual_refcount = 0;
 		k->attach_count = NULL;
 		tgt->refcount++;
 		array->left = NULL;
@@ -1528,7 +1534,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 
 		  k->tgt = tgt;
 		  k->refcount = 1;
-		  k->dynamic_refcount = 0;
+		  k->virtual_refcount = 0;
 		  k->attach_count = NULL;
 		  k->link_key = NULL;
 		  tgt_size = (tgt_size + align - 1) & ~(align - 1);
@@ -1611,8 +1617,20 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
-  if (pragma_kind == GOMP_MAP_VARS_ENTER_DATA && tgt->refcount == 0)
-    {
+  if ((pragma_kind == GOMP_MAP_VARS_ENTER_DATA
+       || pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA)
+      && tgt->refcount == 0)
+    {
+      /* If we're about to discard a target_mem_desc with no "structural"
+	 references (tgt->refcount == 0), any splay keys linked in the tgt's
+	 list must have their virtual refcount incremented to represent that
+	 "lost" reference in order to implement the semantics of the OpenACC
+	 "present increment" operation properly.  */
+      if (pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA)
+	for (i = 0; i < tgt->list_count; i++)
+	  if (tgt->list[i].key)
+	    tgt->list[i].key->virtual_refcount++;
+
       free (tgt);
       tgt = NULL;
     }
@@ -1628,8 +1646,6 @@  gomp_unmap_tgt (struct target_mem_desc *tgt)
   if (tgt->tgt_end)
     gomp_free_device_memory (tgt->device_descr, tgt->to_free);
 
-  gomp_acc_data_env_remove_tgt (&tgt->device_descr->openacc.data_environ, tgt);
-
   free (tgt->array);
   free (tgt);
 }
@@ -1641,6 +1657,8 @@  gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
   splay_tree_remove (&devicep->mem_map, k);
   if (k->link_key)
     splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
+  if (k->attach_count)
+    free (k->attach_count);
   if (k->tgt->refcount > 1)
     k->tgt->refcount--;
   else
@@ -1648,8 +1666,6 @@  gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
       is_tgt_unmapped = true;
       gomp_unmap_tgt (k->tgt);
     }
-  if (k->attach_count)
-    free (k->attach_count);
   return is_tgt_unmapped;
 }
 
@@ -1706,7 +1722,14 @@  gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
 	continue;
 
       bool do_unmap = false;
-      if (k->refcount > 1 && k->refcount != REFCOUNT_INFINITY)
+      if (k->tgt == tgt
+	  && k->virtual_refcount > 0
+	  && k->refcount != REFCOUNT_INFINITY)
+        {
+	  k->virtual_refcount--;
+	  k->refcount--;
+	}
+      else if (k->refcount > 1 && k->refcount != REFCOUNT_INFINITY)
 	k->refcount--;
       else if (k->refcount == 1)
 	{
@@ -1830,17 +1853,14 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
     }
 
   /* Insert host-target address mapping into splay tree.  */
-  struct target_mem_desc *tgt =
-	  gomp_malloc (sizeof (*tgt)
-		       + sizeof (tgt->list[0])
-		       * (num_funcs + num_vars) * sizeof (*tgt->array));
+  struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
   tgt->array = gomp_malloc ((num_funcs + num_vars) * sizeof (*tgt->array));
   tgt->refcount = REFCOUNT_INFINITY;
   tgt->tgt_start = 0;
   tgt->tgt_end = 0;
   tgt->to_free = NULL;
   tgt->prev = NULL;
-  tgt->list_count = num_funcs + num_vars;
+  tgt->list_count = 0;
   tgt->device_descr = devicep;
   splay_tree_node array = tgt->array;
 
@@ -1852,10 +1872,9 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->tgt = tgt;
       k->tgt_offset = target_table[i].start;
       k->refcount = REFCOUNT_INFINITY;
+      k->virtual_refcount = 0;
       k->attach_count = NULL;
       k->link_key = NULL;
-      tgt->list[i].key = k;
-      tgt->refcount++;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -1887,10 +1906,9 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->tgt = tgt;
       k->tgt_offset = target_var->start;
       k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
+      k->virtual_refcount = 0;
       k->attach_count = NULL;
       k->link_key = NULL;
-      tgt->list[i].key = k;
-      tgt->refcount++;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -3604,7 +3622,6 @@  gomp_target_init (void)
 		current_device.type = current_device.get_type_func ();
 		current_device.mem_map.root = NULL;
 		current_device.state = GOMP_DEVICE_UNINITIALIZED;
-		current_device.openacc.data_environ = NULL;
 
 		/* Augment DEVICES and NUM_DEVICES.  */
 		devices = gomp_realloc (devices,
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c
index 3a970a0..a59047a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c
@@ -20,16 +20,19 @@  main ()
 
   for (k = 0; k < 16; k++)
     {
+      /* Here, we do not explicitly copy the enclosing structure, but work
+	 with fields directly.  Make sure attachment counters and reference
+	 counters work properly in that case.  */
 #pragma acc enter data copyin(v.a, v.b[0:n])
-
+#pragma acc enter data pcopyin(v.b[0:n])
 #pragma acc enter data pcopyin(v.b[0:n])
 
-#pragma acc parallel loop attach(v.b)
+#pragma acc parallel loop present(v.a, v.b)
       for (i = 0; i < n; i++)
 	v.b[i] = v.a + i;
 
-#pragma acc exit data copyout(v.b[:n])
-#pragma acc exit data delete(v) finalize
+#pragma acc exit data copyout(v.b[:n]) finalize
+#pragma acc exit data delete(v.a)
 
       for (i = 0; i < n; i++)
 	assert (v.b[i] == v.a + i);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c
index 54f553b..0ca5990 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c
@@ -35,6 +35,7 @@  main ()
 #pragma acc exit data copyout(v.b[:n])
 #pragma acc exit data copyout(v.c[:n])
 #pragma acc exit data copyout(v.d[:n])
+#pragma acc exit data copyout(v.a)
 
       for (i = 0; i < n; i++)
 	assert (v.b[i] == v.a + i);