diff mbox series

[og7] Backport libgomp gomp_copy_host2dev coalesce optimization from trunk

Message ID b8776164-731e-23cd-8a6a-909347251ed0@codesourcery.com
State New
Headers show
Series [og7] Backport libgomp gomp_copy_host2dev coalesce optimization from trunk | expand

Commit Message

Cesar Philippidis May 7, 2018, 3:53 p.m. UTC
This patch backports Jakub's gomp_copy_host2dev optimization from
<https://gcc.gnu.org/ml/gcc-patches/2017-10/msg01800.html>. There were a
couple of changes required due to the new async infrastructure in og7.

I've applied this patch to og7.

Cesar
diff mbox series

Patch

2018-05-07  Thomas Schwinge <thomas@codesourcery.com>
	    Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* libgomp.h (gomp_coalesce_buf): Declare.
	(gomp_copy_host2dev): Add gomp_coalesce_buf argument.
	* oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
	(update_dev_host): Likewise.
	* target.c (gomp_map_vars_async): Coalesce host2dev args.

	Backport from trunk:
	2017-10-28  Jakub Jelinek  <jakub@redhat.com>

	* target.c (struct gomp_coalesce_buf): New type.
	(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
	(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
	(gomp_copy_host2dev): Add CBUF argument, if copying into
	the cached ranges, memcpy into buffer instead of copying
	into device.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
	Add CBUF argument, pass it through to other calls.
	(gomp_map_vars): Aggregate copies from host to device if small enough
	and with small enough gaps in between into memcpy into a buffer and
	fewer host to device copies from the buffer.
	(gomp_update): Adjust gomp_copy_host2dev caller.


diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a31c83cc656..6ece45cf95c 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1000,9 +1000,11 @@  extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
 extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 				       unsigned short *);
+struct gomp_coalesce_buf;
 extern void gomp_copy_host2dev (struct gomp_device_descr *,
 				struct goacc_asyncqueue *,
-				void *, const void *, size_t);
+				void *, const void *, size_t,
+				struct gomp_coalesce_buf *);
 extern void gomp_copy_dev2host (struct gomp_device_descr *,
 				struct goacc_asyncqueue *,
 				void *, const void *, size_t);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index d749491cbf5..e72bd35fc9d 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -228,7 +228,7 @@  memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
   if (from)
     gomp_copy_dev2host (thr->dev, aq, h, d, s);
   else
-    gomp_copy_host2dev (thr->dev, aq, d, h, s);
+    gomp_copy_host2dev (thr->dev, aq, d, h, s, NULL);
 
  out:
   if (profiling_setup_p)
@@ -893,7 +893,7 @@  update_dev_host (int is_dev, void *h, size_t s, int async)
   goacc_aq aq = get_goacc_asyncqueue (async);
 
   if (is_dev)
-    gomp_copy_host2dev (acc_dev, aq, d, h, s);
+    gomp_copy_host2dev (acc_dev, aq, d, h, s, NULL);
   else
     gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 10c5e34f378..aa27dc85894 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -203,11 +203,123 @@  goacc_device_copy_async (struct gomp_device_descr *devicep,
     }
 }
 
+/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
+   host to device memory transfers.  */
+
+struct gomp_coalesce_buf
+{
+  /* Buffer into which gomp_copy_host2dev will memcpy data and from which
+     it will be copied to the device.  */
+  void *buf;
+  struct target_mem_desc *tgt;
+  /* Array with offsets, chunks[2 * i] is the starting offset and
+     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
+     of chunks which are to be copied to buf and later copied to device.  */
+  size_t *chunks;
+  /* Number of chunks in chunks array, or -1 if coalesce buffering should not
+     be performed.  */
+  long chunk_cnt;
+  /* During construction of chunks array, how many memory regions are within
+     the last chunk.  If there is just one memory region for a chunk, we copy
+     it directly to device rather than going through buf.  */
+  long use_cnt;
+};
+
+/* Maximum size of memory region considered for coalescing.  Larger copies
+   are performed directly.  */
+#define MAX_COALESCE_BUF_SIZE	(32 * 1024)
+
+/* Maximum size of a gap in between regions to consider them being copied
+   within the same chunk.  All the device offsets considered are within
+   newly allocated device memory, so it isn't fatal if we copy some padding
+   in between from host to device.  The gaps come either from alignment
+   padding or from memory regions which are not supposed to be copied from
+   host to device (e.g. map(alloc:), map(from:) etc.).  */
+#define MAX_COALESCE_BUF_GAP	(4 * 1024)
+
+/* Add region with device tgt_start relative offset and length to CBUF.  */
+
+static inline void
+gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
+{
+  if (len > MAX_COALESCE_BUF_SIZE || len == 0)
+    return;
+  if (cbuf->chunk_cnt)
+    {
+      if (cbuf->chunk_cnt < 0)
+	return;
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+	{
+	  cbuf->chunk_cnt = -1;
+	  return;
+	}
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
+	{
+	  cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
+	  cbuf->use_cnt++;
+	  return;
+	}
+      /* If the last chunk is only used by one mapping, discard it,
+	 as it will be one host to device copy anyway and
+	 memcpying it around will only waste cycles.  */
+      if (cbuf->use_cnt == 1)
+	cbuf->chunk_cnt--;
+    }
+  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
+  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
+  cbuf->chunk_cnt++;
+  cbuf->use_cnt = 1;
+}
+
+/* Return true for mapping kinds which need to copy data from the
+   host to device for regions that weren't previously mapped.  */
+
+static inline bool
+gomp_to_device_kind_p (int kind)
+{
+  switch (kind)
+    {
+    case GOMP_MAP_ALLOC:
+    case GOMP_MAP_FROM:
+    case GOMP_MAP_FORCE_ALLOC:
+    case GOMP_MAP_ALWAYS_FROM:
+      return false;
+    default:
+      return true;
+    }
+}
+
 attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
 		    struct goacc_asyncqueue *aq,
-		    void *d, const void *h, size_t sz)
+		    void *d, const void *h, size_t sz,
+		    struct gomp_coalesce_buf *cbuf)
 {
+  if (cbuf)
+    {
+      uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
+      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+	{
+	  long first = 0;
+	  long last = cbuf->chunk_cnt - 1;
+	  while (first <= last)
+	    {
+	      long middle = (first + last) >> 1;
+	      if (cbuf->chunks[2 * middle + 1] <= doff)
+		first = middle + 1;
+	      else if (cbuf->chunks[2 * middle] <= doff)
+		{
+		  if (doff + sz > cbuf->chunks[2 * middle + 1])
+		    gomp_fatal ("internal libgomp cbuf error");
+		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
+			  h, sz);
+		  return;
+		}
+	      else
+		last = middle - 1;
+	    }
+	}
+    }
   if (aq)
     goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
 			     "dev", d, "host", h, sz, aq);
@@ -245,7 +357,7 @@  static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep,
 			struct goacc_asyncqueue *aq, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
-			unsigned char kind)
+			unsigned char kind, struct gomp_coalesce_buf *cbuf)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@@ -269,7 +381,7 @@  gomp_map_vars_existing (struct gomp_device_descr *devicep,
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start);
+			newn->host_end - newn->host_start, cbuf);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -284,7 +396,8 @@  get_kind (bool short_mapkind, void *kinds, int idx)
 
 static void
 gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
-		  uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias)
+		  uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
+		  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -294,11 +407,10 @@  gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      /* FIXME: see comment about coalescing host/dev transfers below.  */
       gomp_copy_host2dev (devicep, aq,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *));
+			  sizeof (void *), cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -317,9 +429,8 @@  gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  /* FIXME: see comment about coalescing host/dev transfers below.  */
   gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *));
+		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
 }
 
 static uintptr_t
@@ -358,7 +469,8 @@  gomp_map_pset (struct target_mem_desc *tgt, uintptr_t host_ptr,
 static void
 gomp_map_fields_existing (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
 			  splay_tree_key n, size_t first, size_t i,
-			  void **hostaddrs, size_t *sizes, void *kinds)
+			  void **hostaddrs, size_t *sizes, void *kinds,
+			  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -376,7 +488,7 @@  gomp_map_fields_existing (struct target_mem_desc *tgt, struct goacc_asyncqueue *
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
       gomp_map_vars_existing (devicep, aq, n2, &cur_node,
-			      &tgt->list[i], kind & typemask);
+			      &tgt->list[i], kind & typemask, cbuf);
       return;
     }
   if (sizes[i] == 0)
@@ -392,7 +504,7 @@  gomp_map_fields_existing (struct target_mem_desc *tgt, struct goacc_asyncqueue *
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
 	      gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cbuf);
 	      return;
 	    }
 	}
@@ -404,7 +516,7 @@  gomp_map_fields_existing (struct target_mem_desc *tgt, struct goacc_asyncqueue *
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
 	  gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
-				  kind & typemask);
+				  kind & typemask, cbuf);
 	  return;
 	}
     }
@@ -617,6 +729,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
   tgt->list_count = mapnum + da_data_row_num;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
+  struct gomp_coalesce_buf cbuf, *cbufp = NULL;
 
   if (mapnum == 0)
     {
@@ -635,11 +748,25 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
+  cbuf.chunks = NULL;
+  cbuf.chunk_cnt = -1;
+  cbuf.use_cnt = 0;
+  cbuf.buf = NULL;
+  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+    {
+      cbuf.chunks
+	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      cbuf.chunk_cnt = 0;
+    }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
       tgt_size = mapnum * sizeof (void *);
+      cbuf.chunk_cnt = 1;
+      cbuf.use_cnt = 1 + (mapnum > 1);
+      cbuf.chunks[0] = 0;
+      cbuf.chunks[1] = tgt_size;
     }
 
   gomp_mutex_lock (&devicep->lock);
@@ -693,19 +820,26 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	      size_t align = (size_t) 1 << (kind >> rshift);
 	      if (tgt_align < align)
 		tgt_align = align;
-	      tgt_size -= (uintptr_t) hostaddrs[first]
-			  - (uintptr_t) hostaddrs[i];
+	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
 	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+	      tgt_size += cur_node.host_end - cur_node.host_start;
 	      not_found_cnt += last - i;
 	      for (i = first; i <= last; i++)
-		tgt->list[i].key = NULL;
+		{
+		  tgt->list[i].key = NULL;
+		  if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
+					     & typemask))
+		    gomp_coalesce_buf_add (&cbuf,
+					   tgt_size - cur_node.host_end
+					   + (uintptr_t) hostaddrs[i],
+					   sizes[i]);
+		}
 	      i--;
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
 	    gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
-				      sizes, kinds);
+				      sizes, kinds, NULL);
 	  i--;
 	  continue;
 	}
@@ -743,6 +877,8 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  gomp_coalesce_buf_add (&cbuf, tgt_size,
+				 cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -762,7 +898,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
 	gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
-				kind & typemask);
+				kind & typemask, NULL);
       else
 	{
 	  tgt->list[i].key = NULL;
@@ -772,6 +908,9 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  if (gomp_to_device_kind_p (kind & typemask))
+	    gomp_coalesce_buf_add (&cbuf, tgt_size,
+				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  if ((kind & typemask) == GOMP_MAP_TO_PSET)
 	    {
@@ -830,7 +969,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	    {
 	      assert (n->refcount != REFCOUNT_LINK);
 	      gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
-				      kind & typemask);	      
+				      kind & typemask, NULL);
 	    }
 	  else
 	    {
@@ -869,6 +1008,19 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
       tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+      if (cbuf.use_cnt == 1)
+	cbuf.chunk_cnt--;
+      if (cbuf.chunk_cnt > 0)
+	{
+	  cbuf.buf
+	    = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
+	  if (cbuf.buf)
+	    {
+	      cbuf.tgt = tgt;
+	      cbufp = &cbuf;
+	    }
+	}
     }
   else
     {
@@ -907,7 +1059,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		len = sizes[i];
 		gomp_copy_host2dev (devicep, aq,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len);
+				    (void *) hostaddrs[i], len, cbufp);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -940,7 +1092,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		  }
 		for (i = first; i <= last; i++)
 		  gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
-					    sizes, kinds);
+					    sizes, kinds, cbufp);
 		i--;
 		continue;
 	      case GOMP_MAP_ALWAYS_POINTER:
@@ -965,7 +1117,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *));
+				    sizeof (void *), cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -990,7 +1142,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n && n->refcount != REFCOUNT_LINK)
 	      gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cbufp);
 	    else
 	      {
 		k->link_key = NULL;
@@ -1042,22 +1194,18 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    /* FIXME: Perhaps add some smarts, like if copying
-		       several adjacent fields from host to target, use some
-		       host buffer to avoid sending each var individually.  */
 		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cbufp);
 		    break;
 		  case GOMP_MAP_POINTER:
 		    gomp_map_pointer (tgt, aq,
 				      (uintptr_t) *(void **) k->host_start,
-				      k->tgt_offset, sizes[i]);
+				      k->tgt_offset, sizes[i], cbufp);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    /* FIXME: see above FIXME comment.  */
 		    {
 		      bool found_pointer = false;
 		      for (j = i + 1; j < mapnum; j++)
@@ -1086,7 +1234,8 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 						(void *) (tgt->tgt_start
 							  + k->tgt_offset),
 						(void *) k->host_start,
-						k->host_end - k->host_start);
+						k->host_end - k->host_start,
+						cbufp);
 			    *(uintptr_t *) hostaddrs[i] = tptr;
 			    i++;
 			    found_pointer = true;
@@ -1096,7 +1245,8 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 					    (void *) (tgt->tgt_start
 						      + k->tgt_offset),
 					    (void *) k->host_start,
-					    k->host_end - k->host_start);
+					    k->host_end - k->host_start,
+					    cbufp);
 		    }
 		    break;
 		  case GOMP_MAP_FORCE_PRESENT:
@@ -1123,7 +1273,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *));
+					sizeof (void *), cbufp);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -1137,7 +1287,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
 		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
-					&tgt_addr, sizeof (void *));
+					&tgt_addr, sizeof (void *), cbufp);
 		  }
 		array++;
 	      }
@@ -1182,7 +1332,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		{
 		  assert (n->refcount != REFCOUNT_LINK);
 		  gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
-					  kind & typemask);
+					  kind & typemask, cbufp);
 		  target_row_addr = n->tgt->tgt_start + n->tgt_offset;
 		}
 	      else
@@ -1217,7 +1367,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 		    gomp_copy_host2dev (devicep, aq,
 					(void *) tgt->tgt_start + k->tgt_offset,
 					(void *) k->host_start,
-					da->data_row_size);
+					da->data_row_size, cbufp);
 		  array++;
 		}
 	      target_data_rows[row_start + j] = (void *) target_row_addr;
@@ -1231,7 +1381,7 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
 	      void *ptrblock = gomp_dynamic_array_create_ptrblock
 		(da, target_ptrblock, target_data_rows + row_start);
 	      gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
-				  da->ptrblock_size);
+				  da->ptrblock_size, cbufp);
 	      /* Freeing of the ptrblock must be scheduled after the host2dev
 		 copy completes.  */
 	      goacc_async_free (devicep, aq, ptrblock);
@@ -1253,13 +1403,24 @@  gomp_map_vars_async (struct gomp_device_descr *devicep,
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  /* FIXME: see above FIXME comment.  */
 	  gomp_copy_host2dev (devicep, aq,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *));
+			      (void *) &cur_node.tgt_offset, sizeof (void *),
+			      cbufp);
 	}
     }
 
+  if (cbufp)
+    {
+      long c = 0;
+      for (c = 0; c < cbuf.chunk_cnt; ++c)
+	gomp_copy_host2dev (devicep, aq,
+			    (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
+			    (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
+			    cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
+      free (cbuf.buf);
+    }
+
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
@@ -1416,7 +1577,8 @@  gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size);
+	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+				  NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
 	      gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
 	  }