diff mbox series

[3/3,og13] OpenMP: Support accelerated 2D/3D memory copies for AMD GCN

Message ID 56a30c3c201c87bc8e59bac048afc9c911be32b0.1695207771.git.julian@codesourcery.com
State New
Headers show
Series OpenMP: Accelerated 2D/3D host<->target memory copies | expand

Commit Message

Julian Brown Sept. 20, 2023, 11:14 a.m. UTC
This patch adds support for 2D/3D memory copies for omp_target_memcpy_rect
and "target update", using AMD extensions to the HSA API.  This follows
Tobias's similar patch for NVPTX (already present on mainline and higher
up this patch series for og13).

Support is also added for 1-dimensional strided accesses: these are
treated as a special case of 2-dimensional transfers, where the innermost
dimension is formed from the stride length (in bytes).

2023-09-19  Julian Brown  <julian@codesourcery.com>

libgomp/
	* plugin/plugin-gcn.c (hsa_runtime_fn_info): Add hsa_amd_memory_lock_fn,
	hsa_amd_memory_unlock_fn, hsa_amd_memory_async_copy_rect_fn function
	pointers.
	(init_hsa_runtime_functions): Add above functions, with DLSYM_OPT_FN.
	(GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New functions.
	* target.c (omp_target_memcpy_rect_worker): Add 1D strided transfer
	support.
---
 libgomp/plugin/plugin-gcn.c | 359 ++++++++++++++++++++++++++++++++++++
 libgomp/target.c            |  31 ++++
 2 files changed, 390 insertions(+)
diff mbox series

Patch

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3df9fbe8d80..b8b95d75214 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -211,6 +211,16 @@  struct hsa_runtime_fn_info
   hsa_status_t (*hsa_amd_svm_attributes_set_fn)
     (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
      size_t attribute_count);
+  hsa_status_t (*hsa_amd_memory_lock_fn)
+    (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
+     void **agent_ptr);
+  hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
+  hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
+    (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+     const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+     const hsa_dim3_t *range, hsa_agent_t copy_agent,
+     hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+     const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 };
 
 /* Structure describing the run-time and grid properties of an HSA kernel
@@ -1439,6 +1449,9 @@  init_hsa_runtime_functions (void)
   DLSYM_FN (hsa_queue_destroy)
   DLSYM_FN (hsa_code_object_deserialize)
   DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
+  DLSYM_OPT_FN (hsa_amd_memory_lock)
+  DLSYM_OPT_FN (hsa_amd_memory_unlock)
+  DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
   return true;
 #undef DLSYM_FN
 }
@@ -3948,6 +3961,352 @@  GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
   return true;
 }
 
+/* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
+   measured in bytes.  So we have:
+
+   dim1_size: number of bytes to copy on innermost dimension ("row")
+   dim0_len: number of rows to copy
+   dst: base pointer for destination of copy
+   dst_offset1_size: innermost row offset (for dest), in bytes
+   dst_offset0_len: offset, number of rows (for dest)
+   dst_dim1_size: whole-array dest row length, in bytes (pitch)
+   src: base pointer for source of copy
+   src_offset1_size: innermost row offset (for source), in bytes
+   src_offset0_len: offset, number of rows (for source)
+   src_dim1_size: whole-array source row length, in bytes (pitch)
+*/
+
+int
+GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
+		       size_t dim0_len, void *dst, size_t dst_offset1_size,
+		       size_t dst_offset0_len, size_t dst_dim1_size,
+		       const void *src, size_t src_offset1_size,
+		       size_t src_offset0_len, size_t src_dim1_size)
+{
+  if (!hsa_fns.hsa_amd_memory_lock_fn
+      || !hsa_fns.hsa_amd_memory_unlock_fn
+      || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
+    return -1;
+
+  /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
+     out quietly if we have anything oddly-aligned rather than letting the
+     driver raise an error.  */
+  if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
+    return -1;
+
+  if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
+    return -1;
+
+  /* Only handle host to device or device to host transfers here.  */
+  if ((dst_ord == -1 && src_ord == -1)
+      || (dst_ord != -1 && src_ord != -1))
+    return -1;
+
+  hsa_amd_copy_direction_t dir
+    = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
+  hsa_agent_t copy_agent;
+
+  /* We need to pin (lock) host memory before we start the transfer.  Try to
+     lock the minimum size necessary, i.e. using partial first/last rows of the
+     whole array.  Something like this:
+
+	 rows -->
+	 ..............
+     c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
+     o | ++#######+++++ <- whole row
+     l | ++#######+++++ <-     "
+     s v ++#######..... <- last row apart from trailing remainder
+	 ..............
+
+     We could split very large transfers into several rectangular copies, but
+     that is unimplemented for now.  */
+
+  size_t bounded_size_host, first_elem_offset_host;
+  void *host_ptr;
+  if (dir == hsaHostToDevice)
+    {
+      bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
+      first_elem_offset_host = src_offset0_len * src_dim1_size
+			       + src_offset1_size;
+      host_ptr = (void *) src;
+      struct agent_info *agent = get_agent_info (dst_ord);
+      copy_agent = agent->id;
+    }
+  else
+    {
+      bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
+      first_elem_offset_host = dst_offset0_len * dst_dim1_size
+			       + dst_offset1_size;
+      host_ptr = dst;
+      struct agent_info *agent = get_agent_info (src_ord);
+      copy_agent = agent->id;
+    }
+
+  void *agent_ptr;
+
+  hsa_status_t status
+    = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
+				      bounded_size_host, NULL, 0, &agent_ptr);
+  /* We can't lock the host memory: don't give up though, we might still be
+     able to use the slow path in our caller.  So, don't make this an
+     error.  */
+  if (status != HSA_STATUS_SUCCESS)
+    return -1;
+
+  hsa_pitched_ptr_t dstpp, srcpp;
+  hsa_dim3_t dst_offsets, src_offsets, ranges;
+
+  int retval = 1;
+
+  hsa_signal_t completion_signal;
+  status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      retval = -1;
+      goto unlock;
+    }
+
+  if (dir == hsaHostToDevice)
+    {
+      srcpp.base = agent_ptr - first_elem_offset_host;
+      dstpp.base = dst;
+    }
+  else
+    {
+      srcpp.base = (void *) src;
+      dstpp.base = agent_ptr - first_elem_offset_host;
+    }
+
+  srcpp.pitch = src_dim1_size;
+  srcpp.slice = 0;
+
+  src_offsets.x = src_offset1_size;
+  src_offsets.y = src_offset0_len;
+  src_offsets.z = 0;
+
+  dstpp.pitch = dst_dim1_size;
+  dstpp.slice = 0;
+
+  dst_offsets.x = dst_offset1_size;
+  dst_offsets.y = dst_offset0_len;
+  dst_offsets.z = 0;
+
+  ranges.x = dim1_size;
+  ranges.y = dim0_len;
+  ranges.z = 1;
+
+  status
+    = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
+						 &src_offsets, &ranges,
+						 copy_agent, dir, 0, NULL,
+						 completion_signal);
+  /* If the rectangular copy fails, we might still be able to use the slow
+     path.  We need to unlock the host memory though, so don't return
+     immediately.  */
+  if (status != HSA_STATUS_SUCCESS)
+    retval = -1;
+  else
+    hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
+					HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+					HSA_WAIT_STATE_ACTIVE);
+
+  hsa_fns.hsa_signal_destroy_fn (completion_signal);
+
+unlock:
+  status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not unlock host memory", status);
+
+  return retval;
+}
+
+/* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
+   measured in bytes.  So we have:
+
+   dim2_size: number of bytes to copy on innermost dimension ("row")
+   dim1_len: number of rows per slice to copy
+   dim0_len: number of slices to copy
+   dst: base pointer for destination of copy
+   dst_offset2_size: innermost row offset (for dest), in bytes
+   dst_offset1_len: offset, number of rows (for dest)
+   dst_offset0_len: offset, number of slices (for dest)
+   dst_dim2_size: whole-array dest row length, in bytes (pitch)
+   dst_dim1_len: whole-array number of rows in slice (for dest)
+   src: base pointer for source of copy
+   src_offset2_size: innermost row offset (for source), in bytes
+   src_offset1_len: offset, number of rows (for source)
+   src_offset0_len: offset, number of slices (for source)
+   src_dim2_size: whole-array source row length, in bytes (pitch)
+   src_dim1_len: whole-array number of rows in slice (for source)
+*/
+
+int
+GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
+		       size_t dim1_len, size_t dim0_len, void *dst,
+		       size_t dst_offset2_size, size_t dst_offset1_len,
+		       size_t dst_offset0_len, size_t dst_dim2_size,
+		       size_t dst_dim1_len, const void *src,
+		       size_t src_offset2_size, size_t src_offset1_len,
+		       size_t src_offset0_len, size_t src_dim2_size,
+		       size_t src_dim1_len)
+{
+  if (!hsa_fns.hsa_amd_memory_lock_fn
+      || !hsa_fns.hsa_amd_memory_unlock_fn
+      || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
+    return -1;
+
+  /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
+     out quietly if we have anything oddly-aligned rather than letting the
+     driver raise an error.  */
+  if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
+    return -1;
+
+  if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
+    return -1;
+
+  /* Only handle host to device or device to host transfers here.  */
+  if ((dst_ord == -1 && src_ord == -1)
+      || (dst_ord != -1 && src_ord != -1))
+    return -1;
+
+  hsa_amd_copy_direction_t dir
+    = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
+  hsa_agent_t copy_agent;
+
+  /* We need to pin (lock) host memory before we start the transfer.  Try to
+     lock the minimum size necessary, i.e. using partial first/last slices of
+     the whole 3D array.  Something like this:
+
+	 slice 0:          slice 1:          slice 2:
+	     __________        __________        __________
+       ^    /+++++++++/    :  /+++++++++/    :  /         /
+    column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
+     /	  /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
+	 /_________/  :    /_________/  :    /_________/
+	row --->
+
+     We could split very large transfers into several rectangular copies, but
+     that is unimplemented for now.  */
+
+  size_t bounded_size_host, first_elem_offset_host;
+  void *host_ptr;
+  if (dir == hsaHostToDevice)
+    {
+      size_t slice_bytes = src_dim2_size * src_dim1_len;
+      bounded_size_host = slice_bytes * (dim0_len - 1)
+			  + src_dim2_size * (dim1_len - 1)
+			  + dim2_size;
+      first_elem_offset_host = src_offset0_len * slice_bytes
+			       + src_offset1_len * src_dim2_size
+			       + src_offset2_size;
+      host_ptr = (void *) src;
+      struct agent_info *agent = get_agent_info (dst_ord);
+      copy_agent = agent->id;
+    }
+  else
+    {
+      size_t slice_bytes = dst_dim2_size * dst_dim1_len;
+      bounded_size_host = slice_bytes * (dim0_len - 1)
+			  + dst_dim2_size * (dim1_len - 1)
+			  + dim2_size;
+      first_elem_offset_host = dst_offset0_len * slice_bytes
+			       + dst_offset1_len * dst_dim2_size
+			       + dst_offset2_size;
+      host_ptr = dst;
+      struct agent_info *agent = get_agent_info (src_ord);
+      copy_agent = agent->id;
+    }
+
+  void *agent_ptr;
+
+  hsa_status_t status
+    = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
+				      bounded_size_host, NULL, 0, &agent_ptr);
+  /* We can't lock the host memory: don't give up though, we might still be
+     able to use the slow path in our caller (maybe even with iterated memcpy2d
+     calls).  So, don't make this an error.  */
+  if (status != HSA_STATUS_SUCCESS)
+    return -1;
+
+  hsa_pitched_ptr_t dstpp, srcpp;
+  hsa_dim3_t dst_offsets, src_offsets, ranges;
+
+  int retval = 1;
+
+  hsa_signal_t completion_signal;
+  status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      retval = -1;
+      goto unlock;
+    }
+
+  if (dir == hsaHostToDevice)
+    {
+      srcpp.base = agent_ptr - first_elem_offset_host;
+      dstpp.base = dst;
+    }
+  else
+    {
+      srcpp.base = (void *) src;
+      dstpp.base = agent_ptr - first_elem_offset_host;
+    }
+
+  /* Pitch is measured in bytes.  */
+  srcpp.pitch = src_dim2_size;
+  /* Slice is also measured in bytes (i.e. total per-slice).  */
+  srcpp.slice = src_dim2_size * src_dim1_len;
+
+  src_offsets.x = src_offset2_size;
+  src_offsets.y = src_offset1_len;
+  src_offsets.z = src_offset0_len;
+
+  /* As above.  */
+  dstpp.pitch = dst_dim2_size;
+  dstpp.slice = dst_dim2_size * dst_dim1_len;
+
+  dst_offsets.x = dst_offset2_size;
+  dst_offsets.y = dst_offset1_len;
+  dst_offsets.z = dst_offset0_len;
+
+  ranges.x = dim2_size;
+  ranges.y = dim1_len;
+  ranges.z = dim0_len;
+
+  status
+    = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
+						 &src_offsets, &ranges,
+						 copy_agent, dir, 0, NULL,
+						 completion_signal);
+  /* If the rectangular copy fails, we might still be able to use the slow
+     path.  We need to unlock the host memory though, so don't return
+     immediately.  */
+  if (status != HSA_STATUS_SUCCESS)
+    retval = -1;
+  else
+    {
+      hsa_signal_value_t sv
+	= hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
+					      HSA_SIGNAL_CONDITION_LT, 1,
+					      UINT64_MAX,
+					      HSA_WAIT_STATE_ACTIVE);
+      if (sv < 0)
+	{
+	  GCN_WARNING ("async copy rect failure");
+	  retval = -1;
+	}
+    }
+
+  hsa_fns.hsa_signal_destroy_fn (completion_signal);
+
+unlock:
+  status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not unlock host memory", status);
+
+  return retval;
+}
+
 /* }}}  */
 /* {{{ OpenMP Plugin API  */
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 9bdf3412c04..114a0375806 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -5893,6 +5893,37 @@  omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
       if (__builtin_mul_overflow (span, strides[0], &stride))
 	return EINVAL;
 
+      if (((src_devicep && src_devicep->memcpy2d_func)
+	   || (dst_devicep && dst_devicep->memcpy2d_func))
+	  && (stride % element_size) == 0)
+	{
+	  /* Try using memcpy2d for a 1-dimensional strided access.  Here we
+	     treat the transfer as a 2-dimensional array, where the inner
+	     dimension is calculated to be (stride in bytes) / element_size.
+	     Indices/offsets are adjusted so the source/destination pointers
+	     point to the first element to be transferred, to make the sums
+	     easier.  (There are some configurations of 2D strided accesses
+	     that memcpy3d could handle similarly, but those are probably rare
+	     and are unimplemented for now.)   */
+
+	  /* If stride is element size, this is a contiguous transfer and
+	     should have been handled above.  */
+	  assert (stride > element_size);
+
+	  int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+	  int src_id = src_devicep ? src_devicep->target_id : -1;
+	  void *subarray_src = (char *) src + src_off;
+	  void *subarray_dst = (char *) dst + dst_off;
+
+	  struct gomp_device_descr *devp = dst_devicep ? dst_devicep
+						       : src_devicep;
+	  ret = devp->memcpy2d_func (dst_id, src_id, element_size, volume[0],
+				     subarray_dst, 0, 0, stride, subarray_src,
+				     0, 0, stride);
+	  if (ret != -1)
+	    return ret ? 0 : EINVAL;
+	}
+
       for (i = 0, ret = 1; i < volume[0] && ret; i++)
 	{
 	  if (src_devicep == NULL)