diff mbox series

OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect

Message ID 3bd8a5c7-3256-0e63-b7c4-1e969f6bba86@codesourcery.com
State New
Headers show
Series OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect | expand

Commit Message

Tobias Burnus July 25, 2023, 9:45 p.m. UTC
The attached patch calls CUDA's cuMemcopy2D and cuMemcpy3D
for omp_target_memcpy_rect[,_async} for dim=2/dim=3. This should
speed up the data transfer for noncontiguous data.

While being there, I ended up adding support for device to other device
copying; while potentially slow, it is still better than not being able to
copy - and with shared-memory, it shouldn't be that bad.

Comments, suggestions, remarks?
If there are none, will commit it...

Disclaimer: While I have done correctness tests (system with two nvptx GPUs,
I have not done any performance tests. (I also tested it without offloading
configured, but that's rather boring.)

Tobias
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

Comments

Thomas Schwinge July 27, 2023, 9 p.m. UTC | #1
Hi Tobias!

On 2023-07-25T23:45:54+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
> The attached patch calls CUDA's cuMemcopy2D and cuMemcpy3D
> for omp_target_memcpy_rect[,_async} for dim=2/dim=3. This should
> speed up the data transfer for noncontiguous data.

ACK, thanks.

> While being there, I ended up adding support for device to other device
> copying; while potentially slow, it is still better than not being able to
> copy - and with shared-memory, it shouldn't be that bad.

Makes sense, I guess.

> Comments, suggestions, remarks?
> If there are none, will commit it...

You're so quick -- I'm so slow...  ;-)

I've not verified all the logic in here, but I've got a few comments.

> Disclaimer: While I have done correctness tests (system with two nvptx GPUs,
> I have not done any performance tests.

Well, we should, eventually.

> (I also tested it without offloading
> configured, but that's rather boring.)

> OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect
>
> When copying a 2D or 3D rectangular memmory block, the performance is
> better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the
> data one by one. That's what this commit does.

So you've actually done some performance verification?

> Additionally, it permits device-to-device copies, if neccessary using a
> temporary variable on the host.

> --- a/include/cuda/cuda.h
> +++ b/include/cuda/cuda.h

I note that you're not actually using everything you're adding here.
(..., but I understand you're simply adding everying that relates to
these 'cuMemcpy[...]' routines -- OK as far as I'm concerned.)

> @@ -47,6 +47,7 @@ typedef void *CUevent;
>  typedef void *CUfunction;
>  typedef void *CUlinkState;
>  typedef void *CUmodule;
> +typedef void *CUarray;
>  typedef size_t (*CUoccupancyB2DSize)(int);
>  typedef void *CUstream;
>
> @@ -54,7 +55,10 @@ typedef enum {
>    CUDA_SUCCESS = 0,
>    CUDA_ERROR_INVALID_VALUE = 1,
>    CUDA_ERROR_OUT_OF_MEMORY = 2,
> +  CUDA_ERROR_NOT_INITIALIZED = 3,
> +  CUDA_ERROR_DEINITIALIZED = 4,
>    CUDA_ERROR_INVALID_CONTEXT = 201,
> +  CUDA_ERROR_INVALID_HANDLE = 400,
>    CUDA_ERROR_NOT_FOUND = 500,
>    CUDA_ERROR_NOT_READY = 600,
>    CUDA_ERROR_LAUNCH_FAILED = 719,
> @@ -126,6 +130,75 @@ typedef enum {
>    CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
>  } CUlimit;
>
> +typedef enum {
> +  CU_MEMORYTYPE_HOST = 0x01,
> +  CU_MEMORYTYPE_DEVICE = 0x02,
> +  CU_MEMORYTYPE_ARRAY = 0x03,
> +  CU_MEMORYTYPE_UNIFIED = 0x04
> +} CUmemorytype;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  size_t srcPitch;
> +
> +  size_t dstXInBytes, dstY;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;

That last one isn't 'const'.  ;-)

> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  size_t dstPitch;
> +
> +  size_t WidthInBytes, Height;
> +} CUDA_MEMCPY2D;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY, srcZ;
> +  size_t srcLOD;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  void *dummy;

A 'cuda.h' that I looked at calls that last one 'reserved0', with comment
"Must be NULL".

> +  size_t srcPitch, srcHeight;
> +
> +  size_t dstXInBytes, dstY, dstZ;
> +  size_t dstLOD;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;

Again, not 'const'.

> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  void *dummy2;

Similar to above: 'reserved1', with comment "Must be NULL".

> +  size_t dstPitch, dstHeight;
> +
> +  size_t WidthInBytes, Height, Depth;
> +} CUDA_MEMCPY3D;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY, srcZ;
> +  size_t srcLOD;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  CUcontext srcContext;
> +  size_t srcPitch, srcHeight;
> +
> +  size_t dstXInBytes, dstY, dstZ;
> +  size_t dstLOD;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;
> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  CUcontext dstContext;
> +  size_t dstPitch, dstHeight;
> +
> +  size_t WidthInBytes, Height, Depth;
> +} CUDA_MEMCPY3D_PEER;
> +
>  #define cuCtxCreate cuCtxCreate_v2
>  CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
>  #define cuCtxDestroy cuCtxDestroy_v2
> @@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
>  CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
>  #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
>  CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
> +#define cuMemcpy2D cuMemcpy2D_v2
> +CUresult cuMemcpy2D (const CUDA_MEMCPY2D *);
> +#define cuMemcpy2DAsync cuMemcpy2DAsync_v2
> +CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream);
> +#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
> +CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *);
> +#define cuMemcpy3D cuMemcpy3D_v2
> +CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
> +#define cuMemcpy3DAsync cuMemcpy3DAsync_v2
> +CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
> +CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
> +CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);

So these are the 'v2' variants, unconditionally.  (Accompanied by the
'v2' data types defined above.)  But that'll be OK, as I see these
already in CUDA 6.5 'cuda.h'.

> --- a/libgomp/libgomp-plugin.h
> +++ b/libgomp/libgomp-plugin.h

> +extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t,
> +                               void*, size_t, size_t, size_t,
> +                               const void*, size_t, size_t, size_t);
> +extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
> +                               size_t, size_t, size_t, size_t, size_t,
> +                               const void *, size_t, size_t, size_t, size_t,
> +                               size_t);

Oh, wow.  ;-)

> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -1781,6 +1781,122 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
>    return true;
>  }
>
> +int
> +GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
> +                    size_t dim0_len, void *dst, size_t dst_offset1_size,
> +                    size_t dst_offset0_len, size_t dst_dim1_size,
> +                    const void *src, size_t src_offset1_size,
> +                    size_t src_offset0_len, size_t src_dim1_size)
> +{
> +  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
> +    return false;
> +
> +  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
> +
> +  CUDA_MEMCPY2D data;
> +  data.WidthInBytes = dim1_size;
> +  data.Height = dim0_len;
> +
> +  if (dst_ord == -1)
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_HOST;
> +      data.dstHost = dst;
> +    }
> +  else
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.dstDevice = (CUdeviceptr) dst;
> +    }
> +  data.dstPitch = dst_dim1_size;
> +  data.dstXInBytes = dst_offset1_size;
> +  data.dstY = dst_offset0_len;
> +
> +  if (src_ord == -1)
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_HOST;
> +      data.srcHost = src;
> +    }
> +  else
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.srcDevice = (CUdeviceptr) src;
> +    }
> +  data.srcPitch = src_dim1_size;
> +  data.srcXInBytes = src_offset1_size;
> +  data.srcY = src_offset0_len;
> +
> +  CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
> +  if (res == CUDA_ERROR_INVALID_VALUE)
> +    /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
> +       for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
> +       with an error; try the slower cuMemcpy2DUnaligned now.  */
> +    CUDA_CALL (cuMemcpy2DUnaligned, &data);
> +  else if (res != CUDA_SUCCESS)
> +    {
> +      GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
> +      return false;
> +    }
> +  return true;
> +}
> +
> +int
> +GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
> +                    size_t dim1_len, size_t dim0_len, void *dst,
> +                    size_t dst_offset2_size, size_t dst_offset1_len,
> +                    size_t dst_offset0_len, size_t dst_dim2_size,
> +                    size_t dst_dim1_len, const void *src,
> +                    size_t src_offset2_size, size_t src_offset1_len,
> +                    size_t src_offset0_len, size_t src_dim2_size,
> +                    size_t src_dim1_len)
> +{
> +  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
> +    return false;
> +
> +  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
> +
> +  CUDA_MEMCPY3D data;

I note that this doesn't adhere to the two "Must be NULL" remarks from
above -- but I'm confused, because, for example, on
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4b5238975579f002c0199a3800ca44df>
"cuMemcpy3D", there also are no such remarks.  (... in contast to:
"srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be set to 0",
which you do set accordingly.)

Maybe just 'memset' the whole thing to '0', for good measure?

> +  data.WidthInBytes = dim2_size;
> +  data.Height = dim1_len;
> +  data.Depth = dim0_len;
> +
> +  if (dst_ord == -1)
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_HOST;
> +      data.dstHost = dst;
> +    }
> +  else
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.dstDevice = (CUdeviceptr) dst;
> +    }
> +  data.dstPitch = dst_dim2_size;
> +  data.dstHeight = dst_dim1_len;
> +  data.dstXInBytes = dst_offset2_size;
> +  data.dstY = dst_offset1_len;
> +  data.dstZ = dst_offset0_len;
> +  data.dstLOD = 0;
> +
> +  if (src_ord == -1)
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_HOST;
> +      data.srcHost = src;
> +    }
> +  else
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.srcDevice = (CUdeviceptr) src;
> +    }
> +  data.srcPitch = src_dim2_size;
> +  data.srcHeight = src_dim1_len;
> +  data.srcXInBytes = src_offset2_size;
> +  data.srcY = src_offset1_len;
> +  data.srcZ = src_offset0_len;
> +  data.srcLOD = 0;
> +
> +  CUDA_CALL (cuMemcpy3D, &data);
> +  return true;
> +}

> --- a/libgomp/target.c
> +++ b/libgomp/target.c
> @@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
>                              const size_t *dst_dimensions,
>                              const size_t *src_dimensions,
>                              struct gomp_device_descr *dst_devicep,
> -                            struct gomp_device_descr *src_devicep)
> +                            struct gomp_device_descr *src_devicep,
> +                            size_t *tmp_size, void **tmp)
>  {
>    size_t dst_slice = element_size;
>    size_t src_slice = element_size;
> @@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
>         || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
>         || __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
>       return EINVAL;
> -      if (dst_devicep == NULL && src_devicep == NULL)
> -     {
> -       memcpy ((char *) dst + dst_off, (const char *) src + src_off,
> -               length);
> -       ret = 1;
> -     }
> -      else if (src_devicep == NULL)
> -     ret = dst_devicep->host2dev_func (dst_devicep->target_id,
> +      if (src_devicep != NULL && src_devicep == dst_devicep)
> +     ret = src_devicep->dev2dev_func (src_devicep->target_id,
> +                                      (char *) dst + dst_off,
> +                                      (const char *) src + src_off,
> +                                      length);

So you moved up the intra-device case up here...

> +      else if (src_devicep != NULL
> +            && (dst_devicep == NULL
> +                || (dst_devicep->capabilities
> +                    & GOMP_OFFLOAD_CAP_SHARED_MEM)))

Are these 'GOMP_OFFLOAD_CAP_SHARED_MEM' actually reachable, given that
'omp_target_memcpy_check' (via 'omp_target_memcpy_rect_check') clears out
the device to 'NULL' for 'GOMP_OFFLOAD_CAP_SHARED_MEM'?  In other words,
was the original code already doing what you've implemented here?

> +     ret = src_devicep->dev2host_func (src_devicep->target_id,
>                                         (char *) dst + dst_off,
>                                         (const char *) src + src_off,
>                                         length);
> -      else if (dst_devicep == NULL)
> -     ret = src_devicep->dev2host_func (src_devicep->target_id,
> +      else if (dst_devicep != NULL
> +            && (src_devicep == NULL
> +                || (src_devicep->capabilities
> +                     & GOMP_OFFLOAD_CAP_SHARED_MEM)))
> +     ret = dst_devicep->host2dev_func (dst_devicep->target_id,
>                                         (char *) dst + dst_off,
>                                         (const char *) src + src_off,
>                                         length);
> +      else if (dst_devicep == NULL && src_devicep == NULL)
> +     {
> +       memcpy ((char *) dst + dst_off, (const char *) src + src_off,
> +               length);
> +       ret = 1;
> +     }
>        else if (src_devicep == dst_devicep)
>       ret = src_devicep->dev2dev_func (src_devicep->target_id,
>                                        (char *) dst + dst_off,
>                                        (const char *) src + src_off,
>                                        length);

..., but also left the intra-device case here -- which should now be dead
code here?

>        else
> -     ret = 0;
> +     {
> +       if (*tmp_size == 0)
> +         {
> +           *tmp_size = length;
> +           *tmp = malloc (length);
> +           if (*tmp == NULL)
> +             return ENOMEM;
> +         }
> +       else if (*tmp_size < length)
> +         {
> +           *tmp_size = length;
> +           *tmp = realloc (*tmp, length);
> +           if (*tmp == NULL)
> +             return ENOMEM;

If 'realloc' returns 'NULL', we should 'free' the original '*tmp'?

Do we really need here the property here that if the re-allocation can't
be done in-place, 'realloc' copies the original content to the new?  In
other words, should we just unconditionally 'free' and re-'malloc' here,
instead of 'realloc'?

I haven't looked whether the re-use of 'tmp' for multiple calls to this
is then actually useful, or whether we should just always 'malloc', use,
'free' the buffer here?

> +         }
> +       ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp,
> +                                         (const char *) src + src_off,
> +                                         length);
> +       if (ret == 1)
> +         ret = dst_devicep->host2dev_func (dst_devicep->target_id,
> +                                           (char *) dst + dst_off, *tmp,
> +                                           length);
> +     }
>        return ret ? 0 : EINVAL;

(For later...)  Is that what
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ge1f5c7771544fee150ada8853c7cbf4a>
"cuMemcpyPeer" would be used for?

>      }
>
> -  /* FIXME: it would be nice to have some plugin function to handle
> -     num_dims == 2 and num_dims == 3 more efficiently.  Larger ones can
> -     be handled in the generic recursion below, and for host-host it
> -     should be used even for any num_dims >= 2.  */
> +  /* host->device, device->host and same-device device->device.  */
> +  if (num_dims == 2
> +      && ((src_devicep
> +        && src_devicep == dst_devicep
> +        && src_devicep->memcpy2d_func)
> +       || (!src_devicep != !dst_devicep
> +           && ((src_devicep && src_devicep->memcpy2d_func)
> +               || (dst_devicep && dst_devicep->memcpy2d_func)))))
> +    {
> +      size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1;
> +      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
> +      int src_id = src_devicep ? src_devicep->target_id : -1;
> +      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
> +
> +      if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1)
> +       || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1)
> +       || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1)
> +       || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1)
> +       || __builtin_mul_overflow (src_offsets[1], element_size,
> +                                  &src_off_sz1))
> +     return EINVAL;
> +      ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0],
> +                              dst, dst_off_sz1, dst_offsets[0], dst_sz1,
> +                              src, src_off_sz1, src_offsets[0], src_sz1);
> +      if (ret != -1)
> +     return ret ? 0 : EINVAL;
> +    }
> +  else if (num_dims == 3
> +        && ((src_devicep
> +             && src_devicep == dst_devicep
> +             && src_devicep->memcpy3d_func)
> +            || (!src_devicep != !dst_devicep
> +                && ((src_devicep && src_devicep->memcpy3d_func)
> +                    || (dst_devicep && dst_devicep->memcpy3d_func)))))
> +    {
> +      size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2;
> +      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
> +      int src_id = src_devicep ? src_devicep->target_id : -1;
> +      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
> +
> +      if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2)
> +       || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2)
> +       || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2)
> +       || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2)
> +       || __builtin_mul_overflow (src_offsets[2], element_size,
> +                                  &src_off_sz2))
> +     return EINVAL;
> +      ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0],
> +                              dst, dst_off_sz2, dst_offsets[1],
> +                              dst_offsets[0], dst_sz2, dst_dimensions[1],
> +                              src, src_off_sz2, src_offsets[1],
> +                              src_offsets[0], src_sz2, src_dimensions[1]);
> +      if (ret != -1)
> +     return ret ? 0 : EINVAL;
> +    }
>
>    for (i = 1; i < num_dims; i++)
>      if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
> @@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
>                                          volume + 1, dst_offsets + 1,
>                                          src_offsets + 1, dst_dimensions + 1,
>                                          src_dimensions + 1, dst_devicep,
> -                                        src_devicep);
> +                                        src_devicep, tmp_size, tmp);
>        if (ret)
>       return ret;
>        dst_off += dst_slice;
> @@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num,
>    if (ret)
>      return ret;
>
> -  if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep)
> -    return EINVAL;
> -
>    return 0;
>  }
>
> @@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void *src,
>                            struct gomp_device_descr *dst_devicep,
>                            struct gomp_device_descr *src_devicep)
>  {
> -  if (src_devicep)
> +  size_t tmp_size = 0;
> +  void *tmp = NULL;
> +  bool lock_src;
> +  bool lock_dst;
> +
> +  lock_src = (src_devicep
> +           && (!dst_devicep
> +               || src_devicep == dst_devicep
> +               || !(src_devicep->capabilities
> +                    & GOMP_OFFLOAD_CAP_SHARED_MEM)));

Similar doubt than above re "'GOMP_OFFLOAD_CAP_SHARED_MEM' actually
reachable"?

> +  lock_dst = (dst_devicep
> +           && (!lock_src
> +               || (src_devicep != dst_devicep
> +                   && !(dst_devicep->capabilities
> +                        & GOMP_OFFLOAD_CAP_SHARED_MEM))));
> +  if (lock_src)
>      gomp_mutex_lock (&src_devicep->lock);
> -  else if (dst_devicep)
> +  if (lock_dst)
>      gomp_mutex_lock (&dst_devicep->lock);

(Pre-existing issue, and I've not myself tried to figure out the details
at this time -- why do we actually lock the devices here, and in similar
other places?)

>    int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
>                                          volume, dst_offsets, src_offsets,
>                                          dst_dimensions, src_dimensions,
> -                                        dst_devicep, src_devicep);
> -  if (src_devicep)
> +                                        dst_devicep, src_devicep,
> +                                        &tmp_size, &tmp);
> +  if (lock_src)
>      gomp_mutex_unlock (&src_devicep->lock);
> -  else if (dst_devicep)
> +  if (lock_dst)
>      gomp_mutex_unlock (&dst_devicep->lock);
> +  if (tmp)
> +    free (tmp);
>
>    return ret;
>  }
> @@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
>    DLSYM (free);
>    DLSYM (dev2host);
>    DLSYM (host2dev);
> +  DLSYM (memcpy2d);
> +  DLSYM (memcpy3d);

With 'DLSYM' used here, won't that fail if these symbols don't actually
exist (like for 'libgomp/plugin/plugin-gcn.c')?


I'm attaching the humble beginnings of a follow-on patch; feel free to
use.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
diff mbox series

Patch

OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect

When copying a 2D or 3D rectangular memmory block, the performance is
better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the
data one by one. That's what this commit does.

Additionally, it permits device-to-device copies, if neccessary using a
temporary variable on the host.

include/ChangeLog:

	* cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED,
	CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE.
	(CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D,
	CUDA_MEMCPY3D_PEER): New typdefs.
	(cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned,
	cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer,
	cuMemcpy3DPeerAsync): New prototypes.

libgomp/ChangeLog:

	* libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d,
	GOMP_OFFLOAD_memcpy3d): New prototypes.
	* libgomp.h (struct gomp_device_descr): Add memcpy2d_func
	and memcpy3d_func.
	* libgomp.texi (5.1 Impl. Status): Add 'defaultmap(:all)' with 'N'.
	(nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used.
	* oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL.
	* plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned,
	cuMemcpy3D): Invoke via CUDA_ONE_CALL.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d,
	GOMP_OFFLOAD_memcpy3d): New.
	* target.c (omp_target_memcpy_rect_worker):
	(omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy):
	Permit all device-to-device copyies; invoke new plugins for
	2D and 3D copying when available.
	(gomp_load_plugin_for_device): DLSYM the new plugin functions.
	* testsuite/libgomp.c/target-12.c: Fix dimension bug.
	* testsuite/libgomp.fortran/target-12.f90: Likewise.
	* testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test.

 include/cuda/cuda.h                                |  85 ++++
 libgomp/libgomp-plugin.h                           |   7 +
 libgomp/libgomp.h                                  |   2 +
 libgomp/libgomp.texi                               |   6 +
 libgomp/oacc-host.c                                |   2 +
 libgomp/plugin/cuda-lib.def                        |   3 +
 libgomp/plugin/plugin-nvptx.c                      | 116 +++++
 libgomp/target.c                                   | 152 +++++-
 libgomp/testsuite/libgomp.c/target-12.c            |   6 +-
 libgomp/testsuite/libgomp.fortran/target-12.f90    |   6 +-
 .../libgomp.fortran/target-memcpy-rect-1.f90       | 531 +++++++++++++++++++++
 11 files changed, 885 insertions(+), 31 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 338626fb6dc..09c3c2b8dbe 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -47,6 +47,7 @@  typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef void *CUarray;
 typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
@@ -54,7 +55,10 @@  typedef enum {
   CUDA_SUCCESS = 0,
   CUDA_ERROR_INVALID_VALUE = 1,
   CUDA_ERROR_OUT_OF_MEMORY = 2,
+  CUDA_ERROR_NOT_INITIALIZED = 3,
+  CUDA_ERROR_DEINITIALIZED = 4,
   CUDA_ERROR_INVALID_CONTEXT = 201,
+  CUDA_ERROR_INVALID_HANDLE = 400,
   CUDA_ERROR_NOT_FOUND = 500,
   CUDA_ERROR_NOT_READY = 600,
   CUDA_ERROR_LAUNCH_FAILED = 719,
@@ -126,6 +130,75 @@  typedef enum {
   CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
 } CUlimit;
 
+typedef enum {
+  CU_MEMORYTYPE_HOST = 0x01,
+  CU_MEMORYTYPE_DEVICE = 0x02,
+  CU_MEMORYTYPE_ARRAY = 0x03,
+  CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+
+typedef struct {
+  size_t srcXInBytes, srcY;
+  CUmemorytype srcMemoryType;
+  const void *srcHost;
+  CUdeviceptr srcDevice;
+  CUarray srcArray;
+  size_t srcPitch;
+
+  size_t dstXInBytes, dstY;
+  CUmemorytype dstMemoryType;
+  const void *dstHost;
+  CUdeviceptr dstDevice;
+  CUarray dstArray;
+  size_t dstPitch;
+
+  size_t WidthInBytes, Height;
+} CUDA_MEMCPY2D;
+
+typedef struct {
+  size_t srcXInBytes, srcY, srcZ;
+  size_t srcLOD;
+  CUmemorytype srcMemoryType;
+  const void *srcHost;
+  CUdeviceptr srcDevice;
+  CUarray srcArray;
+  void *dummy;
+  size_t srcPitch, srcHeight;
+
+  size_t dstXInBytes, dstY, dstZ;
+  size_t dstLOD;
+  CUmemorytype dstMemoryType;
+  const void *dstHost;
+  CUdeviceptr dstDevice;
+  CUarray dstArray;
+  void *dummy2;
+  size_t dstPitch, dstHeight;
+
+  size_t WidthInBytes, Height, Depth;
+} CUDA_MEMCPY3D;
+
+typedef struct {
+  size_t srcXInBytes, srcY, srcZ;
+  size_t srcLOD;
+  CUmemorytype srcMemoryType;
+  const void *srcHost;
+  CUdeviceptr srcDevice;
+  CUarray srcArray;
+  CUcontext srcContext;
+  size_t srcPitch, srcHeight;
+
+  size_t dstXInBytes, dstY, dstZ;
+  size_t dstLOD;
+  CUmemorytype dstMemoryType;
+  const void *dstHost;
+  CUdeviceptr dstDevice;
+  CUarray dstArray;
+  CUcontext dstContext;
+  size_t dstPitch, dstHeight;
+
+  size_t WidthInBytes, Height, Depth;
+} CUDA_MEMCPY3D_PEER;
+
 #define cuCtxCreate cuCtxCreate_v2
 CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
 #define cuCtxDestroy cuCtxDestroy_v2
@@ -183,6 +256,18 @@  CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
 #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
 CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
+#define cuMemcpy2D cuMemcpy2D_v2
+CUresult cuMemcpy2D (const CUDA_MEMCPY2D *);
+#define cuMemcpy2DAsync cuMemcpy2DAsync_v2
+CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream);
+#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
+CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *);
+#define cuMemcpy3D cuMemcpy3D_v2
+CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
+#define cuMemcpy3DAsync cuMemcpy3DAsync_v2
+CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
+CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
+CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);
 #define cuMemFree cuMemFree_v2
 CUresult cuMemFree (CUdeviceptr);
 CUresult cuMemFreeHost (void *);
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 42ee3d6c7f9..dc993882c3b 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -139,6 +139,13 @@  extern bool GOMP_OFFLOAD_free (int, void *);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
+extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t,
+				  void*, size_t, size_t, size_t,
+				  const void*, size_t, size_t, size_t);
+extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
+				  size_t, size_t, size_t, size_t, size_t,
+				  const void *, size_t, size_t, size_t, size_t,
+				  size_t);
 extern bool GOMP_OFFLOAD_can_run (void *);
 extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
 extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 4d2bfab4b71..68f20651fbf 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1388,6 +1388,8 @@  struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_free) *free_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
+  __typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
+  __typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
   __typeof (GOMP_OFFLOAD_can_run) *can_run_func;
   __typeof (GOMP_OFFLOAD_run) *run_func;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 9d3b2ae54cb..00b91ae96de 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -424,6 +424,7 @@  to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
 @item For Fortran, optional comma between directive and clause @tab N @tab
 @item Conforming device numbers and @code{omp_initial_device} and
       @code{omp_invalid_device} enum/PARAMETER @tab Y @tab
+@item @code{all} as @emph{implicit-behavior} for @code{defaultmap} @tab N @tab
 @item Initial value of @var{default-device-var} ICV with
       @code{OMP_TARGET_OFFLOAD=mandatory} @tab Y @tab
 @item @emph{interop_types} in any position of the modifier list for the @code{init} clause
@@ -5033,6 +5034,11 @@  The implementation remark:
       list of available devices (``host fallback'').
 @item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack}
       in the GCC manual.
+@item The OpenMP routines @code{omp_target_memcpy_rect} and
+      @code{omp_target_memcpy_rect_async} and the @code{target update}
+      directive for non-contiguous list items will use the 2D and 3D
+      memory-copy functions of the CUDA library.  Higher dimensions will
+      call those functions in a loop and are therefore supported.
 @end itemize
 
 
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index b75e8bead39..5980d510838 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -281,6 +281,8 @@  static struct gomp_device_descr host_dispatch =
     .free_func = host_free,
     .dev2host_func = host_dev2host,
     .host2dev_func = host_host2dev,
+    .memcpy2d_func = NULL,
+    .memcpy3d_func = NULL,
     .run_func = host_run,
 
     .mem_map = { NULL },
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index dff42d6a9ac..007c6e0f4df 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -36,6 +36,9 @@  CUDA_ONE_CALL (cuMemcpyDtoH)
 CUDA_ONE_CALL (cuMemcpyDtoHAsync)
 CUDA_ONE_CALL (cuMemcpyHtoD)
 CUDA_ONE_CALL (cuMemcpyHtoDAsync)
+CUDA_ONE_CALL (cuMemcpy2D)
+CUDA_ONE_CALL (cuMemcpy2DUnaligned)
+CUDA_ONE_CALL (cuMemcpy3D)
 CUDA_ONE_CALL (cuMemFree)
 CUDA_ONE_CALL (cuMemFreeHost)
 CUDA_ONE_CALL (cuMemGetAddressRange)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index ffc8e2d79d1..9cdc55cac6b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1781,6 +1781,122 @@  GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
   return true;
 }
 
+int
+GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
+		       size_t dim0_len, void *dst, size_t dst_offset1_size,
+		       size_t dst_offset0_len, size_t dst_dim1_size,
+		       const void *src, size_t src_offset1_size,
+		       size_t src_offset0_len, size_t src_dim1_size)
+{
+  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
+    return false;
+
+  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
+
+  CUDA_MEMCPY2D data;
+  data.WidthInBytes = dim1_size;
+  data.Height = dim0_len;
+
+  if (dst_ord == -1)
+    {
+      data.dstMemoryType = CU_MEMORYTYPE_HOST;
+      data.dstHost = dst;
+    }
+  else
+    {
+      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      data.dstDevice = (CUdeviceptr) dst;
+    }
+  data.dstPitch = dst_dim1_size;
+  data.dstXInBytes = dst_offset1_size;
+  data.dstY = dst_offset0_len;
+
+  if (src_ord == -1)
+    {
+      data.srcMemoryType = CU_MEMORYTYPE_HOST;
+      data.srcHost = src;
+    }
+  else
+    {
+      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+      data.srcDevice = (CUdeviceptr) src;
+    }
+  data.srcPitch = src_dim1_size;
+  data.srcXInBytes = src_offset1_size;
+  data.srcY = src_offset0_len;
+
+  CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
+  if (res == CUDA_ERROR_INVALID_VALUE)
+    /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
+       for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
+       with an error; try the slower cuMemcpy2DUnaligned now.  */
+    CUDA_CALL (cuMemcpy2DUnaligned, &data);
+  else if (res != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
+      return false;
+    }
+  return true;
+}
+
+int
+GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
+		       size_t dim1_len, size_t dim0_len, void *dst,
+		       size_t dst_offset2_size, size_t dst_offset1_len,
+		       size_t dst_offset0_len, size_t dst_dim2_size,
+		       size_t dst_dim1_len, const void *src,
+		       size_t src_offset2_size, size_t src_offset1_len,
+		       size_t src_offset0_len, size_t src_dim2_size,
+		       size_t src_dim1_len)
+{
+  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
+    return false;
+
+  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
+
+  CUDA_MEMCPY3D data;
+  data.WidthInBytes = dim2_size;
+  data.Height = dim1_len;
+  data.Depth = dim0_len;
+
+  if (dst_ord == -1)
+    {
+      data.dstMemoryType = CU_MEMORYTYPE_HOST;
+      data.dstHost = dst;
+    }
+  else
+    {
+      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      data.dstDevice = (CUdeviceptr) dst;
+    }
+  data.dstPitch = dst_dim2_size;
+  data.dstHeight = dst_dim1_len;
+  data.dstXInBytes = dst_offset2_size;
+  data.dstY = dst_offset1_len;
+  data.dstZ = dst_offset0_len;
+  data.dstLOD = 0;
+
+  if (src_ord == -1)
+    {
+      data.srcMemoryType = CU_MEMORYTYPE_HOST;
+      data.srcHost = src;
+    }
+  else
+    {
+      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+      data.srcDevice = (CUdeviceptr) src;
+    }
+  data.srcPitch = src_dim2_size;
+  data.srcHeight = src_dim1_len;
+  data.srcXInBytes = src_offset2_size;
+  data.srcY = src_offset1_len;
+  data.srcZ = src_offset0_len;
+  data.srcLOD = 0;
+
+  CUDA_CALL (cuMemcpy3D, &data);
+  return true;
+}
+
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
diff --git a/libgomp/target.c b/libgomp/target.c
index 80c25a16f1e..5cf2e8dce37 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4526,7 +4526,8 @@  omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
 			       const size_t *dst_dimensions,
 			       const size_t *src_dimensions,
 			       struct gomp_device_descr *dst_devicep,
-			       struct gomp_device_descr *src_devicep)
+			       struct gomp_device_descr *src_devicep,
+			       size_t *tmp_size, void **tmp)
 {
   size_t dst_slice = element_size;
   size_t src_slice = element_size;
@@ -4539,36 +4540,120 @@  omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
 	  || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
 	  || __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
 	return EINVAL;
-      if (dst_devicep == NULL && src_devicep == NULL)
-	{
-	  memcpy ((char *) dst + dst_off, (const char *) src + src_off,
-		  length);
-	  ret = 1;
-	}
-      else if (src_devicep == NULL)
-	ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+      if (src_devicep != NULL && src_devicep == dst_devicep)
+	ret = src_devicep->dev2dev_func (src_devicep->target_id,
+					 (char *) dst + dst_off,
+					 (const char *) src + src_off,
+					 length);
+      else if (src_devicep != NULL
+	       && (dst_devicep == NULL
+		   || (dst_devicep->capabilities
+		       & GOMP_OFFLOAD_CAP_SHARED_MEM)))
+	ret = src_devicep->dev2host_func (src_devicep->target_id,
 					  (char *) dst + dst_off,
 					  (const char *) src + src_off,
 					  length);
-      else if (dst_devicep == NULL)
-	ret = src_devicep->dev2host_func (src_devicep->target_id,
+      else if (dst_devicep != NULL
+	       && (src_devicep == NULL
+		   || (src_devicep->capabilities
+			& GOMP_OFFLOAD_CAP_SHARED_MEM)))
+	ret = dst_devicep->host2dev_func (dst_devicep->target_id,
 					  (char *) dst + dst_off,
 					  (const char *) src + src_off,
 					  length);
+      else if (dst_devicep == NULL && src_devicep == NULL)
+	{
+	  memcpy ((char *) dst + dst_off, (const char *) src + src_off,
+		  length);
+	  ret = 1;
+	}
       else if (src_devicep == dst_devicep)
 	ret = src_devicep->dev2dev_func (src_devicep->target_id,
 					 (char *) dst + dst_off,
 					 (const char *) src + src_off,
 					 length);
       else
-	ret = 0;
+	{
+	  if (*tmp_size == 0)
+	    {
+	      *tmp_size = length;
+	      *tmp = malloc (length);
+	      if (*tmp == NULL)
+		return ENOMEM;
+	    }
+	  else if (*tmp_size < length)
+	    {
+	      *tmp_size = length;
+	      *tmp = realloc (*tmp, length);
+	      if (*tmp == NULL)
+		return ENOMEM;
+	    }
+	  ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp,
+					    (const char *) src + src_off,
+					    length);
+	  if (ret == 1)
+	    ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+					      (char *) dst + dst_off, *tmp,
+					      length);
+	}
       return ret ? 0 : EINVAL;
     }
 
-  /* FIXME: it would be nice to have some plugin function to handle
-     num_dims == 2 and num_dims == 3 more efficiently.  Larger ones can
-     be handled in the generic recursion below, and for host-host it
-     should be used even for any num_dims >= 2.  */
+  /* host->device, device->host and same-device device->device.  */
+  if (num_dims == 2
+      && ((src_devicep
+	   && src_devicep == dst_devicep
+	   && src_devicep->memcpy2d_func)
+	  || (!src_devicep != !dst_devicep
+	      && ((src_devicep && src_devicep->memcpy2d_func)
+		  || (dst_devicep && dst_devicep->memcpy2d_func)))))
+    {
+      size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1;
+      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+      int src_id = src_devicep ? src_devicep->target_id : -1;
+      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
+
+      if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1)
+	  || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1)
+	  || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1)
+	  || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1)
+	  || __builtin_mul_overflow (src_offsets[1], element_size,
+				     &src_off_sz1))
+	return EINVAL;
+      ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0],
+				 dst, dst_off_sz1, dst_offsets[0], dst_sz1,
+				 src, src_off_sz1, src_offsets[0], src_sz1);
+      if (ret != -1)
+	return ret ? 0 : EINVAL;
+    }
+  else if (num_dims == 3
+	   && ((src_devicep
+		&& src_devicep == dst_devicep
+		&& src_devicep->memcpy3d_func)
+	       || (!src_devicep != !dst_devicep
+		   && ((src_devicep && src_devicep->memcpy3d_func)
+		       || (dst_devicep && dst_devicep->memcpy3d_func)))))
+    {
+      size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2;
+      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+      int src_id = src_devicep ? src_devicep->target_id : -1;
+      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
+
+      if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2)
+	  || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2)
+	  || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2)
+	  || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2)
+	  || __builtin_mul_overflow (src_offsets[2], element_size,
+				     &src_off_sz2))
+	return EINVAL;
+      ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0],
+				 dst, dst_off_sz2, dst_offsets[1],
+				 dst_offsets[0], dst_sz2, dst_dimensions[1],
+				 src, src_off_sz2, src_offsets[1],
+				 src_offsets[0], src_sz2, src_dimensions[1]);
+      if (ret != -1)
+	return ret ? 0 : EINVAL;
+    }
 
   for (i = 1; i < num_dims; i++)
     if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
@@ -4585,7 +4670,7 @@  omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
 					   volume + 1, dst_offsets + 1,
 					   src_offsets + 1, dst_dimensions + 1,
 					   src_dimensions + 1, dst_devicep,
-					   src_devicep);
+					   src_devicep, tmp_size, tmp);
       if (ret)
 	return ret;
       dst_off += dst_slice;
@@ -4608,9 +4693,6 @@  omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num,
   if (ret)
     return ret;
 
-  if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep)
-    return EINVAL;
-
   return 0;
 }
 
@@ -4624,18 +4706,36 @@  omp_target_memcpy_rect_copy (void *dst, const void *src,
 			     struct gomp_device_descr *dst_devicep,
 			     struct gomp_device_descr *src_devicep)
 {
-  if (src_devicep)
+  size_t tmp_size = 0;
+  void *tmp = NULL;
+  bool lock_src;
+  bool lock_dst;
+
+  lock_src = (src_devicep
+	      && (!dst_devicep
+		  || src_devicep == dst_devicep
+		  || !(src_devicep->capabilities
+		       & GOMP_OFFLOAD_CAP_SHARED_MEM)));
+  lock_dst = (dst_devicep
+	      && (!lock_src
+		  || (src_devicep != dst_devicep
+		      && !(dst_devicep->capabilities
+			   & GOMP_OFFLOAD_CAP_SHARED_MEM))));
+  if (lock_src)
     gomp_mutex_lock (&src_devicep->lock);
-  else if (dst_devicep)
+  if (lock_dst)
     gomp_mutex_lock (&dst_devicep->lock);
   int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
 					   volume, dst_offsets, src_offsets,
 					   dst_dimensions, src_dimensions,
-					   dst_devicep, src_devicep);
-  if (src_devicep)
+					   dst_devicep, src_devicep,
+					   &tmp_size, &tmp);
+  if (lock_src)
     gomp_mutex_unlock (&src_devicep->lock);
-  else if (dst_devicep)
+  if (lock_dst)
     gomp_mutex_unlock (&dst_devicep->lock);
+  if (tmp)
+    free (tmp);
 
   return ret;
 }
@@ -4976,6 +5076,8 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM (free);
   DLSYM (dev2host);
   DLSYM (host2dev);
+  DLSYM (memcpy2d);
+  DLSYM (memcpy3d);
   device->capabilities = device->get_caps_func ();
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
diff --git a/libgomp/testsuite/libgomp.c/target-12.c b/libgomp/testsuite/libgomp.c/target-12.c
index b439e56577c..4fbfae19eda 100644
--- a/libgomp/testsuite/libgomp.c/target-12.c
+++ b/libgomp/testsuite/libgomp.c/target-12.c
@@ -80,12 +80,12 @@  main ()
       src_offsets[2] = 1;
       src_offsets[1] = 0;
       src_offsets[0] = 3;
-      dst_dimensions[2] = 2;
+      dst_dimensions[2] = 3;
       dst_dimensions[1] = 3;
       dst_dimensions[0] = 6;
       src_dimensions[2] = 3;
       src_dimensions[1] = 4;
-      src_dimensions[0] = 6;
+      src_dimensions[0] = 9;
       if (omp_target_memcpy_rect (p, q, sizeof (int), 3, volume,
 				  dst_offsets, src_offsets, dst_dimensions,
 				  src_dimensions, d, id) != 0)
@@ -98,7 +98,7 @@  main ()
 	for (j = 0; j < 6; j++)
 	  for (k = 0; k < 3; k++)
 	    for (l = 0; l < 2; l++)
-	      if (q[j * 6 + k * 2 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12)
+	      if (q[j * 9 + k * 3 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12)
 		err = 1;
       }
 
diff --git a/libgomp/testsuite/libgomp.fortran/target-12.f90 b/libgomp/testsuite/libgomp.fortran/target-12.f90
index 17c78f18f9b..eba4b62a18e 100644
--- a/libgomp/testsuite/libgomp.fortran/target-12.f90
+++ b/libgomp/testsuite/libgomp.fortran/target-12.f90
@@ -93,12 +93,12 @@  program main
     src_offsets(2) = 1
     src_offsets(1) = 0
     src_offsets(0) = 3
-    dst_dimensions(2) = 2
+    dst_dimensions(2) = 3
     dst_dimensions(1) = 3
     dst_dimensions(0) = 6
     src_dimensions(2) = 3
     src_dimensions(1) = 4
-    src_dimensions(0) = 6
+    src_dimensions(0) = 9
 
     if (omp_target_memcpy_rect (p, c_loc (q), sizeof (q(0)), 3, volume, &
                                 dst_offsets, src_offsets, dst_dimensions, &
@@ -112,7 +112,7 @@  program main
       do j = 0, 5
         do k = 0, 2
           do l = 0, 1
-            if (q(j * 6 + k * 2 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) &
+            if (q(j * 9 + k * 3 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) &
               err = .true.
           end do
         end do
diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90 b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90
new file mode 100644
index 00000000000..406c23b8408
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90
@@ -0,0 +1,531 @@ 
+program main
+  use iso_c_binding
+  use omp_lib
+  implicit none (type, external)
+
+  integer(c_size_t), parameter :: sizeof_int = 4
+  integer, parameter :: sk = c_size_t
+  logical, allocatable :: isshared(:)
+  integer, allocatable :: maxdim(:,:)
+  integer :: ndev
+
+  ndev = omp_get_num_devices()
+  call init_isshared
+  call init_maxdim
+
+  call one
+  call two
+  call three
+  call four
+
+  deallocate(isshared, maxdim)
+contains
+
+  subroutine init_maxdim
+    integer :: dev, dev2, r
+    integer(c_size_t), parameter :: nl = 0
+
+    allocate(maxdim(0:ndev,0:ndev))
+    do dev = 0, ndev
+      do dev2 = 0, ndev
+        r = omp_target_memcpy_rect (c_null_ptr, c_null_ptr, nl, &
+                                    num_dims=1_c_int, volume=[nl], &
+                                    dst_offsets=[nl], src_offsets=[nl], &
+                                    dst_dimensions=[nl], src_dimensions=[nl], &
+                                    dst_device_num=dev, src_device_num=omp_initial_device)
+        if (r < 3) stop 1             ! OpenMP requirement
+        if (r < huge(0_c_int)) stop 2 ! GCC implementation
+        maxdim(dev2,dev) = r
+      end do
+    end do
+  end subroutine
+
+  subroutine init_isshared
+    integer :: dev
+    logical :: dev_isshared
+
+    allocate(isshared(0:ndev))
+    do dev = 0, ndev
+      dev_isshared = .false.
+      !$omp target device(dev) map(to: dev_isshared)
+        dev_isshared = .true.
+      !$omp end target
+      isshared(dev) = dev_isshared
+    end do
+  end subroutine
+
+
+  subroutine one
+    integer(c_size_t), parameter :: N1 = 30
+    integer, target :: host_data(N1)
+    type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+    integer :: dev, dev2, i, r
+
+    do dev = 0, ndev
+      dev_cptr(dev) = omp_target_alloc (N1*sizeof_int, dev)
+      if (.not. c_associated (dev_cptr(dev))) stop 11
+    end do
+
+    do i = 1, N1
+      host_data(i) = i
+    end do
+
+    ! copy full array host -> all devices + check value + set per-device value
+    do dev = 0, ndev
+      r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+                                  num_dims=1_c_int, volume=[N1], &
+                                  dst_offsets=[0_sk], src_offsets=[0_sk], &
+                                  dst_dimensions=[N1], src_dimensions=[N1], &
+                                  dst_device_num=dev, src_device_num=omp_initial_device)
+      if (r /= 0) stop 12
+      cptr = dev_cptr(dev)
+      !$omp target device(dev) is_device_ptr(cptr)
+      block
+        integer, pointer, contiguous :: fptr(:)
+        call c_f_pointer(cptr, fptr, [N1])
+        do i = 1, N1
+           if (fptr(i) /= i) stop 13
+           fptr(i) = i*100 + 10000 * (dev+3)
+        end do
+      end block
+    end do
+
+    ! Test strided data - forth and back - same array sizes
+    do dev = 0, ndev
+      do dev2 = 0, ndev
+        tmp_cptr = omp_target_alloc (N1*sizeof_int, dev)
+        if (.not. c_associated (tmp_cptr)) stop 14
+
+        !$omp target device(dev) is_device_ptr(tmp_cptr)
+        block
+          integer, pointer, contiguous :: fptr(:)
+          call c_f_pointer(tmp_cptr, fptr, [N1])
+          do i = 1, N1
+            fptr(i) = i*100 + 10000*(dev+1)
+          end do
+        end block
+
+        if (N1-17 > N1 - max(12,13)) stop 18
+        r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+                                    num_dims=1_c_int, volume=[N1-17], &
+                                    dst_offsets=[12_sk], src_offsets=[13_sk], &
+                                    dst_dimensions=[N1], src_dimensions=[N1], &
+                                    dst_device_num=dev2, src_device_num=dev)
+        if (r /= 0) stop 15
+
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          logical :: checked(N1)
+          integer, pointer, contiguous :: fptr(:)
+          call c_f_pointer(cptr, fptr, [N1])
+          checked = .false.
+          do i = 1, N1-17
+            if (fptr(i+12) /= (i+13)*100 + 10000 * (dev+1))  stop 16
+             checked(i+12) = .true.
+          end do
+          ! original device value
+          do i = 1, N1
+            if (.not. checked(i)) then
+              if (fptr(i) /= i*100 + 10000 * (dev2+3)) stop 17
+            end if
+          end do
+        end block
+        call omp_target_free (tmp_cptr, dev)
+      end do
+
+      ! reset to original value
+      do dev2 = 0, ndev
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          integer, pointer, contiguous :: fptr(:)
+          call c_f_pointer(cptr, fptr, [N1])
+          do i = 1, N1
+                fptr(i) = i*100 + 10000 * (dev2+3)
+          end do
+        end block
+      end do
+    end do
+
+    do dev = 0, ndev
+      call omp_target_free (dev_cptr(dev), dev)
+    end do
+  end subroutine
+
+
+  subroutine two
+    integer(c_size_t), parameter :: N = 10, M = 30
+    integer, target :: host_data(N,M)
+    type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+    integer :: dev, dev2, i, j, r
+
+    do dev = 0, ndev
+      dev_cptr(dev) = omp_target_alloc (N*M*sizeof_int, dev)
+      if (.not. c_associated (dev_cptr(dev))) stop 21
+    end do
+
+    do i = 1, M
+      do j = 1, N
+        host_data(j,i) = i*100 + j
+      end do
+    end do
+
+    ! copy full array host -> all devices + check value + set per-device value
+    do dev = 0, ndev
+      r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+                                  num_dims=2_c_int, volume=[M, N], &
+                                  dst_offsets=[0_sk, 0_sk], src_offsets=[0_sk, 0_sk], &
+                                  dst_dimensions=[M, N], src_dimensions=[M,N], &
+                                  dst_device_num=dev, src_device_num=omp_initial_device)
+      if (r /= 0) stop 22
+      cptr = dev_cptr(dev)
+      !$omp target device(dev) is_device_ptr(cptr)
+      block
+        integer, pointer, contiguous :: fptr(:,:)
+        call c_f_pointer(cptr, fptr, [N,M])
+        do i = 1, M
+          do j = 1, N
+            if (fptr(j,i) /= i*100 + j) stop 23
+            fptr(j,i) = i*100 + j + 1000 * dev
+          end do
+        end do
+      end block
+    end do
+
+    ! Test strided data - forth and back - same array sizes
+    do dev = 0, ndev
+      do dev2 = 0, ndev
+        tmp_cptr = omp_target_alloc (N*M*sizeof_int, dev)
+        if (.not. c_associated (tmp_cptr)) stop 24
+
+        !$omp target device(dev) is_device_ptr(tmp_cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:)
+          call c_f_pointer(tmp_cptr, fptr, [N,M])
+          do i = 1, M
+            do j = 1, N
+              fptr(j,i) = i*100 + j + 100000 * (dev+1)
+            end do
+          end do
+        end block
+
+        if (M-14 > M - max(5,2) &
+            .or. N-3 > N - max(2,1)) stop 28
+        r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+                                    num_dims=2_c_int, volume=[M-14, N-3], &
+                                    dst_offsets=[5_sk, 3_sk], src_offsets=[2_sk, 1_sk], &
+                                    dst_dimensions=[M, N], src_dimensions=[M,N], &
+                                    dst_device_num=dev2, src_device_num=dev)
+        if (r /= 0) stop 25
+
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          logical :: checked(N,M)
+          integer, pointer, contiguous :: fptr(:,:)
+          call c_f_pointer(cptr, fptr, [N,M])
+          checked = .false.
+          do i = 1, M-14
+            do j = 1, N-3
+              if (fptr(j+3, i+5) /= (i+2)*100 + (j+1) + 100000 * (dev+1)) stop 26
+              checked(j+3, i+5) = .true.
+            end do
+          end do
+          ! original device value
+          do i = 1, M
+            do j = 1, N
+              if (.not. checked(j,i)) then
+                if (fptr(j,i) /= i*100 + j + 1000 * dev2) stop 27
+              end if
+            end do
+          end do
+        end block
+        call omp_target_free (tmp_cptr, dev)
+      end do
+
+      ! reset to original value
+      do dev2 = 0, ndev
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:)
+          call c_f_pointer(cptr, fptr, [N,M])
+          do i = 1, M
+            do j = 1, N
+              fptr(j,i) = i*100 + j + 1000 * dev2
+            end do
+          end do
+        end block
+      end do
+    end do
+
+    do dev = 0, ndev
+      call omp_target_free (dev_cptr(dev), dev)
+    end do
+  end subroutine
+
+
+  subroutine three
+    integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15
+    integer, target :: host_data(N3,N2,N1)
+    type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+    integer :: dev, dev2, i, j, k, r
+
+    do dev = 0, ndev
+      dev_cptr(dev) = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
+      if (.not. c_associated (dev_cptr(dev))) stop 31
+    end do
+
+    do i = 1, N1
+      do j = 1, N2
+        do k = 1, N3
+          host_data(k, j,i) = i*1000 + 100*j + k
+        end do
+      end do
+    end do
+
+    ! copy full array host -> all devices + check value + set per-device value
+    do dev = 0, ndev
+      r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+                                  num_dims=3_c_int, volume=[N1, N2, N3], &
+                                  dst_offsets=[0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk], &
+                                  dst_dimensions=[N1, N2, N3], src_dimensions=[N1, N2, N3], &
+                                  dst_device_num=dev, src_device_num=omp_initial_device)
+      if (r /= 0) stop 32
+      cptr = dev_cptr(dev)
+      !$omp target device(dev) is_device_ptr(cptr)
+      block
+        integer, pointer, contiguous :: fptr(:,:,:)
+        call c_f_pointer(cptr, fptr, [N3,N2,N1])
+        do i = 1, N1
+          do j = 1, N2
+            do k = 1, N3
+              if (fptr(k, j,i) /= i*1000 + 100*j + k) stop 33
+              fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev
+            end do
+          end do
+        end do
+      end block
+    end do
+
+    ! Test strided data - forth and back - same array sizes
+    do dev = 0, ndev
+      do dev2 = 0, ndev
+        tmp_cptr = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
+        if (.not. c_associated (tmp_cptr)) stop 34
+
+        !$omp target device(dev) is_device_ptr(tmp_cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:,:)
+          call c_f_pointer(tmp_cptr, fptr, [N3,N2,N1])
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+               fptr(k,j,i) = i*1000 + 100*j + k + 100000 * (dev+1)
+              end do
+            end do
+          end do
+        end block
+
+        if (N1-5 > N1 - max(5,2) &
+            .or. N2-13 > N2 - max(3,1) &
+            .or. N3-5  > N3 - max(2,4)) stop 38
+        r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+                                    num_dims=3_c_int, volume=[N1-5, N2-13,N3-5], &
+                                    dst_offsets=[5_sk, 3_sk,2_sk], src_offsets=[2_sk, 1_sk,4_sk], &
+                                    dst_dimensions=[N1,N2,N3], src_dimensions=[N1,N2,N3], &
+                                    dst_device_num=dev2, src_device_num=dev)
+        if (r /= 0) stop 35
+
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          logical :: checked(N3,N2,N1)
+          integer, pointer, contiguous :: fptr(:,:,:)
+          call c_f_pointer(cptr, fptr, [N3,N2,N1])
+          checked = .false.
+          do i = 1, N1-5
+            do j = 1, N2-13
+              do k = 1, N3-5
+                if (fptr(k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4) + 100000 * (dev+1))  stop 36
+                checked(k+2, j+3, i+5) = .true.
+              end do
+            end do
+          end do
+          ! original device value
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+                if (.not. checked(k,j,i)) then
+                  if (fptr(k,j,i) /= i*1000 + 100*j + k + 1000 * dev2) stop 37
+                end if
+              end do
+            end do
+          end do
+        end block
+        call omp_target_free (tmp_cptr, dev)
+      end do
+
+      ! reset to original value
+      do dev2 = 0, ndev
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:,:)
+          call c_f_pointer(cptr, fptr, [N3,N2,N1])
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+                fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev2
+              end do
+            end do
+          end do
+        end block
+      end do
+    end do
+
+    do dev = 0, ndev
+      call omp_target_free (dev_cptr(dev), dev)
+    end do
+  end subroutine
+
+
+  subroutine four
+    integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15, N4 = 25
+    integer, target :: host_data(N4, N3,N2,N1)
+    type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+    integer :: dev, dev2, i, j, k, ll, r
+
+    do dev = 0, ndev
+      dev_cptr(dev) = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
+      if (.not. c_associated (dev_cptr(dev))) stop 41
+    end do
+
+    do i = 1, N1
+      do j = 1, N2
+        do k = 1, N3
+          do ll = 1, N4
+            host_data(ll, k, j,i) = i*1000 + 100*j + k*10 + ll
+          end do
+        end do
+      end do
+    end do
+
+    ! copy full array host -> all devices + check value + set per-device value
+    do dev = 0, ndev
+      r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+                                  num_dims=4_c_int, volume=[N1, N2, N3, N4], &
+                                  dst_offsets=[0_sk, 0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk, 0_sk], &
+                                  dst_dimensions=[N1, N2, N3, N4], src_dimensions=[N1, N2, N3, N4], &
+                                  dst_device_num=dev, src_device_num=omp_initial_device)
+      if (r /= 0) stop 42
+      cptr = dev_cptr(dev)
+      !$omp target device(dev) is_device_ptr(cptr)
+      block
+        integer, pointer, contiguous :: fptr(:,:,:,:)
+        call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+        do i = 1, N1
+          do j = 1, N2
+            do k = 1, N3
+              do ll = 1, N4
+                if (fptr(ll, k, j,i) /= i*1000 + 100*j + k*10 + ll) stop 43
+                fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev
+              end do
+            end do
+          end do
+        end do
+      end block
+    end do
+
+    ! Test strided data - forth and back - same array sizes
+    do dev = 0, ndev
+      do dev2 = 0, ndev
+        tmp_cptr = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
+        if (.not. c_associated (tmp_cptr)) stop 44
+
+        !$omp target device(dev) is_device_ptr(tmp_cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:,:,:)
+          call c_f_pointer(tmp_cptr, fptr, [N4,N3,N2,N1])
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+                do ll = 1, N4
+                  fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 100000 * (dev+1)
+                end do
+              end do
+            end do
+          end do
+        end block
+
+        if (N1-5 > N1 - max(5,2) &
+            .or. N2-13 > N2 - max(3,1) &
+            .or. N3-5  > N3 - max(2,4) &
+            .or. N4-11  > N4 - max(7,5)) stop 48
+        r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+                                    num_dims=4_c_int, volume=[N1-5, N2-13,N3-5,N4-11], &
+                                    dst_offsets=[5_sk, 3_sk,2_sk,7_sk], src_offsets=[2_sk, 1_sk,4_sk,5_sk], &
+                                    dst_dimensions=[N1,N2,N3,N4], src_dimensions=[N1,N2,N3,N4], &
+                                    dst_device_num=dev2, src_device_num=dev)
+        if (r /= 0) stop 45
+
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          logical, allocatable :: checked(:,:,:,:) ! allocatble to reduce stack size
+          integer, pointer, contiguous :: fptr(:,:,:,:)
+          call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+          allocate (checked(N4,N3,N2,N1), source=.false.)
+          do i = 1, N1-5
+            do j = 1, N2-13
+              do k = 1, N3-5
+                do ll = 1, N4-11
+                  if (fptr(ll+7, k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4)*10 + ll+5 + 100000 * (dev+1))  stop 46
+                  checked(ll+7, k+2, j+3, i+5) = .true.
+                end do
+              end do
+            end do
+          end do
+          ! original device value
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+                do ll = 1, N4
+                  if (.not. checked(ll,k,j,i)) then
+                    if (fptr(ll,k,j,i) /= i*1000 + 100*j + k*10 + ll + 1000 * dev2) stop 47
+                  end if
+                end do
+              end do
+            end do
+          end do
+          deallocate (checked)
+        end block
+        call omp_target_free (tmp_cptr, dev)
+      end do
+
+      ! reset to original value
+      do dev2 = 0, ndev
+        cptr = dev_cptr(dev2)
+        !$omp target device(dev2) is_device_ptr(cptr)
+        block
+          integer, pointer, contiguous :: fptr(:,:,:,:)
+          call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+          do i = 1, N1
+            do j = 1, N2
+              do k = 1, N3
+                do ll = 1, N4
+                  fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev2
+                end do
+              end do
+            end do
+          end do
+        end block
+      end do
+    end do
+
+    do dev = 0, ndev
+      call omp_target_free (dev_cptr(dev), dev)
+    end do
+  end subroutine
+end program