diff mbox series

[1/5] OpenMP, NVPTX: memcpy[23]D bias correction

Message ID c83c9f9f05bf5577eeaf3633c5c2e494ac0a11fd.1693991759.git.julian@codesourcery.com
State New
Headers show
Series OpenMP: Array-shaping operator and strided/rectangular 'target update' support | expand

Commit Message

Julian Brown Sept. 6, 2023, 9:34 a.m. UTC
This patch works around behaviour of the 2D and 3D memcpy operations in
the CUDA driver runtime.  Particularly in Fortran, the "base pointer"
of an array (used for either source or destination of a host/device copy)
may lie outside of data that is actually stored on the device.  The fix
is to make sure that we use the first element of data to be transferred
instead, and adjust parameters accordingly.

2023-09-05  Julian Brown  <julian@codesourcery.com>

libgomp/
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d): Adjust parameters to
	avoid out-of-bounds array checks in CUDA runtime.
	(GOMP_OFFLOAD_memcpy3d): Likewise.
---
 libgomp/plugin/plugin-nvptx.c | 67 +++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

Comments

Thomas Schwinge Sept. 26, 2023, 10:57 p.m. UTC | #1
Hi Julian!

On 2023-09-06T02:34:30-0700, Julian Brown <julian@codesourcery.com> wrote:
> This patch works around behaviour of the 2D and 3D memcpy operations in
> the CUDA driver runtime.  Particularly in Fortran, the "base pointer"
> of an array (used for either source or destination of a host/device copy)
> may lie outside of data that is actually stored on the device.  The fix
> is to make sure that we use the first element of data to be transferred
> instead, and adjust parameters accordingly.

Do you (a) have a stand-alone test case for this (that is, not depending
on your other pending patches, so that this could go in directly --
together with the before-FAIL test case).  Do you (b) know if is this a
bug in our use of the CUDA Driver API or rather in CUDA itself?  If the
latter, have you reported this to Nvidia?

(I didn't quickly understand
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27>
"cuMemcpy2D" etc.)


Grüße
 Thomas


> 2023-09-05  Julian Brown  <julian@codesourcery.com>
>
> libgomp/
>       * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d): Adjust parameters to
>       avoid out-of-bounds array checks in CUDA runtime.
>       (GOMP_OFFLOAD_memcpy3d): Likewise.
> ---
>  libgomp/plugin/plugin-nvptx.c | 67 +++++++++++++++++++++++++++++++++++
>  1 file changed, 67 insertions(+)
>
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 00d4241ae02b..cefe288a8aab 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -1827,6 +1827,35 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
>    data.srcXInBytes = src_offset1_size;
>    data.srcY = src_offset0_len;
>
> +  if (data.srcXInBytes != 0 || data.srcY != 0)
> +    {
> +      /* Adjust origin to the actual array data, else the CUDA 2D memory
> +      copy API calls below may fail to validate source/dest pointers
> +      correctly (especially for Fortran where the "virtual origin" of an
> +      array is often outside the stored data).  */
> +      if (src_ord == -1)
> +     data.srcHost = (const void *) ((const char *) data.srcHost
> +                                   + data.srcY * data.srcPitch
> +                                   + data.srcXInBytes);
> +      else
> +     data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
> +      data.srcXInBytes = 0;
> +      data.srcY = 0;
> +    }
> +
> +  if (data.dstXInBytes != 0 || data.dstY != 0)
> +    {
> +      /* As above.  */
> +      if (dst_ord == -1)
> +     data.dstHost = (void *) ((char *) data.dstHost
> +                              + data.dstY * data.dstPitch
> +                              + data.dstXInBytes);
> +      else
> +     data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
> +      data.dstXInBytes = 0;
> +      data.dstY = 0;
> +    }
> +
>    CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
>    if (res == CUDA_ERROR_INVALID_VALUE)
>      /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
> @@ -1895,6 +1924,44 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
>    data.srcY = src_offset1_len;
>    data.srcZ = src_offset0_len;
>
> +  if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
> +    {
> +      /* Adjust origin to the actual array data, else the CUDA 3D memory
> +      copy API call below may fail to validate source/dest pointers
> +      correctly (especially for Fortran where the "virtual origin" of an
> +      array is often outside the stored data).  */
> +      if (src_ord == -1)
> +     data.srcHost
> +       = (const void *) ((const char *) data.srcHost
> +                         + (data.srcZ * data.srcHeight + data.srcY)
> +                           * data.srcPitch
> +                         + data.srcXInBytes);
> +      else
> +     data.srcDevice
> +       += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
> +          + data.srcXInBytes;
> +      data.srcXInBytes = 0;
> +      data.srcY = 0;
> +      data.srcZ = 0;
> +    }
> +
> +  if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
> +    {
> +      /* As above.  */
> +      if (dst_ord == -1)
> +     data.dstHost = (void *) ((char *) data.dstHost
> +                              + (data.dstZ * data.dstHeight + data.dstY)
> +                                * data.dstPitch
> +                              + data.dstXInBytes);
> +      else
> +     data.dstDevice
> +       += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
> +          + data.dstXInBytes;
> +      data.dstXInBytes = 0;
> +      data.dstY = 0;
> +      data.dstZ = 0;
> +    }
> +
>    CUDA_CALL (cuMemcpy3D, &data);
>    return true;
>  }
> --
> 2.41.0
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Julian Brown Oct. 2, 2023, 2:53 p.m. UTC | #2
On Wed, 27 Sep 2023 00:57:58 +0200
Thomas Schwinge <thomas@codesourcery.com> wrote:

> On 2023-09-06T02:34:30-0700, Julian Brown <julian@codesourcery.com>
> wrote:
> > This patch works around behaviour of the 2D and 3D memcpy
> > operations in the CUDA driver runtime.  Particularly in Fortran,
> > the "base pointer" of an array (used for either source or
> > destination of a host/device copy) may lie outside of data that is
> > actually stored on the device.  The fix is to make sure that we use
> > the first element of data to be transferred instead, and adjust
> > parameters accordingly.  
> 
> Do you (a) have a stand-alone test case for this (that is, not
> depending on your other pending patches, so that this could go in
> directly -- together with the before-FAIL test case).

Thanks for the reply! Here's a version with a stand-alone test case.

> Do you (b)
> know if is this a bug in our use of the CUDA Driver API or rather in
> CUDA itself?  If the latter, have you reported this to Nvidia?

I don't think the CUDA behaviour is *wrong*, as such -- at least to the
C/C++ way of thinking (or indeed a graphics-oriented way of thinking),
one would normally think of an array as having a zero-based origin, and
these 2D/3D memory copies would be intended as a way of updating just a
part of an array (or texture) that has full duplicate copies on both
the host and device.  Our use-case just happens to be a bit different,
both because Fortran (internally) represents an array by a zero-based
origin but may use 1-based (or whatever-based) indices, and because we
support partial mappings of host arrays on the device in all three
supported languages -- which amounts to much the same thing, actually.

That said, it *could* be fixed in CUDA, though probably not in all the
versions currently deployed out there in the world.  So I guess we'd
still need a patch like this anyway.

Julian
Tobias Burnus Dec. 19, 2023, 8:45 p.m. UTC | #3
Hi Julian & Thomas,

the patch LGTM - and seemingly also Thomas is somewhat fine with it -
and it includes the stand-alone testcase.

* * *

I guess, you don't know the answer to Thomas question, i.e. whether
that's a bug in CUDA or in our use of the CUDA API?

CUDA's spec itself,
https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html has
for cuMemcpy2D

‎  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);

and for cuMemcpy3D

‎  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);

Thus, I assume we use it "properly", except that the CUDA writers
probably assumed that one allocates a big chunk of memory and work with
that memory and not just maps a subset.

This might or might not be stated in the manual in the following:
"Memory regions spanning over allocations that are both registered and
not registered with CUDA are not supported and will return
CUDA_ERROR_INVALID_VALUE." – where the question is whether everything
until 'start' really counts as "spanning".

Tobias


On 02.10.23 16:53, Julian Brown wrote:
> On Wed, 27 Sep 2023 00:57:58 +0200
> Thomas Schwinge <thomas@codesourcery.com> wrote:
>
>> On 2023-09-06T02:34:30-0700, Julian Brown <julian@codesourcery.com>
>> wrote:
>>> This patch works around behaviour of the 2D and 3D memcpy
>>> operations in the CUDA driver runtime.  Particularly in Fortran,
>>> the "base pointer" of an array (used for either source or
>>> destination of a host/device copy) may lie outside of data that is
>>> actually stored on the device.  The fix is to make sure that we use
>>> the first element of data to be transferred instead, and adjust
>>> parameters accordingly.
>> Do you (a) have a stand-alone test case for this (that is, not
>> depending on your other pending patches, so that this could go in
>> directly -- together with the before-FAIL test case).
> Thanks for the reply! Here's a version with a stand-alone test case.
>
>> Do you (b)
>> know if is this a bug in our use of the CUDA Driver API or rather in
>> CUDA itself?  If the latter, have you reported this to Nvidia?
> I don't think the CUDA behaviour is *wrong*, as such -- at least to the
> C/C++ way of thinking (or indeed a graphics-oriented way of thinking),
> one would normally think of an array as having a zero-based origin, and
> these 2D/3D memory copies would be intended as a way of updating just a
> part of an array (or texture) that has full duplicate copies on both
> the host and device.  Our use-case just happens to be a bit different,
> both because Fortran (internally) represents an array by a zero-based
> origin but may use 1-based (or whatever-based) indices, and because we
> support partial mappings of host arrays on the device in all three
> supported languages -- which amounts to much the same thing, actually.
>
> That said, it *could* be fixed in CUDA, though probably not in all the
> versions currently deployed out there in the world.  So I guess we'd
> still need a patch like this anyway.
>
> Julian
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
diff mbox series

Patch

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 00d4241ae02b..cefe288a8aab 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1827,6 +1827,35 @@  GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
   data.srcXInBytes = src_offset1_size;
   data.srcY = src_offset0_len;
 
+  if (data.srcXInBytes != 0 || data.srcY != 0)
+    {
+      /* Adjust origin to the actual array data, else the CUDA 2D memory
+	 copy API calls below may fail to validate source/dest pointers
+	 correctly (especially for Fortran where the "virtual origin" of an
+	 array is often outside the stored data).  */
+      if (src_ord == -1)
+	data.srcHost = (const void *) ((const char *) data.srcHost
+				      + data.srcY * data.srcPitch
+				      + data.srcXInBytes);
+      else
+	data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
+      data.srcXInBytes = 0;
+      data.srcY = 0;
+    }
+
+  if (data.dstXInBytes != 0 || data.dstY != 0)
+    {
+      /* As above.  */
+      if (dst_ord == -1)
+	data.dstHost = (void *) ((char *) data.dstHost
+				 + data.dstY * data.dstPitch
+				 + data.dstXInBytes);
+      else
+	data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
+      data.dstXInBytes = 0;
+      data.dstY = 0;
+    }
+
   CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
   if (res == CUDA_ERROR_INVALID_VALUE)
     /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
@@ -1895,6 +1924,44 @@  GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
   data.srcY = src_offset1_len;
   data.srcZ = src_offset0_len;
 
+  if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
+    {
+      /* Adjust origin to the actual array data, else the CUDA 3D memory
+	 copy API call below may fail to validate source/dest pointers
+	 correctly (especially for Fortran where the "virtual origin" of an
+	 array is often outside the stored data).  */
+      if (src_ord == -1)
+	data.srcHost
+	  = (const void *) ((const char *) data.srcHost
+			    + (data.srcZ * data.srcHeight + data.srcY)
+			      * data.srcPitch
+			    + data.srcXInBytes);
+      else
+	data.srcDevice
+	  += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
+	     + data.srcXInBytes;
+      data.srcXInBytes = 0;
+      data.srcY = 0;
+      data.srcZ = 0;
+    }
+
+  if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
+    {
+      /* As above.  */
+      if (dst_ord == -1)
+	data.dstHost = (void *) ((char *) data.dstHost
+				 + (data.dstZ * data.dstHeight + data.dstY)
+				   * data.dstPitch
+				 + data.dstXInBytes);
+      else
+	data.dstDevice
+	  += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
+	     + data.dstXInBytes;
+      data.dstXInBytes = 0;
+      data.dstY = 0;
+      data.dstZ = 0;
+    }
+
   CUDA_CALL (cuMemcpy3D, &data);
   return true;
 }