diff mbox

[fortran] Implement blocked eoshift for eoshift0

Message ID 7f484ee3-4cbf-4373-8f5f-d30197e1e7c6@netcologne.de
State New
Headers show

Commit Message

Thomas Koenig July 1, 2017, 1:48 p.m. UTC
Hello world,

the attached patch implements the blocked algorithm for
constant shift for dim > 1 for eoshift0 (which handles
the case of constant shift and constant fill value).

Speedup, as for cshift, is large.  Moving a 500*500*500
array by -3 with eo_bench.f90 (also attached):

$ gfortran -O3 eo_bench.f90 && ./a.out
  dim =            1  t =   0.451796889
  dim =            2  t =   0.183514118
  dim =            3  t =   0.184015989
$ gfortran-7 -static-libgfortran -O3 eo_bench.f90 && ./a.out
  dim =            1  t =   0.955736041
  dim =            2  t =    1.42228103
  dim =            3  t =    3.00043702

Regression-tested.  OK for trunk?

Regards

	Thomas

2017-07-01  Thomas Koenig  <tkoenig@gcc.gnu.org>

         * intrinsics/eoshift0.c:  For contiguous arrays, use
         block algorithm.  Use memcpy where possible.

2017-07-01  Thomas Koenig  <tkoenig@gcc.gnu.org>

         * gfortran/eoshift_3.f90:  New test.

Comments

Paul Richard Thomas July 2, 2017, 11:53 a.m. UTC | #1
Hi Thomas,

The timings are impressive! OK for trunk.

Thanks

Paul

On 1 July 2017 at 14:48, Thomas Koenig <tkoenig@netcologne.de> wrote:
> Hello world,
>
> the attached patch implements the blocked algorithm for
> constant shift for dim > 1 for eoshift0 (which handles
> the case of constant shift and constant fill value).
>
> Speedup, as for cshift, is large.  Moving a 500*500*500
> array by -3 with eo_bench.f90 (also attached):
>
> $ gfortran -O3 eo_bench.f90 && ./a.out
>  dim =            1  t =   0.451796889
>  dim =            2  t =   0.183514118
>  dim =            3  t =   0.184015989
> $ gfortran-7 -static-libgfortran -O3 eo_bench.f90 && ./a.out
>  dim =            1  t =   0.955736041
>  dim =            2  t =    1.42228103
>  dim =            3  t =    3.00043702
>
> Regression-tested.  OK for trunk?
>
> Regards
>
>         Thomas
>
> 2017-07-01  Thomas Koenig  <tkoenig@gcc.gnu.org>
>
>         * intrinsics/eoshift0.c:  For contiguous arrays, use
>         block algorithm.  Use memcpy where possible.
>
> 2017-07-01  Thomas Koenig  <tkoenig@gcc.gnu.org>
>
>         * gfortran/eoshift_3.f90:  New test.
diff mbox

Patch

Index: intrinsics/eoshift0.c
===================================================================
--- intrinsics/eoshift0.c	(Revision 249632)
+++ intrinsics/eoshift0.c	(Arbeitskopie)
@@ -53,7 +53,8 @@ 
   index_type len;
   index_type n;
   index_type arraysize;
-
+  bool do_blocked;
+  
   /* The compiler cannot figure out that these are set, initialize
      them to avoid warnings.  */
   len = 0;
@@ -102,39 +103,94 @@ 
   count[0] = 0;
   sstride[0] = -1;
   rstride[0] = -1;
+
+  if (which > 0)
+    {
+      /* Test if both ret and array are contiguous.  */
+      size_t r_ex, a_ex;
+      r_ex = 1;
+      a_ex = 1;
+      do_blocked = true;
+      dim = GFC_DESCRIPTOR_RANK (array);
+      for (n = 0; n < dim; n ++)
+	{
+	  index_type rs, as;
+	  rs = GFC_DESCRIPTOR_STRIDE (ret, n);
+	  if (rs != r_ex)
+	    {
+	      do_blocked = false;
+	      break;
+	    }
+	  as = GFC_DESCRIPTOR_STRIDE (array, n);
+	  if (as != a_ex)
+	    {
+	      do_blocked = false;
+	      break;
+	    }
+	  r_ex *= GFC_DESCRIPTOR_EXTENT (ret, n);
+	  a_ex *= GFC_DESCRIPTOR_EXTENT (array, n);
+	}
+    }
+  else
+    do_blocked = false;
+
   n = 0;
-  for (dim = 0; dim < GFC_DESCRIPTOR_RANK (array); dim++)
+
+  if (do_blocked)
     {
-      if (dim == which)
-        {
-          roffset = GFC_DESCRIPTOR_STRIDE_BYTES(ret,dim);
-          if (roffset == 0)
-            roffset = size;
-          soffset = GFC_DESCRIPTOR_STRIDE_BYTES(array,dim);
-          if (soffset == 0)
-            soffset = size;
-          len = GFC_DESCRIPTOR_EXTENT(array,dim);
-        }
-      else
-        {
-          count[n] = 0;
-          extent[n] = GFC_DESCRIPTOR_EXTENT(array,dim);
-          rstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(ret,dim);
-          sstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(array,dim);
-          n++;
-        }
+      /* For contiguous arrays, use the relationship that
+
+         dimension(n1,n2,n3) :: a, b
+	 b = eoshift(a,sh,3)
+
+         can be dealt with as if
+
+	 dimension(n1*n2*n3) :: an, bn
+	 bn = eoshift(a,sh*n1*n2,1)
+
+	 so a block move can be used for dim>1.  */
+      len = GFC_DESCRIPTOR_STRIDE(array, which)
+	* GFC_DESCRIPTOR_EXTENT(array, which);
+      shift *= GFC_DESCRIPTOR_STRIDE(array, which);
+      roffset = size;
+      soffset = size;
+      for (dim = which + 1; dim < GFC_DESCRIPTOR_RANK (array); dim++)
+	{
+	  count[n] = 0;
+	  extent[n] = GFC_DESCRIPTOR_EXTENT(array,dim);
+	  rstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(ret,dim);
+	  sstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(array,dim);
+	  n++;
+	}
+      count[n] = 0;
+      dim = GFC_DESCRIPTOR_RANK (array) - which;
     }
-  if (sstride[0] == 0)
-    sstride[0] = size;
-  if (rstride[0] == 0)
-    rstride[0] = size;
+  else
+    {
+      for (dim = 0; dim < GFC_DESCRIPTOR_RANK (array); dim++)
+	{
+	  if (dim == which)
+	    {
+	      roffset = GFC_DESCRIPTOR_STRIDE_BYTES(ret,dim);
+	      if (roffset == 0)
+		roffset = size;
+	      soffset = GFC_DESCRIPTOR_STRIDE_BYTES(array,dim);
+	      if (soffset == 0)
+		soffset = size;
+	      len = GFC_DESCRIPTOR_EXTENT(array,dim);
+	    }
+	  else
+	    {
+	      count[n] = 0;
+	      extent[n] = GFC_DESCRIPTOR_EXTENT(array,dim);
+	      rstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(ret,dim);
+	      sstride[n] = GFC_DESCRIPTOR_STRIDE_BYTES(array,dim);
+	      n++;
+	    }
+	}
+      dim = GFC_DESCRIPTOR_RANK (array);
+    }
 
-  dim = GFC_DESCRIPTOR_RANK (array);
-  rstride0 = rstride[0];
-  sstride0 = sstride[0];
-  rptr = ret->base_addr;
-  sptr = array->base_addr;
-
   if ((shift >= 0 ? shift : -shift) > len)
     {
       shift = len;
@@ -148,6 +204,11 @@ 
 	len = len + shift;
     }
 
+  rstride0 = rstride[0];
+  sstride0 = sstride[0];
+  rptr = ret->base_addr;
+  sptr = array->base_addr;
+
   while (rptr)
     {
       /* Do the shift for this dimension.  */
@@ -161,12 +222,23 @@ 
           src = sptr;
           dest = &rptr[-shift * roffset];
         }
-      for (n = 0; n < len; n++)
-        {
-          memcpy (dest, src, size);
-          dest += roffset;
-          src += soffset;
-        }
+      /* If the elements are contiguous, perform a single block move.  */
+
+      if (soffset == size && roffset == size)
+	{
+	  size_t chunk = size * len;
+	  memcpy (dest, src, chunk);
+	  dest += chunk;
+	}
+      else
+	{
+	  for (n = 0; n < len; n++)
+	    {
+	      memcpy (dest, src, size);
+	      dest += roffset;
+	      src += soffset;
+	    }
+	}
       if (shift >= 0)
         {
           n = shift;