diff mbox series

[fortran,RFC] Interchange indices for FORALL and DO CONCURRENT if profitable

Message ID 6f2efdb3-c45d-18c7-0b0e-89e91ab32eb4@netcologne.de
State New
Headers show
Series [fortran,RFC] Interchange indices for FORALL and DO CONCURRENT if profitable | expand

Commit Message

Thomas Koenig Oct. 27, 2017, 10:03 p.m. UTC
Hello world,

this is a draft patch which interchanges the indices for FORALL and
DO CONCURRENT loops for cases like PR 82471, where code like

   DO CONCURRENT( K=1:N, J=1:M, I=1:L)
      C(I,J,K) = A(I,J,K) + B(I,J,K)
   END DO

led to very poor code because of stride issues.  Currently,
Graphite is not able to do this.

Without the patch, the code above is translated as


         i.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 k.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.7];
                     L.1:;
                     k.5 = k.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             i.7 = i.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop has the biggest stride. With the patch
(and front-end optimization turned on), this is turned into

         k.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 i.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.5];
                     L.1:;
                     i.5 = i.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             k.7 = k.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop is the one that gets traversed with unity stride
(the way it should have been done).

Although the algorithm here is quite simple, it resolves the issues
raised in the PR so far, and it definitely worth fixing.

If we do this kind of thing, it might also be possible to
interchange normal DO loops in a similar way (which Graphite also
cannot do at the moment, at least not if the bounds are variables).

So, comments? Suggestions? Other ideas? Any ideas how to write
a test case? Any volunteers to re-implement Graphite in the
Fortran front end (didn't think so) or to make Graphite catch
this sort of pattern (which it currently doesn't) instead?

Regards

	Thomas

2017-10-27  Thomas Koenig  <tkoenig@gcc.gnu.org>

         * frontend-passes.c (index_interchange): New funciton,
         prototype.
         (optimize_namespace): Call index_interchange.
         (ind_type): New function.
         (has_var): New function.
         (index_cost): New function.
         (loop_comp): New function.

Comments

Steve Kargl Oct. 27, 2017, 10:38 p.m. UTC | #1
Hi Thomas,

In general, I like the idea.  I have some minor suggestions below.


On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote:
> +/* Callback function to determine if an expression is the 
> +   corresponding variable.  */
> +
> +static int

static bool

> +has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
> +{
> +  gfc_expr *expr = *e;
> +  gfc_symbol *sym;
> +
> +  if (expr->expr_type != EXPR_VARIABLE)
> +    return 0;

return false;

> +
> +  sym = (gfc_symbol *) data;
> +  return sym == expr->symtree->n.sym;
> +}
> +
> +/* Callback function to calculate the cost of a certain index.  */

This function always returns 0, so

> +static int

static void

> +index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
> +	    void *data)
> +{
> +  ind_type *ind;
> +  gfc_expr *expr;
> +  gfc_array_ref *ar;
> +  gfc_ref *ref;
> +  int i,j;
> +
> +  expr = *e;
> +  if (expr->expr_type != EXPR_VARIABLE)
> +    return 0;

return;

> +
> +  ar = NULL;
> +  for (ref = expr->ref; ref; ref = ref->next)
> +    {
> +      if (ref->type == REF_ARRAY)
> +	{
> +	  ar = &ref->u.ar;
> +	  break;
> +	}
> +    }
> +  if (ar == NULL || ar->type != AR_ELEMENT)
> +    return 0;

return;

> +
> +  ind = (ind_type *) data;
> +  for (i = 0; i < ar->dimen; i++)
> +    {
> +      for (j=0; ind[j].sym != NULL; j++)
> +	{
> +	  if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
> +	      ind[j].n[i]++;
> +	}
> +    }
> +  return 0;

Delete this return as a void function that reaches its
end will return;

> +}
> +
> +/* Callback function for qsort, to sort the loop indices. */
> +
> +static int
> +loop_comp (const void *e1, const void *e2)
> +{
> +  const ind_type *i1 = (const ind_type *) e1;
> +  const ind_type *i2 = (const ind_type *) e2;
> +  int i;
> +
> +  for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
> +    {
> +      if (i1->n[i] != i2->n[i])
> +	return i1->n[i] - i2->n[i];
> +    }
> +  /* All other things being equal, let's not change the ordering.  */
> +  return i2->num - i1->num;
> +}
> +
> +/* Main function to do the index interchange.  */
> +

This function always returns 0, so

> +static int

static void

> +index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
> +		  void *data ATTRIBUTE_UNUSED)
> +{
> +  gfc_code *co;
> +  co = *c;
> +  int n_iter;
> +  gfc_forall_iterator *fa;
> +  ind_type *ind;
> +  int i, j;
> +  
> +  if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
> +    return 0;

return;

> +
> +  n_iter = 0;
> +  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
> +    n_iter ++;
> +
> +  /* Nothing to reorder. */
> +  if (n_iter < 2)
> +    return 0;

return;

> +
> +  ind = XALLOCAVEC (ind_type, n_iter + 1);
> +
> +  i = 0;
> +  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
> +    {
> +      ind[i].sym = fa->var->symtree->n.sym;
> +      ind[i].fa = fa;
> +      for (j=0; j<GFC_MAX_DIMENSIONS; j++)
> +	ind[i].n[j] = 0;
> +      ind[i].num = i;
> +      i++;
> +    }
> +  ind[n_iter].sym = NULL;
> +  ind[n_iter].fa = NULL;
> +
> +  gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
> +  qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
> +
> +  /* Do the actual index interchange.  */
> +  co->ext.forall_iterator = fa = ind[0].fa;
> +  for (i=1; i<n_iter; i++)
> +    {
> +      fa->next = ind[i].fa;
> +      fa = fa->next;
> +    }
> +  fa->next = NULL;
> +
> +  return 0;

Delete this return.
Thomas Koenig Oct. 28, 2017, 11:23 a.m. UTC | #2
Hi Steve,

> On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote:
>> +/* Callback function to determine if an expression is the
>> +   corresponding variable.  */
>> +
>> +static int
> static bool

Most of the functions in the patch are callback functions for
gfc_code_walker or gfc_expr_walker, respectively. Their
function arguments are given as

typedef int (*walk_code_fn_t) (gfc_code **, int *, void *);
typedef int (*walk_expr_fn_t) (gfc_expr **, int *, void *);

respectively, so the types of the functions are fixed.

Regards

	Thomas
Richard Biener Oct. 28, 2017, 5:13 p.m. UTC | #3
On October 28, 2017 12:03:58 AM GMT+02:00, Thomas Koenig <tkoenig@netcologne.de> wrote:
>Hello world,
>
>this is a draft patch which interchanges the indices for FORALL and
>DO CONCURRENT loops for cases like PR 82471, where code like
>
>   DO CONCURRENT( K=1:N, J=1:M, I=1:L)
>      C(I,J,K) = A(I,J,K) + B(I,J,K)
>   END DO
>
>led to very poor code because of stride issues.  Currently,
>Graphite is not able to do this.
>
>Without the patch, the code above is translated as
>
>
>         i.7 = 1;
>         count.10 = 512;
>         while (1)
>           {
>             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
>             j.6 = 1;
>             count.9 = 512;
>             while (1)
>               {
>                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
>                 k.5 = 1;
>                 count.8 = 512;
>                 while (1)
>                   {
>                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
>                    (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
>+ (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
>c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * 
>restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride)
>
>+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + 
>(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
>k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
>(integer(kind=8)) i.7];
>                     L.1:;
>                     k.5 = k.5 + 1;
>                     count.8 = count.8 + -1;
>                   }
>                 L.2:;
>                 j.6 = j.6 + 1;
>                 count.9 = count.9 + -1;
>               }
>             L.3:;
>             i.7 = i.7 + 1;
>             count.10 = count.10 + -1;
>           }
>         L.4:;
>
>so the innermost loop has the biggest stride. With the patch
>(and front-end optimization turned on), this is turned into
>
>         k.7 = 1;
>         count.10 = 512;
>         while (1)
>           {
>             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
>             j.6 = 1;
>             count.9 = 512;
>             while (1)
>               {
>                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
>                 i.5 = 1;
>                 count.8 = 512;
>                 while (1)
>                   {
>                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
>                    (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
>+ (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
>c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * 
>restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride)
>
>+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + 
>(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
>k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
>(integer(kind=8)) i.5];
>                     L.1:;
>                     i.5 = i.5 + 1;
>                     count.8 = count.8 + -1;
>                   }
>                 L.2:;
>                 j.6 = j.6 + 1;
>                 count.9 = count.9 + -1;
>               }
>             L.3:;
>             k.7 = k.7 + 1;
>             count.10 = count.10 + -1;
>           }
>         L.4:;
>
>so the innermost loop is the one that gets traversed with unity stride
>(the way it should have been done).
>
>Although the algorithm here is quite simple, it resolves the issues
>raised in the PR so far, and it definitely worth fixing.
>
>If we do this kind of thing, it might also be possible to
>interchange normal DO loops in a similar way (which Graphite also
>cannot do at the moment, at least not if the bounds are variables).
>
>So, comments? Suggestions? Other ideas? Any ideas how to write
>a test case? Any volunteers to re-implement Graphite in the
>Fortran front end (didn't think so) or to make Graphite catch
>this sort of pattern (which it currently doesn't) instead?

Graphite has major issues with the scalarization of multidimensional arrays to a single array dimension. Indeed doing graphite optimization on the Fortran FE IL would be an interesting experiment.  Maybe list that as summer of code project idea. 

Richard. 

>Regards
>
>	Thomas
>
>2017-10-27  Thomas Koenig  <tkoenig@gcc.gnu.org>
>
>         * frontend-passes.c (index_interchange): New funciton,
>         prototype.
>         (optimize_namespace): Call index_interchange.
>         (ind_type): New function.
>         (has_var): New function.
>         (index_cost): New function.
>         (loop_comp): New function.
Thomas Koenig Oct. 29, 2017, 5:26 p.m. UTC | #4
Am 28.10.2017 um 00:03 schrieb Thomas Koenig:
> +typepedef struct {

That should have been typdef, obviously - the typo
must have slipped in after testing.

Regards

	Thomas
Steve Kargl Oct. 30, 2017, 8:39 p.m. UTC | #5
On Sat, Oct 28, 2017 at 01:23:30PM +0200, Thomas Koenig wrote:
> Hi Steve,
> 
> > On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote:
> >> +/* Callback function to determine if an expression is the
> >> +   corresponding variable.  */
> >> +
> >> +static int
> > static bool
> 
> Most of the functions in the patch are callback functions for
> gfc_code_walker or gfc_expr_walker, respectively. Their
> function arguments are given as
> 
> typedef int (*walk_code_fn_t) (gfc_code **, int *, void *);
> typedef int (*walk_expr_fn_t) (gfc_expr **, int *, void *);
> 
> respectively, so the types of the functions are fixed.
> 

Whoops, I didn't realize that the prototypes were related
to being call back functions.  I noticed the functions were
declared as int, but only returned a single value of 0.

In any event, if you have already applied the patch, it looks
ok to me.
diff mbox series

Patch

Index: frontend-passes.c
===================================================================
--- frontend-passes.c	(Revision 253872)
+++ frontend-passes.c	(Arbeitskopie)
@@ -55,6 +55,7 @@  static gfc_expr* check_conjg_transpose_variable (g
 						 bool *);
 static bool has_dimen_vector_ref (gfc_expr *);
 static int matmul_temp_args (gfc_code **, int *,void *data);
+static int index_interchange (gfc_code **, int*, void *);
 
 #ifdef CHECKING_P
 static void check_locus (gfc_namespace *);
@@ -1385,6 +1386,9 @@  optimize_namespace (gfc_namespace *ns)
 		       NULL);
     }
 
+  gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback,
+		   NULL);
+
   /* BLOCKs are handled in the expression walker below.  */
   for (ns = ns->contained; ns; ns = ns->sibling)
     {
@@ -4225,6 +4229,157 @@  inline_matmul_assign (gfc_code **c, int *walk_subt
   return 0;
 }
 
+
+/* Code for index interchange for loops which are grouped together in DO
+   CONCURRENT or FORALL statements.  This is currently only applied if the
+   iterations are grouped together in a single statement.
+
+   For this transformation, tt is assumed that memory access in strides is
+   expensive, and that loops which access later indices (which access memory
+   in bigger strides) shoud be moved to the first loops.
+
+   For this, a loop over all the statements is executed, counting the times
+   that the loop iteration values are acessed in each index.  The loop
+   indices are then sorted to minimize access to later indces from inner
+   loops.  */
+
+/* Type for holding index information.  */
+
+typepedef struct {
+  gfc_symbol *sym;
+  gfc_forall_iterator *fa;
+  int num;
+  int n[GFC_MAX_DIMENSIONS];
+} ind_type;
+
+/* Callback function to determine if an expression is the 
+   corresponding variable.  */
+
+static int
+has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
+{
+  gfc_expr *expr = *e;
+  gfc_symbol *sym;
+
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  sym = (gfc_symbol *) data;
+  return sym == expr->symtree->n.sym;
+}
+
+/* Callback function to calculate the cost of a certain index.  */
+
+static int
+index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+	    void *data)
+{
+  ind_type *ind;
+  gfc_expr *expr;
+  gfc_array_ref *ar;
+  gfc_ref *ref;
+  int i,j;
+
+  expr = *e;
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  ar = NULL;
+  for (ref = expr->ref; ref; ref = ref->next)
+    {
+      if (ref->type == REF_ARRAY)
+	{
+	  ar = &ref->u.ar;
+	  break;
+	}
+    }
+  if (ar == NULL || ar->type != AR_ELEMENT)
+    return 0;
+
+  ind = (ind_type *) data;
+  for (i = 0; i < ar->dimen; i++)
+    {
+      for (j=0; ind[j].sym != NULL; j++)
+	{
+	  if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
+	      ind[j].n[i]++;
+	}
+    }
+  return 0;
+}
+
+/* Callback function for qsort, to sort the loop indices. */
+
+static int
+loop_comp (const void *e1, const void *e2)
+{
+  const ind_type *i1 = (const ind_type *) e1;
+  const ind_type *i2 = (const ind_type *) e2;
+  int i;
+
+  for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
+    {
+      if (i1->n[i] != i2->n[i])
+	return i1->n[i] - i2->n[i];
+    }
+  /* All other things being equal, let's not change the ordering.  */
+  return i2->num - i1->num;
+}
+
+/* Main function to do the index interchange.  */
+
+static int
+index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+		  void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co;
+  co = *c;
+  int n_iter;
+  gfc_forall_iterator *fa;
+  ind_type *ind;
+  int i, j;
+  
+  if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
+    return 0;
+
+  n_iter = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    n_iter ++;
+
+  /* Nothing to reorder. */
+  if (n_iter < 2)
+    return 0;
+
+  ind = XALLOCAVEC (ind_type, n_iter + 1);
+
+  i = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    {
+      ind[i].sym = fa->var->symtree->n.sym;
+      ind[i].fa = fa;
+      for (j=0; j<GFC_MAX_DIMENSIONS; j++)
+	ind[i].n[j] = 0;
+      ind[i].num = i;
+      i++;
+    }
+  ind[n_iter].sym = NULL;
+  ind[n_iter].fa = NULL;
+
+  gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
+  qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
+
+  /* Do the actual index interchange.  */
+  co->ext.forall_iterator = fa = ind[0].fa;
+  for (i=1; i<n_iter; i++)
+    {
+      fa->next = ind[i].fa;
+      fa = fa->next;
+    }
+  fa->next = NULL;
+
+  return 0;
+}
+
 #define WALK_SUBEXPR(NODE) \
   do							\
     {							\