diff mbox

[gomp4] Update openacc loop iteration partitioning

Message ID 562676A3.5010804@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Oct. 20, 2015, 5:15 p.m. UTC
In preparing this patch set for trunk, I discovered I'd flubbed the calculations 
for default contiguous looping.  This fixes the calculation in the target-side 
loop transformation code.  I also realized that the calculation appropriate for 
an accelerator is not the best for the host.  For the latter we want this to 
expand to a regular loop iterator.

Applied to gomp4 branch.

nathan
diff mbox

Patch

2015-10-20  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* omp-low.c (expand_oacc_for): Use -1 for unspecified static
	chunking.  Remove unnecessary gimple forcing.
	(oacc_xform_loop): Adjust chunk size calculation.  Don't chunk on
	host.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: New.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 229092)
+++ gcc/omp-low.c	(working copy)
@@ -9962,7 +9962,7 @@  expand_oacc_for (struct omp_region *regi
   enum tree_code cond_code = fd->loop.cond_code;
   enum tree_code plus_code = PLUS_EXPR;
 
-  tree chunk_size = integer_one_node;
+  tree chunk_size = integer_minus_one_node;
   tree gwv = integer_zero_node;
   tree iter_type = TREE_TYPE (v);
   tree diff_type = iter_type;
@@ -10110,10 +10110,6 @@  expand_oacc_for (struct omp_region *regi
       ass = gimple_build_assign (chunk_no, expr);
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
 
-      expr = fold_convert (diff_type, chunk_size);
-      chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
-					     NULL_TREE, true, GSI_SAME_STMT);
-
       call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
 					 build_int_cst (integer_type_node,
 							IFN_GOACC_LOOP_CHUNKS),
@@ -16892,25 +16888,26 @@  oacc_xform_loop (gcall *call)
   tree dir = gimple_call_arg (call, 1);
   tree range = gimple_call_arg (call, 2);
   tree step = gimple_call_arg (call, 3);
-  tree chunk_size = gimple_call_arg (call, 4);
+  tree chunk_size = NULL_TREE;
   unsigned mask = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 5));
   tree lhs = gimple_call_lhs (call);
   tree type = TREE_TYPE (lhs);
   tree diff_type = TREE_TYPE (range);
   tree r = NULL_TREE;
   gimple_seq seq = NULL;
-  bool chunking, striding;
+  bool chunking = false, striding = true;
   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
 
-  if (integer_zerop (chunk_size))
-    {
-      /* If we're at the gang or (worker with vector), we want each to
-	 execute a contiguous run of iterations.  Otherwise we want
-	 each element to stride.  */
-      striding = !((outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
-		   || ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-		       && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))));
+#ifdef ACCEL_COMPILER
+  chunk_size = gimple_call_arg (call, 4);
+  if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
+      || integer_zerop (chunk_size))   /* Default (also static).  */
+    {
+      /* If we're at the gang level, we want each to execute a
+	 contiguous run of iterations.  Otherwise we want each element
+	 to stride.  */
+      striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
       chunking = false;
     }
   else
@@ -16919,7 +16916,16 @@  oacc_xform_loop (gcall *call)
       striding = integer_onep (chunk_size);
       chunking = !striding;
     }
+#endif
 
+  /* striding=true, chunking=true
+       -> invalid.
+     striding=true, chunking=false
+       -> chunks=1
+     striding=false,chunking=true
+       -> chunks=ceil (range/(chunksize*threads*step))
+     striding=false,chunking=false
+       -> chunk_size=ceil(range/(threads*step)),chunks=1  */
   push_gimplify_context (true);
 
   switch (code)
@@ -16963,31 +16969,25 @@  oacc_xform_loop (gcall *call)
 	}
       else
 	{
-	  tree span;
 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
 				     inner_size, outer_size);
 
+	  volume = fold_convert (diff_type, volume);
 	  if (chunking)
-	    {
-	      chunk_size = fold_convert (diff_type, chunk_size);
-
-	      span = inner_size;
-	      span = fold_convert (diff_type, span);
-	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
-	    }
+	    chunk_size = fold_convert (diff_type, chunk_size);
 	  else
 	    {
-	      tree per = fold_convert (diff_type, volume);
-	      per = fold_build2 (MULT_EXPR, diff_type, per, step);
+	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
 
-	      span = build2 (MINUS_EXPR, diff_type, range, dir);
-	      span = build2 (PLUS_EXPR, diff_type, span, per);
-	      span = build2 (TRUNC_DIV_EXPR, diff_type, span, per);
-	      span = build2 (MULT_EXPR, diff_type, span, inner_size);
+	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
 	    }
 
+	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+			      fold_convert (diff_type, inner_size));
 	  r = oacc_thread_numbers (true, outer_mask, &seq);
 	  r = fold_convert (diff_type, r);
 	  r = build2 (MULT_EXPR, diff_type, r, span);
@@ -16998,9 +16998,9 @@  oacc_xform_loop (gcall *call)
 
 	  if (chunking)
 	    {
-	      tree chunk = gimple_call_arg (call, 6);
-	      tree per = fold_convert (diff_type, volume);
-	      per = fold_build2 (MULT_EXPR, diff_type, per, chunk_size);
+	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+	      tree per
+		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
 
 	      r = build2 (PLUS_EXPR, diff_type, r, per);
@@ -17016,29 +17016,29 @@  oacc_xform_loop (gcall *call)
 	r = range;
       else
 	{
-	  tree offset = gimple_call_arg (call, 6);
-	  tree span;
-	  
-	  if (chunking)
-	    {
-	      chunk_size = fold_convert (diff_type, chunk_size);
+	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+				     inner_size, outer_size);
 
-	      span = oacc_thread_numbers (false, inner_mask, &seq);
-	      span = fold_convert (diff_type, span);
-	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
-	    }
+	  volume = fold_convert (diff_type, volume);
+	  if (chunking)
+	    chunk_size = fold_convert (diff_type, chunk_size);
 	  else
 	    {
-	      tree per = oacc_thread_numbers (false, mask, &seq);
-	      per = fold_convert (diff_type, per);
-	      per = build2 (MULT_EXPR, diff_type, per, step);
-	      span = build2 (MINUS_EXPR, diff_type, range, dir);
-	      span = build2 (PLUS_EXPR, diff_type, span, per);
-	      span = build2 (TRUNC_DIV_EXPR, diff_type, span, per);
+	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+
+	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
 	    }
 
+	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+			      fold_convert (diff_type, inner_size));
+
 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
 
+	  tree offset = gimple_call_arg (call, 6);
 	  r = build2 (PLUS_EXPR, diff_type, r,
 		      fold_convert (diff_type, offset));
 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = ix / ((N + 31) / 32);
+	  int w = 0;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang (static:1)
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = ix % 32;
+	  int w = 0;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c	(working copy)
@@ -0,0 +1,59 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang worker vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  
+	  int g = ix / (chunk_size * 32 * 32);
+	  int w = ix / 32 % 32;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = 0;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop worker
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = ix % 32;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop worker vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = (ix / 32) % 32;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}