diff mbox

Automatic openacc loop partitioning

Message ID 5645F1F7.2020902@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Nov. 13, 2015, 2:21 p.m. UTC
Jakub,
this patch applies automatic loop partitioning to loops that are marked 'auto' 
and 'independent'. 'independent' is implicit inside a parallel region.

We were unnecessarily still emitting a sorry for the auto, seq and independent 
clauses in omp lowering.  The main event is in the target compiler, when we know 
which partitioning axes are available.  A simple DFS walk of the loops assigns 
the innermost available partition to such loops.

ok?

nathan

Comments

Bernd Schmidt Nov. 13, 2015, 9:40 p.m. UTC | #1
> +      this_mask = (this_mask & -this_mask);

Unnecessary parens.

> +      if (!this_mask && noisy)
> +	warning_at (loop->loc, 0,
> +		    "insufficient partitioning available to parallelize loop");

Should this really be an unconditional warning? Isn't sequential 
execution a valid result of "auto"? (The spec implies that this is a 
possible outcome for loops inside kernels that can't be determined to be 
independent.)

Speaking of kernels, the testcase doesn't cover it, but maybe that's 
because that needs something else before it works?

Otherwise LGTM.


Bernd
Nathan Sidwell Nov. 13, 2015, 9:46 p.m. UTC | #2
On 11/13/15 16:40, Bernd Schmidt wrote:
>> +      this_mask = (this_mask & -this_mask);
>
> Unnecessary parens.
>
>> +      if (!this_mask && noisy)
>> +    warning_at (loop->loc, 0,
>> +            "insufficient partitioning available to parallelize loop");
>
> Should this really be an unconditional warning? Isn't sequential execution a
> valid result of "auto"? (The spec implies that this is a possible outcome for
> loops inside kernels that can't be determined to be independent.)

This piece of  code is only active when loops have the INDEPENDENT clause. 
That's implicit for (marked) loops inside parallel.  But for kernels you need to 
say so explicitly.  IMO the user should be told that such loops fail to be 
parallelized.

> Speaking of kernels, the testcase doesn't cover it, but maybe that's because
> that needs something else before it works?

Going to wait for Tom's kernels patch set to land.


nathan
diff mbox

Patch

2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* gcc/omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO &
	SEQ.
	(oacc_loop_fixed_partitions): Correct return type to bool.
	(oacc_loop_auto_partitions): New.
	(oacc_loop_partition): Take mask argument, call
	oacc_loop_auto_partitions.
	(execute_oacc_device_lower): Provide mask to oacc_loop_partition.

	gcc/testsuite/
	* gcc/testsuite/c-c++-common/goacc/loop-auto-1.c: New.

	libgomp/
	* libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 230283)
+++ gcc/omp-low.c	(working copy)
@@ -2124,6 +2124,9 @@  scan_sharing_clauses (tree clauses, omp_
 	case OMP_CLAUSE_WORKER:
 	case OMP_CLAUSE_VECTOR:
 	case OMP_CLAUSE_TILE:
+	case OMP_CLAUSE_INDEPENDENT:
+	case OMP_CLAUSE_AUTO:
+	case OMP_CLAUSE_SEQ:
 	  break;
 
 	case OMP_CLAUSE_ALIGNED:
@@ -2136,9 +2139,6 @@  scan_sharing_clauses (tree clauses, omp_
 	case OMP_CLAUSE_DEVICE_RESIDENT:
 	case OMP_CLAUSE_USE_DEVICE:
 	case OMP_CLAUSE__CACHE_:
-	case OMP_CLAUSE_INDEPENDENT:
-	case OMP_CLAUSE_AUTO:
-	case OMP_CLAUSE_SEQ:
 	  sorry ("Clause not supported yet");
 	  break;
 
@@ -2299,14 +2299,14 @@  scan_sharing_clauses (tree clauses, omp_
 	case OMP_CLAUSE_WORKER:
 	case OMP_CLAUSE_VECTOR:
 	case OMP_CLAUSE_TILE:
+	case OMP_CLAUSE_INDEPENDENT:
+	case OMP_CLAUSE_AUTO:
+	case OMP_CLAUSE_SEQ:
 	  break;
 
 	case OMP_CLAUSE_DEVICE_RESIDENT:
 	case OMP_CLAUSE_USE_DEVICE:
 	case OMP_CLAUSE__CACHE_:
-	case OMP_CLAUSE_INDEPENDENT:
-	case OMP_CLAUSE_AUTO:
-	case OMP_CLAUSE_SEQ:
 	  sorry ("Clause not supported yet");
 	  break;
 
@@ -19230,10 +19230,10 @@  oacc_loop_process (oacc_loop *loop)
 
 /* Walk the OpenACC loop heirarchy checking and assigning the
    programmer-specified partitionings.  OUTER_MASK is the partitioning
-   this loop is contained within.  Return partitiong mask used within
-   this loop nest.  */
+   this loop is contained within.  Return true if we contain an
+   auto-partitionable loop.  */
 
-static unsigned
+static bool
 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 {
   unsigned this_mask = loop->mask;
@@ -19337,18 +19337,63 @@  oacc_loop_fixed_partitions (oacc_loop *l
   return has_auto;
 }
 
+/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
+   OUTER_MASK is the partitioning this loop is contained within.
+   Return the cumulative partitioning used by this loop, siblings and
+   children.  */
+
+static unsigned
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+{
+  unsigned inner_mask = 0;
+  bool noisy = true;
+
+#ifdef ACCEL_COMPILER
+  /* When device_type is supported, we want the device compiler to be
+     noisy, if the loop parameters are device_type-specific.  */
+  noisy = false;
+#endif
+
+  if (loop->child)
+    inner_mask |= oacc_loop_auto_partitions (loop->child,
+					     outer_mask | loop->mask);
+
+  if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT))
+    {
+      unsigned this_mask = 0;
+      
+      /* Determine the outermost partitioning used within this loop. */
+      this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX);
+      this_mask = (this_mask & -this_mask);
+
+      /* Pick the partitioning just inside that one.  */
+      this_mask >>= 1;
+
+      /* And avoid picking one use by an outer loop. */
+      this_mask &= ~outer_mask;
+
+      if (!this_mask && noisy)
+	warning_at (loop->loc, 0,
+		    "insufficient partitioning available to parallelize loop");
+
+      loop->mask = this_mask;
+    }
+  inner_mask |= loop->mask;
+  
+  if (loop->sibling)
+    inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+
+  return inner_mask;
+}
+
 /* Walk the OpenACC loop heirarchy to check and assign partitioning
    axes.  */
 
 static void
-oacc_loop_partition (oacc_loop *loop, int fn_level)
+oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
 {
-  unsigned outer_mask = 0;
-
-  if (fn_level >= 0)
-    outer_mask = GOMP_DIM_MASK (fn_level) - 1;
-
-  oacc_loop_fixed_partitions (loop, outer_mask);
+  if (oacc_loop_fixed_partitions (loop, outer_mask))
+    oacc_loop_auto_partitions (loop, outer_mask);
 }
 
 /* Default fork/join early expander.  Delete the function calls if
@@ -19429,7 +19474,8 @@  execute_oacc_device_lower ()
 
   /* Discover, partition and process the loops.  */
   oacc_loop *loops = oacc_loop_discovery ();
-  oacc_loop_partition (loops, fn_level);
+  unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
+  oacc_loop_partition (loops, outer_mask);
   oacc_loop_process (loops);
   if (dump_file)
     {
Index: gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(revision 0)
+++ gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(working copy)
@@ -0,0 +1,230 @@ 
+
+void Foo ()
+{
+  
+#pragma acc parallel num_gangs(10) num_workers(32) vector_length(32)
+  {
+#pragma acc loop vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop seq
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+
+#pragma acc loop worker
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop vector
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+
+#pragma acc loop gang
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop worker
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop vector
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop vector
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+      }
+    
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+  }
+}
+
+#pragma acc routine gang
+void Gang (void)
+{
+#pragma acc loop vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop seq
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+
+#pragma acc loop worker
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop vector
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+
+#pragma acc loop gang
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop worker
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop vector
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop vector
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+
+      }
+    
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+}
+
+#pragma acc routine worker
+void Worker (void)
+{
+#pragma acc loop vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop seq
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+
+#pragma acc loop worker
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop vector
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto
+	    for (int kx = 0; kx < 10; kx++) {}
+	  }
+      }
+}
+
+#pragma acc routine vector
+void Vector (void)
+{
+#pragma acc loop vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop seq
+	for (int jx = 0; jx < 10; jx++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++) {}
+
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++) {}
+      }
+}
+
+#pragma acc routine seq
+void Seq (void)
+{
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+    for (int ix = 0; ix < 10; ix++) {}
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(working copy)
@@ -0,0 +1,225 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int check (const int *ary, int size, int gp, int wp, int vp)
+{
+  int exit = 0;
+  int ix;
+  int gangs[32], workers[32], vectors[32];
+
+  for (ix = 0; ix < 32; ix++)
+    gangs[ix] = workers[ix] = vectors[ix] = 0;
+  
+  for (ix = 0; ix < size; ix++)
+    {
+      vectors[ary[ix] & 0xff]++;
+      workers[(ary[ix] >> 8) & 0xff]++;
+      gangs[(ary[ix] >> 16) & 0xff]++;
+    }
+
+  for (ix = 0; ix < 32; ix++)
+    {
+      if (gp)
+	{
+	  int expect = gangs[0];
+	  if (gangs[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("gang %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && gangs[ix])
+	{
+	  exit = 1;
+	  printf ("gang %d unexpectedly used\n", ix);
+	}
+
+      if (wp)
+	{
+	  int expect = workers[0];
+	  if (workers[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("worker %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && workers[ix])
+	{
+	  exit = 1;
+	  printf ("worker %d unexpectedly used\n", ix);
+	}
+
+      if (vp)
+	{
+	  int expect = vectors[0];
+	  if (vectors[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("vector %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && vectors[ix])
+	{
+	  exit = 1;
+	  printf ("vector %d unexpectedly used\n", ix);
+	}
+      
+    }
+  return exit;
+}
+
+#pragma acc routine seq
+static int __attribute__((noinline)) place ()
+{
+  int r = 0;
+
+  if (acc_on_device (acc_device_nvidia))
+    {
+      int g = 0, w = 0, v = 0;
+
+      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      r = (g << 16) | (w << 8) | v;
+    }
+  return r;
+}
+
+static void clear (int *ary, int size)
+{
+  int ix;
+
+  for (ix = 0; ix < size; ix++)
+    ary[ix] = -1;
+}
+
+int vector_1 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int ix = 0; ix < size; ix++)
+      ary[ix] = place ();
+  }
+
+  return check (ary, size, 0, 0, 1);
+}
+
+int vector_2 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop worker
+    for (int jx = 0; jx <  size  / 64; jx++)
+#pragma acc loop auto
+      for (int ix = 0; ix < 64; ix++)
+	ary[ix + jx * 64] = place ();
+  }
+
+  return check (ary, size, 0, 1, 1);
+}
+
+int worker_1 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size  / 64; jx++)
+#pragma acc loop vector
+      for (int ix = 0; ix < 64; ix++)
+	ary[ix + jx * 64] = place ();
+  }
+
+  return check (ary, size, 0, 1, 1);
+}
+
+int worker_2 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size  / 64; jx++)
+#pragma acc loop auto
+      for (int ix = 0; ix < 64; ix++)
+	ary[ix + jx * 64] = place ();
+  }
+
+  return check (ary, size, 0, 1, 1);
+}
+
+int gang_1 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size  / 64; jx++)
+#pragma acc loop worker
+      for (int ix = 0; ix < 64; ix++)
+	ary[ix + jx * 64] = place ();
+  }
+
+  return check (ary, size, 1, 1, 0);
+}
+
+int gang_2 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int kx = 0; kx < size / (32 * 32); kx++)
+#pragma acc loop auto
+      for (int jx = 0; jx <  32; jx++)
+#pragma acc loop auto
+	for (int ix = 0; ix < 32; ix++)
+	  ary[ix + jx * 32 + kx * 32 * 32] = place ();
+  }
+
+  return check (ary, size, 1, 1, 1);
+}
+
+#define N (32*32*32)
+int main ()
+{
+  int ondev = 0;
+
+#pragma acc parallel copy(ondev)
+  {
+    ondev = acc_on_device (acc_device_not_host);
+  }
+  if (!ondev)
+    return 0;
+  
+  int ary[N];
+
+  if (vector_1 (ary,  N))
+    return 1;
+  if (vector_2 (ary,  N))
+    return 1;
+
+  if (worker_1 (ary,  N))
+    return 1;
+  if (worker_2 (ary,  N))
+    return 1;
+  
+  if (gang_1 (ary,  N))
+    return 1;
+  if (gang_2 (ary,  N))
+    return 1;
+
+  return 0;
+}