diff mbox

[gomp4] assign unused gwv clauses to auto/independent parallel acc loops

Message ID 55F05F7E.40402@codesourcery.com
State New
Headers show

Commit Message

Cesar Philippidis Sept. 9, 2015, 4:34 p.m. UTC
This patch assigns any available gang, worker or vector level
parallelism to auto and independent loops inside acc parallel regions.
This is done in omplower for two reasons:

  1. At the moment, it's too late to do this in oacc-xform because
     ompexpand is responsible for partitioning loops. This will likely
     get revisited later when we add support for kernels.

  2. omplower already has several tree walkers to scan for nesting
     errors and data mappings, etc. This is just another tree walk
     for acc parallel regions.

There are a couple of problems with this patch. First, I make no attempt
to determine the optimal work-sharing clause for a particular loop.
Instead, I assign the lowest (i.e. gang before worker before vector)
available parallelism to the outermost loop. At this point, that's
better than nothing. The second issue is, while adding clauses does let
ompexpand partition acc loops, we are not setting default values for
num_gangs, num_workers and vector_length yet (although we do set
vector_length to 32 when num_workers != 1).

It should be noted that this optimization only applies to acc loops
inside parallel regions. I probably could expand it to acc loops inside
acc routines, but technically acc routines are only supposed to have one
level of parallelism anyway. It also probably could be expanded to
handle independent loops inside kernels regions too.

Is this patch ok for gomp-4_0-branch or should I hold off until the
kernels situation gets resolved?

Cesar
diff mbox

Patch

2015-09-09  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* omp-low.c (struct oacc_gwv): New struct.
	(filter_omp_clause): New function.
	(set_oacc_parallel_loop_gwv_1): New function.
	(set_oacc_parallel_loop_gwv): New function.
	(scan_omp_for): Use filer_omp_clause to remove the stale reductions.
	(scan_omp_target): Automatically assign gang, worker and vector
	clauses to auto and independent loop with any worksharing clauses
	inside parallel regions.

	gcc/testsuite/
	* gfortran.dg/goacc/dtype-1.f95: Update xfails to account for the
	automatic parallelism in acc parallel regions.
	* c-c++-common/goacc/dtype-1.c: Likewise.
	* c-c++-common/goacc/par-auto-1.c: New test.
	* c-c++-common/goacc/par-auto-2.c: New test.
	* c-c++-common/goacc/par-auto-3.c: New test.


diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index bfef298..2d79ad1 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -237,6 +237,13 @@  struct omp_for_data
   struct omp_for_data_loop *loops;
 };
 
+/* A structure for automatically adding parallelism to OpenACC loops.  */
+
+struct oacc_gwv
+{
+  short gwv;
+  bool update;
+};
 
 static splay_tree all_contexts;
 static int taskreg_nesting_level;
@@ -2596,6 +2603,191 @@  oacc_loop_or_target_p (gimple stmt)
 	      && gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_OACC_LOOP));
 }
 
+/* Remove all clauses of type CODE from the chain of omp CLAUSES.  */
+static tree
+filter_omp_clause (omp_clause_code code, tree clauses)
+{
+  /* First filter out the clauses at the beginning of the chain.  */
+  while (clauses
+	 && OMP_CLAUSE_CODE (clauses) == code)
+    {
+      clauses = OMP_CLAUSE_CHAIN (clauses);
+    }
+
+  if (clauses != NULL)
+    {
+      /* Filter out the remaining clauses.  */
+      for (tree c = OMP_CLAUSE_CHAIN (clauses), prev = clauses;
+	   c; c = OMP_CLAUSE_CHAIN (c))
+	{
+	  if (OMP_CLAUSE_CODE (c) == code)
+	    {
+	      tree t = OMP_CLAUSE_CHAIN (c);
+	      OMP_CLAUSE_CHAIN (prev) = t;
+	    }
+	  else
+	    prev = c;
+	}
+    }
+
+  return clauses;
+}
+
+/* Callback for walk_gimple_seq.  Set the appropriate level of parallelism
+   for an acc loop when possible.  Also remove a reduction clause if the
+   a loop doesn't have any parallelism associated with it.  */
+
+static tree
+set_oacc_parallel_loop_gwv_1 (gimple_stmt_iterator *gsi_p,
+			      bool *handled_ops_p,
+			      struct walk_stmt_info *wi)
+{
+  struct oacc_gwv *outer = (struct oacc_gwv *) wi->info;
+  struct oacc_gwv nested = { 0, false };
+  int local_gwv = 0, dim = 0, nested_dim = GOMP_DIM_MAX;
+  gimple stmt = gsi_stmt (*gsi_p);
+  bool is_seq = false;
+  tree clauses, c;
+
+  *handled_ops_p = true;
+
+  switch (gimple_code (stmt))
+    {
+    WALK_SUBSTMTS;
+
+    case GIMPLE_CALL:
+      {
+	tree fndecl = gimple_call_fndecl (stmt);
+	if (fndecl)
+	  {
+	    int call_gwv = extract_oacc_routine_gwv (fndecl);
+	    outer->gwv |= call_gwv;
+	  }
+      }
+      break;
+
+    case GIMPLE_OMP_FOR:
+      clauses = gimple_omp_for_clauses (stmt);
+
+      /* First pass of the clauses: extract the gwv parallelism associated
+	 with this loop.  */
+      for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+	switch (OMP_CLAUSE_CODE (c))
+	  {
+	  case OMP_CLAUSE_GANG:
+	    local_gwv |= GOMP_DIM_MASK (GOMP_DIM_GANG);
+	    break;
+	  case OMP_CLAUSE_WORKER:
+	    local_gwv |= GOMP_DIM_MASK (GOMP_DIM_WORKER);
+	    break;
+	  case OMP_CLAUSE_VECTOR:
+	    local_gwv |= GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+	    break;
+	  case OMP_CLAUSE_SEQ:
+	    is_seq = true;
+	  default:
+	    ;
+	  }
+
+      outer->gwv |= local_gwv;
+
+      if (!outer->update)
+	break;
+
+      /* Loops with a non-zero gwv or seq clause don't need any additional
+	 parallelism.  */
+      if (!is_seq && local_gwv == 0)
+	{
+	  struct walk_stmt_info wi_nested;
+
+	  memset (&wi_nested, 0, sizeof (wi_nested));
+	  wi_nested.info = &nested;
+	  wi_nested.want_locations = true;
+
+	  walk_gimple_seq (gimple_omp_for_pre_body (stmt),
+			   set_oacc_parallel_loop_gwv_1, NULL, &wi_nested);
+	  walk_gimple_seq (gimple_omp_body (stmt),
+			   set_oacc_parallel_loop_gwv_1, NULL, &wi_nested);
+
+	  for (dim = GOMP_DIM_MAX;
+	       dim > 0 && (outer->gwv & GOMP_DIM_MASK (dim-1)) == 0;
+	       dim--)
+	    ;
+
+	  nested_dim = nested.gwv == 0 ? GOMP_DIM_MAX : ffs (nested.gwv)-1;
+
+	  if (dim < nested_dim)
+	    {
+	      switch (dim)
+		{
+		case GOMP_DIM_GANG:
+		  c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_GANG);
+		  break;
+		case GOMP_DIM_WORKER:
+		  c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_WORKER);
+		  break;
+		case GOMP_DIM_VECTOR:
+		  c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_VECTOR);
+		  break;
+		default:
+		  c = NULL_TREE;
+		}
+
+	      if (c)
+		{
+		  OMP_CLAUSE_CHAIN (c) = clauses;
+		  clauses = c;
+		  outer->gwv |= GOMP_DIM_MASK (dim);
+		}
+	    }
+	}
+
+      /* Remove any reductions associated with this loop since there
+	 isn't anymore available parallelism for it.  */
+      if (dim == GOMP_DIM_MAX || dim >= nested_dim || is_seq)
+	clauses = filter_omp_clause (OMP_CLAUSE_REDUCTION, clauses);
+      else if (c)
+	clauses = filter_omp_clause (OMP_CLAUSE_AUTO, clauses);
+
+      gimple_omp_for_set_clauses (stmt, clauses);
+
+      /* gimple_omp_for_{index,initial,final} are all DECLs; no need to
+	 walk them.  */
+      walk_gimple_seq (gimple_omp_for_pre_body (stmt),
+	  	       set_oacc_parallel_loop_gwv_1, NULL, wi);
+      walk_gimple_seq (gimple_omp_body (stmt), set_oacc_parallel_loop_gwv_1,
+		       NULL, wi);
+      wi->info = outer;
+      break;
+    default:
+      break;
+    }
+  return NULL;
+}
+
+/* Scan all of the statements inside the current OpenACC parallel
+   region for acc loops.  Partition loops with the lowest level of
+   available parallelism from gangs (lowest) to vectors (highest).  */
+
+static void
+set_oacc_parallel_loop_gwv (gimple_seq *body_p, omp_context *ctx)
+{
+  if (!is_oacc_parallel (ctx))
+    return;
+
+  location_t saved_location;
+  struct walk_stmt_info wi;
+  struct oacc_gwv gwv = { 0, true };
+
+  memset (&wi, 0, sizeof (wi));
+  wi.info = &gwv;
+  wi.want_locations = true;
+
+  saved_location = input_location;
+  walk_gimple_seq_mod (body_p, set_oacc_parallel_loop_gwv_1, NULL, &wi);
+  input_location = saved_location;
+}
+
 /* Scan a GIMPLE_OMP_FOR.  */
 
 static void
@@ -2684,28 +2876,7 @@  scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
 	 gangs, workers or vectors.  Such reductions are no-ops.  */
       if (extract_oacc_loop_mask (ctx) == 0)
 	{
-	  /* First filter out the clauses at the beginning of the chain.  */
-	  while (clauses && OMP_CLAUSE_CODE (clauses) == OMP_CLAUSE_REDUCTION)
-	    {
-	      clauses = OMP_CLAUSE_CHAIN (clauses);
-	    }
-
-	  if (clauses != NULL)
-	    {
-	      /* Filter out the remaining clauses.  */
-	      for (tree c = OMP_CLAUSE_CHAIN (clauses), prev = clauses;
-		   c; c = OMP_CLAUSE_CHAIN (c))
-		{
-		  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
-		    {
-		      tree t = OMP_CLAUSE_CHAIN (c);
-		      OMP_CLAUSE_CHAIN (prev) = t;
-		    }
-		  else
-		    prev = c;
-		}
-	    }
-
+	  clauses = filter_omp_clause (OMP_CLAUSE_REDUCTION, clauses);
 	  gimple_omp_for_set_clauses (stmt, clauses);
 	}
     }
@@ -2824,6 +2995,7 @@  scan_omp_target (gomp_target *stmt, omp_context *outer_ctx)
 	}
     }
 
+  set_oacc_parallel_loop_gwv (gimple_omp_body_ptr (stmt), ctx);
   scan_sharing_clauses (clauses, ctx);
   scan_omp (gimple_omp_body_ptr (stmt), ctx);
 
diff --git a/gcc/testsuite/c-c++-common/goacc/dtype-1.c b/gcc/testsuite/c-c++-common/goacc/dtype-1.c
index d133766..9aa781f 100644
--- a/gcc/testsuite/c-c++-common/goacc/dtype-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/dtype-1.c
@@ -102,13 +102,13 @@  test ()
 
 /* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ wait\\(0\\) async\\(0\\) \\\] device_type\\(nvidia\\) \\\[ wait\\(2\\) async\\(2\\) \\\] async\\(-1\\)" 1 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.0\\) private\\(i1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.0\\) private\\(i1\\)" 1 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.1\\) private\\(i1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.1\\) private\\(i1\\)" 1 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
 
 /* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } } */
 
@@ -116,9 +116,9 @@  test ()
 
 /* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ seq \\\] private\\(i6\\)" 2 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
 
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
 
 /* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/par-auto-1.c b/gcc/testsuite/c-c++-common/goacc/par-auto-1.c
new file mode 100644
index 0000000..f13a1ed
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/par-auto-1.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+  int red = 0;
+#pragma acc parallel copy (red)
+  {
+#pragma acc loop reduction (+:red) gang
+    for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red)
+      for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+	for (int k = 0; k < 10; k++)
+	  red ++;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector" 1 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/par-auto-2.c b/gcc/testsuite/c-c++-common/goacc/par-auto-2.c
new file mode 100644
index 0000000..d40b34b
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/par-auto-2.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+  int red = 0;
+#pragma acc parallel copy (red)
+  {
+#pragma acc loop reduction (+:red)
+    for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red) gang
+      for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+	for (int k = 0; k < 10; k++)
+	  red ++;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop private\\(i\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/par-auto-3.c b/gcc/testsuite/c-c++-common/goacc/par-auto-3.c
new file mode 100644
index 0000000..dafc792
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/par-auto-3.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+  int red = 0;
+#pragma acc parallel copy (red)
+  {
+#pragma acc loop reduction (+:red)
+    for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red) worker
+      for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+	for (int k = 0; k < 10; k++)
+	  red ++;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector" 1 "omplower" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95 b/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
index 5919ae4..fe20cbb 100644
--- a/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
@@ -183,13 +183,13 @@  end subroutine sr5b
 
 ! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ async\\(0\\) wait\\(0\\) \\\] device_type\\(nvidia_ptx\\) \\\[ async\\(1\\) wait\\(1\\) \\\] async\\(-1\\)" 1 "omplower" } }
 
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.1\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop gang device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.1\\)" 1 "omplower" } }
 
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop gang device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.2\\)" 1 "omplower" } }
 
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop worker device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
 
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop vector device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } }
 
 ! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } }
 
@@ -197,7 +197,7 @@  end subroutine sr5b
 
 ! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ seq \\\] private\\(i6\\)" 2 "omplower" } }
 
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop worker device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
 
 ! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } }