2015-09-09 Cesar Philippidis <cesar@codesourcery.com>
gcc/
* omp-low.c (struct oacc_gwv): New struct.
(filter_omp_clause): New function.
(set_oacc_parallel_loop_gwv_1): New function.
(set_oacc_parallel_loop_gwv): New function.
(scan_omp_for): Use filer_omp_clause to remove the stale reductions.
(scan_omp_target): Automatically assign gang, worker and vector
clauses to auto and independent loop with any worksharing clauses
inside parallel regions.
gcc/testsuite/
* gfortran.dg/goacc/dtype-1.f95: Update xfails to account for the
automatic parallelism in acc parallel regions.
* c-c++-common/goacc/dtype-1.c: Likewise.
* c-c++-common/goacc/par-auto-1.c: New test.
* c-c++-common/goacc/par-auto-2.c: New test.
* c-c++-common/goacc/par-auto-3.c: New test.
@@ -237,6 +237,13 @@ struct omp_for_data
struct omp_for_data_loop *loops;
};
+/* A structure for automatically adding parallelism to OpenACC loops. */
+
+struct oacc_gwv
+{
+ short gwv;
+ bool update;
+};
static splay_tree all_contexts;
static int taskreg_nesting_level;
@@ -2596,6 +2603,191 @@ oacc_loop_or_target_p (gimple stmt)
&& gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_OACC_LOOP));
}
+/* Remove all clauses of type CODE from the chain of omp CLAUSES. */
+static tree
+filter_omp_clause (omp_clause_code code, tree clauses)
+{
+ /* First filter out the clauses at the beginning of the chain. */
+ while (clauses
+ && OMP_CLAUSE_CODE (clauses) == code)
+ {
+ clauses = OMP_CLAUSE_CHAIN (clauses);
+ }
+
+ if (clauses != NULL)
+ {
+ /* Filter out the remaining clauses. */
+ for (tree c = OMP_CLAUSE_CHAIN (clauses), prev = clauses;
+ c; c = OMP_CLAUSE_CHAIN (c))
+ {
+ if (OMP_CLAUSE_CODE (c) == code)
+ {
+ tree t = OMP_CLAUSE_CHAIN (c);
+ OMP_CLAUSE_CHAIN (prev) = t;
+ }
+ else
+ prev = c;
+ }
+ }
+
+ return clauses;
+}
+
+/* Callback for walk_gimple_seq. Set the appropriate level of parallelism
+ for an acc loop when possible. Also remove a reduction clause if the
+ a loop doesn't have any parallelism associated with it. */
+
+static tree
+set_oacc_parallel_loop_gwv_1 (gimple_stmt_iterator *gsi_p,
+ bool *handled_ops_p,
+ struct walk_stmt_info *wi)
+{
+ struct oacc_gwv *outer = (struct oacc_gwv *) wi->info;
+ struct oacc_gwv nested = { 0, false };
+ int local_gwv = 0, dim = 0, nested_dim = GOMP_DIM_MAX;
+ gimple stmt = gsi_stmt (*gsi_p);
+ bool is_seq = false;
+ tree clauses, c;
+
+ *handled_ops_p = true;
+
+ switch (gimple_code (stmt))
+ {
+ WALK_SUBSTMTS;
+
+ case GIMPLE_CALL:
+ {
+ tree fndecl = gimple_call_fndecl (stmt);
+ if (fndecl)
+ {
+ int call_gwv = extract_oacc_routine_gwv (fndecl);
+ outer->gwv |= call_gwv;
+ }
+ }
+ break;
+
+ case GIMPLE_OMP_FOR:
+ clauses = gimple_omp_for_clauses (stmt);
+
+ /* First pass of the clauses: extract the gwv parallelism associated
+ with this loop. */
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ switch (OMP_CLAUSE_CODE (c))
+ {
+ case OMP_CLAUSE_GANG:
+ local_gwv |= GOMP_DIM_MASK (GOMP_DIM_GANG);
+ break;
+ case OMP_CLAUSE_WORKER:
+ local_gwv |= GOMP_DIM_MASK (GOMP_DIM_WORKER);
+ break;
+ case OMP_CLAUSE_VECTOR:
+ local_gwv |= GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+ break;
+ case OMP_CLAUSE_SEQ:
+ is_seq = true;
+ default:
+ ;
+ }
+
+ outer->gwv |= local_gwv;
+
+ if (!outer->update)
+ break;
+
+ /* Loops with a non-zero gwv or seq clause don't need any additional
+ parallelism. */
+ if (!is_seq && local_gwv == 0)
+ {
+ struct walk_stmt_info wi_nested;
+
+ memset (&wi_nested, 0, sizeof (wi_nested));
+ wi_nested.info = &nested;
+ wi_nested.want_locations = true;
+
+ walk_gimple_seq (gimple_omp_for_pre_body (stmt),
+ set_oacc_parallel_loop_gwv_1, NULL, &wi_nested);
+ walk_gimple_seq (gimple_omp_body (stmt),
+ set_oacc_parallel_loop_gwv_1, NULL, &wi_nested);
+
+ for (dim = GOMP_DIM_MAX;
+ dim > 0 && (outer->gwv & GOMP_DIM_MASK (dim-1)) == 0;
+ dim--)
+ ;
+
+ nested_dim = nested.gwv == 0 ? GOMP_DIM_MAX : ffs (nested.gwv)-1;
+
+ if (dim < nested_dim)
+ {
+ switch (dim)
+ {
+ case GOMP_DIM_GANG:
+ c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_GANG);
+ break;
+ case GOMP_DIM_WORKER:
+ c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_WORKER);
+ break;
+ case GOMP_DIM_VECTOR:
+ c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_VECTOR);
+ break;
+ default:
+ c = NULL_TREE;
+ }
+
+ if (c)
+ {
+ OMP_CLAUSE_CHAIN (c) = clauses;
+ clauses = c;
+ outer->gwv |= GOMP_DIM_MASK (dim);
+ }
+ }
+ }
+
+ /* Remove any reductions associated with this loop since there
+ isn't anymore available parallelism for it. */
+ if (dim == GOMP_DIM_MAX || dim >= nested_dim || is_seq)
+ clauses = filter_omp_clause (OMP_CLAUSE_REDUCTION, clauses);
+ else if (c)
+ clauses = filter_omp_clause (OMP_CLAUSE_AUTO, clauses);
+
+ gimple_omp_for_set_clauses (stmt, clauses);
+
+ /* gimple_omp_for_{index,initial,final} are all DECLs; no need to
+ walk them. */
+ walk_gimple_seq (gimple_omp_for_pre_body (stmt),
+ set_oacc_parallel_loop_gwv_1, NULL, wi);
+ walk_gimple_seq (gimple_omp_body (stmt), set_oacc_parallel_loop_gwv_1,
+ NULL, wi);
+ wi->info = outer;
+ break;
+ default:
+ break;
+ }
+ return NULL;
+}
+
+/* Scan all of the statements inside the current OpenACC parallel
+ region for acc loops. Partition loops with the lowest level of
+ available parallelism from gangs (lowest) to vectors (highest). */
+
+static void
+set_oacc_parallel_loop_gwv (gimple_seq *body_p, omp_context *ctx)
+{
+ if (!is_oacc_parallel (ctx))
+ return;
+
+ location_t saved_location;
+ struct walk_stmt_info wi;
+ struct oacc_gwv gwv = { 0, true };
+
+ memset (&wi, 0, sizeof (wi));
+ wi.info = &gwv;
+ wi.want_locations = true;
+
+ saved_location = input_location;
+ walk_gimple_seq_mod (body_p, set_oacc_parallel_loop_gwv_1, NULL, &wi);
+ input_location = saved_location;
+}
+
/* Scan a GIMPLE_OMP_FOR. */
static void
@@ -2684,28 +2876,7 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
gangs, workers or vectors. Such reductions are no-ops. */
if (extract_oacc_loop_mask (ctx) == 0)
{
- /* First filter out the clauses at the beginning of the chain. */
- while (clauses && OMP_CLAUSE_CODE (clauses) == OMP_CLAUSE_REDUCTION)
- {
- clauses = OMP_CLAUSE_CHAIN (clauses);
- }
-
- if (clauses != NULL)
- {
- /* Filter out the remaining clauses. */
- for (tree c = OMP_CLAUSE_CHAIN (clauses), prev = clauses;
- c; c = OMP_CLAUSE_CHAIN (c))
- {
- if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
- {
- tree t = OMP_CLAUSE_CHAIN (c);
- OMP_CLAUSE_CHAIN (prev) = t;
- }
- else
- prev = c;
- }
- }
-
+ clauses = filter_omp_clause (OMP_CLAUSE_REDUCTION, clauses);
gimple_omp_for_set_clauses (stmt, clauses);
}
}
@@ -2824,6 +2995,7 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx)
}
}
+ set_oacc_parallel_loop_gwv (gimple_omp_body_ptr (stmt), ctx);
scan_sharing_clauses (clauses, ctx);
scan_omp (gimple_omp_body_ptr (stmt), ctx);
@@ -102,13 +102,13 @@ test ()
/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ wait\\(0\\) async\\(0\\) \\\] device_type\\(nvidia\\) \\\[ wait\\(2\\) async\\(2\\) \\\] async\\(-1\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.0\\) private\\(i1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.0\\) private\\(i1\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.1\\) private\\(i1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.1\\) private\\(i1\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } } */
@@ -116,9 +116,9 @@ test ()
/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ seq \\\] private\\(i6\\)" 2 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } } */
/* { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+ int red = 0;
+#pragma acc parallel copy (red)
+ {
+#pragma acc loop reduction (+:red) gang
+ for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red)
+ for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+ for (int k = 0; k < 10; k++)
+ red ++;
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector" 1 "omplower" } } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+ int red = 0;
+#pragma acc parallel copy (red)
+ {
+#pragma acc loop reduction (+:red)
+ for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red) gang
+ for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+ for (int k = 0; k < 10; k++)
+ red ++;
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop private\\(i\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+int
+main ()
+{
+ int red = 0;
+#pragma acc parallel copy (red)
+ {
+#pragma acc loop reduction (+:red)
+ for (int i = 0; i < 10; i++)
+#pragma acc loop reduction (+:red) worker
+ for (int j = 0; j < 10; j++)
+#pragma acc loop reduction (+:red)
+ for (int k = 0; k < 10; k++)
+ red ++;
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "acc loop gang" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop worker" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "acc loop vector" 1 "omplower" } } */
@@ -183,13 +183,13 @@ end subroutine sr5b
! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ async\\(0\\) wait\\(0\\) \\\] device_type\\(nvidia_ptx\\) \\\[ async\\(1\\) wait\\(1\\) \\\] async\\(-1\\)" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.1\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop gang device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.1\\)" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop gang device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.2\\)" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop worker device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop vector device_type\\(nvidia\\) \\\[ vector \\\] private\\(i3\\)" 1 "omplower" } }
! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } }
@@ -197,7 +197,7 @@ end subroutine sr5b
! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ seq \\\] private\\(i6\\)" 2 "omplower" } }
-! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "acc loop worker device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ collapse\\(1\\) worker \\\] private\\(i2\\)" 1 "omplower" } }
! { dg-final { scan-tree-dump-times "acc loop device_type\\(\\*\\) \\\[ seq \\\] device_type\\(nvidia\\) \\\[ auto \\\] private\\(i4\\)" 1 "omplower" } }