@@ -39,6 +39,7 @@ along with GCC; see the file COPYING3. If not see
#include "gimple-iterator.h"
#include "gimple-walk.h"
#include "gomp-constants.h"
+#include "omp-general.h"
/* This is a preprocessing pass to be run immediately before lower_omp. It
will convert OpenACC "kernels" regions into sequences of "parallel"
@@ -135,6 +136,95 @@ top_level_omp_for_in_stmt (gimple *stmt)
return NULL;
}
+/* Helper for adjust_region_code: evaluate the statement at GSI_P. */
+
+static tree
+adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
+ bool *handled_ops_p,
+ struct walk_stmt_info *wi)
+{
+ int *region_code = (int *) wi->info;
+
+ gimple *stmt = gsi_stmt (*gsi_p);
+ switch (gimple_code (stmt))
+ {
+ case GIMPLE_OMP_FOR:
+ {
+ tree clauses = gimple_omp_for_clauses (stmt);
+ if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+ {
+ /* Explicit 'independent' clause. */
+ /* Keep going; recurse into loop body. */
+ break;
+ }
+ else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
+ {
+ /* Explicit 'seq' clause. */
+ /* We'll "parallelize" if at some level a loop construct has been
+ marked up by the user as unparallelizable ('seq' clause; we'll
+ respect that in the later processing). Given that the user has
+ explicitly marked it up, this loop construct cannot be
+ performance-critical (and we thus don't have to "avoid
+ offloading"), and in this case it's also fine to "parallelize"
+ instead of "gang-single", because any outer or inner loops may
+ still exploit the available parallelism. */
+ /* Keep going; recurse into loop body. */
+ break;
+ }
+ else
+ {
+ /* Explicit or implicit 'auto' clause. */
+ /* The user would like this loop analyzed ('auto' clause) and
+ typically parallelized, but we don't have available yet the
+ compiler logic to analyze this, so can't parallelize it here, so
+ we'd very likely be running into a performance problem if we
+ were to execute this unparallelized, thus forward the whole loop
+ nest to "parloops". */
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ /* Terminate: final decision for this region. */
+ *handled_ops_p = true;
+ return integer_zero_node;
+ }
+ gcc_unreachable ();
+ }
+
+ case GIMPLE_COND:
+ case GIMPLE_GOTO:
+ case GIMPLE_SWITCH:
+ case GIMPLE_ASM:
+ case GIMPLE_TRANSACTION:
+ case GIMPLE_RETURN:
+ /* Statement that might constitute some looping/control flow pattern. */
+ /* The user would like this code analyzed (implicit inside a 'kernels'
+ region) and typically parallelized, but we don't have available yet
+ the compiler logic to analyze this, so can't parallelize it here, so
+ we'd very likely be running into a performance problem if we were to
+ execute this unparallelized, thus forward the whole thing to
+ "parloops". */
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ /* Terminate: final decision for this region. */
+ *handled_ops_p = true;
+ return integer_zero_node;
+
+ default:
+ /* Keep going. */
+ break;
+ }
+
+ return NULL;
+}
+
+/* Adjust the REGION_CODE for the region in GS. */
+
+static void
+adjust_region_code (gimple_seq gs, int *region_code)
+{
+ struct walk_stmt_info wi;
+ memset (&wi, 0, sizeof (wi));
+ wi.info = region_code;
+ walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
+}
+
/* Helper function for make_loops_gang_single for walking the tree. If the
statement indicated by GSI_P is an OpenACC for loop with a gang clause,
issue a warning and remove the clause. */
@@ -174,6 +264,7 @@ visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
gimple_omp_for_set_clauses (stmt, clauses);
/* No need to recurse into nested statements; no loop nested inside
this loop can be gang-partitioned. */
+ sorry ("'gang' loop in \"gang-single\" region");
*handled_ops_p = true;
break;
@@ -184,16 +275,16 @@ visit_loops_in_gang_single_region (gimple_stmt_iterator
*gsi_p,
return NULL;
}
-/* Visit all nested OpenACC loops in the statement indicated by GSI. This
+/* Visit all nested OpenACC loops in the sequence indicated by GS. This
statement is expected to be inside a gang-single region. Issue a warning
for any loops inside it that have gang clauses and remove the clauses. */
static void
-make_loops_gang_single (gimple_stmt_iterator gsi)
+make_loops_gang_single (gimple_seq gs)
{
struct walk_stmt_info wi;
memset (&wi, 0, sizeof (wi));
- walk_gimple_stmt (&gsi, visit_loops_in_gang_single_region, NULL, &wi);
+ walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
}
/* Construct a "gang-single" OpenACC parallel region at LOC containing the
@@ -202,21 +293,75 @@ make_loops_gang_single (gimple_stmt_iterator gsi)
to force gang-single execution. */
static gimple *
-make_gang_single_region (location_t loc, gimple_seq stmts, tree clauses)
+make_region_seq (location_t loc, gimple_seq stmts,
+ tree num_gangs_clause,
+ tree num_workers_clause,
+ tree vector_length_clause,
+ tree clauses)
{
/* This correctly unshares the entire clause chain rooted here. */
clauses = unshare_expr (clauses);
- /* Make a num_gangs(1) clause. */
- tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
- OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
- OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+
+ dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
+
+ /* Figure out the region code for this region. */
+ /* Optimistic default: assume "setup code", no looping; thus not
+ performance-critical. */
+ int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
+ adjust_region_code (stmts, ®ion_code);
+
+ if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+ "beginning \"gang-single\" region in OpenACC"
+ " 'kernels' construct\n");
+
+ /* Make a num_gangs(1) clause. */
+ tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
+ OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
+ OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+ clauses = gang_single_clause;
+
+ /* Remove and issue warnings about gang clauses on any OpenACC
+ loops nested inside this sequentially executed statement. */
+ make_loops_gang_single (stmts);
+ }
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+ "beginning \"parloops\" region in OpenACC"
+ " 'kernels' construct\n");
+
+ /* As we're transforming a "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+ "GF_OMP_TARGET_KIND_OACC_KERNELS", this isn't doing any of the clauses
+ mangling that "make_region_loop_nest" is doing. */
+ /* Re-assemble the clauses stripped off earlier. */
+ if (num_gangs_clause != NULL)
+ {
+ tree c = unshare_expr (num_gangs_clause);
+ OMP_CLAUSE_CHAIN (c) = clauses;
+ clauses = c;
+ }
+ if (num_workers_clause != NULL)
+ {
+ tree c = unshare_expr (num_workers_clause);
+ OMP_CLAUSE_CHAIN (c) = clauses;
+ clauses = c;
+ }
+ if (vector_length_clause != NULL)
+ {
+ tree c = unshare_expr (vector_length_clause);
+ OMP_CLAUSE_CHAIN (c) = clauses;
+ clauses = c;
+ }
+ }
+ else
+ gcc_unreachable ();
/* Build the gang-single region. */
- gimple *single_region
- = gimple_build_omp_target (
- NULL,
- GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE,
- gang_single_clause);
+ gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
gimple_set_location (single_region, loc);
gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
gimple_omp_set_body (single_region, single_body);
@@ -224,7 +369,7 @@ make_gang_single_region (location_t loc, gimple_seq stmts,
tree clauses)
return single_region;
}
-/* Helper function for make_gang_parallel_loop_region. Adds a num_gangs
+/* Helper function for make_region_loop_nest. Adds a num_gangs
(num_workers, vector_length) clause to the given CLAUSES, either the one
from the parent region (PARENT_CLAUSE) or a new one based on the loop's
own LOOP_CLAUSE ("gang(num: N)" or similar for workers or vectors) with
@@ -256,7 +401,7 @@ add_parent_or_loop_num_clause (tree parent_clause, tree
loop_clause,
return clauses;
}
-/* Helper for make_gang_parallel_loop_region, looking for "worker(num: N)"
+/* Helper for make_region_loop_nest, looking for "worker(num: N)"
or "vector(length: N)" clauses in nested loops. Removes the numeric
argument, transferring it to the enclosing parallel region (via
WI->INFO). If numeric arguments within the same loop nest conflict,
@@ -493,32 +638,65 @@ transform_kernels_loop_clauses (gimple *omp_for,
adjust_nested_loop_clauses function. */
static gimple *
-make_gang_parallel_loop_region (gimple *omp_for, gimple *stmt,
- tree num_gangs_clause,
- tree num_workers_clause,
- tree vector_length_clause,
- tree clauses)
+make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
+ tree num_gangs_clause,
+ tree num_workers_clause,
+ tree vector_length_clause,
+ tree clauses)
{
/* This correctly unshares the entire clause chain rooted here. */
clauses = unshare_expr (clauses);
- clauses = transform_kernels_loop_clauses (omp_for,
- num_gangs_clause,
- num_workers_clause,
- vector_length_clause,
- clauses);
+ /* Figure out the region code for this region. */
+ /* Optimistic default: assume that the loop nest is parallelizable
+ (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
+ and no un-annotated loops). */
+ int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+ adjust_region_code (stmts, ®ion_code);
+
+ if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+ "parallelized loop nest in OpenACC 'kernels'"
+ " construct\n");
+
+ clauses = transform_kernels_loop_clauses (omp_for,
+ num_gangs_clause,
+ num_workers_clause,
+ vector_length_clause,
+ clauses);
+ }
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+ "forwarded loop nest in OpenACC 'kernels' construct"
+ " to \"parloops\" for analysis\n");
+
+ /* We're transforming one "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+ "GF_OMP_TARGET_KIND_OACC_KERNELS", so don't have to
+ "transform_kernels_loop_clauses". */
+ /* Re-assemble the clauses stripped off earlier. */
+ clauses
+ = add_parent_or_loop_num_clause (num_gangs_clause, NULL,
+ OMP_CLAUSE_NUM_GANGS, clauses);
+ clauses
+ = add_parent_or_loop_num_clause (num_workers_clause, NULL,
+ OMP_CLAUSE_NUM_WORKERS, clauses);
+ clauses
+ = add_parent_or_loop_num_clause (vector_length_clause, NULL,
+ OMP_CLAUSE_VECTOR_LENGTH, clauses);
+ }
+ else
+ gcc_unreachable ();
/* Now build the parallel region containing this loop. */
- gimple_seq parallel_body = NULL;
- gimple_seq_add_stmt (¶llel_body, stmt);
gimple *parallel_body_bind
- = gimple_build_bind (NULL, parallel_body, make_node (BLOCK));
+ = gimple_build_bind (NULL, stmts, make_node (BLOCK));
gimple *parallel_region
- = gimple_build_omp_target (
- parallel_body_bind,
- GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED,
- clauses);
- gimple_set_location (parallel_region, gimple_location (stmt));
+ = gimple_build_omp_target (parallel_body_bind, region_code, clauses);
+ gimple_set_location (parallel_region, gimple_location (omp_for));
return parallel_region;
}
@@ -744,9 +922,9 @@ class control_flow_regions
information for the statement sequence SEQ. */
control_flow_regions (gimple_seq seq);
- /* Return true if the STMT with the given index IDX in the analyzed
+ /* Return true if the statement with the given index IDX in the analyzed
statement sequence is an unconditionally executed OpenACC loop. */
- bool is_unconditional_oacc_for_loop (gimple *stmt, size_t idx);
+ bool is_unconditional_oacc_for_loop (size_t idx);
private:
/* Find the region representative for the statement identified by index
@@ -777,11 +955,8 @@ control_flow_regions::control_flow_regions (gimple_seq seq)
}
bool
-control_flow_regions::is_unconditional_oacc_for_loop (gimple *stmt, size_t idx)
+control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
{
- if (top_level_omp_for_in_stmt (stmt) == NULL)
- /* Not an OpenACC for loop. */
- return false;
if (idx == 0 || idx == representatives.length () - 1)
/* The first or last statement in the kernels region. This means that
there is no room before or after it for a jump or a label. Thus
@@ -917,7 +1092,7 @@ control_flow_regions::compute_regions (gimple_seq seq)
}
/* Decompose the body of the KERNELS_REGION, which was originally annotated
- with the KERNELS_CLAUSES, into a series of parallel regions. */
+ with the KERNELS_CLAUSES, into a series of regions. */
static gimple *
decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
@@ -1057,17 +1232,24 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
gimple *stmt = gsi_stmt (gsi);
gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+ bool is_unconditional_oacc_for_loop = false;
+ if (omp_for != NULL)
+ is_unconditional_oacc_for_loop
+ = cf_regions.is_unconditional_oacc_for_loop (idx);
if (omp_for != NULL
- && cf_regions.is_unconditional_oacc_for_loop (stmt, idx))
+ && is_unconditional_oacc_for_loop)
{
- /* This is an OMP for statement, put it into a parallel region.
+ /* This is an OMP for statement, put it into a separate region.
But first, construct a gang-single region containing any
complex sequential statements we may have seen. */
if (gang_single_seq != NULL && !only_simple_assignments)
{
gimple *single_region
- = make_gang_single_region (loc, gang_single_seq,
- kernels_clauses);
+ = make_region_seq (loc, gang_single_seq,
+ num_gangs_clause,
+ num_workers_clause,
+ vector_length_clause,
+ kernels_clauses);
gimple_seq_add_stmt (®ion_body, single_region);
}
else if (gang_single_seq != NULL && only_simple_assignments)
@@ -1085,8 +1267,10 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
gang_single_seq = NULL;
only_simple_assignments = true;
+ gimple_seq parallel_seq = NULL;
+ gimple_seq_add_stmt (¶llel_seq, stmt);
gimple *parallel_region
- = make_gang_parallel_loop_region (omp_for, stmt,
+ = make_region_loop_nest (omp_for, parallel_seq,
num_gangs_clause,
num_workers_clause,
vector_length_clause,
@@ -1095,6 +1279,16 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
}
else
{
+ if (omp_for != NULL)
+ {
+ gcc_checking_assert (!is_unconditional_oacc_for_loop);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+ "unparallelized loop nest in OpenACC"
+ " 'kernels' region: it's executed"
+ " conditionally\n");
+ }
+
/* This is not an unconditional OMP for statement, so it will be
put into a gang-single region. */
gimple_seq_add_stmt (&gang_single_seq, stmt);
@@ -1107,9 +1301,6 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
&& DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
if (!is_simple_assignment)
only_simple_assignments = false;
- /* Remove and issue warnings about gang clauses on any OpenACC
- loops nested inside this sequentially executed statement. */
- make_loops_gang_single (gsi);
}
}
@@ -1128,7 +1319,11 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
if (gang_single_seq != NULL)
{
gimple *single_region
- = make_gang_single_region (loc, gang_single_seq, kernels_clauses);
+ = make_region_seq (loc, gang_single_seq,
+ num_gangs_clause,
+ num_workers_clause,
+ vector_length_clause,
+ kernels_clauses);
gimple_seq_add_stmt (®ion_body, single_region);
}
b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -12,19 +12,22 @@ main (void)
unsigned int sum = 1;
#pragma acc kernels copyin(a[0:N]) copy(sum)
- /* { dg-bogus "region contains gang partitoned code but is not gang
partitioned" "gang partitioned" { xfail *-*-* } .-1 } */
{
+ /* converted to "oacc_kernels" */
#pragma acc loop
for (i = 0; i < N; ++i)
sum += a[i];
+ /* converted to "oacc_parallel_kernels_gang_single" */
sum++;
a[0]++;
- #pragma acc loop
+ /* converted to "oacc_parallel_kernels_parallelized" */
+ #pragma acc loop independent
for (i = 0; i < N; ++i)
sum += a[i];
+ /* converted to "oacc_kernels" */
if (sum > 10)
{
#pragma acc loop
@@ -32,7 +35,8 @@ main (void)
sum += a[i];
}
- #pragma acc loop
+ /* converted to "oacc_kernels" */
+ #pragma acc loop auto
for (i = 0; i < N; ++i)
sum += a[i];
}
@@ -44,10 +48,11 @@ main (void)
parallel regions. */
/* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1
"convert_oacc_kernels" } } */
-/* The three unconditional loop regions are parallelized, the sequential
- part in between and the conditional loop are made gang-single. */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3
"convert_oacc_kernels" } } */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2
"convert_oacc_kernels" } } */
+/* As noted in the comments above, we get one gang-single serial region; one
+ parallelized loop region; and three "old-style" kernel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels" 3 "convert_oacc_kernels" } } */
/* Each of the parallel regions is async, and there is a final call to
__builtin_GOACC_wait. */
b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
new file mode 100644
@@ -0,0 +1,123 @@
+/* Test OpenACC 'kernels' construct decomposition. */
+
+/* { dg-additional-options "-fopenacc-kernels=split" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-O2" } for "parloops". */
+
+/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95". */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+ int x, y, z;
+#define N 10
+ int a[N], b[N], c[N];
+
+#pragma acc kernels
+ {
+ x = 0; /* { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" } */
+ y = x < 10;
+ z = x++;
+ ;
+ }
+
+#pragma acc kernels
+ for (int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops.
region in OpenACC .kernels. construct" } */
+ a[i] = 0;
+
+#pragma acc kernels loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to
.parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ b[i] = a[N - i - 1];
+
+#pragma acc kernels
+ {
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ b[i] = a[N - i - 1];
+
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+
+ a[z] = 0; /* { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" } */
+
+#pragma acc loop
+ /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; i++)
+ c[i] += a[i];
+
+#pragma acc loop seq /* { dg-message "note: assigned OpenACC seq loop
parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels.
construct" "" { target *-*-* } .-1 } */
+ for (int i = 0 + 1; i < N; i++)
+ c[i] += c[i - 1];
+ }
+
+#pragma acc kernels
+ {
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang loop
parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels.
construct" "" { target *-*-* } .-1 } */
+ for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC worker
loop parallelism" } */
+ for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC seq loop
parallelism" } */
+ /* { dg-warning "insufficient partitioning available to parallelize loop" ""
{ target *-*-* } .-1 } */
+ for (int k = 0; k < N; ++k)
+ a[(i + j + k) % N]
+ = b[j]
+ + f_v (c[k]); /* { dg-message "note: assigned OpenACC vector loop
parallelism" } */
+
+ //TODO Should the following turn into "gang-single" instead of "parloops"?
+ //TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>;
else goto <D.2548>;", thus "parloops".
+ if (y < 5) /* { dg-message "note: beginning .parloops. region in OpenACC
.kernels. construct" } */
+#pragma acc loop independent /* { dg-message "note: unparallelized loop nest in
OpenACC .kernels. region: it's executed conditionally" } */
+ for (int j = 0; j < N; ++j)
+ b[j] = f_w (c[j]);
+ }
+
+#pragma acc kernels /* { dg-warning "region contains gang partitoned code but
is not gang partitioned" } */
+ {
+ /* { dg-message "note: beginning .gang-single. region in OpenACC .kernels.
construct" "" { target *-*-* } .+1 } */
+ y = f_g (a[5]); /* { dg-message "note: assigned OpenACC gang worker vector
loop parallelism" } */
+
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang loop
parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels.
construct" "" { target *-*-* } .-1 } */
+ for (int j = 0; j < N; ++j)
+ b[j] = y + f_w (c[j]); /* { dg-message "note: assigned OpenACC worker
vector loop parallelism" } */
+ }
+
+#pragma acc kernels
+ {
+ y = 3; /* { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" } */
+
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang
worker loop parallelism" } */
+ /* { dg-message "note: parallelized loop nest in OpenACC .kernels.
construct" "" { target *-*-* } .-1 } */
+ for (int j = 0; j < N; ++j)
+ b[j] = y + f_v (c[j]); /* { dg-message "note: assigned OpenACC vector
loop parallelism" } */
+
+ z = 2; /* { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" } */
+ }
+
+#pragma acc kernels /* { dg-message "note: beginning .gang-single. region in
OpenACC .kernels. construct" } */
+ ;
+
+ return 0;
+}
b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -9,19 +9,23 @@ program main
!$acc kernels copyin(a(1:N)) copy(sum)
+ ! converted to "oacc_kernels"
!$acc loop
do i = 1, N
sum = sum + a(i)
end do
+ ! converted to "oacc_parallel_kernels_gang_single"
sum = sum + 1
a(1) = a(1) + 1
- !$acc loop
+ ! converted to "oacc_parallel_kernels_parallelized"
+ !$acc loop independent
do i = 1, N
sum = sum + a(i)
end do
+ ! converted to "oacc_kernels"
if (sum .gt. 10) then
!$acc loop
do i = 1, N
@@ -29,8 +33,8 @@ program main
end do
end if
- !$acc loop
- ! { dg-bogus "region contains gang partitoned code but is not gang
partitioned" "gang partitioned" { xfail *-*-* } .-1 }
+ ! converted to "oacc_kernels"
+ !$acc loop auto
do i = 1, N
sum = sum + a(i)
end do
@@ -42,15 +46,13 @@ end program main
! parallel regions.
! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1
"convert_oacc_kernels" } }
-! The three unconditional loop regions are parallelized, the sequential part
-! in between and the conditional loop are made gang-single.
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3
"convert_oacc_kernels" } }
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2
"convert_oacc_kernels" } }
+! As noted in the comments above, we get one gang-single serial region; one
+! parallelized loop region; and three "old-style" kernel regions.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_kernels" 3 "convert_oacc_kernels" } }
! Each of the parallel regions is async, and there is a final call to
! __builtin_GOACC_wait.
! { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5
"convert_oacc_kernels" } }
! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1
"convert_oacc_kernels" } }
-
-! Check that the original kernels region is removed.
-! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } }
b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
new file mode 100644
@@ -0,0 +1,132 @@
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopenacc-kernels=split" }
+! { dg-additional-options "-fopt-info-optimized-omp" }
+! { dg-additional-options "-O2" } for "parloops".
+
+! See also "../../c-c++-common/goacc/kernels-decompose-1.c".
+
+program main
+ implicit none
+
+ integer, external :: f_g
+ !$acc routine (f_g) gang
+ integer, external :: f_w
+ !$acc routine (f_w) worker
+ integer, external :: f_v
+ !$acc routine (f_v) vector
+ integer, external :: f_s
+ !$acc routine (f_s) seq
+
+ integer :: i, j, k
+ integer :: x, y, z
+ logical :: y_l
+ integer, parameter :: N = 10
+ integer :: a(N), b(N), c(N)
+
+ !$acc kernels
+ x = 0 ! { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" }
+ y = 0
+ y_l = x < 10
+ z = x
+ x = x + 1
+ ;
+ !$acc end kernels
+
+ !$acc kernels ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+ do i = 1, N ! { dg-message "note: beginning .parloops. region in OpenACC
.kernels. construct" }
+ a(i) = 0
+ end do
+ !$acc end kernels
+
+ !$acc kernels loop ! { dg-message "note: assigned OpenACC gang loop
parallelism" }
+ ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to
.parloops. for analysis" "" { target *-*-* } .-1 }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc kernels
+ !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+ ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to
.parloops. for analysis" "" { target *-*-* } .-1 }
+ do i = 1, N
+ b(i) = a(N - i + 1)
+ end do
+
+ !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+ ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to
.parloops. for analysis" "" { target *-*-* } .-1 }
+ do i = 1, N
+ c(i) = a(i) * b(i)
+ end do
+
+ a(z) = 0 ! { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" }
+
+ !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+ ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to
.parloops. for analysis" "" { target *-*-* } .-1 }
+ do i = 1, N
+ c(i) = c(i) + a(i)
+ end do
+
+ !$acc loop seq ! { dg-message "note: assigned OpenACC seq loop parallelism" }
+ ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct"
"" { target *-*-* } .-1 }
+ do i = 1 + 1, N
+ c(i) = c(i) + c(i - 1)
+ end do
+ !$acc end kernels
+
+ !$acc kernels ! { dg-bogus "note: assigned OpenACC seq loop parallelism"
"TODO" { xfail *-*-* } }
+ !$acc loop independent ! { dg-message "note: assigned OpenACC gang loop
parallelism" }
+ ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct"
"" { target *-*-* } .-1 }
+ do i = 1, N
+ !$acc loop independent ! { dg-message "note: assigned OpenACC worker loop
parallelism" }
+ do j = 1, N
+ !$acc loop independent ! { dg-message "note: assigned OpenACC seq loop
parallelism" "TODO" { xfail *-*-* } }
+ ! { dg-warning "insufficient partitioning available to parallelize
loop" "TODO" { xfail *-*-* } .-1 }
+ ! { dg-bogus "note: assigned OpenACC vector loop parallelism" "TODO" {
xfail *-*-* } .-2 }
+ do k = 1, N
+ a(1 + mod(i + j + k, N)) &
+ = b(j) &
+ + f_v (c(k)) ! { dg-message "note: assigned OpenACC vector loop
parallelism" "TODO" { xfail *-*-* } .-1 }
+ end do
+ end do
+ end do
+
+ !TODO Should the following turn into "gang-single" instead of "parloops"?
+ !TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else
goto <D.2548>;", thus "parloops".
+ if (y < 5) then ! { dg-message "note: beginning .parloops. region in OpenACC
.kernels. construct" }
+ !$acc loop independent ! { dg-message "note: unparallelized loop nest in
OpenACC .kernels. region: it's executed conditionally" }
+ do j = 1, N
+ b(j) = f_w (c(j))
+ end do
+ end if
+ !$acc end kernels
+
+ !$acc kernels
+ !TODO This refers to the "gang-single" "f_g" call.
+ ! { dg-warning "region contains gang partitoned code but is not gang
partitioned" "TODO" { xfail *-*-* } .-2 }
+ ! { dg-message "note: beginning .gang-single. region in OpenACC .kernels.
construct" "" { target *-*-* } .+1 }
+ y = f_g (a(5)) ! { dg-message "note: assigned OpenACC gang worker vector loop
parallelism" "TODO" { xfail *-*-* } }
+
+ !$acc loop independent ! { dg-message "note: assigned OpenACC gang loop
parallelism" "TODO" { xfail *-*-* } }
+ ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct"
"" { target *-*-* } .-1 }
+ ! { dg-bogus "note: assigned OpenACC gang vector loop parallelism" "TODO" {
xfail *-*-* } .-2 }
+ do j = 1, N
+ b(j) = y + f_w (c(j)) ! { dg-message "note: assigned OpenACC worker vector
loop parallelism" "TODO" { xfail *-*-* } }
+ end do
+ !$acc end kernels
+
+ !$acc kernels
+ y = 3 ! { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" }
+
+ !$acc loop independent ! { dg-message "note: assigned OpenACC gang worker
loop parallelism" "TODO" { xfail *-*-* } }
+ ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct"
"" { target *-*-* } .-1 }
+ ! { dg-bogus "note: assigned OpenACC gang vector loop parallelism" "TODO" {
xfail *-*-* } .-2 }
+ do j = 1, N
+ b(j) = y + f_v (c(j)) ! { dg-message "note: assigned OpenACC vector loop
parallelism" "TODO" { xfail *-*-* } }
+ end do
+
+ z = 2 ! { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" }
+ !$acc end kernels
+
+ !$acc kernels ! { dg-message "note: beginning .gang-single. region in OpenACC
.kernels. construct" }
+ !$acc end kernels
+end program main
b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-additional-options "-fopenacc-kernels=split" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+#undef NDEBUG
+#include <assert.h>
+
+int main()
+{
+ int a = 0;
+#define N 123
+ int b[N] = { 0 };
+
+#pragma acc kernels
+ {
+ int c = 234; /* { dg-warning "note: beginning .gang-single. region in
OpenACC .kernels. construct" } */
+
+#pragma acc loop independent gang /* { dg-warning "note: assigned OpenACC gang
loop parallelism" } */
+ /* { dg-warning "note: parallelized loop nest in OpenACC .kernels.
construct" "" { target *-*-* } 17 } */
+ for (int i = 0; i < N; ++i)
+ b[i] = c;
+
+ a = c; /* { dg-warning "note: beginning .gang-single. region in OpenACC
.kernels. construct" } */
+ }
+
+ for (int i = 0; i < N; ++i)
+ assert (b[i] == 234);