@@ -1747,7 +1747,7 @@ Specify default OpenACC compute dimensions.
fopenacc-kernels=
C ObjC C++ ObjC++ RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_SPLIT)
--fopenacc-kernels=[split|parloops] Configure OpenACC 'kernels' constructs handling.
+-fopenacc-kernels=[split|split-parloops|parloops] Configure OpenACC 'kernels' constructs handling.
Enum
Name(openacc_kernels) Type(enum openacc_kernels)
@@ -1755,6 +1755,9 @@ Name(openacc_kernels) Type(enum openacc_kernels)
EnumValue
Enum(openacc_kernels) String(split) Value(OPENACC_KERNELS_SPLIT)
+EnumValue
+Enum(openacc_kernels) String(split-parloops) Value(OPENACC_KERNELS_SPLIT_PARLOOPS)
+
EnumValue
Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)
@@ -2266,12 +2266,20 @@ permitted.
@opindex fopenacc-kernels
@cindex OpenACC accelerator programming
Configure OpenACC 'kernels' constructs handling.
+
With @option{-fopenacc-kernels=split}, OpenACC 'kernels' constructs
are split into a sequence of compute constructs, each then handled
-individually.
+individually. The data dependence analysis that is necessary to
+determine if loops can be parallelized is performed by the Graphite
+pass.
This is the default.
+With @option{-fopenacc-kernels=split-parloops}, OpenACC 'kernels' constructs
+are split into a sequence of compute constructs, each then handled
+individually.
+This is deprecated.
With @option{-fopenacc-kernels=parloops}, the whole OpenACC
'kernels' constructs is handled by the @samp{parloops} pass.
+This is deprecated.
@item -fopenmp
@opindex fopenmp
@@ -248,9 +248,9 @@ constraints in order to generate the points-to sets. It is located in
This is a pass group for processing OpenACC kernels regions. It is a
subpass of the IPA OpenACC pass group that runs on offloaded functions
-containing OpenACC kernels loops. It is located in
-@file{tree-ssa-loop.c} and is described by
-@code{pass_ipa_oacc_kernels}.
+containing OpenACC kernels loops if @samp{parloops} based handling of
+kernels regions is used. It is located in @file{tree-ssa-loop.c} and
+is described by @code{pass_ipa_oacc_kernels}.
@item Target clone
@@ -376,6 +376,7 @@ enum cf_protection_level
enum openacc_kernels
{
OPENACC_KERNELS_SPLIT,
+ OPENACC_KERNELS_SPLIT_PARLOOPS,
OPENACC_KERNELS_PARLOOPS
};
@@ -1701,6 +1701,9 @@ dump_gimple_omp_target (pretty_printer *buffer, const gomp_target *gs,
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
kind = " oacc_parallel_kernels_gang_single";
break;
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ kind = " oacc_parallel_kernels_graphite";
+ break;
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
kind = " oacc_data_kernels";
break;
@@ -169,7 +169,7 @@ enum gf_mask {
loop statements. */
GF_OMP_FOR_GRID_INTRA_GROUP = 1 << 5,
GF_OMP_FOR_GRID_GROUP_ITER = 1 << 6,
- GF_OMP_TARGET_KIND_MASK = (1 << 4) - 1,
+ GF_OMP_TARGET_KIND_MASK = (1 << 5) - 1,
GF_OMP_TARGET_KIND_REGION = 0,
GF_OMP_TARGET_KIND_DATA = 1,
GF_OMP_TARGET_KIND_UPDATE = 2,
@@ -189,9 +189,12 @@ enum gf_mask {
/* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
construct, "gang-single". */
GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE = 14,
+ /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
+ construct, for Graphite to analyze. */
+ GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE = 15,
/* A GF_OMP_TARGET_KIND_OACC_DATA that originates from a 'kernels'
construct. */
- GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
+ GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 16,
GF_OMP_TEAMS_GRID_PHONY = 1 << 0,
GF_OMP_TEAMS_HOST = 1 << 1,
@@ -6610,6 +6613,7 @@ is_gimple_omp_oacc (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
return true;
default:
@@ -6638,6 +6642,7 @@ is_gimple_omp_offloaded (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
return true;
default:
return false;
@@ -13103,6 +13103,7 @@ gimplify_omp_workshare (tree *expr_p, gimple_seq *pre_p)
{
push_gimplify_context ();
+ //TODO-kernels: What needs to be done here?
/* FIXME: Reductions are not supported in kernels regions yet. */
if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
@@ -108,7 +108,8 @@ struct omp_region
a depend clause. */
gomp_ordered *ord_stmt;
- /* True if this is nested inside an OpenACC kernels construct. */
+ /* True if this is nested inside an OpenACC kernels construct that
+ will be handled by the "parloops" pass. */
bool inside_kernels_p;
};
@@ -6579,12 +6580,36 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
loops_state_set (LOOPS_NEED_FIXUP);
if (region->inside_kernels_p)
- expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
- NULL_TREE, inner_stmt);
+ {
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+ NULL_TREE, inner_stmt);
+ }
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_SIMD)
expand_omp_simd (region, &fd);
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
{
+ struct omp_region *target_region;
+ for (target_region = region->outer; target_region;
+ target_region = target_region->outer)
+ {
+ if (region->type == GIMPLE_OMP_TARGET)
+ {
+ gomp_target *entry_stmt
+ = as_a <gomp_target *> (last_stmt (target_region->entry));
+
+ if (gimple_omp_target_kind (entry_stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ gcc_checking_assert (flag_openacc_kernels != OPENACC_KERNELS_SPLIT_PARLOOPS
+ && flag_openacc_kernels != OPENACC_KERNELS_PARLOOPS);
+
+ }
+
+ }
+
+
+
gcc_assert (!inner_stmt);
expand_oacc_for (region, &fd);
}
@@ -7674,6 +7699,9 @@ static void
mark_loops_in_oacc_kernels_region (basic_block region_entry,
basic_block region_exit)
{
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
class loop *outer = region_entry->loop_father;
gcc_assert (region_exit == NULL || outer == region_exit->loop_father);
@@ -7955,6 +7983,10 @@ expand_omp_target (struct omp_region *region)
entry_stmt = as_a <gomp_target *> (last_stmt (region->entry));
target_kind = gimple_omp_target_kind (entry_stmt);
+ if (!(flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS))
+ gcc_checking_assert (target_kind != GF_OMP_TARGET_KIND_OACC_KERNELS);
+
new_bb = region->entry;
oacc_explode_args = false;
@@ -7964,6 +7996,7 @@ expand_omp_target (struct omp_region *region)
case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_SERIAL:
if (targetm.goacc.explode_args ())
oacc_explode_args = true;
@@ -8010,6 +8043,9 @@ expand_omp_target (struct omp_region *region)
switch (target_kind)
{
case GF_OMP_TARGET_KIND_OACC_KERNELS:
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
mark_loops_in_oacc_kernels_region (region->entry, region->exit);
DECL_ATTRIBUTES (child_fn)
@@ -8031,6 +8067,11 @@ expand_omp_target (struct omp_region *region)
= tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
+ DECL_ATTRIBUTES (child_fn)
+ = tree_cons (get_identifier ("oacc parallel_kernels_graphite"),
+ NULL_TREE, DECL_ATTRIBUTES (child_fn));
+ break;
default:
break;
}
@@ -8240,6 +8281,7 @@ expand_omp_target (struct omp_region *region)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
start_ix = BUILT_IN_GOACC_PARALLEL;
break;
case GF_OMP_TARGET_KIND_OACC_DATA:
@@ -8885,6 +8927,9 @@ expand_omp (struct omp_region *region)
{
grid_expand_target_grid_body (region);
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+ {
if (region->inner)
{
gomp_target *entry
@@ -8894,6 +8939,7 @@ expand_omp (struct omp_region *region)
== GF_OMP_TARGET_KIND_OACC_KERNELS))
region->inner->inside_kernels_p = true;
}
+ }
}
if (region->type == GIMPLE_OMP_FOR
@@ -9046,6 +9092,7 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
break;
case GF_OMP_TARGET_KIND_UPDATE:
@@ -9232,9 +9279,14 @@ public:
/* opt_pass methods: */
virtual bool gate (function *fun)
{
- return !(fun->curr_properties & PROP_gimple_eomp);
+ return !(fun->curr_properties & PROP_gimple_eomp)
+ && (!oacc_get_kernels_attrib (cfun->decl)
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+
}
- virtual unsigned int execute (function *) { return execute_expand_omp (); }
+ virtual unsigned int execute (function *) {return execute_expand_omp ();}
opt_pass * clone () { return new pass_expand_omp_ssa (m_ctxt); }
}; // class pass_expand_omp_ssa
@@ -9304,6 +9356,7 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
break;
case GF_OMP_TARGET_KIND_UPDATE:
@@ -1962,6 +1962,16 @@ oacc_get_fn_attrib (tree fn)
return lookup_attribute (OACC_FN_ATTRIB, DECL_ATTRIBUTES (fn));
}
+/* Retrieve the oacc kernels attrib and return it. Non-oacc
+ functions will return NULL. */
+
+tree
+oacc_get_kernels_attrib (tree fn)
+{
+ return lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn));
+}
+
+
/* Return true if FN is an OpenMP or OpenACC offloading function. */
bool
@@ -1988,10 +1998,15 @@ oacc_get_fn_dim_size (tree fn, int axis)
dims = TREE_CHAIN (dims);
tree v = TREE_VALUE (dims);
- /* TODO With 'pass_oacc_device_lower' moved "later", this is necessary to
+ /* TODO-kernels With 'pass_oacc_device_lower' moved "later", this is necessary to
avoid ICE for some OpenACC 'kernels' ("parloops") constructs. */
if (v == NULL_TREE)
- return 0;
+ {
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ return 0;
+ }
int size = TREE_INT_CST_LOW (v);
@@ -99,6 +99,7 @@ extern int oacc_verify_routine_clauses (tree, tree *, location_t,
const char *);
extern tree oacc_build_routine_dims (tree clauses);
extern tree oacc_get_fn_attrib (tree fn);
+extern tree oacc_get_kernels_attrib (tree fn);
extern bool offloading_function_p (tree fn);
extern int oacc_get_fn_dim_size (tree fn, int axis);
extern int oacc_get_ifn_dim_arg (const gimple *stmt);
@@ -210,7 +210,9 @@ is_oacc_parallel_or_serial (omp_context *ctx)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|| (gimple_omp_target_kind (ctx->stmt)
- == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)));
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)));
}
/* Return true if CTX corresponds to an oacc kernels region. */
@@ -236,6 +238,8 @@ was_originally_oacc_kernels (omp_context *ctx)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ || (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
}
@@ -2566,11 +2570,19 @@ enclosing_target_ctx (omp_context *ctx)
static bool
ctx_in_oacc_kernels_region (omp_context *ctx)
{
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
for (;ctx != NULL; ctx = ctx->outer)
{
gimple *stmt = ctx->stmt;
- if (gimple_code (stmt) == GIMPLE_OMP_TARGET
- && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ if (gimple_code (stmt) != GIMPLE_OMP_TARGET)
+ continue;
+
+ int target_kind = gimple_omp_target_kind (stmt);
+ if (target_kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+ || target_kind == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
return true;
}
@@ -2584,6 +2596,10 @@ ctx_in_oacc_kernels_region (omp_context *ctx)
static unsigned
check_oacc_kernel_gwv (gomp_for *stmt, omp_context *ctx)
{
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
bool checking = true;
unsigned outer_mask = 0;
unsigned this_mask = 0;
@@ -2656,7 +2672,7 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
omp_context *tgt = enclosing_target_ctx (outer_ctx);
if (!tgt || (is_oacc_parallel_or_serial (tgt)
- && !was_originally_oacc_kernels (tgt)))
+ && !was_originally_oacc_kernels (tgt)))
for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
{
char const *check = NULL;
@@ -2685,8 +2701,12 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
" OpenACC %<parallel%> or %<serial%>", check);
}
- if (tgt && is_oacc_kernels (tgt))
+ if (tgt && (is_oacc_kernels (tgt)))
{
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
/* Strip out reductions, as they are not handled yet. */
tree *prev_ptr = &clauses;
@@ -3183,14 +3203,18 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
&& is_gimple_omp_oacc (stmt))
/* Except for atomic codes that we share with OpenMP. */
&& !(gimple_code (stmt) == GIMPLE_OMP_ATOMIC_LOAD
- || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE))
+ || gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE)
+ /* Except for target regions introduced for kernels. */
+ && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+ || gimple_omp_target_kind (stmt) != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
{
if (oacc_get_fn_attrib (cfun->decl) != NULL)
{
error_at (gimple_location (stmt),
"non-OpenACC construct inside of OpenACC routine");
+
return false;
- }
+}
else
for (omp_context *octx = ctx; octx != NULL; octx = octx->outer)
if (is_gimple_omp (octx->stmt)
@@ -3336,6 +3360,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
ok = true;
break;
@@ -3794,6 +3819,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* These three cases arise from kernels conversion. */
stmt_name = "kernels"; break;
@@ -3814,6 +3840,7 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
ctx_stmt_name = "host_data"; break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* These three cases arise from kernels conversion. */
ctx_stmt_name = "kernels"; break;
@@ -3822,7 +3849,9 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
/* OpenACC/OpenMP mismatch? */
if (is_gimple_omp_oacc (stmt)
- != is_gimple_omp_oacc (ctx->stmt))
+ != is_gimple_omp_oacc (ctx->stmt)
+ && (gimple_code (stmt) != GIMPLE_OMP_TARGET
+ || gimple_omp_target_kind (stmt) != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
{
error_at (gimple_location (stmt),
"%s %qs construct inside of %s %qs region",
@@ -7143,7 +7172,9 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
&& (gimple_omp_target_kind (probe->stmt)
!= GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
&& (gimple_omp_target_kind (probe->stmt)
- != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE))
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+ && (gimple_omp_target_kind (probe->stmt)
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE))
goto do_lookup;
cls = gimple_omp_target_clauses (probe->stmt);
@@ -7225,7 +7256,7 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
if (!ref_to_res)
ref_to_res = integer_zero_node;
- if (omp_is_reference (outgoing))
+ if (omp_is_reference (outgoing) && !omp_is_reference (var))
{
outgoing = build_simple_mem_ref (outgoing);
@@ -7954,7 +7985,15 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
omp_context *tgt = enclosing_target_ctx (ctx);
if ((!tgt || is_oacc_parallel_or_serial (tgt))
&& !(tag & (OLF_SEQ | OLF_AUTO)))
- tag |= OLF_INDEPENDENT;
+ {
+ tag |= OLF_INDEPENDENT;
+
+ gcc_checking_assert (gimple_code (ctx->stmt) != GIMPLE_OMP_TARGET
+ /* Loops in kernels regions that will be handled by Graphite should
+ have been made 'auto' by "pass_convert_oacc_kernels". */
+ || gimple_omp_target_kind (ctx->stmt)
+ != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE);
+ }
if (tag & OLF_TILE)
/* Tiling could use all 3 levels. */
@@ -11112,11 +11151,17 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
/* Once lowered, extract the bounds and clauses. */
omp_extract_for_data (stmt, &fd, NULL);
+ bool oacc_kernels_parloops = false;
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+ oacc_kernels_parloops = ctx_in_oacc_kernels_region (ctx);
if (is_gimple_omp_oacc (ctx->stmt)
- && !ctx_in_oacc_kernels_region (ctx))
- lower_oacc_head_tail (gimple_location (stmt),
- gimple_omp_for_clauses (stmt), private_marker,
- &oacc_head, &oacc_tail, ctx);
+ && !oacc_kernels_parloops)
+ {
+ lower_oacc_head_tail (gimple_location (stmt),
+ gimple_omp_for_clauses (stmt), private_marker,
+ &oacc_head, &oacc_tail, ctx);
+ }
/* Add OpenACC partitioning and reduction markers just before the loop. */
if (oacc_head)
@@ -12003,6 +12048,7 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+ case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE:
data_region = false;
break;
case GF_OMP_TARGET_KIND_DATA:
@@ -179,8 +179,13 @@ adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
compiler logic to analyze this, so can't parallelize it here, so
we'd very likely be running into a performance problem if we
were to execute this unparallelized, thus forward the whole loop
- nest to "parloops". */
- *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ nest to Graphite/"parloops". */
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+ *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+ else if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ else
+ gcc_unreachable ();
/* Terminate: final decision for this region. */
*handled_ops_p = true;
return integer_zero_node;
@@ -200,8 +205,15 @@ adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
the compiler logic to analyze this, so can't parallelize it here, so
we'd very likely be running into a performance problem if we were to
execute this unparallelized, thus forward the whole thing to
- "parloops". */
- *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ Graphite/"parloops". */
+ // TODO-kernels Is Graphite already able to handle this?
+ // Is this covered by tests?
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+ *region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE;
+ else if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+ *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+ else
+ gcc_unreachable ();
/* Terminate: final decision for this region. */
*handled_ops_p = true;
return integer_zero_node;
@@ -327,6 +339,13 @@ make_region_seq (location_t loc, gimple_seq stmts,
loops nested inside this sequentially executed statement. */
make_loops_gang_single (stmts);
}
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+ "beginning \"Graphite\" region in OpenACC"
+ " 'kernels' construct\n");
+ }
else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
{
if (dump_enabled_p ())
@@ -492,6 +511,11 @@ adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
= build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
gimple_omp_for_set_clauses (stmt, auto_clause);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_printf_loc (MSG_NOTE,
+ stmt,
+ "Added 'auto' clause to loop.\n");
}
}
@@ -580,6 +604,11 @@ transform_kernels_loop_clauses (gimple *omp_for,
OMP_CLAUSE_AUTO);
OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
loop_clauses = auto_clause;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_printf_loc (MSG_NOTE,
+ omp_for,
+ "Added 'auto' clause to loop.\n");
}
gimple_omp_for_set_clauses (omp_for, loop_clauses);
/* We must also recurse into the loop; it might contain nested loops
@@ -661,6 +690,19 @@ make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
"parallelized loop nest in OpenACC 'kernels'"
" construct\n");
+ clauses = transform_kernels_loop_clauses (omp_for,
+ num_gangs_clause,
+ num_workers_clause,
+ vector_length_clause,
+ clauses);
+ }
+ else if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GRAPHITE)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+ "forwarded loop nest in OpenACC 'kernels' construct"
+ " to \"Graphite\" for analysis\n");
+
clauses = transform_kernels_loop_clauses (omp_for,
num_gangs_clause,
num_workers_clause,
@@ -1644,8 +1686,13 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
- return (flag_openacc
- && flag_openacc_kernels == OPENACC_KERNELS_SPLIT);
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS)
+ return flag_openacc;
+ else if (flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS)
+ return false;
+ else
+ gcc_unreachable ();
}
virtual unsigned int execute (function *)
{
@@ -1863,8 +1863,8 @@ default_goacc_reduction (gcall *call)
gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr));
ref_to_res = t;
}
- tree dst = build_simple_mem_ref (ref_to_res);
tree src = var;
+ tree dst = ref_to_res;
if (code == IFN_GOACC_REDUCTION_SETUP)
{
@@ -1872,6 +1872,14 @@ default_goacc_reduction (gcall *call)
dst = lhs;
lhs = NULL;
}
+
+ if (TREE_TYPE (TREE_TYPE (dst)) == TREE_TYPE (src))
+ dst = build_simple_mem_ref (dst);
+
+ if (TREE_TYPE (TREE_TYPE (src)) == TREE_TYPE (dst))
+ src = build_simple_mem_ref (src);
+
+
gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
}
}
@@ -2030,11 +2038,22 @@ execute_oacc_loop_designation ()
bool is_oacc_kernels
= (lookup_attribute ("oacc kernels",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
+ if (is_oacc_kernels)
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
bool is_oacc_kernels_parallelized
= (lookup_attribute ("oacc kernels parallelized",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
if (is_oacc_kernels_parallelized)
- gcc_checking_assert (is_oacc_kernels);
+ {
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ gcc_checking_assert (is_oacc_kernels);
+ }
bool is_oacc_parallel_kernels_parallelized
= (lookup_attribute ("oacc parallel_kernels_parallelized",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
@@ -2047,6 +2066,12 @@ execute_oacc_loop_designation ()
gcc_checking_assert (!is_oacc_kernels);
gcc_checking_assert (!(is_oacc_parallel_kernels_parallelized
&& is_oacc_parallel_kernels_gang_single));
+ bool is_oacc_parallel_kernels_graphite
+ = (lookup_attribute ("oacc parallel_kernels_graphite",
+ DECL_ATTRIBUTES (current_function_decl)) != NULL);
+ if (is_oacc_parallel_kernels_graphite)
+ gcc_checking_assert (!is_oacc_kernels
+ && !is_oacc_parallel_kernels_gang_single);
/* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
kernels, so remove the parallelism dimensions function attributes
@@ -2076,6 +2101,9 @@ execute_oacc_loop_designation ()
else if (is_oacc_parallel_kernels_gang_single)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
"parallel_kernels_gang_single");
+ else if (is_oacc_parallel_kernels_graphite)
+ fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+ "parallel_kernels_graphite");
else
fprintf (dump_file, "Function is OpenACC parallel offload\n");
}
@@ -2088,11 +2116,25 @@ execute_oacc_loop_designation ()
generic oacc_loop infrastructure and attribute/dimension processing. */
if (is_oacc_kernels && is_oacc_kernels_parallelized)
{
+ //TODO-kernels We should not really end up here with KERNELS_SPLIT!?
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_SPLIT
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
/* Parallelized OpenACC kernels constructs use gang parallelism. See
also tree-parloops.c:create_parallel_loop. */
used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
}
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, " [execute_oacc_loop_designation]: (attr = ");
+ print_generic_expr (dump_file, attr);
+ fprintf (dump_file, ")\n");
+ }
+
+
+
int dims[GOMP_DIM_MAX];
oacc_validate_dims (current_function_decl, attr, dims, fn_level, used_mask);
@@ -2118,6 +2160,10 @@ execute_oacc_loop_designation ()
generic oacc_loop infrastructure. */
if (is_oacc_kernels)
{
+ //TODO-kernels: how to handle KERNELS_SPLIT
+ /* gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS */
+ /* || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS); */
+
/* Create a fake oacc_loop for diagnostic purposes. */
l = new_oacc_loop_raw (NULL,
DECL_SOURCE_LOCATION (current_function_decl));
new file mode 100644
@@ -0,0 +1,61 @@
+/* { dg-additional-options "-fopenacc-kernels=split-parloops -fdump-tree-convert_oacc_kernels" } */
+
+#define N 1024
+
+unsigned int a[N];
+
+int
+main (void)
+{
+ int i;
+ unsigned int sum = 1;
+
+#pragma acc kernels copyin(a[0:N]) copy(sum)
+ {
+ /* converted to "oacc_kernels" */
+ #pragma acc loop
+ for (i = 0; i < N; ++i)
+ sum += a[i];
+
+ /* converted to "oacc_parallel_kernels_gang_single" */
+ sum++;
+ a[0]++;
+
+ /* converted to "oacc_parallel_kernels_parallelized" */
+ #pragma acc loop independent
+ for (i = 0; i < N; ++i)
+ sum += a[i];
+
+ /* converted to "oacc_kernels" */
+ if (sum > 10)
+ {
+ #pragma acc loop
+ for (i = 0; i < N; ++i)
+ sum += a[i];
+ }
+
+ /* converted to "oacc_kernels" */
+ #pragma acc loop auto
+ for (i = 0; i < N; ++i)
+ sum += a[i];
+ }
+
+ return 0;
+}
+
+/* Check that the kernels region is split into a data region and enclosed
+ parallel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } } */
+
+/* As noted in the comments above, we get one gang-single serial region; one
+ parallelized loop region; and three "old-style" kernel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } } */
+
+/* Each of the parallel regions is async, and there is a final call to
+ __builtin_GOACC_wait. */
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
@@ -12,7 +12,7 @@ main (void)
#pragma acc kernels copyin(a[0:N]) copy(sum)
{
- /* converted to "oacc_kernels" */
+ /* converted to "oacc_parallel_kernels_graphite" */
#pragma acc loop
for (i = 0; i < N; ++i)
sum += a[i];
@@ -26,7 +26,7 @@ main (void)
for (i = 0; i < N; ++i)
sum += a[i];
- /* converted to "oacc_kernels" */
+ /* converted to "oacc_parallel_kernels_graphite" */
if (sum > 10)
{
#pragma acc loop
@@ -34,7 +34,7 @@ main (void)
sum += a[i];
}
- /* converted to "oacc_kernels" */
+ /* converted to "oacc_parallel_kernels_graphite" */
#pragma acc loop auto
for (i = 0; i < N; ++i)
sum += a[i];
@@ -48,14 +48,14 @@ main (void)
/* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } } */
/* As noted in the comments above, we get one gang-single serial region; one
- parallelized loop region; and three "old-style" kernel regions. */
+ parallelized loop region; and three "graphite" kernel regions. */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } } */
-/* { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite " 3 "convert_oacc_kernels" } } */
/* Each of the parallel regions is async, and there is a final call to
__builtin_GOACC_wait. */
-/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite async\\(-1\\)" 3 "convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
new file mode 100644
@@ -0,0 +1,37 @@
+! { dg-additional-options "-fopenacc-kernels=split" }
+
+! A regression test checking that the reduction clause lowering does
+! not fail if a subroutine argument is used as a reduction variable in
+! a kernels region.
+
+! This was fine ...
+subroutine reduction_var_not_argument(res)
+ real res
+ real tmp
+ integer i
+
+ !$acc kernels
+ !$acc loop reduction(+:tmp)
+ do i=0,n-1
+ tmp = tmp + 1
+ end do
+ !$acc end kernels
+
+ res = tmp
+end subroutine reduction_var_not_argument
+
+! ... but this led to problems because ARG
+! was a pointer type that did not get dereferenced.
+subroutine reduction_var_as_argument(arg)
+ real arg
+ integer i
+
+ !$acc kernels
+ !$acc loop reduction(+:arg)
+ do i=0,n-1
+ arg = arg + 1
+ end do
+ !$acc end kernels
+end subroutine reduction_var_as_argument
+
+
@@ -4174,7 +4174,15 @@ public:
virtual bool gate (function *)
{
if (oacc_kernels_p)
- return flag_openacc;
+ {
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+ return false;
+
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
+ return flag_openacc;
+ }
else
return flag_tree_parallelize_loops > 1;
}
@@ -4193,6 +4201,12 @@ public:
unsigned
pass_parallelize_loops::execute (function *fun)
{
+ if (oacc_kernels_p)
+ {
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+ }
+
tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
if (nthreads == NULL_TREE)
return 0;
@@ -155,6 +155,12 @@ make_pass_tree_loop (gcc::context *ctxt)
static bool
gate_oacc_kernels (function *fn)
{
+ if (flag_openacc_kernels == OPENACC_KERNELS_SPLIT)
+ return false;
+
+ gcc_checking_assert (flag_openacc_kernels == OPENACC_KERNELS_SPLIT_PARLOOPS
+ || flag_openacc_kernels == OPENACC_KERNELS_PARLOOPS);
+
if (!flag_openacc)
return false;
@@ -324,6 +330,10 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
+ if (flag_openacc_kernels != OPENACC_KERNELS_SPLIT_PARLOOPS
+ && flag_openacc_kernels != OPENACC_KERNELS_PARLOOPS)
+ return false;
+
return (optimize
&& flag_openacc
/* Don't bother doing anything if the program has errors. */