2014-09-16 Cesar Philippidis <cesar@codesourcery.com>
gcc/
* builtins.c (expand_builtin_acc_on_device): New function.
(expand_oacc_builtin): New function.
(expand_builtin): Handle BUILT_IN_GOACC_NCTAID, BUILT_IN_GOACC_CTAID,
BUILT_IN_GOACC_NTID, BUILT_IN_GOACC_TID, BUILT_IN_GOACC_GET_THREAD_NUM
and BUILT_IN_GOACC_GET_NUM_THREADS.
(is_simple_builtin): Handle BUILT_IN_GOACC_NTID and BUILT_IN_GOACC_TID.
(is_inexpensive_builtin): Handle BUILT_IN_ACC_ON_DEVICE.
* gcc/builtins.def (DEF_GOACC_BUILTIN): Temporarily make COND always
true.
(DEF_GOACC_BUILTIN_COMPILER): New.
* gcc/oacc-builtins.def (BUILT_IN_GOACC_NTID, BUILT_IN_GOACC_TID,
BUILT_IN_GOACC_NCTAID, BUILT_IN_GOACC_CTAID, BUILT_IN_ACC_ON_DEVICE,
BUILT_IN_GOACC_GET_THREAD_NUM, BUILT_IN_GOACC_GET_NUM_THREADS): New
built-ins.
* gcc/omp-low.c (finish_reduction_on_host): New function.
(oacc_host_nthreads): New function.
(lower_reduction_clauses): Process the array of partial reductions
on the accelerator is num_gangs = 1.
(expand_omp_for_static_nochunk): Use BUILT_IN_GOACC_GET_NUM_THREADS and
BUILT_IN_GOACC_GET_THREAD_NUM for nthreads and threadid, respectively,
with GF_OMP_FOR_KIND_OACC_LOOP.
(expand_omp_for_static_chunk): Likewise.
(expand_omp_target): Likewise.
(initialize_reduction_data): Adjust memory maps for the case where
the partial reductions are processed on the accelerator.
(finalize_reduction_data): Handle reductions on the accelerator.
(process_reduction_data): Likewise.
gcc/fortran/
* f95-lang.c (gfc_init_builtin_functions): Define
DEF_GOACC_BUILTIN_COMPILER.
* types.def (DEF_FUNCTION_TYPE_0): Define DEF_FUNCTION_TYPE_1 and
DEF_FUNCTION_TYPE_3.
gcc/testsuite/
* c-c++-common/goacc/goacc_builtins.c: New test.
@@ -5747,6 +5747,131 @@ expand_stack_save (void)
return ret;
}
+
+/* Expand OpenACC acc_on_device.
+
+ This has to happen late (that is, not in early folding; expand_builtin_*,
+ rather than fold_builtin_*), as we have to act differently for host and
+ acceleration device. */
+
+static rtx
+expand_builtin_acc_on_device (tree exp, rtx target ATTRIBUTE_UNUSED)
+{
+ if (!validate_arglist (exp, INTEGER_TYPE, VOID_TYPE))
+ return NULL_RTX;
+
+ tree arg, v1, v2, ret;
+ location_t loc;
+
+ arg = CALL_EXPR_ARG (exp, 0);
+ arg = builtin_save_expr (arg);
+ loc = EXPR_LOCATION (exp);
+
+ /* Build: (arg == v1 || arg == v2) ? 1 : 0. */
+
+#ifdef ACCEL_COMPILER
+ v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_not_host */ 3);
+ v2 = build_int_cst (TREE_TYPE (arg), ACCEL_COMPILER_acc_device);
+#else
+ v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_none */ 0);
+ v2 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_host */ 2);
+#endif
+
+ v1 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v1);
+ v2 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v2);
+
+ /* Can't use TRUTH_ORIF_EXPR, as that is not supported by
+ expand_expr_real*. */
+ ret = fold_build3_loc (loc, COND_EXPR, integer_type_node, v1, v1, v2);
+ ret = fold_build3_loc (loc, COND_EXPR, integer_type_node,
+ ret, integer_one_node, integer_zero_node);
+
+ return expand_normal (ret);
+}
+
+
+/* Expand a thread-id/thread-count builtin for OpenACC. */
+static rtx
+expand_oacc_builtin (enum built_in_function fcode, tree exp, rtx target)
+{
+ tree arg0 = NULL_TREE;
+ bool has_arg0 = false;
+ rtx result = const0_rtx;
+ rtx arg;
+
+ enum insn_code icode = CODE_FOR_nothing;
+ switch (fcode)
+ {
+ case BUILT_IN_GOACC_NTID:
+#ifdef HAVE_oacc_ntid
+ icode = CODE_FOR_oacc_ntid;
+#endif
+ has_arg0 = true;
+ result = const1_rtx;
+ break;
+ case BUILT_IN_GOACC_TID:
+#ifdef HAVE_oacc_ntid
+ icode = CODE_FOR_oacc_tid;
+#endif
+ has_arg0 = true;
+ break;
+ case BUILT_IN_GOACC_NCTAID:
+#ifdef HAVE_oacc_ntid
+ icode = CODE_FOR_oacc_nctaid;
+#endif
+ has_arg0 = true;
+ result = const1_rtx;
+ break;
+ case BUILT_IN_GOACC_CTAID:
+#ifdef HAVE_oacc_ntid
+ icode = CODE_FOR_oacc_ctaid;
+#endif
+ has_arg0 = true;
+ break;
+ case BUILT_IN_GOACC_GET_THREAD_NUM:
+#ifdef HAVE_oacc_threadnum
+ icode = CODE_FOR_oacc_threadnum;
+#endif
+ result = const0_rtx;
+ break;
+ case BUILT_IN_GOACC_GET_NUM_THREADS:
+#ifdef HAVE_oacc_numthreads
+ icode = CODE_FOR_oacc_numthreads;
+#endif
+ result = const1_rtx;
+ break;
+ default:
+ break;
+ }
+
+ if (has_arg0)
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+
+ gcc_assert (TREE_CODE (arg0) == INTEGER_CST);
+ arg = expand_normal (arg0);
+ }
+ if (icode != CODE_FOR_nothing)
+ {
+ enum machine_mode mode = insn_data[icode].operand[0].mode;
+ rtx tmp = target;
+ rtx insn;
+ if (!REG_P (tmp) || GET_MODE (tmp) != mode)
+ tmp = gen_reg_rtx (mode);
+ if (arg0)
+ insn = GEN_FCN (icode) (tmp, arg);
+ else
+ insn = GEN_FCN (icode) (tmp);
+ if (insn != NULL_RTX)
+ {
+ emit_insn (insn);
+ return tmp;
+ }
+ }
+
+ return result;
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
@@ -6816,6 +6941,20 @@ expand_builtin (tree exp, rtx target, rtx subtarget, enum machine_mode mode,
expand_builtin_cilk_pop_frame (exp);
return const0_rtx;
+ case BUILT_IN_ACC_ON_DEVICE:
+ target = expand_builtin_acc_on_device (exp, target);
+ if (target)
+ return target;
+ break;
+
+ case BUILT_IN_GOACC_NCTAID:
+ case BUILT_IN_GOACC_CTAID:
+ case BUILT_IN_GOACC_NTID:
+ case BUILT_IN_GOACC_TID:
+ case BUILT_IN_GOACC_GET_THREAD_NUM:
+ case BUILT_IN_GOACC_GET_NUM_THREADS:
+ return expand_oacc_builtin (fcode, exp, target);
+
default: /* just do library call, if unknown builtin */
break;
}
@@ -12663,6 +12802,9 @@ is_simple_builtin (tree decl)
case BUILT_IN_EH_FILTER:
case BUILT_IN_EH_POINTER:
case BUILT_IN_EH_COPY_VALUES:
+ /* Just a special register access. */
+ case BUILT_IN_GOACC_NTID:
+ case BUILT_IN_GOACC_TID:
return true;
default:
@@ -12748,6 +12890,7 @@ is_inexpensive_builtin (tree decl)
case BUILT_IN_LABS:
case BUILT_IN_LLABS:
case BUILT_IN_PREFETCH:
+ case BUILT_IN_ACC_ON_DEVICE:
return true;
default:
@@ -146,12 +146,18 @@ along with GCC; see the file COPYING3. If not see
DEF_BUILTIN (ENUM, NAME, BUILT_IN_NORMAL, BT_LAST, BT_LAST, false, false, \
false, ATTR_LAST, false, false)
-/* Builtin used by the implementation of GNU OpenACC. None of these are
- actually implemented in the compiler; they're all in libgomp. */
+/* Builtin used by the implementation of GNU OpenACC. Few of these are
+ actually implemented in the compiler; most are in libgomp. */
#undef DEF_GOACC_BUILTIN
#define DEF_GOACC_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \
- false, true, true, ATTRS, false, flag_openacc)
+ false, true, true, ATTRS, false, \
+ (/* TODO */ true || flag_openacc))
+#undef DEF_GOACC_BUILTIN_COMPILER
+#define DEF_GOACC_BUILTIN_COMPILER(ENUM, NAME, TYPE, ATTRS) \
+ DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \
+ true, true, true, ATTRS, false, \
+ (/* TODO */ true || flag_openacc))
/* Builtin used by the implementation of GNU OpenMP. None of these are
actually implemented in the compiler; they're all in libgomp. */
@@ -159,7 +165,7 @@ along with GCC; see the file COPYING3. If not see
#define DEF_GOMP_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \
false, true, true, ATTRS, false, \
- (flag_openmp || flag_tree_parallelize_loops))
+ (/* TODO */ true || flag_openmp || flag_tree_parallelize_loops))
/* Builtin used by implementation of Cilk Plus. Most of these are decomposed
by the compiler but a few are implemented in libcilkrts. */
@@ -1093,7 +1093,11 @@ gfc_init_builtin_functions (void)
#define DEF_GOACC_BUILTIN(code, name, type, attr) \
gfc_define_builtin ("__builtin_" name, builtin_types[type], \
code, name, attr);
+#undef DEF_GOACC_BUILTIN_COMPILER
+#define DEF_GOACC_BUILTIN_COMPILER(code, name, type, attr) \
+ gfc_define_builtin (name, builtin_types[type], code, name, attr);
#include "../oacc-builtins.def"
+#undef DEF_GOACC_BUILTIN_COMPILER
#undef DEF_GOACC_BUILTIN
}
@@ -82,6 +82,7 @@ DEF_FUNCTION_TYPE_0 (BT_FN_VOID, BT_VOID)
DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTR, BT_VOID, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTRPTR, BT_VOID, BT_PTR_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_VOID_VPTR, BT_VOID, BT_VOLATILE_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_INT_INT, BT_INT, BT_INT)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT)
DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT)
@@ -144,6 +145,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I2_INT, BT_VOID, BT_VOLATILE_PTR, BT_I2, BT
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I4_INT, BT_VOID, BT_VOLATILE_PTR, BT_I4, BT_INT)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I8_INT, BT_VOID, BT_VOLATILE_PTR, BT_I8, BT_INT)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I16_INT, BT_VOID, BT_VOLATILE_PTR, BT_I16, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_VOID_INT_PTR_INT, BT_VOID, BT_INT, BT_PTR, BT_INT)
DEF_FUNCTION_TYPE_4 (BT_FN_VOID_OMPFN_PTR_UINT_UINT,
BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT)
@@ -39,3 +39,17 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_PARALLEL, "GOACC_parallel",
ATTR_NOTHROW_LIST)
DEF_GOACC_BUILTIN (BUILT_IN_GOACC_UPDATE, "GOACC_update",
BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NTID, "GOACC_ntid",
+ BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_TID, "GOACC_tid",
+ BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NCTAID, "GOACC_nctaid",
+ BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_CTAID, "GOACC_ctaid",
+ BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
+ BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num",
+ BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads",
+ BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
@@ -236,6 +236,3 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_UPDATE, "GOMP_target_update",
BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS, "GOMP_teams",
BT_FN_VOID_UINT_UINT, ATTR_NOTHROW_LIST)
-
-DEF_GOMP_BUILTIN (BUILT_IN_OMP_SET_NUM_THREADS, "omp_set_num_threads",
- BT_FN_VOID_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
@@ -238,6 +238,88 @@ omp_get_id (tree node)
return IDENTIFIER_POINTER(get_identifier (temp_name));
}
+/* Determines if the reduction array should be processed on the host.
+ This is done to avoid launching multiple kernels to synchronize
+ threads across PTX Cooperative Thread Arrays. */
+static bool
+finish_reduction_on_host (omp_context *ctx)
+{
+ /* Currently, OpenACC gangs are mapped onto PTX CTAs. Return false
+ if the num_gangs may be set to something other than one. */
+ for (omp_context *oc = ctx; oc; oc = oc->outer)
+ {
+ tree c, t;
+ int gangs;
+
+ if (gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
+ {
+ c = gimple_oacc_parallel_clauses (oc->stmt);
+ t = find_omp_clause (c, OMP_CLAUSE_NUM_GANGS);
+ if (t)
+ {
+ t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
+ integer_type_node,
+ OMP_CLAUSE_NUM_GANGS_EXPR (t));
+
+ if (TREE_CODE (t) != INTEGER_CST)
+ return true;
+
+ gangs = TREE_INT_CST_LOW (t);
+
+ if (gangs > 1)
+ return true;
+ }
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Determine the number of threads OpenACC threads. Currently, this is
+ num_gangs * vector_length. */
+
+static tree
+oacc_host_nthreads (omp_context *ctx)
+{
+ tree nthreads, vector_length, gangs, clauses;
+
+ gangs = fold_convert (sizetype, integer_one_node);
+ vector_length = gangs;
+
+ /* The reduction clause may be nested inside a loop directive.
+ Scan for the innermost vector_length clause. */
+ for (omp_context *oc = ctx; oc; oc = oc->outer)
+ {
+ if (gimple_code (oc->stmt) != GIMPLE_OACC_PARALLEL)
+ continue;
+
+ clauses = gimple_oacc_parallel_clauses (oc->stmt);
+
+ vector_length = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH);
+ if (vector_length)
+ vector_length = fold_convert_loc (OMP_CLAUSE_LOCATION (vector_length),
+ sizetype,
+ OMP_CLAUSE_VECTOR_LENGTH_EXPR
+ (vector_length));
+ else
+ vector_length = fold_convert (sizetype, integer_one_node);
+
+ gangs = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS);
+ if (gangs)
+ gangs = fold_convert_loc (OMP_CLAUSE_LOCATION (gangs), sizetype,
+ OMP_CLAUSE_NUM_GANGS_EXPR (gangs));
+ else
+ gangs = fold_convert (sizetype, integer_one_node);
+
+ break;
+ }
+
+ nthreads = fold_build2 (MULT_EXPR, sizetype, gangs, vector_length);
+
+ return nthreads;
+}
+
/* Holds a decl for __OPENMP_TARGET__. */
static GTY(()) tree offload_symbol_decl;
@@ -4356,6 +4438,10 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
}
+static void
+finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
+ omp_context *ctx, bool receiver = false);
+
/* Generate code to implement the REDUCTION clauses. */
static void
@@ -4433,61 +4519,26 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
tree t = NULL_TREE, array, nthreads;
tree type = get_base_type (var);
- /* First ensure that the current tid is less than vector_length. */
- tree exit_label = create_artificial_label (UNKNOWN_LOCATION);
- tree reduction_label = create_artificial_label (UNKNOWN_LOCATION);
-
/* Get the current thread id. */
- tree call = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
- gimple stmt = gimple_build_call (call, 1, integer_zero_node);
- tree fntype = gimple_call_fntype (stmt);
- tree tid = create_tmp_var (TREE_TYPE (fntype), NULL);
+ tree call = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ tree tid = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+ gimple stmt = gimple_build_call (call, 0);
gimple_call_set_lhs (stmt, tid);
gimple_seq_add_stmt (stmt_seqp, stmt);
/* Find the total number of threads. A reduction clause
only appears inside a loop construction or a combined
parallel and loop construct. */
- tree c;
-
- if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR)
- c = gimple_oacc_parallel_clauses (ctx->outer->stmt);
- else
- c = gimple_oacc_parallel_clauses (ctx->stmt);
- t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-
- if (t)
- {
- t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
- integer_type_node,
- OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
- }
-
- if (!t)
- t = integer_one_node;
+ call = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+ t = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+ stmt = gimple_build_call (call, 0);
+ gimple_call_set_lhs (stmt, t);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
- /* Extract the number of threads. */
nthreads = create_tmp_var (sizetype, NULL);
gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, t),
stmt_seqp);
- stmt = gimple_build_assign_with_ops (MINUS_EXPR, nthreads, nthreads,
- fold_build1 (NOP_EXPR, sizetype,
- integer_one_node));
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* If tid >= nthreads, goto exit_label. */
- t = create_tmp_var (sizetype, NULL);
- gimplify_assign (t, fold_build1 (NOP_EXPR, sizetype, tid),
- stmt_seqp);
- stmt = gimple_build_cond (GT_EXPR, t, nthreads, exit_label,
- reduction_label);
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* Place the reduction_label here. */
-
- gimple_seq_add_stmt (stmt_seqp,
- gimple_build_label (reduction_label));
/* Now insert the partial reductions into the array. */
@@ -4510,9 +4561,11 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
gimplify_assign (offset, TYPE_SIZE_UNIT (type),
stmt_seqp);
t = create_tmp_var (sizetype, NULL);
- gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype,
- tid)),
- stmt_seqp);
+
+ /* Calculate the stack offset to be array[tid+1]. */
+ x = fold_build2 (PLUS_EXPR, sizetype, build_int_cst (sizetype, 1),
+ fold_build1 (NOP_EXPR, sizetype, tid));
+ gimplify_assign (t, unshare_expr (x), stmt_seqp);
stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -4528,8 +4581,50 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
x = unshare_expr (build_simple_mem_ref (ptr));
stmt = gimplify_assign (x, new_var, stmt_seqp);
- /* Place exit label here. */
- gimple_seq_add_stmt (stmt_seqp, gimple_build_label (exit_label));
+ /* Synchronize the threads and finish up the reduction. */
+
+ tree next = create_artificial_label (UNKNOWN_LOCATION);
+ tree reduction_exit = create_artificial_label (UNKNOWN_LOCATION);
+
+ /* Synchronize all of the threads. */
+ call = builtin_decl_explicit (BUILT_IN_SYNC_SYNCHRONIZE);
+ stmt = gimple_build_call (call, 0);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ /* Jump to the exit label if tid != 0. */
+ tree t1 = create_tmp_var (sizetype, NULL);
+ tree t2 = create_tmp_var (sizetype, NULL);
+ gimplify_assign (t1, fold_build1 (NOP_EXPR, sizetype, tid),
+ stmt_seqp);
+ gimplify_assign (t2, fold_build1 (NOP_EXPR, sizetype,
+ integer_zero_node),
+ stmt_seqp);
+ stmt = gimple_build_cond (NE_EXPR, t1, t2, reduction_exit, next);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+ gimple_seq_add_stmt (stmt_seqp, gimple_build_label (next));
+
+ if (finish_reduction_on_host (ctx))
+ {
+ /* Set the last element of the array to be 1 if this kernel
+ is executed on the accelerator. */
+ call = builtin_decl_explicit (BUILT_IN_ACC_ON_DEVICE);
+ tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+ stmt = gimple_build_call (call, 1, build_int_cst
+ (integer_type_node, 2));
+ gimple_call_set_lhs (stmt, lhs);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ x = unshare_expr (build_simple_mem_ref (array));
+ stmt = gimplify_assign (x, convert (TREE_TYPE (new_var),
+ fold_build1 (TRUTH_NOT_EXPR,
+ sizetype, lhs)),
+ stmt_seqp);
+ }
+ else
+ finalize_reduction_data (clauses, nthreads, stmt_seqp, ctx,
+ true);
+
+ gimple_seq_add_stmt (stmt_seqp, gimple_build_label (reduction_exit));
return;
}
@@ -5644,9 +5739,9 @@ expand_oacc_offload (struct omp_region *region)
tree openmp_target = get_offload_symbol_decl ();
tree fnaddr = build_fold_addr_expr (child_fn);
g = gimple_build_call (builtin_decl_explicit (start_ix), 10, device,
- fnaddr, build_fold_addr_expr (openmp_target),
- t1, t2, t3, t4,
- t_num_gangs, t_num_workers, t_vector_length);
+ fnaddr, build_fold_addr_expr (openmp_target),
+ t1, t2, t3, t4,
+ t_num_gangs, t_num_workers, t_vector_length);
gimple_set_location (g, gimple_location (entry_stmt));
gsi_insert_before (&gsi, g, GSI_SAME_STMT);
}
@@ -6913,8 +7008,10 @@ expand_omp_for_static_nochunk (struct omp_region *region,
threadid = build_call_expr (threadid, 0);
break;
case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = integer_one_node;
- threadid = integer_zero_node;
+ nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+ nthreads = build_call_expr (nthreads, 0);
+ threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ threadid = build_call_expr (threadid, 0);
break;
default:
gcc_unreachable ();
@@ -6922,6 +7019,15 @@ expand_omp_for_static_nochunk (struct omp_region *region,
nthreads = fold_convert (itype, nthreads);
nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
true, GSI_SAME_STMT);
+
+ /* Ensure nthreads is at least 1. BUILT_IN_GOACC_NTID returns 0 for a target
+ that does not have a specific expansion. */
+ nthreads
+ = fold_build2 (MAX_EXPR, itype, nthreads,
+ fold_convert (TREE_TYPE (nthreads), integer_one_node));
+ nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+
threadid = fold_convert (itype, threadid);
threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
true, GSI_SAME_STMT);
@@ -7317,8 +7423,10 @@ expand_omp_for_static_chunk (struct omp_region *region,
threadid = build_call_expr (threadid, 0);
break;
case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = integer_one_node;
- threadid = integer_zero_node;
+ nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+ nthreads = build_call_expr (nthreads, 0);
+ threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ threadid = build_call_expr (threadid, 0);
break;
default:
gcc_unreachable ();
@@ -7326,6 +7434,15 @@ expand_omp_for_static_chunk (struct omp_region *region,
nthreads = fold_convert (itype, nthreads);
nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
true, GSI_SAME_STMT);
+
+ /* Ensure nthreads is at least 1. BUILT_IN_GOACC_NTID returns 0 for a target
+ that does not have a specific expansion. */
+ nthreads
+ = fold_build2 (MAX_EXPR, itype, nthreads,
+ fold_convert (TREE_TYPE (nthreads), integer_one_node));
+ nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+
threadid = fold_convert (itype, threadid);
threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
true, GSI_SAME_STMT);
@@ -9390,6 +9507,7 @@ expand_omp_target (struct omp_region *region)
g = gimple_build_call (builtin_decl_explicit (start_ix), 6, device,
build_fold_addr_expr (openmp_target),
t1, t2, t3, t4);
+
gimple_set_location (g, gimple_location (entry_stmt));
gsi_insert_before (&gsi, g, GSI_SAME_STMT);
if (kind != GF_OMP_TARGET_KIND_REGION)
@@ -9782,6 +9900,14 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
tree (*gimple_omp_clauses) (const_gimple);
void (*gimple_omp_set_clauses) (gimple, tree);
+ /* Increment nthreads by one, so the kernel can return the host type
+ in the last element of the array. */
+ t = create_tmp_var (sizetype, NULL);
+ gimplify_assign (t, fold_build2 (PLUS_EXPR, sizetype,
+ fold_build1 (NOP_EXPR, sizetype, nthreads),
+ build_int_cst (sizetype, 1)), stmt_seqp);
+ nthreads = t;
+
/* Find the innermost PARALLEL openmp context. FIXME: OpenACC kernels
may require extra care unless they are converted to openmp for loops. */
@@ -9817,7 +9943,6 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
fold_convert (TREE_TYPE (nthreads),
TYPE_SIZE_UNIT (type)));
gimple_seq_add_stmt (stmt_seqp, stmt);
-
size = create_tmp_var (sizetype, NULL);
gimplify_assign (size, fold_build1 (NOP_EXPR, sizetype, t), stmt_seqp);
@@ -9837,7 +9962,8 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
most clause so that copy-out works. */
tree x = array;
t = build_omp_clause (gimple_location (ctx->stmt), OMP_CLAUSE_MAP);
- OMP_CLAUSE_MAP_KIND (t) = OMP_CLAUSE_MAP_FORCE_FROM;
+ OMP_CLAUSE_MAP_KIND (t) = finish_reduction_on_host (ctx) ?
+ OMP_CLAUSE_MAP_FORCE_FROM : OMP_CLAUSE_MAP_FORCE_ALLOC;
OMP_CLAUSE_DECL (t) = x;
OMP_CLAUSE_CHAIN (t) = NULL;
if (oc)
@@ -9857,53 +9983,103 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
static void
finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
- omp_context *ctx)
+ omp_context *ctx, bool receiver)
{
gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt));
- tree c, var, array, loop_header, loop_body, loop_exit, type;
+ tree c, x, var, array, loop_header, loop_body, loop_exit, type, ptype;
gimple stmt;
+ /* Update nthreads in case the reduction kernel was executed on the
+ host. */
+ if (!receiver)
+ {
+ for (c = clauses; c && OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION;
+ c = OMP_CLAUSE_CHAIN (c));
+
+ /* Set up reduction variable, var. Because it's not gimple register,
+ it needs to be treated as a reference. */
+ var = OMP_CLAUSE_DECL (c);
+ type = get_base_type (var);
+ ptype = build_pointer_type (type);
+ if (receiver)
+ var = lookup_decl_in_outer_ctx (var, ctx);
+
+ /* Extract array[0] into mem. */
+ array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
+ tree mem = create_tmp_var (type, NULL);
+ gimplify_assign (mem, build_simple_mem_ref (array), stmt_seqp);
+
+ tree l1 = create_artificial_label (UNKNOWN_LOCATION);
+ tree l2 = create_artificial_label (UNKNOWN_LOCATION);
+
+ x = create_tmp_var (integer_type_node, NULL);
+ gimplify_assign (x, convert (integer_type_node, mem), stmt_seqp);
+ stmt = gimple_build_cond (EQ_EXPR, x,
+ integer_zero_node, l1, l2);
+
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+ gimple_seq_add_stmt (stmt_seqp, gimple_build_label (l1));
+ gimplify_assign (nthreads, build_int_cst (sizetype, 1), stmt_seqp);
+ gimple_seq_add_stmt (stmt_seqp, gimple_build_label (l2));
+ }
+
/* Create for loop.
let var = the original reduction variable
let array = reduction variable array
- var = array[0]
- for (i = 1; i < nthreads; i++)
+ var = array[1]
+ for (i = 2; i < nthreads; i++)
var op= array[i]
- */
+ */
loop_header = create_artificial_label (UNKNOWN_LOCATION);
loop_body = create_artificial_label (UNKNOWN_LOCATION);
loop_exit = create_artificial_label (UNKNOWN_LOCATION);
/* Initialize the reduction variables to be value of the first array
- element. */
+ element. FIXME: A parallel loop should use the original reduction
+ variable as the initial value. */
for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
{
if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
continue;
- tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c);
-
- /* reduction(-:var) sums up the partial results, so it acts
- identically to reduction(+:var). */
- if (reduction_code == MINUS_EXPR)
- reduction_code = PLUS_EXPR;
-
/* Set up reduction variable, var. Becuase it's not gimple register,
it needs to be treated as a reference. */
var = OMP_CLAUSE_DECL (c);
type = get_base_type (var);
- tree ptr = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
+ ptype = build_pointer_type (type);
+ if (receiver)
+ var = lookup_decl_in_outer_ctx (var, ctx);
+ array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
- /* Extract array[0] into mem. */
+ if (receiver)
+ {
+ tree t = create_tmp_var (ptype, NULL);
+ array = build_receiver_ref (array, false, ctx->outer);
+ gimplify_assign (t, array, stmt_seqp);
+ array = t;
+ }
+
+ /* Calculate the array offset. */
+ tree offset = create_tmp_var (sizetype, NULL);
+ gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp);
+ stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset,
+ build_int_cst (sizetype, 1));
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
+ stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr, array,
+ offset);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ /* Extract array[1] into mem. */
tree mem = create_tmp_var (type, NULL);
gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
/* Find the original reduction variable. */
- tree x = build_outer_var_ref (var, ctx);
if (is_reference (var))
var = build_simple_mem_ref (var);
@@ -9913,16 +10089,15 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
/* Create an index variable and set it to one. */
tree ix = create_tmp_var (sizetype, NULL);
- gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_one_node),
- stmt_seqp);
+ gimplify_assign (ix, build_int_cst (sizetype, 2), stmt_seqp);
/* Insert the loop header label here. */
gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header));
- /* Loop if ix >= nthreads. */
- tree x = create_tmp_var (sizetype, NULL);
+ /* Loop if ix < nthreads. */
+ x = create_tmp_var (sizetype, NULL);
gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp);
- stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body);
+ stmt = gimple_build_cond (GT_EXPR, ix, x, loop_exit, loop_body);
gimple_seq_add_stmt (stmt_seqp, stmt);
/* Insert the loop body label here. */
@@ -9944,8 +10119,19 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
/* Set up reduction variable var. */
var = OMP_CLAUSE_DECL (c);
type = get_base_type (var);
+ ptype = build_pointer_type (type);
+ if (receiver)
+ var = lookup_decl_in_outer_ctx (var, ctx);
array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
+ if (receiver)
+ {
+ tree t = create_tmp_var (ptype, NULL);
+ array = build_receiver_ref (array, false, ctx->outer);
+ gimplify_assign (t, array, stmt_seqp);
+ array = t;
+ }
+
/* Calculate the array offset. */
tree offset = create_tmp_var (sizetype, NULL);
gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp);
@@ -9962,7 +10148,6 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
/* Find the original reduction variable. */
- tree x = build_outer_var_ref (var, ctx);
if (is_reference (var))
var = build_simple_mem_ref (var);
@@ -10026,7 +10211,6 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi))
{
- tree call;
tree clauses, nthreads, t, c;
bool reduction_found = false;
@@ -10034,6 +10218,7 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
switch (gimple_code (stmt))
{
+ /* FIXME: A reduction may also appear in an oacc parallel. */
case GIMPLE_OMP_FOR:
clauses = gimple_omp_for_clauses (stmt);
@@ -10051,55 +10236,15 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
ctx = maybe_lookup_ctx (stmt);
t = NULL_TREE;
- /* The reduction clause may be nested inside a loop directive.
- Scan for the innermost vector_length clause. */
- for (omp_context *oc = ctx; oc; oc = oc->outer)
- {
- switch (gimple_code (oc->stmt))
- {
- case GIMPLE_OACC_PARALLEL:
- c = gimple_oacc_parallel_clauses (oc->stmt);
- break;
- case GIMPLE_OMP_FOR:
- c = gimple_omp_for_clauses (oc->stmt);
- break;
- default:
- c = NULL_TREE;
- break;
- }
-
- if (c && gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
- {
- t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
- if (t)
- t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
- integer_type_node,
- OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
- break;
- }
- }
-
- if (!t)
- t = integer_one_node;
-
/* Extract the number of threads. */
- nthreads = create_tmp_var (TREE_TYPE (t), NULL);
+ nthreads = create_tmp_var (sizetype, NULL);
+ t = oacc_host_nthreads (ctx);
gimplify_assign (nthreads, t, in_stmt_seqp);
- /* Ensure nthreads >= 1. */
- stmt = gimple_build_assign_with_ops (MAX_EXPR, nthreads, nthreads,
- fold_convert(TREE_TYPE (nthreads),
- integer_one_node));
- gimple_seq_add_stmt (in_stmt_seqp, stmt);
-
- /* Set the number of threads. */
- /* FIXME: This needs to handle accelerators */
- call = builtin_decl_explicit (BUILT_IN_OMP_SET_NUM_THREADS);
- stmt = gimple_build_call (call, 1, nthreads);
- gimple_seq_add_stmt (in_stmt_seqp, stmt);
-
initialize_reduction_data (clauses, nthreads, in_stmt_seqp, ctx);
- finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
+
+ if (finish_reduction_on_host (ctx))
+ finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
break;
default:
// Scan for other directives which support reduction here.
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+
+void
+ntid (void)
+{
+ const int ntid_x = __builtin_GOACC_ntid (0);
+ const int ntid_y = __builtin_GOACC_ntid (1);
+ const int ntid_z = __builtin_GOACC_ntid (2);
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_ntid" } } */
+}
+
+void
+tid (void)
+{
+ const int tid_x = __builtin_GOACC_tid (0);
+ const int tid_y = __builtin_GOACC_tid (1);
+ const int tid_z = __builtin_GOACC_tid (2);
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_tid" } } */
+}
+
+void
+nctaid (void)
+{
+ const int nctaid_x = __builtin_GOACC_nctaid (0);
+ const int nctaid_y = __builtin_GOACC_nctaid (1);
+ const int nctaid_z = __builtin_GOACC_nctaid (2);
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_nctaid" } } */
+}
+
+void
+ctaid (void)
+{
+ const int ctaid_x = __builtin_GOACC_ctaid (0);
+ const int ctaid_y = __builtin_GOACC_ctaid (1);
+ const int ctaid_z = __builtin_GOACC_ctaid (2);
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_ctaid" } } */
+}
+
+void
+on_device (void)
+{
+ const int on_host = __builtin_acc_on_device (0);
+ const int on_accelerator = __builtin_acc_on_device (1);
+
+ /* { dg-final { scan-assembler-not "__built_in_acc_on_device" } } */
+}
+
+void
+acc_get_thread_num (void)
+{
+ const int thread_num = __builtin_GOACC_get_thread_num ();
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_get_thread_num" } } */
+}
+
+void
+acc_get_num_threads (void)
+{
+ const int num_threads = __builtin_GOACC_get_num_threads ();
+
+ /* { dg-final { scan-assembler-not "__builtin_GOACC_get_num_threads" } } */
+}