2014-10-31 Cesar Philippidis <cesar@codesourcery.com>
Thomas Schwinge <thomas@codesourcery.com>
gcc/
* oacc-builtins.def (BUILT_IN_GOACC_GET_NUM_THREADS): New built-in
function.
(BUILT_IN_ACC_GET_DEVICE_TYPE): Likewise.
* omp-low.c (oacc_max_threads): New function.
(lower_reduction_clauses): Use the GOACC thread builtin functions to
determine the thread count. Handle multiple reduction variables.
(expand_omp_for_static_nochunk): Likewise.
(expand_omp_for_static_chunk): Likewise.
(finalize_reduction_data): General cleanups.
(process_reduction_data): Use acc_get_device_type to determine nthreads
at runtime.
libgomp/
* libgomp.map (GOACC_get_thread_num): Declare.
(GOACC_get_num_threads): Declare.
* libgomp_g.h (GOACC_get_thread_num): Declare.
(GOACC_get_num_threads): Declare.
* oacc-parallel.c (GOACC_parallel): Handle num_gangs.
(GOACC_get_num_threads): New function.
(GOACC_get_thread_num): New function.
* oacc-ptx.h: New file.
* plugin-nvptx.c (ABORT_PTX): Remove macro. Move to oacc-ptx.h.
(ACC_ON_DEVICE_PTX): Likewise.
(link_ptx): Also link ptx code defined by GOACC_INTERNAL_PTX.
(PTX_exec): Handle gangs/CTAs.
* testsuite/libgomp.oacc-c/reduction-1.c: New test.
* testsuite/libgomp.oacc-c/reduction-2.c: New test.
* testsuite/libgomp.oacc-c/reduction-3.c: New test.
* testsuite/libgomp.oacc-c/reduction-4.c: New test.
* testsuite/libgomp.oacc-c/reduction-5.: New test.
* testsuite/libgomp.oacc-c/reduction-initial-1.c: New test.
* testsuite/libgomp.oacc-fortran/reduction-1.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-2.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-3.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-4.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-5.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-6.f90: New test.
@@ -27,6 +27,8 @@ along with GCC; see the file COPYING3. If not see
See builtins.def for details. */
+DEF_GOACC_BUILTIN (BUILT_IN_ACC_GET_DEVICE_TYPE, "acc_get_device_type",
+ BT_FN_INT, ATTR_NOTHROW_LIST)
DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_START, "GOACC_data_start",
BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_END, "GOACC_data_end",
@@ -45,3 +47,7 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
ATTR_NOTHROW_LIST)
DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num",
+ BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads",
+ BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
@@ -253,6 +253,52 @@ omp_get_id (tree node)
return IDENTIFIER_POINTER(get_identifier (temp_name));
}
+/* Determine the number of threads OpenACC threads used to determine the
+ size of the array of partial reductions. Currently, this is num_gangs
+ * vector_length. This value may be different than GOACC_GET_NUM_THREADS,
+ because it is independed of the device used. */
+
+static tree
+oacc_max_threads (omp_context *ctx)
+{
+ tree nthreads, vector_length, gangs, clauses;
+
+ gangs = fold_convert (sizetype, integer_one_node);
+ vector_length = gangs;
+
+ /* The reduction clause may be nested inside a loop directive.
+ Scan for the innermost vector_length clause. */
+ for (omp_context *oc = ctx; oc; oc = oc->outer)
+ {
+ if (gimple_code (oc->stmt) != GIMPLE_OACC_PARALLEL)
+ continue;
+
+ clauses = gimple_oacc_parallel_clauses (oc->stmt);
+
+ vector_length = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH);
+ if (vector_length)
+ vector_length = fold_convert_loc (OMP_CLAUSE_LOCATION (vector_length),
+ sizetype,
+ OMP_CLAUSE_VECTOR_LENGTH_EXPR
+ (vector_length));
+ else
+ vector_length = fold_convert (sizetype, integer_one_node);
+
+ gangs = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS);
+ if (gangs)
+ gangs = fold_convert_loc (OMP_CLAUSE_LOCATION (gangs), sizetype,
+ OMP_CLAUSE_NUM_GANGS_EXPR (gangs));
+ else
+ gangs = fold_convert (sizetype, integer_one_node);
+
+ break;
+ }
+
+ nthreads = fold_build2 (MULT_EXPR, sizetype, gangs, vector_length);
+
+ return nthreads;
+}
+
/* Holds a decl for __OPENMP_TARGET__. */
static GTY(()) tree offload_symbol_decl;
@@ -4429,6 +4475,57 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
gimple_seq_add_stmt (stmt_list, gimple_build_label (label));
}
+static void
+lower_reduction_var_helper (gimple_seq *stmt_seqp, omp_context *ctx, tree tid,
+ tree var, tree new_var)
+{
+ /* The atomic add at the end of the sum creates unnecessary
+ write contention on accelerators. To work around this,
+ create an array to store the partial reductions. Later, in
+ lower_omp_for (for openacc), the values of array will be
+ combined. */
+
+ tree t = NULL_TREE, array, x;
+ tree type = get_base_type (var);
+ gimple stmt;
+
+ /* Now insert the partial reductions into the array. */
+
+ /* Find the reduction array. */
+
+ tree ptype = build_pointer_type (type);
+
+ t = lookup_reduction (omp_get_id (var), ctx);
+ t = build_receiver_ref (t, false, ctx->outer);
+
+ array = create_tmp_var (ptype, NULL);
+ gimplify_assign (array, t, stmt_seqp);
+
+ tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
+
+ /* Find the reduction array. */
+
+ /* testing a unary conversion. */
+ tree offset = create_tmp_var (sizetype, NULL);
+ gimplify_assign (offset, TYPE_SIZE_UNIT (type),
+ stmt_seqp);
+ t = create_tmp_var (sizetype, NULL);
+ gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype, tid)),
+ stmt_seqp);
+ stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ /* Offset expression. Does the POINTER_PLUS_EXPR take care
+ of adding sizeof(var) to the array? */
+ ptr = create_tmp_var (ptype, NULL);
+ stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, unshare_expr(ptr),
+ array, offset);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+
+ /* Move the local sum to gfc$sum[i]. */
+ x = unshare_expr (build_simple_mem_ref (ptr));
+ stmt = gimplify_assign (x, new_var, stmt_seqp);
+}
/* Generate code to implement the REDUCTION clauses. */
@@ -4437,7 +4534,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
{
gimple_seq sub_seq = NULL;
gimple stmt;
- tree x, c;
+ tree x, c, tid;
int count = 0;
/* SIMD reductions are handled in lower_rec_input_clauses. */
@@ -4462,6 +4559,17 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
if (count == 0)
return;
+ /* Initialize thread info for OpenACC. */
+ if (is_gimple_omp_oacc_specifically (ctx->stmt))
+ {
+ /* Get the current thread id. */
+ tree call = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ tid = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+ gimple stmt = gimple_build_call (call, 0);
+ gimple_call_set_lhs (stmt, tid);
+ gimple_seq_add_stmt (stmt_seqp, stmt);
+ }
+
for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
{
tree var, ref, new_var;
@@ -4498,114 +4606,8 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
}
else
{
- /* The atomic add at the end of the sum creates unnecessary
- write contention on accelerators. To work around this,
- create an array to store the partial reductions. Later, in
- lower_omp_for (for openacc), the values of array will be
- combined. */
-
- tree t = NULL_TREE, array, nthreads;
- tree type = get_base_type (var);
-
- /* First ensure that the current tid is less than vector_length. */
- tree exit_label = create_artificial_label (UNKNOWN_LOCATION);
- tree reduction_label = create_artificial_label (UNKNOWN_LOCATION);
-
- /* Get the current thread id. */
- tree call = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
- gimple stmt = gimple_build_call (call, 1, integer_zero_node);
- tree fntype = gimple_call_fntype (stmt);
- tree tid = create_tmp_var (TREE_TYPE (fntype), NULL);
- gimple_call_set_lhs (stmt, tid);
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* Find the total number of threads. A reduction clause
- only appears inside a loop construction or a combined
- parallel and loop construct. */
- tree c;
-
- if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR)
- c = gimple_oacc_parallel_clauses (ctx->outer->stmt);
- else
- c = gimple_oacc_parallel_clauses (ctx->stmt);
-
- t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-
- if (t)
- {
- t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
- integer_type_node,
- OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
- }
-
- if (!t)
- t = integer_one_node;
-
- /* Extract the number of threads. */
- nthreads = create_tmp_var (sizetype, NULL);
- gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, t),
- stmt_seqp);
- stmt = gimple_build_assign_with_ops (MINUS_EXPR, nthreads, nthreads,
- fold_build1 (NOP_EXPR, sizetype,
- integer_one_node));
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* If tid >= nthreads, goto exit_label. */
- t = create_tmp_var (sizetype, NULL);
- gimplify_assign (t, fold_build1 (NOP_EXPR, sizetype, tid),
- stmt_seqp);
- stmt = gimple_build_cond (GT_EXPR, t, nthreads, exit_label,
- reduction_label);
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* Place the reduction_label here. */
-
- gimple_seq_add_stmt (stmt_seqp,
- gimple_build_label (reduction_label));
-
- /* Now insert the partial reductions into the array. */
-
- /* Find the reduction array. */
-
- tree ptype = build_pointer_type (type);
-
- t = lookup_reduction (omp_get_id (var), ctx);
- t = build_receiver_ref (t, false, ctx->outer);
-
- array = create_tmp_var (ptype, NULL);
- gimplify_assign (array, t, stmt_seqp);
-
- tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
-
- /* Find the reduction array. */
-
- /* testing a unary conversion. */
- tree offset = create_tmp_var (sizetype, NULL);
- gimplify_assign (offset, TYPE_SIZE_UNIT (type),
- stmt_seqp);
- t = create_tmp_var (sizetype, NULL);
- gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype,
- tid)),
- stmt_seqp);
- stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* Offset expression. Does the POINTER_PLUS_EXPR take care
- of adding sizeof(var) to the array? */
- ptr = create_tmp_var (ptype, NULL);
- stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR,
- unshare_expr(ptr),
- array, offset);
- gimple_seq_add_stmt (stmt_seqp, stmt);
-
- /* Move the local sum to gfc$sum[i]. */
- x = unshare_expr (build_simple_mem_ref (ptr));
- stmt = gimplify_assign (x, new_var, stmt_seqp);
-
- /* Place exit label here. */
- gimple_seq_add_stmt (stmt_seqp, gimple_build_label (exit_label));
-
- return;
+ lower_reduction_var_helper (stmt_seqp, ctx, tid, var, new_var);
+ return;
}
}
@@ -4626,12 +4628,22 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
}
else
{
- x = build2 (code, TREE_TYPE (ref), ref, new_var);
- ref = build_outer_var_ref (var, ctx);
- gimplify_assign (ref, x, &sub_seq);
+ if (is_gimple_omp_oacc_specifically (ctx->stmt))
+ {
+ lower_reduction_var_helper (stmt_seqp, ctx, tid, var, new_var);
+ }
+ else
+ {
+ x = build2 (code, TREE_TYPE (ref), ref, new_var);
+ ref = build_outer_var_ref (var, ctx);
+ gimplify_assign (ref, x, &sub_seq);
+ }
}
}
+ if (is_gimple_omp_oacc_specifically (ctx->stmt))
+ return;
+
stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_START),
0);
gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -7045,8 +7057,10 @@ expand_omp_for_static_nochunk (struct omp_region *region,
threadid = build_call_expr (threadid, 0);
break;
case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = integer_one_node;
- threadid = integer_zero_node;
+ nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+ nthreads = build_call_expr (nthreads, 0);
+ threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ threadid = build_call_expr (threadid, 0);
break;
default:
gcc_unreachable ();
@@ -7449,8 +7463,10 @@ expand_omp_for_static_chunk (struct omp_region *region,
threadid = build_call_expr (threadid, 0);
break;
case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = integer_one_node;
- threadid = integer_zero_node;
+ nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+ nthreads = build_call_expr (nthreads, 0);
+ threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+ threadid = build_call_expr (threadid, 0);
break;
default:
gcc_unreachable ();
@@ -10044,11 +10060,10 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
}
}
-/* Helper function to finalize local data for the reduction arrays. The
- reduction array needs to be reduced to the original reduction variable.
- FIXME: This function assumes that there are vector_length threads in
- total. Also, it assumes that there are at least vector_length iterations
- in the for loop. */
+/* Helper function to process the array of partial reductions. Nthreads
+ indicates the number of threads. Unfortunately, GOACC_GET_NUM_THREADS
+ cannot be used here, because nthreads on the host may be different than
+ on the accelerator. */
static void
finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
@@ -10056,7 +10071,7 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
{
gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt));
- tree c, var, array, loop_header, loop_body, loop_exit, type;
+ tree c, x, var, array, loop_header, loop_body, loop_exit, type;
gimple stmt;
/* Create for loop.
@@ -10080,8 +10095,8 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
/* Insert the loop header label here. */
gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header));
- /* Loop if ix >= nthreads. */
- tree x = create_tmp_var (sizetype, NULL);
+ /* Exit loop if ix >= nthreads. */
+ x = create_tmp_var (sizetype, NULL);
gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp);
stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body);
gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -10123,7 +10138,6 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
/* Find the original reduction variable. */
- tree x = build_outer_var_ref (var, ctx);
if (is_reference (var))
var = build_simple_mem_ref (var);
@@ -10196,14 +10210,15 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi))
{
- tree call;
- tree clauses, nthreads, t, c;
+ tree clauses, nthreads, t, c, acc_device, acc_device_host, call,
+ enter, exit;
bool reduction_found = false;
stmt = gsi_stmt (gsi);
switch (gimple_code (stmt))
{
+ /* FIXME: A reduction may also appear in an oacc parallel. */
case GIMPLE_OMP_FOR:
clauses = gimple_omp_for_clauses (stmt);
@@ -10221,52 +10236,53 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
ctx = maybe_lookup_ctx (stmt);
t = NULL_TREE;
- /* The reduction clause may be nested inside a loop directive.
- Scan for the innermost vector_length clause. */
- for (omp_context *oc = ctx; oc; oc = oc->outer)
- {
- switch (gimple_code (oc->stmt))
- {
- case GIMPLE_OACC_PARALLEL:
- c = gimple_oacc_parallel_clauses (oc->stmt);
- break;
- case GIMPLE_OMP_FOR:
- c = gimple_omp_for_clauses (oc->stmt);
- break;
- default:
- c = NULL_TREE;
- break;
- }
-
- if (c && gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
- {
- t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
- if (t)
- t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
- integer_type_node,
- OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
- break;
- }
- }
-
- if (!t)
- t = integer_one_node;
-
/* Extract the number of threads. */
- nthreads = create_tmp_var (TREE_TYPE (t), NULL);
+ nthreads = create_tmp_var (sizetype, NULL);
+ t = oacc_max_threads (ctx);
gimplify_assign (nthreads, t, in_stmt_seqp);
- /* Ensure nthreads >= 1. */
- stmt = gimple_build_assign_with_ops (MAX_EXPR, nthreads, nthreads,
- fold_convert(TREE_TYPE (nthreads),
- integer_one_node));
+ /* Determine if this is kernel will be executed on the host. */
+ call = builtin_decl_explicit (BUILT_IN_ACC_GET_DEVICE_TYPE);
+ acc_device = create_tmp_var (integer_type_node, ".acc_device_type");
+ stmt = gimple_build_call (call, 0);
+ gimple_call_set_lhs (stmt, acc_device);
gimple_seq_add_stmt (in_stmt_seqp, stmt);
- /* Set the number of threads. */
- /* FIXME: This needs to handle accelerators */
- call = builtin_decl_explicit (BUILT_IN_OMP_SET_NUM_THREADS);
- stmt = gimple_build_call (call, 1, nthreads);
+ /* Set nthreads = 1 for ACC_DEVICE_TYPE=host. */
+ acc_device_host = create_tmp_var (integer_type_node,
+ ".acc_device_host");
+ gimplify_assign (acc_device_host, build_int_cst (integer_type_node,
+ 2),
+ in_stmt_seqp);
+
+ enter = create_artificial_label (UNKNOWN_LOCATION);
+ exit = create_artificial_label (UNKNOWN_LOCATION);
+
+ stmt = gimple_build_cond (EQ_EXPR, acc_device, acc_device_host,
+ enter, exit);
+ gimple_seq_add_stmt (in_stmt_seqp, stmt);
+ gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (enter));
+ gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype,
+ integer_one_node),
+ in_stmt_seqp);
+ gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (exit));
+
+ /* Also, set nthreads = 1 for ACC_DEVICE_TYPE=host_nonshm. */
+ gimplify_assign (acc_device_host, build_int_cst (integer_type_node,
+ 3),
+ in_stmt_seqp);
+
+ enter = create_artificial_label (UNKNOWN_LOCATION);
+ exit = create_artificial_label (UNKNOWN_LOCATION);
+
+ stmt = gimple_build_cond (EQ_EXPR, acc_device, acc_device_host,
+ enter, exit);
gimple_seq_add_stmt (in_stmt_seqp, stmt);
+ gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (enter));
+ gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype,
+ integer_one_node),
+ in_stmt_seqp);
+ gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (exit));
initialize_reduction_data (clauses, nthreads, in_stmt_seqp, ctx);
finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
@@ -321,6 +321,8 @@ GOACC_2.0 {
GOACC_parallel;
GOACC_update;
GOACC_wait;
+ GOACC_get_thread_num;
+ GOACC_get_num_threads;
};
# FIXME: Hygiene/grouping/naming?
@@ -230,5 +230,7 @@ extern void GOACC_update (int device, const void *openmp_target, size_t mapnum,
unsigned short *kinds, int async,
int num_waits, ...);
extern void GOACC_wait (int, int, ...);
+extern int GOACC_get_num_threads (void);
+extern int GOACC_get_thread_num (void);
#endif /* LIBGOMP_G_H */
@@ -117,9 +117,6 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
splay_tree_key tgt_fn_key;
void (*tgt_fn);
- if (num_gangs != 1)
- gomp_fatal ("num_gangs (%d) different from one is not yet supported",
- num_gangs);
if (num_workers != 1)
gomp_fatal ("num_workers (%d) different from one is not yet supported",
num_workers);
@@ -389,3 +386,15 @@ GOACC_wait (int async, int num_waits, ...)
va_end (ap);
}
+
+int
+GOACC_get_num_threads (void)
+{
+ return 1;
+}
+
+int
+GOACC_get_thread_num (void)
+{
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,176 @@
+#define ABORT_PTX \
+ ".version 3.1\n" \
+ ".target sm_30\n" \
+ ".address_size 64\n" \
+ ".visible .func abort;\n" \
+ ".visible .func abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+ ".visible .func _gfortran_abort;\n" \
+ ".visible .func _gfortran_abort\n" \
+ "{\n" \
+ "trap;\n" \
+ "ret;\n" \
+ "}\n" \
+
+/* Generated with:
+
+ $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
+*/
+#define ACC_ON_DEVICE_PTX \
+ " .version 3.1\n" \
+ " .target sm_30\n" \
+ " .address_size 64\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u32 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u32 %r24;\n" \
+ " .reg.u32 %r25;\n" \
+ " .reg.pred %r27;\n" \
+ " .reg.u32 %r30;\n" \
+ " ld.param.u32 %ar1, [%in_ar1];\n" \
+ " mov.u32 %r24, %ar1;\n" \
+ " setp.ne.u32 %r27,%r24,4;\n" \
+ " set.u32.eq.u32 %r30,%r24,5;\n" \
+ " neg.s32 %r25, %r30;\n" \
+ " @%r27 bra $L3;\n" \
+ " mov.u32 %r25, 1;\n" \
+ "$L3:\n" \
+ " mov.u32 %retval, %r25;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
+ ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
+ "{\n" \
+ " .reg.u64 %ar1;\n" \
+ ".reg.u32 %retval;\n" \
+ " .reg.u64 %hr10;\n" \
+ " .reg.u64 %r25;\n" \
+ " .reg.u32 %r26;\n" \
+ " .reg.u32 %r27;\n" \
+ " ld.param.u64 %ar1, [%in_ar1];\n" \
+ " mov.u64 %r25, %ar1;\n" \
+ " ld.u32 %r26, [%r25];\n" \
+ " {\n" \
+ " .param.u32 %retval_in;\n" \
+ " {\n" \
+ " .param.u32 %out_arg0;\n" \
+ " st.param.u32 [%out_arg0], %r26;\n" \
+ " call (%retval_in), acc_on_device, (%out_arg0);\n" \
+ " }\n" \
+ " ld.param.u32 %r27, [%retval_in];\n" \
+ "}\n" \
+ " mov.u32 %retval, %r27;\n" \
+ " st.param.u32 [%out_retval], %retval;\n" \
+ " ret;\n" \
+ " }"
+
+ #define GOACC_INTERNAL_PTX \
+ ".version 3.1\n" \
+ ".target sm_30\n" \
+ ".address_size 64\n" \
+ ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" \
+ ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" \
+ ".extern .func abort;\n" \
+ ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n" \
+ "{\n" \
+ ".reg .u32 %retval;\n" \
+ ".reg .u64 %hr10;\n" \
+ ".reg .u32 %r22;\n" \
+ ".reg .u32 %r23;\n" \
+ ".reg .u32 %r24;\n" \
+ ".reg .u32 %r25;\n" \
+ ".reg .u32 %r26;\n" \
+ ".reg .u32 %r27;\n" \
+ ".reg .u32 %r28;\n" \
+ ".reg .u32 %r29;\n" \
+ "mov.u32 %r26,0;\n" \
+ "{\n" \
+ ".param .u32 %retval_in;\n" \
+ "{\n" \
+ ".param .u32 %out_arg0;\n" \
+ "st.param.u32 [%out_arg0],%r26;\n" \
+ "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \
+ "}\n" \
+ "ld.param.u32 %r27,[%retval_in];\n" \
+ "}\n" \
+ "mov.u32 %r22,%r27;\n" \
+ "mov.u32 %r28,0;\n" \
+ "{\n" \
+ ".param .u32 %retval_in;\n" \
+ "{\n" \
+ ".param .u32 %out_arg0;\n" \
+ "st.param.u32 [%out_arg0],%r28;\n" \
+ "call (%retval_in),GOACC_nctaid,(%out_arg0);\n" \
+ "}\n" \
+ "ld.param.u32 %r29,[%retval_in];\n" \
+ "}\n" \
+ "mov.u32 %r23,%r29;\n" \
+ "mul.lo.u32 %r24,%r22,%r23;\n" \
+ "mov.u32 %r25,%r24;\n" \
+ "mov.u32 %retval,%r25;\n" \
+ "st.param.u32 [%out_retval],%retval;\n" \
+ "ret;\n" \
+ "}\n" \
+ ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n" \
+ "{\n" \
+ ".reg .u32 %retval;\n" \
+ ".reg .u64 %hr10;\n" \
+ ".reg .u32 %r22;\n" \
+ ".reg .u32 %r23;\n" \
+ ".reg .u32 %r24;\n" \
+ ".reg .u32 %r25;\n" \
+ ".reg .u32 %r26;\n" \
+ ".reg .u32 %r27;\n" \
+ ".reg .u32 %r28;\n" \
+ ".reg .u32 %r29;\n" \
+ ".reg .u32 %r30;\n" \
+ ".reg .u32 %r31;\n" \
+ ".reg .u32 %r32;\n" \
+ ".reg .u32 %r33;\n" \
+ "mov.u32 %r28,0;\n" \
+ "{\n" \
+ ".param .u32 %retval_in;\n" \
+ "{\n" \
+ ".param .u32 %out_arg0;\n" \
+ "st.param.u32 [%out_arg0],%r28;\n" \
+ "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \
+ "}\n" \
+ "ld.param.u32 %r29,[%retval_in];\n" \
+ "}\n" \
+ "mov.u32 %r22,%r29;\n" \
+ "mov.u32 %r30,0;\n" \
+ "{\n" \
+ ".param .u32 %retval_in;\n" \
+ "{\n" \
+ ".param .u32 %out_arg0;\n" \
+ "st.param.u32 [%out_arg0],%r30;\n" \
+ "call (%retval_in),GOACC_ctaid,(%out_arg0);\n" \
+ "}\n" \
+ "ld.param.u32 %r31,[%retval_in];\n" \
+ "}\n" \
+ "mov.u32 %r23,%r31;\n" \
+ "mul.lo.u32 %r24,%r22,%r23;\n" \
+ "mov.u32 %r32,0;\n" \
+ "{\n" \
+ ".param .u32 %retval_in;\n" \
+ "{\n" \
+ ".param .u32 %out_arg0;\n" \
+ "st.param.u32 [%out_arg0],%r32;\n" \
+ "call (%retval_in),GOACC_tid,(%out_arg0);\n" \
+ "}\n" \
+ "ld.param.u32 %r33,[%retval_in];\n" \
+ "}\n" \
+ "mov.u32 %r25,%r33;\n" \
+ "add.u32 %r26,%r24,%r25;\n" \
+ "mov.u32 %r27,%r26;\n" \
+ "mov.u32 %retval,%r27;\n" \
+ "st.param.u32 [%out_retval],%retval;\n" \
+ "ret;\n" \
+ "}\n"
@@ -35,6 +35,7 @@
#include "libgomp.h"
#include "target.h"
#include "libgomp-plugin.h"
+#include "oacc-ptx.h"
#include "oacc-plugin.h"
#include <cuda.h>
@@ -740,79 +741,6 @@ PTX_avail(void)
return avail;
}
-#define ABORT_PTX \
- ".version 3.1\n" \
- ".target sm_30\n" \
- ".address_size 64\n" \
- ".visible .func abort;\n" \
- ".visible .func abort\n" \
- "{\n" \
- "trap;\n" \
- "ret;\n" \
- "}\n" \
- ".visible .func _gfortran_abort;\n" \
- ".visible .func _gfortran_abort\n" \
- "{\n" \
- "trap;\n" \
- "ret;\n" \
- "}\n" \
-
-/* Generated with:
-
- $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
-*/
-#define ACC_ON_DEVICE_PTX \
- " .version 3.1\n" \
- " .target sm_30\n" \
- " .address_size 64\n" \
- ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
- ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
- "{\n" \
- " .reg.u32 %ar1;\n" \
- ".reg.u32 %retval;\n" \
- " .reg.u64 %hr10;\n" \
- " .reg.u32 %r24;\n" \
- " .reg.u32 %r25;\n" \
- " .reg.pred %r27;\n" \
- " .reg.u32 %r30;\n" \
- " ld.param.u32 %ar1, [%in_ar1];\n" \
- " mov.u32 %r24, %ar1;\n" \
- " setp.ne.u32 %r27,%r24,4;\n" \
- " set.u32.eq.u32 %r30,%r24,5;\n" \
- " neg.s32 %r25, %r30;\n" \
- " @%r27 bra $L3;\n" \
- " mov.u32 %r25, 1;\n" \
- "$L3:\n" \
- " mov.u32 %retval, %r25;\n" \
- " st.param.u32 [%out_retval], %retval;\n" \
- " ret;\n" \
- " }\n" \
- ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
- ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
- "{\n" \
- " .reg.u64 %ar1;\n" \
- ".reg.u32 %retval;\n" \
- " .reg.u64 %hr10;\n" \
- " .reg.u64 %r25;\n" \
- " .reg.u32 %r26;\n" \
- " .reg.u32 %r27;\n" \
- " ld.param.u64 %ar1, [%in_ar1];\n" \
- " mov.u64 %r25, %ar1;\n" \
- " ld.u32 %r26, [%r25];\n" \
- " {\n" \
- " .param.u32 %retval_in;\n" \
- " {\n" \
- " .param.u32 %out_arg0;\n" \
- " st.param.u32 [%out_arg0], %r26;\n" \
- " call (%retval_in), acc_on_device, (%out_arg0);\n" \
- " }\n" \
- " ld.param.u32 %r27, [%retval_in];\n" \
- "}\n" \
- " mov.u32 %retval, %r27;\n" \
- " st.param.u32 [%out_retval], %retval;\n" \
- " ret;\n" \
- " }"
-
static void
link_ptx (CUmodule *module, char *ptx_code)
{
@@ -874,6 +802,16 @@ link_ptx (CUmodule *module, char *ptx_code)
cuErrorMsg (r));
}
+ char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
+ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
+ strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+ GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+ cuErrorMsg (r));
+ }
+
r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
strlen (ptx_code) + 1, 0, 0, 0, 0);
if (r != CUDA_SUCCESS)
@@ -1053,7 +991,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
kargs[0] = &dp;
r = cuLaunchKernel (function,
- 1, 1, 1,
+ num_gangs, 1, 1,
nthreads_in_block, 1, 1,
0, dev_str->stream, kargs, 0);
if (r != CUDA_SUCCESS)
new file mode 100644
@@ -0,0 +1,174 @@
+/* { dg-do run } */
+
+/* Integer reductions. */
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define vl 32
+
+int
+main(void)
+{
+ const int n = 1000;
+ int i;
+ int vresult, result, array[n];
+ bool lvresult, lresult;
+
+ for (i = 0; i < n; i++)
+ array[i] = i;
+
+ result = 0;
+ vresult = 0;
+
+ /* '+' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result += array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult += array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '*' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+ for (i = 0; i < n; i++)
+ result *= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult *= array[i];
+
+ if (result != vresult)
+ abort ();
+
+// result = 0;
+// vresult = 0;
+//
+// /* 'max' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result > array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult > array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+//
+// result = 0;
+// vresult = 0;
+//
+// /* 'min' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result < array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult < array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '&' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&:result)
+ for (i = 0; i < n; i++)
+ result &= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult &= array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '|' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (|:result)
+ for (i = 0; i < n; i++)
+ result |= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult |= array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '^' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (^:result)
+ for (i = 0; i < n; i++)
+ result ^= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult ^= array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '&&' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult && (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult && (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '||' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult || (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult || (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+
+/* float reductions. */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+#define vl 32
+
+int
+main(void)
+{
+ const int n = 1000;
+ int i;
+ float vresult, result, array[n];
+ bool lvresult, lresult;
+
+ for (i = 0; i < n; i++)
+ array[i] = i;
+
+ result = 0;
+ vresult = 0;
+
+ /* '+' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result += array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult += array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '*' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+ for (i = 0; i < n; i++)
+ result *= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult *= array[i];
+
+ if (fabs(result - vresult) > .0001)
+ abort ();
+// result = 0;
+// vresult = 0;
+//
+// /* 'max' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result > array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult > array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+//
+// result = 0;
+// vresult = 0;
+//
+// /* 'min' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result < array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult < array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '&&' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult && (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult && (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '||' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult || (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult || (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+
+/* double reductions. */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+#define vl 32
+
+int
+main(void)
+{
+ const int n = 1000;
+ int i;
+ double vresult, result, array[n];
+ bool lvresult, lresult;
+
+ for (i = 0; i < n; i++)
+ array[i] = i;
+
+ result = 0;
+ vresult = 0;
+
+ /* '+' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result += array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult += array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* '*' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+ for (i = 0; i < n; i++)
+ result *= array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult *= array[i];
+
+ if (fabs(result - vresult) > .0001)
+ abort ();
+// result = 0;
+// vresult = 0;
+//
+// /* 'max' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result > array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult > array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+//
+// result = 0;
+// vresult = 0;
+//
+// /* 'min' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result < array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult < array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '&&' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult && (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult && (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '||' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult || (result > array[i]);
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult || (result > array[i]);
+
+ if (lresult != lvresult)
+ abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,129 @@
+/* { dg-do run } */
+
+/* complex reductions. */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+
+#define vl 32
+
+int
+main(void)
+{
+ const int n = 1000;
+ int i;
+ double complex vresult, result, array[n];
+ bool lvresult, lresult;
+
+ for (i = 0; i < n; i++)
+ array[i] = i;
+
+ result = 0;
+ vresult = 0;
+
+ /* '+' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result += array[i];
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ vresult += array[i];
+
+ if (result != vresult)
+ abort ();
+
+ result = 0;
+ vresult = 0;
+
+ /* Needs support for complex multiplication. */
+
+// /* '*' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (*:result)
+// for (i = 0; i < n; i++)
+// result *= array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult *= array[i];
+//
+// if (fabs(result - vresult) > .0001)
+// abort ();
+// result = 0;
+// vresult = 0;
+
+// /* 'max' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result > array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult > array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+//
+// result = 0;
+// vresult = 0;
+//
+// /* 'min' reductions. */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+// for (i = 0; i < n; i++)
+// result = result < array[i] ? result : array[i];
+//
+// /* Verify the reduction. */
+// for (i = 0; i < n; i++)
+// vresult = vresult < array[i] ? vresult : array[i];
+//
+// printf("%d != %d\n", result, vresult);
+// if (result != vresult)
+// abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '&&' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult && (creal(result) > creal(array[i]));
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult && (creal(result) > creal(array[i]));
+
+ if (lresult != lvresult)
+ abort ();
+
+ result = 5;
+ vresult = 5;
+
+ lresult = false;
+ lvresult = false;
+
+ /* '||' reductions. */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+ for (i = 0; i < n; i++)
+ lresult = lresult || (creal(result) > creal(array[i]));
+
+ /* Verify the reduction. */
+ for (i = 0; i < n; i++)
+ lvresult = lresult || (creal(result) > creal(array[i]));
+
+ if (lresult != lvresult)
+ abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (void)
+{
+ int s1 = 2, s2 = 5, v1 = 2, v2 = 5;
+ int n = 100;
+ int i;
+
+#pragma acc parallel vector_length (1000)
+#pragma acc loop reduction (+:s1, s2)
+ for (i = 0; i < n; i++)
+ {
+ s1 = s1 + 3;
+ s2 = s2 + 2;
+ }
+
+ for (i = 0; i < n; i++)
+ {
+ v1 = v1 + 3;
+ v2 = v2 + 2;
+ }
+
+ if (s1 != v1)
+ abort ();
+
+ if (s2 != v2)
+ abort ();
+
+ return 0;
+}
\ No newline at end of file
@@ -1,6 +1,4 @@
/* { dg-do run } */
-/* TODO:
- { dg-xfail-run-if "" { *-*-* } { "-DACC_DEVICE_TYPE_host=1" } { "" } } */
int
main(void)
new file mode 100644
@@ -0,0 +1,225 @@
+! { dg-do run }
+
+! Integer reductions
+
+program reduction_1
+ implicit none
+
+ integer, parameter :: n = 10, vl = 2
+ integer :: i, vresult, result
+ logical :: lresult, lvresult
+ integer, dimension (n) :: array
+
+ do i = 1, n
+ array(i) = i
+ end do
+
+ result = 0
+ vresult = 0
+
+ ! '+' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(+:result)
+ do i = 1, n
+ result = result + array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult + array(i)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 0
+ vresult = 0
+
+ ! '*' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(*:result)
+ do i = 1, n
+ result = result * array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult * array(i)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 0
+ vresult = 0
+
+ ! 'max' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(max:result)
+ do i = 1, n
+ result = max (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = max (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! 'min' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(min:result)
+ do i = 1, n
+ result = min (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = min (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! 'iand' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(iand:result)
+ do i = 1, n
+ result = iand (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = iand (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! 'ior' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(ior:result)
+ do i = 1, n
+ result = ior (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = ior (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 0
+ vresult = 0
+
+ ! 'ieor' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(ieor:result)
+ do i = 1, n
+ result = ieor (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = ieor (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.and.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.and.:lresult)
+ do i = 1, n
+ lresult = lresult .and. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .and. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.or.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.or.:lresult)
+ do i = 1, n
+ lresult = lresult .or. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .or. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.eqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.eqv.:lresult)
+ do i = 1, n
+ lresult = lresult .eqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .eqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.neqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.neqv.:lresult)
+ do i = 1, n
+ lresult = lresult .neqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .neqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+end program reduction_1
new file mode 100644
@@ -0,0 +1,170 @@
+! { dg-do run }
+
+! real reductions
+
+program reduction_2
+ implicit none
+
+ integer, parameter :: n = 10, vl = 2
+ integer :: i
+ real, parameter :: e = .001
+ real :: vresult, result
+ logical :: lresult, lvresult
+ real, dimension (n) :: array
+
+ do i = 1, n
+ array(i) = i
+ end do
+
+ result = 0
+ vresult = 0
+
+ ! '+' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(+:result)
+ do i = 1, n
+ result = result + array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult + array(i)
+ end do
+
+ if (abs (result - vresult) .ge. e) call abort
+
+ result = 1
+ vresult = 1
+
+ ! '*' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(*:result)
+ do i = 1, n
+ result = result * array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult * array(i)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 0
+ vresult = 0
+
+ ! 'max' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(max:result)
+ do i = 1, n
+ result = max (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = max (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! 'min' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(min:result)
+ do i = 1, n
+ result = min (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = min (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! '.and.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.and.:lresult)
+ do i = 1, n
+ lresult = lresult .and. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .and. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.or.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.or.:lresult)
+ do i = 1, n
+ lresult = lresult .or. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .or. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.eqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.eqv.:lresult)
+ do i = 1, n
+ lresult = lresult .eqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .eqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.neqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.neqv.:lresult)
+ do i = 1, n
+ lresult = lresult .neqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .neqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+end program reduction_2
new file mode 100644
@@ -0,0 +1,170 @@
+! { dg-do run }
+
+! double precision reductions
+
+program reduction_3
+ implicit none
+
+ integer, parameter :: n = 10, vl = 2
+ integer :: i
+ double precision, parameter :: e = .001
+ double precision :: vresult, result
+ logical :: lresult, lvresult
+ double precision, dimension (n) :: array
+
+ do i = 1, n
+ array(i) = i
+ end do
+
+ result = 0
+ vresult = 0
+
+ ! '+' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(+:result)
+ do i = 1, n
+ result = result + array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult + array(i)
+ end do
+
+ if (abs (result - vresult) .ge. e) call abort
+
+ result = 1
+ vresult = 1
+
+ ! '*' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(*:result)
+ do i = 1, n
+ result = result * array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult * array(i)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 0
+ vresult = 0
+
+ ! 'max' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(max:result)
+ do i = 1, n
+ result = max (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = max (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! 'min' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(min:result)
+ do i = 1, n
+ result = min (result, array(i))
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = min (vresult, array(i))
+ end do
+
+ if (result.ne.vresult) call abort
+
+ result = 1
+ vresult = 1
+
+ ! '.and.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.and.:lresult)
+ do i = 1, n
+ lresult = lresult .and. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .and. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.or.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.or.:lresult)
+ do i = 1, n
+ lresult = lresult .or. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .or. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.eqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.eqv.:lresult)
+ do i = 1, n
+ lresult = lresult .eqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .eqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+
+ lresult = .false.
+ lvresult = .false.
+
+ ! '.neqv.' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(.neqv.:lresult)
+ do i = 1, n
+ lresult = lresult .neqv. (array(i) .ge. 5)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ lvresult = lvresult .neqv. (array(i) .ge. 5)
+ end do
+
+ if (result.ne.vresult) call abort
+end program reduction_3
new file mode 100644
@@ -0,0 +1,54 @@
+! { dg-do run }
+
+! complex reductions
+
+program reduction_4
+ implicit none
+
+ integer, parameter :: n = 10, vl = 32
+ integer :: i
+ complex :: vresult, result
+ complex, dimension (n) :: array
+
+ do i = 1, n
+ array(i) = i
+ end do
+
+ result = 0
+ vresult = 0
+
+ ! '+' reductions
+
+ !$acc parallel vector_length(vl) num_gangs(2)
+ !$acc loop reduction(+:result)
+ do i = 1, n
+ result = result + array(i)
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vresult = vresult + array(i)
+ end do
+
+ if (result .ne. vresult) call abort
+
+ result = 1
+ vresult = 1
+
+! ! '*' reductions
+!
+! !$acc parallel vector_length(vl)
+! !$acc loop reduction(*:result)
+! do i = 1, n
+! result = result * array(i)
+! end do
+! !$acc end parallel
+!
+! ! Verify the results
+! do i = 1, n
+! vresult = vresult * array(i)
+! end do
+!
+! if (result.ne.vresult) call abort
+end program reduction_4
new file mode 100644
@@ -0,0 +1,32 @@
+! { dg-do run }
+
+! subroutine reduction
+
+program reduction
+ integer, parameter :: n = 40, c = 10
+ integer :: i, vsum, sum
+
+ call redsub (sum, n, c)
+
+ vsum = 0
+
+ ! Verify the results
+ do i = 1, n
+ vsum = vsum + c
+ end do
+
+ if (sum.ne.vsum) call abort ()
+end program reduction
+
+subroutine redsub(sum, n, c)
+ integer :: sum, n, c
+
+ sum = 0
+
+ !$acc parallel vector_length(n) copyin (n, c) num_gangs(2)
+ !$acc loop reduction(+:sum)
+ do i = 1, n
+ sum = sum + c
+ end do
+ !$acc end parallel
+end subroutine redsub
new file mode 100644
@@ -0,0 +1,30 @@
+! { dg-do run }
+
+program reduction
+ implicit none
+
+ integer, parameter :: n = 100
+ integer :: i, s1, s2, vs1, vs2
+
+ s1 = 0
+ s2 = 0
+ vs1 = 0
+ vs2 = 0
+
+ !$acc parallel vector_length (1000)
+ !$acc loop reduction(+:s1, s2)
+ do i = 1, n
+ s1 = s1 + 1
+ s2 = s2 + 2
+ end do
+ !$acc end parallel
+
+ ! Verify the results
+ do i = 1, n
+ vs1 = vs1 + 1
+ vs2 = vs2 + 2
+ end do
+
+ if (s1.ne.vs1) call abort ()
+ if (s2.ne.vs2) call abort ()
+end program reduction