2016-01-05 Nathan Sidwell <nathan@acm.org>
gcc/
* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): New.
(nvptx_goacc_validate_dims): Extend to handle global defaults.
* doc/invoke.texi (fopenacc-dim): Document.
* lto-wrapper.c (merge_and_complain): Add OPT_fopenacc_dim_ case.
(append_compiler_options): Likewise.
* omp-low.c (oacc_default_dims, oacc_min_dims): New.
(oacc_parse_default_dims): New.
(oacc_validate_dims): Add USED arg. Emit warnings about strange
partitioning choices. Select non-unity default when possible.
(oacc_loop_fixed_partitions): Return mask of used partitions.
(oacc_loop_auto_partitions): Emit dump info.
(oacc_loop_partition): Return mask of used partitions.
(execute_oacc_device_lower): Parse default dimension arg. Adjust
loop partitioning and validation calls.
gcc/c-family/
* c.opt (fopenacc-dim=): New option.
gcc/fortran/
* lang.opt (fopenacc-dim=): New option.
gcc/testsuite/
* c-c++-common/goacc/parallel-reduction.c: Add -w.
* c-c++-common/goacc/routine-1.c: Add -w.
* c-c++-common/goacc/routine-3.c: Add -w.
* c-c++-common/goacc/routine-6.c: Add -w.
* g++.dg/goacc/template.C: Add -w.
* gfortran.dg/goacc/parallel-tree.f95: Add -w.
* gfortran.dg/goacc/routine-4.f90: Add -w.
libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Read GOMP_OPENACC_DIM.
* testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c: New.
* testsuite/libgomp.oacc-c-c++-common/loop-default-runtime.c: New.
* testsuite/libgomp.oacc-c-c++-common/loop-default.h: New.
* testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c: New.
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Add -w.
* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gang-np-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gv-np-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gw-np-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-vector-p-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-worker-p-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/par-reduction-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/par-reduction-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-reduction.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-local-gang-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-6.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-worker-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-par-gang-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-par-gang-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-vars-par-gang-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-work-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/vec-single-6.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-partn-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-partn-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-1a.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/worker-single-6.c: Likewise.
* testsuite/libgomp.oacc-fortran/parallel-reduction.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-loop-gang-1.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-loop-gang-2.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-loop-gang-3.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-loop-gang-6.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-loop-worker-1.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-vars-par-gang-2.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-1.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-6.f90: Likewise.
* testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise.
===================================================================
@@ -1372,6 +1372,10 @@ fopenacc
C ObjC C++ ObjC++ LTO Var(flag_openacc)
Enable OpenACC.
+fopenacc-dim=
+C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
+Specify default OpenACC compute dimensions.
+
fopenmp
C ObjC C++ ObjC++ LTO Var(flag_openmp)
Enable OpenMP (implies -frecursive in Fortran).
===================================================================
@@ -4125,10 +4125,12 @@ nvptx_expand_builtin (tree exp, rtx targ
/* Define dimension sizes for known hardware. */
#define PTX_VECTOR_LENGTH 32
#define PTX_WORKER_LENGTH 32
+#define PTX_GANG_DEFAULT 32
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
- routine might spawn a loop. It is negative for non-routines. */
+ routine might spawn a loop. It is negative for non-routines. If
+ DECL is null, we are validating the default dimensions. */
static bool
nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
@@ -4136,11 +4138,12 @@ nvptx_goacc_validate_dims (tree decl, in
bool changed = false;
/* The vector size must be 32, unless this is a SEQ routine. */
- if (fn_level <= GOMP_DIM_VECTOR
+ if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
+ && dims[GOMP_DIM_VECTOR] >= 0
&& dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
{
- if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
- warning_at (DECL_SOURCE_LOCATION (decl), 0,
+ if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
dims[GOMP_DIM_VECTOR]
? "using vector_length (%d), ignoring %d"
: "using vector_length (%d), ignoring runtime setting",
@@ -4152,13 +4155,23 @@ nvptx_goacc_validate_dims (tree decl, in
/* Check the num workers is not too large. */
if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
{
- warning_at (DECL_SOURCE_LOCATION (decl), 0,
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
"using num_workers (%d), ignoring %d",
PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
changed = true;
}
+ if (!decl)
+ {
+ dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
+ if (dims[GOMP_DIM_WORKER] < 0)
+ dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+ if (dims[GOMP_DIM_GANG] < 0)
+ dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+ changed = true;
+ }
+
return changed;
}
===================================================================
@@ -1936,9 +1936,17 @@ Programming Interface v2.0 @w{@uref{http
implies @option{-pthread}, and thus is only supported on targets that
have support for @option{-pthread}.
-Note that this is an experimental feature, incomplete, and subject to
-change in future versions of GCC. See
-@w{@uref{https://gcc.gnu.org/wiki/OpenACC}} for more information.
+@item -fopenacc-dim=@var{geom}
+@opindex fopenacc-dim
+@cindex OpenACC accelerator programming
+Specify default compute dimensions for parallel offload regions that do
+not explicitly specify. The @var{geom} value is a triple of
+':'-separated sizes, in order 'gang', 'worker' and, 'vector'. If a size
+is to be deferred until execution '-' can be used, alternatively a size
+can be omitted to use a target-specific default value. When deferring
+to runtime, the environment variable @var{GOMP_OPENACC_DIM} can be set.
+It has the same format as the option value, except that '-' is not
+permitted.
@item -fopenmp
@opindex fopenmp
===================================================================
@@ -578,6 +578,10 @@ fopenacc
Fortran LTO
; Documented in C
+fopenacc-dim=
+Fortran LTO Joined Var(flag_openacc_dims)
+; Documented in C
+
fopenmp
Fortran LTO
; Documented in C
===================================================================
@@ -286,12 +286,25 @@ merge_and_complain (struct cl_decoded_op
append_option (decoded_options, decoded_options_count, foption);
/* -fmath-errno > -fno-math-errno,
-fsigned-zeros > -fno-signed-zeros,
- -ftrapping-math -> -fno-trapping-math,
+ -ftrapping-math > -fno-trapping-math,
-fwrapv > -fno-wrapv. */
else if (foption->value > (*decoded_options)[j].value)
(*decoded_options)[j] = *foption;
break;
+ case OPT_fopenacc_dim_:
+ /* Append or check identical. */
+ for (j = 0; j < *decoded_options_count; ++j)
+ if ((*decoded_options)[j].opt_index == foption->opt_index)
+ break;
+ if (j == *decoded_options_count)
+ append_option (decoded_options, decoded_options_count, foption);
+ else if (strcmp ((*decoded_options)[j].arg, foption->arg))
+ fatal_error (input_location,
+ "Option %s with different values",
+ foption->orig_option_with_args_text);
+ break;
+
case OPT_freg_struct_return:
case OPT_fpcc_struct_return:
case OPT_fshort_double:
@@ -505,6 +518,7 @@ append_compiler_options (obstack *argv_o
case OPT_fwrapv:
case OPT_fopenmp:
case OPT_fopenacc:
+ case OPT_fopenacc_dim_:
case OPT_ftrapv:
case OPT_fstrict_overflow:
case OPT_foffload_abi_:
===================================================================
@@ -19295,13 +19295,86 @@ oacc_xform_loop (gcall *call)
gsi_replace_with_seq (&gsi, seq, true);
}
+/* Default partitioned and minimum partitioned dimensions. */
+
+static int oacc_default_dims[GOMP_DIM_MAX];
+static int oacc_min_dims[GOMP_DIM_MAX];
+
+/* Parse the default dimension parameter. This is a set of
+ :-separated optional compute dimensions. Each dimension is either
+ a positive integoer, or '-' for a dynamic value computed at
+ runrime. When device type support is added, it is a comma
+ separated list of such compute dimensions, with all but the first
+ prefixed by the colon-terminated device type. */
+
+static void
+oacc_parse_default_dims (const char *dims)
+{
+ int ix;
+
+ for (ix = GOMP_DIM_MAX; ix--;)
+ {
+ oacc_default_dims[ix] = -1;
+ oacc_min_dims[ix] = 1;
+ }
+
+#ifndef ACCEL_COMPILER
+ /* Cannot be overridden on the host. */
+ dims = NULL;
+#endif
+ if (dims)
+ {
+ const char *pos = dims;
+
+ for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
+ {
+ if (ix)
+ {
+ if (*pos != ':')
+ goto malformed;
+ pos++;
+ }
+
+ if (*pos != ':')
+ {
+ long val = 0;
+
+ if (*pos == '-')
+ pos++;
+ else
+ {
+ const char *eptr;
+
+ errno = 0;
+ val = strtol (pos, CONST_CAST (char **, &eptr), 10);
+ if (errno || val <= 0 || (unsigned)val != val)
+ goto malformed;
+ pos = eptr;
+ }
+ oacc_default_dims[ix] = (int)val;
+ }
+ }
+ if (*pos)
+ {
+ malformed:
+ error_at (UNKNOWN_LOCATION,
+ "-fopenacc-dim operand is malformed at '%s'", pos);
+ }
+ }
+
+ /* Allow the backend to validate the dimensions. */
+ targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
+ targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
+}
+
/* Validate and update the dimensions for offloaded FN. ATTRS is the
raw attribute. DIMS is an array of dimensions, which is filled in.
LEVEL is the partitioning level of a routine, or -1 for an offload
- region itself. */
+ region itself. USED is the mask of partitioned execution in the
+ function. */
static void
-oacc_validate_dims (tree fn, tree attrs, int *dims, int level)
+oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
{
tree purpose[GOMP_DIM_MAX];
unsigned ix;
@@ -19320,13 +19393,61 @@ oacc_validate_dims (tree fn, tree attrs,
pos = TREE_CHAIN (pos);
}
+ bool check = true;
+#ifdef ACCEL_COMPILER
+ /* When device_type is implemented, we should also check on the
+ target, if device_type has been used to affect the partitioning
+ and/or dimensions. */
+ check = false;
+#endif
+ if (!is_kernel && check)
+ {
+ static char const *const axes[] =
+ /* Must be kept in sync with GOMP_DIM enumeration. */
+ {"gang", "worker", "vector" };
+ for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
+ if (dims[ix] < 0)
+ ; /* Defaulting axis. */
+ else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
+ /* There is partitioned execution, but the user requested a
+ dimension size of 1. They're probably confused. */
+ warning_at (DECL_SOURCE_LOCATION (fn), 0,
+ "region contains %s partitoned code but"
+ " is not %s partitioned", axes[ix], axes[ix]);
+ else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
+ /* The dimension is explicitly partitioned to non-unity, but
+ no use is made within the region. */
+ warning_at (DECL_SOURCE_LOCATION (fn), 0,
+ "region is %s partitioned but"
+ " does not contain %s partitioned code",
+ axes[ix], axes[ix]);
+ }
+
bool changed = targetm.goacc.validate_dims (fn, dims, level);
- /* Default anything left to 1. */
+ /* Default anything left to 1 or a partitioned default. */
for (ix = 0; ix != GOMP_DIM_MAX; ix++)
if (dims[ix] < 0)
{
- dims[ix] = 1;
+ /* The OpenACC spec says 'If the [num_gangs] clause is not
+ specified, an implementation-defined default will be used;
+ the default may depend on the code within the construct.'
+ (2.5.6). Thus an implementation is free to choose
+ non-unity default for a parallel region that doesn't have
+ any gang-partitioned loops. However, it appears that there
+ is a sufficient body of user code that expects non-gang
+ partitioned regions to not execute in gang-redundant mode.
+ So we (a) don't warn about the non-portability and (b) pick
+ the minimum permissible dimension size when there is no
+ partitioned execution. Otherwise we pick the global
+ default for the dimension, which the user can control. The
+ same wording and logic applies to num_workers and
+ vector_length, however the worker- or vector- single
+ execution doesn't have the same impact as gang-redundant
+ execution. (If the minimum gang-level partioning is not 1,
+ the target is probably too confusing.) */
+ dims[ix] = (!is_kernel && (used & (GOMP_DIM_MASK (ix)))
+ ? oacc_default_dims[ix] : oacc_min_dims[ix]);
changed = true;
}
@@ -19777,14 +19898,15 @@ oacc_loop_process (oacc_loop *loop)
/* Walk the OpenACC loop heirarchy checking and assigning the
programmer-specified partitionings. OUTER_MASK is the partitioning
- this loop is contained within. Return true if we contain an
- auto-partitionable loop. */
+ this loop is contained within. Return mask of partitioning
+ encountered. If any auto loops are discovered, set GOMP_DIM_MAX
+ bit. */
-static bool
+static unsigned
oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
{
unsigned this_mask = loop->mask;
- bool has_auto = false;
+ unsigned mask_all = 0;
bool noisy = true;
#ifdef ACCEL_COMPILER
@@ -19818,7 +19940,7 @@ oacc_loop_fixed_partitions (oacc_loop *l
}
}
if (auto_par && (loop->flags & OLF_INDEPENDENT))
- has_auto = true;
+ mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
}
if (this_mask & outer_mask)
@@ -19872,16 +19994,16 @@ oacc_loop_fixed_partitions (oacc_loop *l
}
loop->mask = this_mask;
+ mask_all |= this_mask;
+
+ if (loop->child)
+ mask_all |= oacc_loop_fixed_partitions (loop->child,
+ outer_mask | this_mask);
- if (loop->child
- && oacc_loop_fixed_partitions (loop->child, outer_mask | this_mask))
- has_auto = true;
-
- if (loop->sibling
- && oacc_loop_fixed_partitions (loop->sibling, outer_mask))
- has_auto = true;
+ if (loop->sibling)
+ mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
- return has_auto;
+ return mask_all;
}
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
@@ -19923,6 +20045,11 @@ oacc_loop_auto_partitions (oacc_loop *lo
warning_at (loop->loc, 0,
"insufficient partitioning available to parallelize loop");
+ if (dump_file)
+ fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+ LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+ this_mask);
+
loop->mask = this_mask;
}
inner_mask |= loop->mask;
@@ -19934,13 +20061,19 @@ oacc_loop_auto_partitions (oacc_loop *lo
}
/* Walk the OpenACC loop heirarchy to check and assign partitioning
- axes. */
+ axes. Return mask of partitioning. */
-static void
+static unsigned
oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
{
- if (oacc_loop_fixed_partitions (loop, outer_mask))
- oacc_loop_auto_partitions (loop, outer_mask);
+ unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
+
+ if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
+ {
+ mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
+ mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+ }
+ return mask_all;
}
/* Default fork/join early expander. Delete the function calls if
@@ -20032,8 +20165,8 @@ maybe_discard_oacc_function (tree decl)
static unsigned int
execute_oacc_device_lower ()
{
- tree attr = get_oacc_fn_attrib (current_function_decl);
- if (!attr)
+ tree attrs = get_oacc_fn_attrib (current_function_decl);
+ if (!attrs)
/* Not an offloaded function. */
return 0;
@@ -20045,21 +20178,28 @@ execute_oacc_device_lower ()
return TODO_discard_function;
}
+ /* Parse the default dim argument exactly once. */
+ if ((const void *)flag_openacc_dims != &flag_openacc_dims)
+ {
+ oacc_parse_default_dims (flag_openacc_dims);
+ flag_openacc_dims = (char *)&flag_openacc_dims;
+ }
+
/* Discover, partition and process the loops. */
oacc_loop *loops = oacc_loop_discovery ();
- int fn_level = oacc_fn_attrib_level (attr);
+ int fn_level = oacc_fn_attrib_level (attrs);
if (dump_file)
- fprintf (dump_file, oacc_fn_attrib_kernels_p (attr)
+ fprintf (dump_file, oacc_fn_attrib_kernels_p (attrs)
? "Function is kernels offload\n"
: fn_level < 0 ? "Function is parallel offload\n"
: "Function is routine level %d\n", fn_level);
unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
- oacc_loop_partition (loops, outer_mask);
-
+ unsigned used_mask = oacc_loop_partition (loops, outer_mask);
int dims[GOMP_DIM_MAX];
- oacc_validate_dims (current_function_decl, attr, dims, fn_level);
+
+ oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
if (dump_file)
{
===================================================================
@@ -6,7 +6,7 @@ main ()
#pragma acc data copy (dummy)
{
-#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum)
+#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum) /* { dg-warning "gang partitioned" } */
{
int v = 5;
sum += 10 + v;
===================================================================
@@ -1,17 +1,17 @@
/* Test valid use of clauses with routine. */
#pragma acc routine gang
-void gang (void)
+void gang (void) /* { dg-warning "partitioned" 3 } */
{
}
#pragma acc routine worker
-void worker (void)
+void worker (void) /* { dg-warning "partitioned" 2 } */
{
}
#pragma acc routine vector
-void vector (void)
+void vector (void) /* { dg-warning "partitioned" 1 } */
{
}
===================================================================
@@ -1,16 +1,28 @@
#pragma acc routine gang
void gang (void) /* { dg-message "declared here" 3 } */
{
+ #pragma acc loop gang worker vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
}
#pragma acc routine worker
void worker (void) /* { dg-message "declared here" 2 } */
{
+ #pragma acc loop worker vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
}
#pragma acc routine vector
void vector (void) /* { dg-message "declared here" 1 } */
{
+ #pragma acc loop vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
}
#pragma acc routine seq
===================================================================
@@ -5,6 +5,11 @@
int
gang () /* { dg-message "declared here" 3 } */
{
+ #pragma acc loop gang worker vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
+
return 1;
}
@@ -12,6 +17,11 @@ gang () /* { dg-message "declared here"
int
worker () /* { dg-message "declared here" 2 } */
{
+ #pragma acc loop worker vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
+
return 1;
}
@@ -19,6 +29,11 @@ worker () /* { dg-message "declared here
int
vector () /* { dg-message "declared here" } */
{
+ #pragma acc loop vector
+ for (int i = 0; i < 10; i++)
+ {
+ }
+
return 1;
}
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc -w" } */
+
#pragma acc routine
template <typename T> T
accDouble(int val)
===================================================================
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-additional-options "-fdump-tree-original" }
+! { dg-additional-options "-fdump-tree-original -w" }
! test for tree-dump-original and spaces-commas
===================================================================
@@ -123,6 +123,7 @@ contains
integer, intent (inout) :: a(N)
integer :: i
+ !$acc loop gang worker vector
do i = 1, N
a(i) = a(i) - a(i)
end do
@@ -133,6 +134,7 @@ contains
integer, intent (inout) :: a(N)
integer :: i
+ !$acc loop worker vector
do i = 1, N
a(i) = a(i) - a(i)
end do
@@ -143,6 +145,7 @@ contains
integer, intent (inout) :: a(N)
integer :: i
+ !$acc loop vector
do i = 1, N
a(i) = a(i) - a(i)
end do
@@ -153,6 +156,7 @@ contains
integer, intent (inout) :: a(N)
integer :: i
+ !$acc loop seq
do i = 1, N
a(i) = a(i) - a(i)
end do
===================================================================
@@ -45,6 +45,7 @@
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
+#include <errno.h>
static const char *
cuda_error (CUresult r)
@@ -897,9 +898,68 @@ nvptx_exec (void (*fn), size_t mapnum, v
/* Initialize the launch dimensions. Typically this is constant,
provided by the device compiler, but we must permit runtime
values. */
+ int seen_zero = 0;
for (i = 0; i != GOMP_DIM_MAX; i++)
- if (targ_fn->launch->dim[i])
- dims[i] = targ_fn->launch->dim[i];
+ {
+ if (targ_fn->launch->dim[i])
+ dims[i] = targ_fn->launch->dim[i];
+ if (!dims[i])
+ seen_zero = 1;
+ }
+
+ if (seen_zero)
+ {
+ /* See if the user provided GOMP_OPENACC_DIM environment
+ variable to specify runtime defaults. */
+ static int default_dims[GOMP_DIM_MAX];
+
+ if (!default_dims[0])
+ {
+ /* We only read the environment variable once. You can't
+ change it in the middle of execution. The sytntax is
+ the same as for the -fopenacc-dim compilation option. */
+ const char *env_var = getenv ("GOMP_OPENACC_DIM");
+ if (env_var)
+ {
+ const char *pos = env_var;
+
+ for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+ {
+ if (i && *pos++ != ':')
+ break;
+ if (*pos != ':')
+ {
+ const char *eptr;
+
+ errno = 0;
+ long val = strtol (pos, (char **)&eptr, 10);
+ if (errno || val < 0 || (unsigned)val != val)
+ break;
+ default_dims[i] = (int)val;
+ pos = eptr;
+ }
+ }
+ }
+
+ /* Do some sanity checking. The CUDA API doesn't appear to
+ provide queries to determine these limits. */
+ if (default_dims[GOMP_DIM_GANG] < 1)
+ default_dims[GOMP_DIM_GANG] = 32;
+ if (default_dims[GOMP_DIM_WORKER] < 1
+ || default_dims[GOMP_DIM_WORKER] > 32)
+ default_dims[GOMP_DIM_WORKER] = 32;
+ default_dims[GOMP_DIM_VECTOR] = 32;
+
+ GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
+ default_dims[GOMP_DIM_GANG],
+ default_dims[GOMP_DIM_WORKER],
+ default_dims[GOMP_DIM_VECTOR]);
+ }
+
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ if (!dims[i])
+ dims[i] = default_dims[i];
+ }
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -920,8 +980,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" gangs=%u, workers=%u, vectors=%u\n",
- __FUNCTION__, targ_fn->launch->fn,
- dims[0], dims[1], dims[2]);
+ __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
+ dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
// OpenACC CUDA
//
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
#include <openacc.h>
===================================================================
@@ -0,0 +1,13 @@
+
+/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
+
+#include "loop-default.h"
+#include <stdlib.h>
+
+int main ()
+{
+ /* Environment should be ignored. */
+ setenv ("GOMP_OPENACC_DIM", "8:8", 1);
+
+ return test_1 (16, 16, 32);
+}
===================================================================
@@ -0,0 +1,12 @@
+
+/* { dg-additional-options "-O2 -fopenacc-dim=-:-" } */
+
+#include "loop-default.h"
+#include <stdlib.h>
+
+int main ()
+{
+ setenv ("GOMP_OPENACC_DIM", "8:16", 1);
+
+ return test_1 (8, 16, 32);
+}
===================================================================
@@ -0,0 +1,125 @@
+#include <openacc.h>
+#include <alloca.h>
+#include <string.h>
+#include <stdio.h>
+
+#pragma acc routine
+static int __attribute__ ((noinline)) coord ()
+{
+ int res = 0;
+
+ if (acc_on_device (acc_device_nvidia))
+ {
+ int g = 0, w = 0, v = 0;
+
+ __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+ __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+ __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+ res = (1 << 24) | (g << 16) | (w << 8) | v;
+ }
+ return res;
+}
+
+
+int check (const int *ary, int size, int gp, int wp, int vp)
+{
+ int exit = 0;
+ int ix;
+ int *gangs = (int *)alloca (gp * sizeof (int));
+ int *workers = (int *)alloca (wp * sizeof (int));
+ int *vectors = (int *)alloca (vp * sizeof (int));
+ int offloaded = 0;
+
+ memset (gangs, 0, gp * sizeof (int));
+ memset (workers, 0, wp * sizeof (int));
+ memset (vectors, 0, vp * sizeof (int));
+
+ for (ix = 0; ix < size; ix++)
+ {
+ int g = (ary[ix] >> 16) & 0xff;
+ int w = (ary[ix] >> 8) & 0xff;
+ int v = (ary[ix] >> 0) & 0xff;
+
+ if (g >= gp || w >= wp || v >= vp)
+ {
+ printf ("unexpected cpu %#x used\n", ary[ix]);
+ exit = 1;
+ }
+ else
+ {
+ vectors[v]++;
+ workers[w]++;
+ gangs[g]++;
+ }
+ offloaded += ary[ix] >> 24;
+ }
+
+ if (!offloaded)
+ return 0;
+
+ if (offloaded != size)
+ {
+ printf ("offloaded %d times, expected %d\n", offloaded, size);
+ return 1;
+ }
+
+ for (ix = 0; ix < gp; ix++)
+ if (gangs[ix] != gangs[0])
+ {
+ printf ("gang %d not used %d times\n", ix, gangs[0]);
+ exit = 1;
+ }
+
+ for (ix = 0; ix < wp; ix++)
+ if (workers[ix] != workers[0])
+ {
+ printf ("worker %d not used %d times\n", ix, workers[0]);
+ exit = 1;
+ }
+
+ for (ix = 0; ix < vp; ix++)
+ if (vectors[ix] != vectors[0])
+ {
+ printf ("vector %d not used %d times\n", ix, vectors[0]);
+ exit = 1;
+ }
+
+ return exit;
+}
+
+#define N (32 *32*32)
+
+int test_1 (int gp, int wp, int vp)
+{
+ int ary[N];
+ int exit = 0;
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop gang (static:1)
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, gp, 1, 1);
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop worker
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, 1, wp, 1);
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop vector
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, 1, 1, vp);
+
+ return exit;
+}
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on loop directive (gangs, non-private reduction
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on loop directive (gangs and vectors, non-private
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on loop directive (gangs and workers, non-private
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on loop directive (vectors, private reduction
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on loop directive (workers, private reduction
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -0,0 +1,37 @@
+
+/* Check warnings about suboptimal partitioning choices. */
+
+int main ()
+{
+ int ary[10];
+
+#pragma acc parallel copy(ary) num_gangs (1) /* { dg-warning "is not gang partitioned" } */
+ {
+ #pragma acc loop gang
+ for (int i = 0; i < 10; i++)
+ ary[i] = i;
+ }
+
+#pragma acc parallel copy(ary) num_workers (1) /* { dg-warning "is not worker partitioned" } */
+ {
+ #pragma acc loop worker
+ for (int i = 0; i < 10; i++)
+ ary[i] = i;
+ }
+
+#pragma acc parallel copy(ary) num_gangs (8) /* { dg-warning "is gang partitioned" } */
+ {
+ #pragma acc loop worker
+ for (int i = 0; i < 10; i++)
+ ary[i] = i;
+ }
+
+#pragma acc parallel copy(ary) num_workers (8) /* { dg-warning "is worker partitioned" } */
+ {
+ #pragma acc loop gang
+ for (int i = 0; i < 10; i++)
+ ary[i] = i;
+ }
+
+ return 0;
+}
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of reduction on parallel directive. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
#include <openacc.h>
===================================================================
@@ -5,12 +5,20 @@
int main ()
{
+ int dummy[10];
+
#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
{
+#pragma acc loop worker
+ for (int i = 0; i < 10; i++)
+ dummy[i] = i;
}
#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
{
+#pragma acc loop vector
+ for (int i = 0; i < 10; i++)
+ dummy[i] = i;
}
return 0;
===================================================================
@@ -1,4 +1,5 @@
/* { dg-do run } */
+/* { dg-additional-options "-w" } */
#include <stdlib.h>
#include <openacc.h>
===================================================================
@@ -1,13 +1,12 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
+#include <openacc.h>
/* Test of gang-private variables declared in local scope with parallel
directive. */
-#if defined(ACC_DEVICE_TYPE_host)
-#define ACTUAL_GANGS 1
-#else
#define ACTUAL_GANGS 32
-#endif
int
main (int argc, char* argv[])
@@ -28,7 +27,11 @@ main (int argc, char* argv[])
#pragma acc loop gang(static:1)
for (i = 0; i < ACTUAL_GANGS; i++)
- arr[i] += x;
+ {
+ if (acc_on_device (acc_device_host))
+ x = i * 2;
+ arr[i] += x;
+ }
}
for (i = 0; i < ACTUAL_GANGS; i++)
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private variables declared on loop directive. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private variables declared on loop directive, with broadcasting
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private variables declared on loop directive, with broadcasting
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private addressable variable declared on loop directive, with
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private array variable declared on loop directive, with
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private aggregate variable declared on loop directive, with
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of worker-private variables declared on a loop directive. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Basic test of firstprivate variable. */
===================================================================
@@ -1,12 +1,11 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
+#include <openacc.h>
/* Test of gang-private variables declared on the parallel directive. */
-#if defined(ACC_DEVICE_TYPE_host)
-#define ACTUAL_GANGS 1
-#else
#define ACTUAL_GANGS 32
-#endif
int
main (int argc, char* argv[])
@@ -25,7 +24,11 @@ main (int argc, char* argv[])
#pragma acc loop gang(static:1)
for (i = 0; i < ACTUAL_GANGS; i++)
- arr[i] += x;
+ {
+ if (acc_on_device (acc_device_host))
+ x = i * 2;
+ arr[i] += x;
+ }
}
for (i = 0; i < ACTUAL_GANGS; i++)
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test of gang-private array variable declared on the parallel directive. */
===================================================================
@@ -1,4 +1,5 @@
/* { dg-do run } */
+/* { dg-additional-options "-w" } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
===================================================================
@@ -1,5 +1,5 @@
-
/* { dg-do run } */
+/* { dg-additional-options "-w" } */
#include <stdlib.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -w" } */
#include <stdio.h>
===================================================================
@@ -1,5 +1,5 @@
/* { dg-do run } */
-/* { dg-additional-options "-O1" } */
+/* { dg-additional-options "-O1 -w" } */
#include <stdio.h>
#include <openacc.h>
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test trivial operation of vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test vector-single, gang-partitioned mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test conditions in vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test switch in vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test switch in vector-single mode, initialise array on device. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
#include <stdbool.h>
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test worker-partitioned/vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test worker-single/worker-partitioned transitions. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test correct synchronisation between worker-partitioned loops. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test worker-single/vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test worker-single/vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test condition in worker-single/vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test switch in worker-single/vector-single mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test worker-single/vector-partitioned mode. */
===================================================================
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
/* Test multiple conditional vector-partitioned loops in worker-single
===================================================================
@@ -1,10 +1,9 @@
+/* { dg-additional-options "-w" } */
+
#include <assert.h>
+#include <openacc.h>
-#if defined(ACC_DEVICE_TYPE_host)
-#define ACTUAL_GANGS 1
-#else
#define ACTUAL_GANGS 8
-#endif
/* Test worker-single, vector-partitioned, gang-redundant mode. */
@@ -12,6 +11,7 @@ int
main (int argc, char *argv[])
{
int n, arr[32], i;
+ int ondev;
for (i = 0; i < 32; i++)
arr[i] = 0;
@@ -19,10 +19,12 @@ main (int argc, char *argv[])
n = 0;
#pragma acc parallel copy(n, arr) num_gangs(ACTUAL_GANGS) num_workers(8) \
- vector_length(32)
+ vector_length(32) copyout(ondev)
{
int j;
+ ondev = acc_on_device (acc_device_not_host);
+
#pragma acc atomic
n++;
@@ -37,10 +39,12 @@ main (int argc, char *argv[])
n++;
}
- assert (n == ACTUAL_GANGS * 2);
+ int m = ondev ? ACTUAL_GANGS : 1;
+
+ assert (n == m * 2);
for (i = 0; i < 32; i++)
- assert (arr[i] == ACTUAL_GANGS);
+ assert (arr[i] == m);
return 0;
}
===================================================================
@@ -1,4 +1,5 @@
! { dg-do run }
+! { dg-additional-options "-w" }
program reduction
integer, parameter :: n = 10
===================================================================
@@ -1,3 +1,5 @@
+! { dg-additional-options "-w" }
+
! Test of gang-private variables declared on loop directive.
program main
===================================================================
@@ -1,3 +1,5 @@
+! { dg-additional-options "-cpp -w" }
+
! Test of gang-private variables declared on loop directive, with broadcasting
! to partitioned workers.
===================================================================
@@ -1,3 +1,5 @@
+! { dg-additional-options "-w" }
+
! Test of gang-private variables declared on loop directive, with broadcasting
! to partitioned vectors.
===================================================================
@@ -1,3 +1,5 @@
+! { dg-additional-options "-w" }
+
! Test of gang-private addressable variable declared on loop directive, with
! broadcasting to partitioned workers.
===================================================================
@@ -1,3 +1,5 @@
+! { dg-additional-options "-cpp -w" }
+
! Test of worker-private variables declared on a loop directive.
program main
===================================================================
@@ -1,16 +1,11 @@
-! { dg-additional-options "-cpp" }
+! { dg-additional-options "-w" }
! Test of gang-private variables declared on the parallel directive.
-#if defined(ACC_DEVICE_TYPE_host)
-#define ACTUAL_GANGS 1
-#else
-#define ACTUAL_GANGS 32
-#endif
-
program main
+ use openacc
integer :: x = 5
- integer, parameter :: n = ACTUAL_GANGS
+ integer, parameter :: n = 32
integer :: arr(n)
do i = 1, n
@@ -25,6 +20,7 @@ program main
!$acc loop gang(static:1)
do i = 1, n
+ if (acc_on_device (acc_device_host) .eqv. .TRUE.) x = i * 2
arr(i) = arr(i) + x
end do
!$acc end parallel
===================================================================
@@ -1,4 +1,5 @@
! { dg-do run }
+! { dg-additional-options "-w" }
! Integer reductions
===================================================================
@@ -1,4 +1,5 @@
! { dg-do run }
+! { dg-additional-options "-w" }
! subroutine reduction
===================================================================
@@ -1,4 +1,5 @@
! { dg-do run }
+! { dg-additional-options "-cpp -w" }
program reduction
implicit none
===================================================================
@@ -1,6 +1,5 @@
-
! { dg-do run }
-! { dg-additional-options "-cpp" }
+! { dg-additional-options "-cpp -w" }
#define M 8
#define N 32
@@ -41,7 +40,7 @@ program main
end do
!$acc parallel copy (b)
- !$acc loop
+ !$acc loop seq
do i = 1, N
call worker (b)
end do
@@ -56,7 +55,7 @@ program main
end do
!$acc parallel copy (a)
- !$acc loop
+ !$acc loop seq
do i = 1, N
call vector (a)
end do