@@ -7071,7 +7071,9 @@ static void
expand_omp_ordered_source (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
tree *counts, location_t loc)
{
- enum built_in_function source_ix = BUILT_IN_GOMP_DOACROSS_POST;
+ enum built_in_function source_ix
+ = fd->iter_type == long_integer_type_node
+ ? BUILT_IN_GOMP_DOACROSS_POST : BUILT_IN_GOMP_DOACROSS_ULL_POST;
gimple g
= gimple_build_call (builtin_decl_explicit (source_ix), 1,
build_fold_addr_expr (counts[fd->ordered]));
@@ -7086,7 +7088,9 @@ expand_omp_ordered_sink (gimple_stmt_ite
tree *counts, tree c, location_t loc)
{
auto_vec<tree, 10> args;
- enum built_in_function sink_ix = BUILT_IN_GOMP_DOACROSS_WAIT;
+ enum built_in_function sink_ix
+ = fd->iter_type == long_integer_type_node
+ ? BUILT_IN_GOMP_DOACROSS_WAIT : BUILT_IN_GOMP_DOACROSS_ULL_WAIT;
tree t, off, coff = NULL_TREE, deps = OMP_CLAUSE_DECL (c), cond = NULL_TREE;
int i;
gimple_stmt_iterator gsi2 = *gsi;
@@ -7625,11 +7629,11 @@ expand_omp_for_generic (struct omp_regio
gsi_prev (&gsi);
e = split_block (entry_bb, gsi_stmt (gsi));
entry_bb = e->dest;
- make_edge (zero_iter1_bb, entry_bb, EDGE_FALLTHRU);
+ make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU);
gsi = gsi_last_bb (entry_bb);
set_immediate_dominator (CDI_DOMINATORS, entry_bb,
get_immediate_dominator
- (CDI_DOMINATORS, zero_iter1_bb));
+ (CDI_DOMINATORS, zero_iter2_bb));
}
}
if (fd->collapse == 1)
@@ -7762,7 +7766,7 @@ expand_omp_for_generic (struct omp_regio
t0 = fold_build2 (PLUS_EXPR, fd->iter_type, t0, bias);
}
}
- if (fd->iter_type == long_integer_type_node)
+ if (fd->iter_type == long_integer_type_node || fd->ordered)
{
if (fd->chunk_size)
{
@@ -7801,14 +7805,8 @@ expand_omp_for_generic (struct omp_regio
tree bfn_decl = builtin_decl_explicit (start_fn);
t = fold_convert (fd->iter_type, fd->chunk_size);
t = omp_adjust_chunk_size (t, fd->simd_schedule);
- if (fd->ordered)
- t = build_call_expr (bfn_decl, 6, t5, t0, t1, t, t3, t4);
- else
- t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
+ t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
}
- else if (fd->ordered)
- t = build_call_expr (builtin_decl_explicit (start_fn),
- 5, t5, t0, t1, t3, t4);
else
t = build_call_expr (builtin_decl_explicit (start_fn),
6, t5, t0, t1, t2, t3, t4);
@@ -197,6 +197,22 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL
"GOMP_loop_ull_ordered_runtime_start",
BT_FN_BOOL_BOOL_ULL_ULL_ULL_ULLPTR_ULLPTR,
ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START,
+ "GOMP_loop_ull_doacross_static_start",
+ BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+ ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START,
+ "GOMP_loop_ull_doacross_dynamic_start",
+ BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+ ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START,
+ "GOMP_loop_ull_doacross_guided_start",
+ BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+ ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
+ "GOMP_loop_ull_doacross_runtime_start",
+ BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+ ATTR_NOTHROW_LEAF_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_STATIC_NEXT, "GOMP_loop_ull_static_next",
BT_FN_BOOL_ULONGLONGPTR_ULONGLONGPTR, ATTR_NOTHROW_LEAF_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DYNAMIC_NEXT, "GOMP_loop_ull_dynamic_next",
@@ -250,6 +266,10 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS
BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_WAIT, "GOMP_doacross_wait",
BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_POST, "GOMP_doacross_ull_post",
+ BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_WAIT, "GOMP_doacross_ull_wait",
+ BT_FN_VOID_ULL_VAR, ATTR_NOTHROW_LEAF_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL, "GOMP_parallel",
BT_FN_VOID_OMPFN_PTR_UINT_UINT, ATTR_NOTHROW_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK, "GOMP_task",
@@ -475,6 +475,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+ BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+ BT_PTR_ULONGLONG)
DEF_FUNCTION_TYPE_5 (BT_FN_INT_STRING_INT_SIZE_CONST_STRING_VALIST_ARG,
BT_INT, BT_STRING, BT_INT, BT_SIZE, BT_CONST_STRING,
@@ -502,6 +505,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PT
DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+ BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+ BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
DEF_FUNCTION_TYPE_6 (BT_FN_INT_STRING_SIZE_INT_SIZE_CONST_STRING_VALIST_ARG,
BT_INT, BT_STRING, BT_SIZE, BT_INT, BT_SIZE,
@@ -578,6 +584,8 @@ DEF_FUNCTION_TYPE_VAR_1 (BT_FN_UINT32_UI
BT_UINT32, BT_UINT32)
DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+ BT_VOID, BT_ULONGLONG)
DEF_FUNCTION_TYPE_VAR_2 (BT_FN_INT_FILEPTR_CONST_STRING_VAR,
BT_INT, BT_FILEPTR, BT_CONST_STRING)
@@ -156,6 +156,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+ BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+ BT_PTR_ULONGLONG)
DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT,
BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT,
@@ -170,6 +173,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_INT_SIZE
DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+ BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+ BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
DEF_FUNCTION_TYPE_6 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR,
BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG,
@@ -232,6 +238,8 @@ DEF_FUNCTION_TYPE_VAR_0 (BT_FN_VOID_VAR,
DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+ BT_VOID, BT_ULONGLONG)
DEF_FUNCTION_TYPE_VAR_2 (BT_FN_VOID_INT_INT_VAR, BT_VOID, BT_INT, BT_INT)
@@ -299,6 +299,114 @@ GOMP_loop_ull_ordered_runtime_start (boo
}
}
+/* The *_doacross_*_start routines are similar. The only difference is that
+ this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+ section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+ and other COUNTS array elements tell the library number of iterations
+ in the ordered inner loops. */
+
+static bool
+gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ struct gomp_thread *thr = gomp_thread ();
+
+ thr->ts.static_trip = 0;
+ if (gomp_work_share_start (false))
+ {
+ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ GFS_STATIC, chunk_size);
+ gomp_doacross_ull_init (ncounts, counts, chunk_size);
+ gomp_work_share_init_done ();
+ }
+
+ return !gomp_iter_ull_static_next (istart, iend);
+}
+
+static bool
+gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ bool ret;
+
+ if (gomp_work_share_start (false))
+ {
+ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ GFS_DYNAMIC, chunk_size);
+ gomp_doacross_ull_init (ncounts, counts, chunk_size);
+ gomp_work_share_init_done ();
+ }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+ ret = gomp_iter_ull_dynamic_next (istart, iend);
+#else
+ gomp_mutex_lock (&thr->ts.work_share->lock);
+ ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+ return ret;
+}
+
+static bool
+gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ bool ret;
+
+ if (gomp_work_share_start (false))
+ {
+ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ GFS_GUIDED, chunk_size);
+ gomp_doacross_ull_init (ncounts, counts, chunk_size);
+ gomp_work_share_init_done ();
+ }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+ ret = gomp_iter_ull_guided_next (istart, iend);
+#else
+ gomp_mutex_lock (&thr->ts.work_share->lock);
+ ret = gomp_iter_ull_guided_next_locked (istart, iend);
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+ return ret;
+}
+
+bool
+GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull *istart, gomp_ull *iend)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ switch (icv->run_sched_var)
+ {
+ case GFS_STATIC:
+ return gomp_loop_ull_doacross_static_start (ncounts, counts,
+ icv->run_sched_chunk_size,
+ istart, iend);
+ case GFS_DYNAMIC:
+ return gomp_loop_ull_doacross_dynamic_start (ncounts, counts,
+ icv->run_sched_chunk_size,
+ istart, iend);
+ case GFS_GUIDED:
+ return gomp_loop_ull_doacross_guided_start (ncounts, counts,
+ icv->run_sched_chunk_size,
+ istart, iend);
+ case GFS_AUTO:
+ /* For now map to schedule(static), later on we could play with feedback
+ driven choice. */
+ return gomp_loop_ull_doacross_static_start (ncounts, counts,
+ 0, istart, iend);
+ default:
+ abort ();
+ }
+}
+
/* The *_next routines are called when the thread completes processing of
the iteration block currently assigned to it. If the work-share
construct is bound directly to a parallel construct, then the iteration
@@ -466,6 +574,13 @@ extern __typeof(gomp_loop_ull_ordered_dy
extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start
__attribute__((alias ("gomp_loop_ull_ordered_guided_start")));
+extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start
+ __attribute__((alias ("gomp_loop_ull_doacross_static_start")));
+extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start
+ __attribute__((alias ("gomp_loop_ull_doacross_dynamic_start")));
+extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start
+ __attribute__((alias ("gomp_loop_ull_doacross_guided_start")));
+
extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next
__attribute__((alias ("gomp_loop_ull_static_next")));
extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next
@@ -535,6 +650,33 @@ GOMP_loop_ull_ordered_guided_start (bool
}
bool
+GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size,
+ istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size,
+ istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, gomp_ull *istart,
+ gomp_ull *iend)
+{
+ return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size,
+ istart, iend);
+}
+
+bool
GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend)
{
return gomp_loop_ull_static_next (istart, iend);
@@ -317,7 +317,6 @@ gomp_doacross_init (unsigned ncounts, lo
doacross->elt_sz = elt_sz;
doacross->ncounts = ncounts;
doacross->flattened = false;
- doacross->boundary = 0;
doacross->array = (unsigned char *)
((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
& ~(uintptr_t) 63);
@@ -479,3 +478,296 @@ GOMP_doacross_wait (long first, ...)
while (1);
__sync_synchronize ();
}
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+ struct gomp_work_share *ws = thr->ts.work_share;
+ unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+ unsigned long ent, num_ents, elt_sz, shift_sz;
+ struct gomp_doacross_work_share *doacross;
+
+ if (team == NULL || team->nthreads == 1)
+ return;
+
+ for (i = 0; i < ncounts; i++)
+ {
+ /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
+ if (counts[i] == 0)
+ return;
+
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int this_bits;
+ if (counts[i] == 1)
+ this_bits = 1;
+ else
+ this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+ - __builtin_clzll (counts[i] - 1);
+ if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+ {
+ bits[i] = this_bits;
+ num_bits += this_bits;
+ }
+ else
+ num_bits = MAX_COLLAPSED_BITS + 1;
+ }
+ }
+
+ if (ws->sched == GFS_STATIC)
+ num_ents = team->nthreads;
+ else
+ num_ents = (counts[0] - 1) / chunk_size + 1;
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ elt_sz = sizeof (unsigned long);
+ shift_sz = ncounts * sizeof (unsigned int);
+ }
+ else
+ {
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ elt_sz = sizeof (gomp_ull) * ncounts;
+ else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+ elt_sz = sizeof (unsigned long) * 2 * ncounts;
+ else
+ abort ();
+ shift_sz = 0;
+ }
+ elt_sz = (elt_sz + 63) & ~63UL;
+
+ doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+ + shift_sz);
+ doacross->chunk_size_ull = chunk_size;
+ doacross->elt_sz = elt_sz;
+ doacross->ncounts = ncounts;
+ doacross->flattened = false;
+ doacross->boundary = 0;
+ doacross->array = (unsigned char *)
+ ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+ & ~(uintptr_t) 63);
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int shift_count = 0;
+ doacross->flattened = true;
+ for (i = ncounts; i > 0; i--)
+ {
+ doacross->shift_counts[i - 1] = shift_count;
+ shift_count += bits[i - 1];
+ }
+ for (ent = 0; ent < num_ents; ent++)
+ *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+ }
+ else
+ for (ent = 0; ent < num_ents; ent++)
+ memset (doacross->array + ent * elt_sz, '\0',
+ sizeof (unsigned long) * ncounts);
+ if (ws->sched == GFS_STATIC && chunk_size == 0)
+ {
+ gomp_ull q = counts[0] / num_ents;
+ gomp_ull t = counts[0] % num_ents;
+ doacross->boundary_ull = t * (q + 1);
+ doacross->q_ull = q;
+ doacross->t = t;
+ }
+ ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation. */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ ent = thr->ts.team_id;
+ else
+ ent = counts[0] / doacross->chunk_size_ull;
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ gomp_ull flattened
+ = counts[0] << doacross->shift_counts[0];
+
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= counts[i] << doacross->shift_counts[i];
+ flattened++;
+ if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ else
+ __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+ return;
+ }
+
+ __atomic_thread_fence (MEMMODEL_ACQUIRE);
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ {
+ gomp_ull *array = (gomp_ull *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ for (i = doacross->ncounts; i-- > 0; )
+ {
+ if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+ }
+ }
+ else
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ for (i = doacross->ncounts; i-- > 0; )
+ {
+ gomp_ull cull = counts[i] + 1UL;
+ unsigned long c = (unsigned long) cull;
+ if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+ c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+ if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+ }
+ }
+}
+
+/* DOACROSS WAIT operation. */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ va_list ap;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ {
+ if (ws->chunk_size_ull == 0)
+ {
+ if (first < doacross->boundary_ull)
+ ent = first / (doacross->q_ull + 1);
+ else
+ ent = (first - doacross->boundary_ull) / doacross->q_ull
+ + doacross->t;
+ }
+ else
+ ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+ }
+ else
+ ent = first / doacross->chunk_size_ull;
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ gomp_ull flattened = first << doacross->shift_counts[0];
+ unsigned long cur;
+
+ va_start (ap, first);
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= va_arg (ap, gomp_ull)
+ << doacross->shift_counts[i];
+ cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+ if (flattened < cur)
+ {
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+ doacross_spin (array, flattened, cur);
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ {
+ gomp_ull *array = (gomp_ull *) (doacross->array
+ + ent * doacross->elt_sz);
+ do
+ {
+ va_start (ap, first);
+ for (i = 0; i < doacross->ncounts; i++)
+ {
+ gomp_ull thisv
+ = (i ? va_arg (ap, gomp_ull) : first) + 1;
+ gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+ if (thisv < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (thisv > cur)
+ break;
+ }
+ va_end (ap);
+ if (i == doacross->ncounts)
+ break;
+ cpu_relax ();
+ }
+ while (1);
+ }
+ else
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ do
+ {
+ va_start (ap, first);
+ for (i = 0; i < doacross->ncounts; i++)
+ {
+ gomp_ull thisv
+ = (i ? va_arg (ap, gomp_ull) : first) + 1;
+ unsigned long t
+ = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+ unsigned long cur
+ = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+ if (t < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (t > cur)
+ break;
+ t = thisv;
+ cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+ if (t < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (t > cur)
+ break;
+ }
+ va_end (ap);
+ if (i == doacross->ncounts)
+ break;
+ cpu_relax ();
+ }
+ while (1);
+ }
+ __sync_synchronize ();
+}
@@ -280,6 +280,12 @@ GOMP_4.1 {
GOMP_loop_doacross_static_start;
GOMP_doacross_post;
GOMP_doacross_wait;
+ GOMP_loop_ull_doacross_dynamic_start;
+ GOMP_loop_ull_doacross_guided_start;
+ GOMP_loop_ull_doacross_runtime_start;
+ GOMP_loop_ull_doacross_static_start;
+ GOMP_doacross_ull_post;
+ GOMP_doacross_ull_wait;
} GOMP_4.0.1;
OACC_2.0 {
@@ -173,12 +173,34 @@ extern bool GOMP_loop_ull_ordered_guided
extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *,
unsigned long long *);
+extern bool GOMP_loop_ull_doacross_static_start (unsigned,
+ unsigned long long *,
+ unsigned long long,
+ unsigned long long *,
+ unsigned long long *);
+extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned,
+ unsigned long long *,
+ unsigned long long,
+ unsigned long long *,
+ unsigned long long *);
+extern bool GOMP_loop_ull_doacross_guided_start (unsigned,
+ unsigned long long *,
+ unsigned long long,
+ unsigned long long *,
+ unsigned long long *);
+extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
+ unsigned long long *,
+ unsigned long long *,
+ unsigned long long *);
+
/* ordered.c */
extern void GOMP_ordered_start (void);
extern void GOMP_ordered_end (void);
extern void GOMP_doacross_post (long *);
extern void GOMP_doacross_wait (long, ...);
+extern void GOMP_doacross_ull_post (unsigned long long *);
+extern void GOMP_doacross_ull_wait (unsigned long long, ...);
/* parallel.c */
@@ -84,10 +84,14 @@ struct gomp_doacross_work_share
/* chunk_size copy, as ws->chunk_size is multiplied by incr for
GFS_DYNAMIC. */
long chunk_size;
+ /* Likewise, but for ull implementation. */
+ unsigned long long chunk_size_ull;
/* For schedule(static,0) this is the number
of iterations assigned to the last thread, i.e. number of
iterations / number of threads. */
long q;
+ /* Likewise, but for ull implementation. */
+ unsigned long long q_ull;
};
/* Size of each array entry (padded to cache line size). */
unsigned long elt_sz;
@@ -102,8 +106,12 @@ struct gomp_doacross_work_share
/* These two are only used for schedule(static,0). */
/* This one is number of iterations % number of threads. */
long t;
- /* And this one is cached t * (q + 1). */
- long boundary;
+ union {
+ /* And this one is cached t * (q + 1). */
+ long boundary;
+ /* Likewise, but for the ull implementation. */
+ unsigned long long boundary_ull;
+ };
/* Array of shift counts for each dimension if they can be flattened. */
unsigned int shift_counts[];
};
@@ -683,6 +691,8 @@ extern void gomp_ordered_static_init (vo
extern void gomp_ordered_static_next (void);
extern void gomp_ordered_sync (void);
extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
+ unsigned long long);
/* parallel.c */
@@ -0,0 +1,225 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+ unsigned long long i;
+ int j, k, l, m;
+ #pragma omp parallel private (l)
+ {
+ #pragma omp for schedule(static, 1) ordered (1) nowait
+ for (i = 1; i < N + f; i++)
+ {
+ #pragma omp atomic write
+ a[i] = 1;
+ #pragma omp ordered depend(sink: i - 1)
+ if (i > 1)
+ {
+ #pragma omp atomic read
+ l = a[i - 1];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp atomic write
+ a[i] = 2;
+ if (i < N - 1)
+ {
+ #pragma omp atomic read
+ l = a[i + 1];
+ if (l == 3)
+ abort ();
+ }
+ #pragma omp ordered depend(source)
+ #pragma omp atomic write
+ a[i] = 3;
+ }
+ #pragma omp for schedule(static, 0) ordered (3) nowait
+ for (i = 3; i < N / 16 - 1 + f; i++)
+ for (j = 0; j < 8; j += 2)
+ for (k = 1; k <= 3; k++)
+ {
+ #pragma omp atomic write
+ b[i][j][k] = 1;
+ #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+ depend(sink: i - 2, j - 2, k + 1)
+ #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+ if (j >= 2 && k > 1)
+ {
+ #pragma omp atomic read
+ l = b[i][j - 2][k - 1];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp atomic write
+ b[i][j][k] = 2;
+ if (i >= 5 && j >= 2 && k < 3)
+ {
+ #pragma omp atomic read
+ l = b[i - 2][j - 2][k + 1];
+ if (l < 2)
+ abort ();
+ }
+ if (i >= 6 && j < N / 16 - 3 && k == 3)
+ {
+ #pragma omp atomic read
+ l = b[i - 3][j + 2][k - 2];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp ordered depend(source)
+ #pragma omp atomic write
+ b[i][j][k] = 3;
+ }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+ D(m)
+#undef A
+ #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15)
+ for (i = 2; i < N / 32 + f; i++)
+ for (j = 7; j > 1; j--)
+ for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+ D(m)
+#undef A
+ {
+ #pragma omp atomic write
+ c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+ #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+ depend (sink:i - 2, j + 1, k - 4 E(m)) \
+ depend(sink: i - 1, j - 2, k - 2 E(m))
+ if (k <= 4)
+ {
+ l = c[i][j][k + 2];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp atomic write
+ c[i][j][k] = 2;
+ if (i >= 4 && j < 7 && k >= 4)
+ {
+ l = c[i - 2][j + 1][k - 4];
+ if (l < 2)
+ abort ();
+ }
+ if (i >= 3 && j >= 4 && k >= 2)
+ {
+ l = c[i - 1][j - 2][k - 2];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp ordered depend (source)
+ #pragma omp atomic write
+ c[i][j][k] = 3;
+ }
+ #pragma omp for schedule(static, 0) ordered (3) nowait
+ for (j = 0; j < N / 16 - 1; j++)
+ for (k = 0; k < 8; k += 2)
+ for (i = 3; i <= 5 + f; i++)
+ {
+ #pragma omp atomic write
+ g[j][k][i] = 1;
+ #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+ depend(sink: j - 2, k - 2, i + 1)
+ #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+ if (k >= 2 && i > 3)
+ {
+ #pragma omp atomic read
+ l = g[j][k - 2][i - 1];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp atomic write
+ g[j][k][i] = 2;
+ if (j >= 2 && k >= 2 && i < 5)
+ {
+ #pragma omp atomic read
+ l = g[j - 2][k - 2][i + 1];
+ if (l < 2)
+ abort ();
+ }
+ if (j >= 3 && k < N / 16 - 3 && i == 5)
+ {
+ #pragma omp atomic read
+ l = g[j - 3][k + 2][i - 2];
+ if (l < 2)
+ abort ();
+ }
+ #pragma omp ordered depend(source)
+ #pragma omp atomic write
+ g[j][k][i] = 3;
+ }
+ #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+ for (i = 2; i < f + 3; i++)
+ for (j = d + 1; j >= 0; j--)
+ for (k = 0; k < d; k++)
+ for (l = 0; l < d + 2; l++)
+ {
+ #pragma omp ordered depend (source)
+ #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+ if (!e)
+ abort ();
+ }
+ #pragma omp single
+ {
+ if (i != 3 || j != -1 || k != 0)
+ abort ();
+ i = 8; j = 9; k = 10;
+ }
+ #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+ for (i = 2; i < f + 3; i++)
+ for (j = d + 1; j >= 0; j--)
+ for (k = 0; k < d + 2; k++)
+ for (m = 0; m < d; m++)
+ {
+ #pragma omp ordered depend (source)
+ #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+ abort ();
+ }
+ #pragma omp single
+ if (i != 3 || j != -1 || k != 2 || m != 0)
+ abort ();
+ #pragma omp for collapse(2) ordered(4) nowait
+ for (i = 2; i < f + 3; i++)
+ for (j = d; j > 0; j--)
+ for (k = 0; k < d + 2; k++)
+ for (l = 0; l < d + 4; l++)
+ {
+ #pragma omp ordered depend (source)
+ #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+ if (!e)
+ abort ();
+ }
+ #pragma omp for nowait
+ for (i = 0; i < N; i++)
+ if (a[i] != 3 * (i >= 1))
+ abort ();
+ #pragma omp for collapse(2) private(k) nowait
+ for (i = 0; i < N / 16; i++)
+ for (j = 0; j < 8; j++)
+ for (k = 0; k < 4; k++)
+ if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+ abort ();
+ #pragma omp for collapse(3) nowait
+ for (i = 0; i < N / 32; i++)
+ for (j = 0; j < 8; j++)
+ for (k = 0; k < 8; k++)
+ if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+ abort ();
+ #pragma omp for collapse(2) private(k) nowait
+ for (i = 0; i < N / 16; i++)
+ for (j = 0; j < 8; j++)
+ for (k = 0; k < 6; k++)
+ if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+ abort ();
+ }
+ return 0;
+}