diff mbox

[gomp4.1] Unsigned long long doacross implementation

Message ID 20151002121402.GH28276@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 2, 2015, 12:14 p.m. UTC
Hi!

The default set of routines use long as the iterator type, if some loops
need either unsigned long, or long long/unsigned long long, they need to use
another implementation (__int128 iterators are not supported).
This patch adds those entry points and fixes some issues on the compiler
side.

2015-10-02  Jakub Jelinek  <jakub@redhat.com>

	* omp-low.c (expand_omp_ordered_source): Use GOMP_doacross_ull_post
	instead of GOMP_doacross_post if iter_type is unsigned long long.
	(expand_omp_ordered_sink): Use GOMP_doacross_ull_wait
	instead of GOMP_doacross_wait if iter_type is unsigned long long.
	(expand_omp_for_generic): Fix up expansion if zero_iter1_bb is
	NULL, but zero_iter2_bb is non-NULL.  Never pass the up bool argument
	to GOMP_loop_ull_doacross_*_start entrypoints.
	* omp-builtins.def (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START,
	BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START,
	BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START,
	BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
	BUILT_IN_GOMP_DOACROSS_ULL_POST, BUILT_IN_GOMP_DOACROSS_ULL_WAIT): New
	built-ins.
	* builtin-types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
	BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New.
gcc/fortran/
	* types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
	BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New.
libgomp/
	* loop_ull.c (gomp_loop_ull_doacross_static_start,
	gomp_loop_ull_doacross_dynamic_start,
	gomp_loop_ull_doacross_guided_start,
	GOMP_loop_ull_doacross_runtime_start,
	GOMP_loop_ull_doacross_static_start,
	GOMP_loop_ull_doacross_dynamic_start,
	GOMP_loop_ull_doacross_guided_start): New functions.
	* ordered.c (gomp_doacross_init): Don't initialize boundary
	if not static scheduling.
	(gomp_doacross_ull_init, GOMP_doacross_ull_post,
	GOMP_doacross_ull_wait): New functions.
	* libgomp.map (GOMP_4.1): Export GOMP_loop_ull_doacross_dynamic_start,
	GOMP_loop_ull_doacross_guided_start,
	GOMP_loop_ull_doacross_runtime_start,
	GOMP_loop_ull_doacross_static_start,
	GOMP_doacross_ull_post and GOMP_doacross_ull_wait.
	* libgomp_g.h (GOMP_loop_ull_doacross_guided_start,
	GOMP_loop_ull_doacross_runtime_start,
	GOMP_loop_ull_doacross_static_start,
	GOMP_doacross_ull_post, GOMP_doacross_ull_wait): New prototypes.
	* libgomp.h (struct gomp_doacross_work_share): Add
	chunk_size_ull, q_ull and boundary_ull fields.
	(gomp_doacross_ull_init): New prototype.
	* testsuite/libgomp.c/doacross-2.c: New test.


	Jakub
diff mbox

Patch

--- gcc/omp-low.c.jj	2015-10-01 13:20:13.000000000 +0200
+++ gcc/omp-low.c	2015-10-02 11:38:40.140982433 +0200
@@ -7071,7 +7071,9 @@  static void
 expand_omp_ordered_source (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
 			   tree *counts, location_t loc)
 {
-  enum built_in_function source_ix = BUILT_IN_GOMP_DOACROSS_POST;
+  enum built_in_function source_ix
+    = fd->iter_type == long_integer_type_node
+      ? BUILT_IN_GOMP_DOACROSS_POST : BUILT_IN_GOMP_DOACROSS_ULL_POST;
   gimple g
     = gimple_build_call (builtin_decl_explicit (source_ix), 1,
 			 build_fold_addr_expr (counts[fd->ordered]));
@@ -7086,7 +7088,9 @@  expand_omp_ordered_sink (gimple_stmt_ite
 			 tree *counts, tree c, location_t loc)
 {
   auto_vec<tree, 10> args;
-  enum built_in_function sink_ix = BUILT_IN_GOMP_DOACROSS_WAIT;
+  enum built_in_function sink_ix
+    = fd->iter_type == long_integer_type_node
+      ? BUILT_IN_GOMP_DOACROSS_WAIT : BUILT_IN_GOMP_DOACROSS_ULL_WAIT;
   tree t, off, coff = NULL_TREE, deps = OMP_CLAUSE_DECL (c), cond = NULL_TREE;
   int i;
   gimple_stmt_iterator gsi2 = *gsi;
@@ -7625,11 +7629,11 @@  expand_omp_for_generic (struct omp_regio
 	      gsi_prev (&gsi);
 	      e = split_block (entry_bb, gsi_stmt (gsi));
 	      entry_bb = e->dest;
-	      make_edge (zero_iter1_bb, entry_bb, EDGE_FALLTHRU);
+	      make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU);
 	      gsi = gsi_last_bb (entry_bb);
 	      set_immediate_dominator (CDI_DOMINATORS, entry_bb,
 				       get_immediate_dominator
-					 (CDI_DOMINATORS, zero_iter1_bb));
+					 (CDI_DOMINATORS, zero_iter2_bb));
 	    }
 	}
       if (fd->collapse == 1)
@@ -7762,7 +7766,7 @@  expand_omp_for_generic (struct omp_regio
 	      t0 = fold_build2 (PLUS_EXPR, fd->iter_type, t0, bias);
 	    }
 	}
-      if (fd->iter_type == long_integer_type_node)
+      if (fd->iter_type == long_integer_type_node || fd->ordered)
 	{
 	  if (fd->chunk_size)
 	    {
@@ -7801,14 +7805,8 @@  expand_omp_for_generic (struct omp_regio
 	      tree bfn_decl = builtin_decl_explicit (start_fn);
 	      t = fold_convert (fd->iter_type, fd->chunk_size);
 	      t = omp_adjust_chunk_size (t, fd->simd_schedule);
-	      if (fd->ordered)
-		t = build_call_expr (bfn_decl, 6, t5, t0, t1, t, t3, t4);
-	      else
-		t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
+	      t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
 	    }
-	  else if (fd->ordered)
-	    t = build_call_expr (builtin_decl_explicit (start_fn),
-				 5, t5, t0, t1, t3, t4);
 	  else
 	    t = build_call_expr (builtin_decl_explicit (start_fn),
 				 6, t5, t0, t1, t2, t3, t4);
--- gcc/omp-builtins.def.jj	2015-09-24 13:33:02.000000000 +0200
+++ gcc/omp-builtins.def	2015-10-01 17:19:13.565016484 +0200
@@ -197,6 +197,22 @@  DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL
 		  "GOMP_loop_ull_ordered_runtime_start",
 		  BT_FN_BOOL_BOOL_ULL_ULL_ULL_ULLPTR_ULLPTR,
 		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START,
+		  "GOMP_loop_ull_doacross_static_start",
+		  BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START,
+		  "GOMP_loop_ull_doacross_dynamic_start",
+		  BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START,
+		  "GOMP_loop_ull_doacross_guided_start",
+		  BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
+		  "GOMP_loop_ull_doacross_runtime_start",
+		  BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+		  ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_STATIC_NEXT, "GOMP_loop_ull_static_next",
 		  BT_FN_BOOL_ULONGLONGPTR_ULONGLONGPTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DYNAMIC_NEXT, "GOMP_loop_ull_dynamic_next",
@@ -250,6 +266,10 @@  DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS
 		  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_WAIT, "GOMP_doacross_wait",
 		  BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_POST, "GOMP_doacross_ull_post",
+		  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_WAIT, "GOMP_doacross_ull_wait",
+		  BT_FN_VOID_ULL_VAR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL, "GOMP_parallel",
 		  BT_FN_VOID_OMPFN_PTR_UINT_UINT, ATTR_NOTHROW_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK, "GOMP_task",
--- gcc/builtin-types.def.jj	2015-09-17 09:24:53.000000000 +0200
+++ gcc/builtin-types.def	2015-10-01 17:25:48.760370499 +0200
@@ -475,6 +475,9 @@  DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
 		     BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
 DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
 		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+		     BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_5 (BT_FN_INT_STRING_INT_SIZE_CONST_STRING_VALIST_ARG,
 		     BT_INT, BT_STRING, BT_INT, BT_SIZE, BT_CONST_STRING,
@@ -502,6 +505,9 @@  DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PT
 DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
 		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
 		     BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+		     BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_6 (BT_FN_INT_STRING_SIZE_INT_SIZE_CONST_STRING_VALIST_ARG,
 		     BT_INT, BT_STRING, BT_SIZE, BT_INT, BT_SIZE,
@@ -578,6 +584,8 @@  DEF_FUNCTION_TYPE_VAR_1 (BT_FN_UINT32_UI
 			 BT_UINT32, BT_UINT32)
 DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
 			 BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+			 BT_VOID, BT_ULONGLONG)
 
 DEF_FUNCTION_TYPE_VAR_2 (BT_FN_INT_FILEPTR_CONST_STRING_VAR,
 			 BT_INT, BT_FILEPTR, BT_CONST_STRING)
--- gcc/fortran/types.def.jj	2015-09-17 09:31:11.000000000 +0200
+++ gcc/fortran/types.def	2015-10-01 17:30:29.856340476 +0200
@@ -156,6 +156,9 @@  DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
 		     BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
 DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
 		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+		     BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT,
 		     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT,
@@ -170,6 +173,9 @@  DEF_FUNCTION_TYPE_5 (BT_FN_VOID_INT_SIZE
 DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
 		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
 		     BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+		     BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_6 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR,
                      BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG,
@@ -232,6 +238,8 @@  DEF_FUNCTION_TYPE_VAR_0 (BT_FN_VOID_VAR,
 
 DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
 			 BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+			 BT_VOID, BT_ULONGLONG)
 
 DEF_FUNCTION_TYPE_VAR_2 (BT_FN_VOID_INT_INT_VAR, BT_VOID, BT_INT, BT_INT)
 
--- libgomp/loop_ull.c.jj	2015-06-11 10:27:48.000000000 +0200
+++ libgomp/loop_ull.c	2015-10-01 17:08:11.969445359 +0200
@@ -299,6 +299,114 @@  GOMP_loop_ull_ordered_runtime_start (boo
     }
 }
 
+/* The *_doacross_*_start routines are similar.  The only difference is that
+   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+   and other COUNTS array elements tell the library number of iterations
+   in the ordered inner loops.  */
+
+static bool
+gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_STATIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+  return !gomp_iter_ull_static_next (istart, iend);
+}
+
+static bool
+gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull chunk_size, gomp_ull *istart,
+				      gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_DYNAMIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_dynamic_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+static bool
+gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_GUIDED, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_guided_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_guided_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+bool
+GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull *istart, gomp_ull *iend)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  switch (icv->run_sched_var)
+    {
+    case GFS_STATIC:
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+						  icv->run_sched_chunk_size,
+						  istart, iend);
+    case GFS_DYNAMIC:
+      return gomp_loop_ull_doacross_dynamic_start (ncounts, counts,
+						   icv->run_sched_chunk_size,
+						   istart, iend);
+    case GFS_GUIDED:
+      return gomp_loop_ull_doacross_guided_start (ncounts, counts,
+						  icv->run_sched_chunk_size,
+						  istart, iend);
+    case GFS_AUTO:
+      /* For now map to schedule(static), later on we could play with feedback
+	 driven choice.  */
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+						  0, istart, iend);
+    default:
+      abort ();
+    }
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
@@ -466,6 +574,13 @@  extern __typeof(gomp_loop_ull_ordered_dy
 extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start
 	__attribute__((alias ("gomp_loop_ull_ordered_guided_start")));
 
+extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start
+	__attribute__((alias ("gomp_loop_ull_doacross_static_start")));
+extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start
+	__attribute__((alias ("gomp_loop_ull_doacross_dynamic_start")));
+extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start
+	__attribute__((alias ("gomp_loop_ull_doacross_guided_start")));
+
 extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next
 	__attribute__((alias ("gomp_loop_ull_static_next")));
 extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next
@@ -535,6 +650,33 @@  GOMP_loop_ull_ordered_guided_start (bool
 }
 
 bool
+GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size,
+					      istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull chunk_size, gomp_ull *istart,
+				      gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size,
+					       istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size,
+					      istart, iend);
+}
+
+bool
 GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend)
 {
   return gomp_loop_ull_static_next (istart, iend);
--- libgomp/ordered.c.jj	2015-09-24 20:20:32.000000000 +0200
+++ libgomp/ordered.c	2015-10-02 13:21:16.675194039 +0200
@@ -317,7 +317,6 @@  gomp_doacross_init (unsigned ncounts, lo
   doacross->elt_sz = elt_sz;
   doacross->ncounts = ncounts;
   doacross->flattened = false;
-  doacross->boundary = 0;
   doacross->array = (unsigned char *)
 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 		     & ~(uintptr_t) 63);
@@ -479,3 +478,296 @@  GOMP_doacross_wait (long first, ...)
   while (1);
   __sync_synchronize ();
 }
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    return;
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+	return;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+	{
+	  unsigned int this_bits;
+	  if (counts[i] == 1)
+	    this_bits = 1;
+	  else
+	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+			- __builtin_clzll (counts[i] - 1);
+	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+	    {
+	      bits[i] = this_bits;
+	      num_bits += this_bits;
+	    }
+	  else
+	    num_bits = MAX_COLLAPSED_BITS + 1;
+	}
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      if (sizeof (gomp_ull) == sizeof (unsigned long))
+	elt_sz = sizeof (gomp_ull) * ncounts;
+      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+	elt_sz = sizeof (unsigned long) * 2 * ncounts;
+      else
+	abort ();
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+			  + shift_sz);
+  doacross->chunk_size_ull = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->boundary = 0;
+  doacross->array = (unsigned char *)
+		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+		     & ~(uintptr_t) 63);
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+	{
+	  doacross->shift_counts[i - 1] = shift_count;
+	  shift_count += bits[i - 1];
+	}
+      for (ent = 0; ent < num_ents; ent++)
+	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+	      sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      gomp_ull q = counts[0] / num_ents;
+      gomp_ull t = counts[0] % num_ents;
+      doacross->boundary_ull = t * (q + 1);
+      doacross->q_ull = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else
+    ent = counts[0] / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+			      + ent * doacross->elt_sz);
+      gomp_ull flattened
+	= counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= counts[i] << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+	__atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+				      + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+	{
+	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+	}
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+	{
+	  gomp_ull cull = counts[i] + 1UL;
+	  unsigned long c = (unsigned long) cull;
+	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+	}
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size_ull == 0)
+	{
+	  if (first < doacross->boundary_ull)
+	    ent = first / (doacross->q_ull + 1);
+	  else
+	    ent = (first - doacross->boundary_ull) / doacross->q_ull
+		  + doacross->t;
+	}
+      else
+	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+    }
+  else
+    ent = first / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+      gomp_ull flattened = first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= va_arg (ap, gomp_ull)
+		     << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+	{
+	  __atomic_thread_fence (MEMMODEL_RELEASE);
+	  va_end (ap);
+	  return;
+	}
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+				      + ent * doacross->elt_sz);
+      do
+	{
+	  va_start (ap, first);
+	  for (i = 0; i < doacross->ncounts; i++)
+	    {
+	      gomp_ull thisv
+		= (i ? va_arg (ap, gomp_ull) : first) + 1;
+	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+	      if (thisv < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (thisv > cur)
+		break;
+	    }
+	  va_end (ap);
+	  if (i == doacross->ncounts)
+	    break;
+	  cpu_relax ();
+	}
+      while (1);
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+      do
+	{
+	  va_start (ap, first);
+	  for (i = 0; i < doacross->ncounts; i++)
+	    {
+	      gomp_ull thisv
+		= (i ? va_arg (ap, gomp_ull) : first) + 1;
+	      unsigned long t
+		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+	      unsigned long cur
+		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+	      if (t < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (t > cur)
+		break;
+	      t = thisv;
+	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+	      if (t < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (t > cur)
+		break;
+	    }
+	  va_end (ap);
+	  if (i == doacross->ncounts)
+	    break;
+	  cpu_relax ();
+	}
+      while (1);
+    }
+  __sync_synchronize ();
+}
--- libgomp/libgomp.map.jj	2015-09-18 18:12:29.000000000 +0200
+++ libgomp/libgomp.map	2015-10-01 17:03:55.649130579 +0200
@@ -280,6 +280,12 @@  GOMP_4.1 {
 	GOMP_loop_doacross_static_start;
 	GOMP_doacross_post;
 	GOMP_doacross_wait;
+	GOMP_loop_ull_doacross_dynamic_start;
+	GOMP_loop_ull_doacross_guided_start;
+	GOMP_loop_ull_doacross_runtime_start;
+	GOMP_loop_ull_doacross_static_start;
+	GOMP_doacross_ull_post;
+	GOMP_doacross_ull_wait;
 } GOMP_4.0.1;
 
 OACC_2.0 {
--- libgomp/libgomp_g.h.jj	2015-09-24 13:33:32.000000000 +0200
+++ libgomp/libgomp_g.h	2015-10-01 14:15:41.195121635 +0200
@@ -173,12 +173,34 @@  extern bool GOMP_loop_ull_ordered_guided
 extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *,
 						unsigned long long *);
 
+extern bool GOMP_loop_ull_doacross_static_start (unsigned,
+						 unsigned long long *,
+						 unsigned long long,
+						 unsigned long long *,
+						 unsigned long long *);
+extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned,
+						  unsigned long long *,
+						  unsigned long long,
+						  unsigned long long *,
+						  unsigned long long *);
+extern bool GOMP_loop_ull_doacross_guided_start (unsigned,
+						 unsigned long long *,
+						 unsigned long long,
+						 unsigned long long *,
+						 unsigned long long *);
+extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
+						  unsigned long long *,
+						  unsigned long long *,
+						  unsigned long long *);
+
 /* ordered.c */
 
 extern void GOMP_ordered_start (void);
 extern void GOMP_ordered_end (void);
 extern void GOMP_doacross_post (long *);
 extern void GOMP_doacross_wait (long, ...);
+extern void GOMP_doacross_ull_post (unsigned long long *);
+extern void GOMP_doacross_ull_wait (unsigned long long, ...);
 
 /* parallel.c */
 
--- libgomp/libgomp.h.jj	2015-09-23 12:25:51.000000000 +0200
+++ libgomp/libgomp.h	2015-10-01 16:24:59.005076396 +0200
@@ -84,10 +84,14 @@  struct gomp_doacross_work_share
     /* chunk_size copy, as ws->chunk_size is multiplied by incr for
        GFS_DYNAMIC.  */
     long chunk_size;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long chunk_size_ull;
     /* For schedule(static,0) this is the number
        of iterations assigned to the last thread, i.e. number of
        iterations / number of threads.  */
     long q;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long q_ull;
   };
   /* Size of each array entry (padded to cache line size).  */
   unsigned long elt_sz;
@@ -102,8 +106,12 @@  struct gomp_doacross_work_share
   /* These two are only used for schedule(static,0).  */
   /* This one is number of iterations % number of threads.  */
   long t;
-  /* And this one is cached t * (q + 1).  */
-  long boundary;
+  union {
+    /* And this one is cached t * (q + 1).  */
+    long boundary;
+    /* Likewise, but for the ull implementation.  */
+    unsigned long long boundary_ull;
+  };
   /* Array of shift counts for each dimension if they can be flattened.  */
   unsigned int shift_counts[];
 };
@@ -683,6 +691,8 @@  extern void gomp_ordered_static_init (vo
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
 extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
+				    unsigned long long);
 
 /* parallel.c */
 
--- libgomp/testsuite/libgomp.c/doacross-2.c.jj	2015-10-02 09:36:19.575951751 +0200
+++ libgomp/testsuite/libgomp.c/doacross-2.c	2015-10-02 10:14:47.098868611 +0200
@@ -0,0 +1,225 @@ 
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+  unsigned long long i;
+  int j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(static, 1) ordered (1) nowait
+    for (i = 1; i < N + f; i++)
+      {
+	#pragma omp atomic write
+	a[i] = 1;
+	#pragma omp ordered depend(sink: i - 1)
+	if (i > 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i - 1];
+	    if (l < 2)
+	      abort ();
+	  }
+	#pragma omp atomic write
+	a[i] = 2;
+	if (i < N - 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i + 1];
+	    if (l == 3)
+	      abort ();
+	  }
+	#pragma omp ordered depend(source)
+	#pragma omp atomic write
+	a[i] = 3;
+      }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (i = 3; i < N / 16 - 1 + f; i++)
+      for (j = 0; j < 8; j += 2)
+	for (k = 1; k <= 3; k++)
+	  {
+	    #pragma omp atomic write
+	    b[i][j][k] = 1;
+	    #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+				depend(sink: i - 2, j - 2, k + 1)
+	    #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+	    if (j >= 2 && k > 1)
+	      {
+		#pragma omp atomic read
+		l = b[i][j - 2][k - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    b[i][j][k] = 2;
+	    if (i >= 5 && j >= 2 && k < 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 2][j - 2][k + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (i >= 6 && j < N / 16 - 3 && k == 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 3][j + 2][k - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    b[i][j][k] = 3;
+	  }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15)
+    for (i = 2; i < N / 32 + f; i++)
+      for (j = 7; j > 1; j--)
+	for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+	  D(m)
+#undef A
+	    {
+	      #pragma omp atomic write
+	      c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+	      #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+				  depend (sink:i - 2, j + 1, k - 4 E(m)) \
+				  depend(sink: i - 1, j - 2, k - 2 E(m))
+	      if (k <= 4)
+		{
+		  l = c[i][j][k + 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp atomic write
+	      c[i][j][k] = 2;
+	      if (i >= 4 && j < 7 && k >= 4)
+		{
+		  l = c[i - 2][j + 1][k - 4];
+		  if (l < 2)
+		    abort ();
+		}
+	      if (i >= 3 && j >= 4 && k >= 2)
+		{
+		  l = c[i - 1][j - 2][k - 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp ordered depend (source)
+	      #pragma omp atomic write
+	      c[i][j][k] = 3;
+	    }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (j = 0; j < N / 16 - 1; j++)
+      for (k = 0; k < 8; k += 2)
+	for (i = 3; i <= 5 + f; i++)
+	  {
+	    #pragma omp atomic write
+	    g[j][k][i] = 1;
+	    #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+				depend(sink: j - 2, k - 2, i + 1)
+	    #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+	    if (k >= 2 && i > 3)
+	      {
+		#pragma omp atomic read
+		l = g[j][k - 2][i - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    g[j][k][i] = 2;
+	    if (j >= 2 && k >= 2 && i < 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 2][k - 2][i + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (j >= 3 && k < N / 16 - 3 && i == 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 3][k + 2][i - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    g[j][k][i] = 3;
+	  }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d; k++)
+	  for (l = 0; l < d + 2; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    {
+      if (i != 3 || j != -1 || k != 0)
+	abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (m = 0; m < d; m++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+	      abort ();
+	    }
+    #pragma omp single
+    if (i != 3 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 2; i < f + 3; i++)
+      for (j = d; j > 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (l = 0; l < d + 4; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3 * (i >= 1))
+	abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 4; k++)
+	  if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+	    abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 8; k++)
+	  if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+	    abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 6; k++)
+	  if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+	    abort ();
+  }
+  return 0;
+}