diff mbox

[5/6] OpenMP 4.0 library part

Message ID 20131008195424.GD30970@tucnak.zalov.cz
State New
Headers show

Commit Message

Jakub Jelinek Oct. 8, 2013, 7:54 p.m. UTC
Hi!

2013-10-08  Jakub Jelinek  <jakub@redhat.com>
	    Tobias Burnus  <burnus@net-b.de>
	    Richard Henderson  <rth@redhat.com>

libgomp/
	* target.c: New file.
	* Makefile.am (libgomp_la_SOURCES): Add target.c.
	* Makefile.in: Regenerated.
	* libgomp_g.h (GOMP_task): Add depend argument.
	(GOMP_barrier_cancel, GOMP_loop_end_cancel,
	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
	GOMP_target_end_data, GOMP_target_update, GOMP_teams,
	GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
	GOMP_parallel, GOMP_cancel, GOMP_cancellation_point,
	GOMP_taskgroup_start, GOMP_taskgroup_end,
	GOMP_parallel_sections): New prototypes.
	* fortran.c (omp_is_initial_device): Add ialias_redirect.
	(omp_is_initial_device_): New function.
	(ULP, STR1, STR2, ialias_redirect): Removed.
	(omp_get_cancellation_, omp_get_proc_bind_, omp_set_default_device_,
	omp_set_default_device_8_, omp_get_default_device_,
	omp_get_num_devices_, omp_get_num_teams_, omp_get_team_num_): New
	functions.
	* libgomp.map (GOMP_barrier_cancel, GOMP_loop_end_cancel,
	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
	GOMP_target_end_data, GOMP_target_update, GOMP_teams): Export
	@@GOMP_4.0.
	(omp_is_initial_device, omp_is_initial_device_, omp_get_cancellation,
	omp_get_cancellation_, omp_get_proc_bind, omp_get_proc_bind_,
	omp_set_default_device, omp_set_default_device_,
	omp_set_default_device_8_, omp_get_default_device,
	omp_get_default_device_, omp_get_num_devices, omp_get_num_devices_,
	omp_get_num_teams, omp_get_num_teams_, omp_get_team_num,
	omp_get_team_num_): Export @@OMP_4.0.
	* team.c (struct gomp_thread_start_data): Add place field.
	(gomp_thread_start): Clear thr->thread_pool and
	thr->task before returning.  Use gomp_team_barrier_wait_final
	instead of gomp_team_barrier_wait.  Initialize thr->place.
	(gomp_new_team): Initialize work_shares_to_free, work_share_cancelled,
	team_cancelled and task_queued_count fields.
	(gomp_free_pool_helper): Clear thr->thread_pool and thr->task
	before calling pthread_exit.
	(gomp_free_thread): No longer static.  Use
	gomp_managed_threads_lock instead of gomp_remaining_threads_lock.
	(gomp_team_start): Add flags argument.  Set
	thr->thread_pool->threads_busy to nthreads immediately after creating
	new pool.  Use gomp_managed_threads_lock instead of
	gomp_remaining_threads_lock.  Handle OpenMP 4.0 affinity.
	(gomp_team_end): Use gomp_managed_threads_lock instead of
	gomp_remaining_threads_lock.  Use gomp_team_barrier_wait_final instead
	of gomp_team_barrier_wait.  If team->team_cancelled, call
	gomp_fini_worshare on ws chain starting at team->work_shares_to_free
	rather than thr->ts.work_share.
	(initialize_team): Don't call gomp_sem_init here.
	* sections.c (GOMP_parallel_sections_start): Adjust gomp_team_start
	caller.
	(GOMP_parallel_sections, GOMP_sections_end_cancel): New functions.
	* env.c (gomp_global_icv): Add default_device_var, target_data and
	bind_var initializers.
	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
	gomp_places_list_len): New variables.
	(parse_bind_var, parse_one_place, parse_places_var): New functions.
	(parse_affinity): Rewritten to construct OMP_PLACES list with unit
	sized places.
	(gomp_cancel_var): New global variable.
	(parse_int): New function.
	(handle_omp_display_env): New function.
	(initialize_env): Use it.  Initialize default_device_var.
	Parse OMP_CANCELLATION env var.  Use parse_bind_var to parse
	OMP_PROC_BIND instead of parse_boolean.  Use parse_places_var for
	OMP_PLACES parsing.  Don't call parse_affinity if OMP_PLACES has
	been successfully parsed (and call gomp_init_affinity in that case).
	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
	omp_get_team_num, omp_is_initial_device): New functions.
	* libgomp.h: Include stdlib.h.
	(ialias_ulp, ialias_str1, ialias_str2, ialias_redirect, ialias_call):
	Define.
	(struct target_mem_desc): Forward declare.
	(struct gomp_task_icv): Add default_device_var, target_data, bind_var
	and thread_limit_var fields.
	(gomp_get_num_devices): New prototype.
	(gomp_cancel_var): New extern decl.
	(struct gomp_team): Add work_shares_to_free, work_share_cancelled,
	team_cancelled and task_queued_count fields.  Add comments about
	task_{,queued_,running_}count.
	(gomp_cancel_kind): New enum.
	(gomp_work_share_end_cancel): New prototype.
	(struct gomp_task): Add next_taskgroup, prev_taskgroup, taskgroup,
	copy_ctors_done, dependers, depend_hash, depend_count, num_dependees
	and depend fields.
	(struct gomp_taskgroup): New type.
	(struct gomp_task_depend_entry,
	struct gomp_dependers_vec): New types.
	(gomp_finish_task): Free depend_hash if non-NULL.
	(struct gomp_team_state): Add place_partition_off
	and place_partition_len fields.
	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
	gomp_places_list_len): New extern decls.
	(struct gomp_thread): Add place field.
	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
	(gomp_init_thread_affinity): Add place argument.
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New
	prototypes.
	(gomp_team_start): Add flags argument.
	(gomp_thread_limit_var, gomp_remaining_threads_count,
	gomp_remaining_threads_lock): Remove.
	(gomp_managed_threads_lock): New variable.
	(struct gomp_thread_pool): Add threads_busy field.
	(gomp_free_thread): New prototype.
	* task.c: Include hashtab.h.
	(hash_entry_type): New typedef.
	(htab_alloc, htab_free, htab_hash, htab_eq): New inlines.
	(gomp_init_task): Clear dependers, depend_hash, depend_count,
	copy_ctors_done and taskgroup fields.
	(GOMP_task): Add depend argument, handle depend clauses.  If
	gomp_team_barrier_cancelled or if it's taskgroup has been
	cancelled, don't queue or start new tasks.  Set copy_ctors_done
	field if needed.  Initialize taskgroup field.  If copy_ctors_done
	and already cancelled, don't discard the task.  If taskgroup is
	non-NULL, enqueue the task into taskgroup queue.  Increment
	num_children field in taskgroup.  Increment task_queued_count.
	(gomp_task_run_pre, gomp_task_run_post_remove_parent,
	gomp_task_run_post_remove_taskgroup): New inline functions.
	(gomp_task_run_post_handle_depend_hash,
	gomp_task_run_post_handle_dependers,
	gomp_task_run_post_handle_depend): New functions.
	(GOMP_taskwait): Use them.  If more than one new tasks
	have been queued, wake other threads if needed.
	(gomp_barrier_handle_tasks): Likewise.  If
	gomp_team_barrier_cancelled, don't start any new tasks, just free
	all tasks.
	(GOMP_taskgroup_start, GOMP_taskgroup_end): New functions.
	* omp_lib.f90.in
	(omp_proc_bind_kind, omp_proc_bind_false,
	omp_proc_bind_true, omp_proc_bind_master, omp_proc_bind_close,
	omp_proc_bind_spread): New params.
	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
	omp_get_team_num, omp_is_initial_device): New interfaces.
	(omp_get_dynamic, omp_get_nested, omp_in_parallel,
	omp_get_max_threads, omp_get_num_procs, omp_get_num_threads,
	omp_get_thread_num, omp_get_thread_limit, omp_set_max_active_levels,
	omp_get_max_active_levels, omp_get_level, omp_get_ancestor_thread_num,
	omp_get_team_size, omp_get_active_level, omp_in_final): Remove
	useless use omp_lib_kinds.
	* omp.h.in (omp_proc_bind_t): New typedef.
	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
	omp_get_team_num, omp_is_initial_device): New prototypes.
	* loop.c (gomp_parallel_loop_start): Add flags argument, pass it
	through to gomp_team_start.
	(GOMP_parallel_loop_static_start, GOMP_parallel_loop_dynamic_start,
	GOMP_parallel_loop_guided_start, GOMP_parallel_loop_runtime_start):
	Adjust gomp_parallel_loop_start callers.
	(GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
	GOMP_loop_end_cancel): New functions.
	(GOMP_parallel_end): Add ialias_redirect.
	* hashtab.h: New file.
	* libgomp.texi (Environment Variables): Minor cleanup,
	update section refs to OpenMP 4.0rc2.
	(OMP_DISPLAY_ENV, GOMP_SPINCOUNT): Document these
	environment variables.
	* work.c (gomp_work_share_end, gomp_work_share_end_nowait): Set
	team->work_shares_to_free to thr->ts.work_share before calling
	free_work_share.
	(gomp_work_share_end_cancel): New function.
	* config/linux/proc.c: Include errno.h.
	(gomp_get_cpuset_size, gomp_cpuset_size, gomp_cpusetp): New variables.
	(gomp_cpuset_popcount): Add cpusetsize argument, use it instead of
	sizeof (cpu_set_t) to determine number of iterations.  Fix up check
	extern decl.  Use CPU_COUNT_S if available, or CPU_COUNT if
	gomp_cpuset_size is sizeof (cpu_set_t).
	(gomp_init_num_threads): Initialize gomp_cpuset_size,
	gomp_get_cpuset_size and gomp_cpusetp here, use gomp_cpusetp instead
	of &cpuset and pass gomp_cpuset_size instead of sizeof (cpu_set_t)
	to pthread_getaffinity_np.  Free and clear gomp_cpusetp if it didn't
	contain any logical CPUs.
	(get_num_procs): Don't call pthread_getaffinity_np if gomp_cpusetp
	is NULL.  Use gomp_cpusetp instead of &cpuset and pass
	gomp_get_cpuset_size instead of sizeof (cpu_set_t) to
	pthread_getaffinity_np.  Check gomp_places_list instead of
	gomp_cpu_affinity.  Adjust gomp_cpuset_popcount caller.
	* config/linux/bar.c (gomp_barrier_wait_end,
	gomp_barrier_wait_last): Use BAR_* defines.
	(gomp_team_barrier_wait_end): Likewise.  Clear BAR_CANCELLED
	from state where needed.  Set work_share_cancelled to 0 on last
	thread.
	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel_end,
	gomp_team_barrier_wait_cancel, gomp_team_barrier_cancel): New
	functions.
	* config/linux/proc.h (gomp_cpuset_popcount): Add attribute_hidden.
	Add cpusetsize argument.
	(gomp_cpuset_size, gomp_cpusetp): Declare.
	* config/linux/affinity.c: Include errno.h, stdio.h and string.h.
	(affinity_counter): Remove.
	(CPU_ISSET_S, CPU_ZERO_S, CPU_SET_S, CPU_CLR_S): Define
	if CPU_ALLOC_SIZE isn't defined.
	(gomp_init_affinity): Rewritten, if gomp_places_list is NULL, try
	silently create OMP_PLACES=threads, if it is non-NULL afterwards,
	bind current thread to the first place.
	(gomp_init_thread_affinity): Rewritten.  Add place argument, just
	pthread_setaffinity_np to gomp_places_list[place].
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New functions.
	* config/linux/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
	(gomp_barrier_t): Add awaited_final field.
	(gomp_barrier_init): Initialize awaited_final field.
	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel,
	gomp_team_barrier_wait_cancel_end, gomp_team_barrier_cancel): New
	prototypes.
	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.  Use BAR_*
	defines.
	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final_start,
	gomp_team_barrier_cancelled): New inline functions.
	(gomp_barrier_last_thread,
	gomp_team_barrier_set_task_pending,
	gomp_team_barrier_clear_task_pending,
	gomp_team_barrier_set_waiting_for_tasks,
	gomp_team_barrier_waiting_for_tasks,
	gomp_team_barrier_done): Use BAR_* defines.
	* config/posix/bar.c (gomp_barrier_init): Clear cancellable field.
	(gomp_barrier_wait_end): Use BAR_* defines.
	(gomp_team_barrier_wait_end): Clear BAR_CANCELLED from state.
	Set work_share_cancelled to 0 on last thread, use __atomic_load_n.
	Use BAR_* defines.
	(gomp_team_barrier_wait_cancel_end, gomp_team_barrier_wait_cancel,
	gomp_team_barrier_cancel): New functions.
	* config/posix/affinity.c (gomp_init_thread_affinity): Add place
	argument.
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New stubs.
	* config/posix/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
	(gomp_barrier_t): Add cancellable field.
	(gomp_team_barrier_wait_cancel, gomp_team_barrier_wait_cancel_end,
	gomp_team_barrier_cancel): New prototypes.
	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.
	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final,
	gomp_team_barrier_cancelled): New inline functions.
	(gomp_barrier_wait_start, gomp_barrier_last_thread,
	gomp_team_barrier_set_task_pending,
	gomp_team_barrier_clear_task_pending,
	gomp_team_barrier_set_waiting_for_tasks,
	gomp_team_barrier_waiting_for_tasks,
	gomp_team_barrier_done): Use BAR_* defines.
	* barrier.c (GOMP_barrier_cancel): New function.
	* omp_lib.h.in (omp_proc_bind_kind, omp_proc_bind_false,
	omp_proc_bind_true, omp_proc_bind_master, omp_proc_bind_close,
	omp_proc_bind_spread): New params.
	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
	omp_get_team_num, omp_is_initial_device): New externals.
	* parallel.c (GOMP_parallel, GOMP_cancel, GOMP_cancellation_point):
	New functions.
	(gomp_resolve_num_threads): Adjust for thread_limit now being in
	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
	infinity.  If not nested, just return minimum of max_num_threads
	and icv->thread_limit_var and if thr->thread_pool, set threads_busy
	to the returned value.  Otherwise, don't update atomically
	gomp_remaining_threads_count, but instead thr->thread_pool->threads_busy.
	(GOMP_parallel_end): Adjust for thread_limit now being in
	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
	infinity.  Adjust threads_busy in the pool rather than
	gomp_remaining_threads_count.  Remember team->nthreads and call
	gomp_team_end before adjusting threads_busy, if not nested
	afterwards, just set it to 1 non-atomically.  Add ialias.
	(GOMP_parallel_start): Adjust gomp_team_start caller.


	Jakub
diff mbox

Patch

--- libgomp/Makefile.am	(.../trunk)	(revision 203241)
+++ libgomp/Makefile.am	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -60,7 +60,7 @@  libgomp_la_LINK = $(LINK) $(libgomp_la_L
 libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
-	time.c fortran.c affinity.c
+	time.c fortran.c affinity.c target.c
 
 nodist_noinst_HEADERS = libgomp_f.h
 nodist_libsubinclude_HEADERS = omp.h
--- libgomp/Makefile.in	(.../trunk)	(revision 203241)
+++ libgomp/Makefile.in	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -96,7 +96,7 @@  am_libgomp_la_OBJECTS = alloc.lo barrier
 	error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
 	parallel.lo sections.lo single.lo task.lo team.lo work.lo \
 	lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
-	fortran.lo affinity.lo
+	fortran.lo affinity.lo target.lo
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -317,7 +317,7 @@  libgomp_la_LINK = $(LINK) $(libgomp_la_L
 libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
-	time.c fortran.c affinity.c
+	time.c fortran.c affinity.c target.c
 
 nodist_noinst_HEADERS = libgomp_f.h
 nodist_libsubinclude_HEADERS = omp.h
@@ -474,6 +474,7 @@  distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
--- libgomp/barrier.c	(.../trunk)	(revision 203241)
+++ libgomp/barrier.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -39,3 +39,15 @@  GOMP_barrier (void)
 
   gomp_team_barrier_wait (&team->barrier);
 }
+
+bool
+GOMP_barrier_cancel (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  /* The compiler transforms to barrier_cancel when it sees that the
+     barrier is within a construct that can cancel.  Thus we should
+     never have an orphaned cancellable barrier.  */
+  return gomp_team_barrier_wait_cancel (&team->barrier);
+}
--- libgomp/config/linux/affinity.c	(.../trunk)	(revision 203241)
+++ libgomp/config/linux/affinity.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -29,90 +29,327 @@ 
 #endif
 #include "libgomp.h"
 #include "proc.h"
+#include <errno.h>
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
 
-static unsigned int affinity_counter;
+#ifndef CPU_ALLOC_SIZE
+#define CPU_ISSET_S(idx, size, set) CPU_ISSET(idx, set)
+#define CPU_ZERO_S(size, set) CPU_ZERO(set)
+#define CPU_SET_S(idx, size, set) CPU_SET(idx, set)
+#define CPU_CLR_S(idx, size, set) CPU_CLR(idx, set)
+#endif
 
 void
 gomp_init_affinity (void)
 {
-  cpu_set_t cpuset, cpusetnew;
-  size_t idx, widx;
-  unsigned long cpus = 0;
-
-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset))
-    {
-      gomp_error ("could not get CPU affinity set");
-      free (gomp_cpu_affinity);
-      gomp_cpu_affinity = NULL;
-      gomp_cpu_affinity_len = 0;
-      return;
-    }
-
-  CPU_ZERO (&cpusetnew);
-  if (gomp_cpu_affinity_len == 0)
-    {
-      unsigned long count = gomp_cpuset_popcount (&cpuset);
-      if (count >= 65536)
-	count = 65536;
-      gomp_cpu_affinity = malloc (count * sizeof (unsigned short));
-      if (gomp_cpu_affinity == NULL)
+  if (gomp_places_list == NULL)
+    {
+      if (!gomp_affinity_init_level (1, ULONG_MAX, true))
+	return;
+    }
+
+  struct gomp_thread *thr = gomp_thread ();
+  pthread_setaffinity_np (pthread_self (), gomp_cpuset_size,
+			  (cpu_set_t *) gomp_places_list[0]);
+  thr->place = 1;
+  thr->ts.place_partition_off = 0;
+  thr->ts.place_partition_len = gomp_places_list_len;
+}
+
+void
+gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
+{
+  pthread_attr_setaffinity_np (attr, gomp_cpuset_size,
+			       (cpu_set_t *) gomp_places_list[place]);
+}
+
+void **
+gomp_affinity_alloc (unsigned long count, bool quiet)
+{
+  unsigned long i;
+  void **ret;
+  char *p;
+
+  if (gomp_cpusetp == NULL)
+    {
+      if (!quiet)
+	gomp_error ("Could not get CPU affinity set");
+      return NULL;
+    }
+
+  ret = malloc (count * sizeof (void *) + count * gomp_cpuset_size);
+  if (ret == NULL)
+    {
+      if (!quiet)
+	gomp_error ("Out of memory trying to allocate places list");
+      return NULL;
+    }
+
+  p = (char *) (ret + count);
+  for (i = 0; i < count; i++, p += gomp_cpuset_size)
+    ret[i] = p;
+  return ret;
+}
+
+void
+gomp_affinity_init_place (void *p)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  CPU_ZERO_S (gomp_cpuset_size, cpusetp);
+}
+
+bool
+gomp_affinity_add_cpus (void *p, unsigned long num,
+			unsigned long len, long stride, bool quiet)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  unsigned long max = 8 * gomp_cpuset_size;
+  for (;;)
+    {
+      if (num >= max)
 	{
-	  gomp_error ("not enough memory to store CPU affinity list");
-	  return;
+	  if (!quiet)
+	    gomp_error ("Logical CPU number %lu out of range", num);
+	  return false;
 	}
-      for (widx = idx = 0; widx < count && idx < 65536; idx++)
-	if (CPU_ISSET (idx, &cpuset))
+      CPU_SET_S (num, gomp_cpuset_size, cpusetp);
+      if (--len == 0)
+	return true;
+      if ((stride < 0 && num + stride > num)
+	  || (stride > 0 && num + stride < num))
+	{
+	  if (!quiet)
+	    gomp_error ("Logical CPU number %lu+%ld out of range",
+			num, stride);
+	  return false;
+	}
+      num += stride;
+    }
+}
+
+bool
+gomp_affinity_remove_cpu (void *p, unsigned long num)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  if (num >= 8 * gomp_cpuset_size)
+    {
+      gomp_error ("Logical CPU number %lu out of range", num);
+      return false;
+    }
+  if (!CPU_ISSET_S (num, gomp_cpuset_size, cpusetp))
+    {
+      gomp_error ("Logical CPU %lu to be removed is not in the set", num);
+      return false;
+    }
+  CPU_CLR_S (num, gomp_cpuset_size, cpusetp);
+  return true;
+}
+
+bool
+gomp_affinity_copy_place (void *p, void *q, long stride)
+{
+  unsigned long i, max = 8 * gomp_cpuset_size;
+  cpu_set_t *destp = (cpu_set_t *) p;
+  cpu_set_t *srcp = (cpu_set_t *) q;
+
+  CPU_ZERO_S (gomp_cpuset_size, destp);
+  for (i = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, srcp))
+      {
+	if ((stride < 0 && i + stride > i)
+	    || (stride > 0 && (i + stride < i || i + stride >= max)))
+	  {
+	    gomp_error ("Logical CPU number %lu+%ld out of range", i, stride);
+	    return false;
+	  }
+	CPU_SET_S (i + stride, gomp_cpuset_size, destp);
+      }
+  return true;
+}
+
+bool
+gomp_affinity_same_place (void *p, void *q)
+{
+#ifdef CPU_EQUAL_S
+  return CPU_EQUAL_S (gomp_cpuset_size, (cpu_set_t *) p, (cpu_set_t *) q);
+#else
+  return memcmp (p, q, gomp_cpuset_size) == 0;
+#endif
+}
+
+bool
+gomp_affinity_finalize_place_list (bool quiet)
+{
+  unsigned long i, j;
+
+  for (i = 0, j = 0; i < gomp_places_list_len; i++)
+    {
+      cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[i];
+      bool nonempty = false;
+#ifdef CPU_AND_S
+      CPU_AND_S (gomp_cpuset_size, cpusetp, cpusetp, gomp_cpusetp);
+      nonempty = gomp_cpuset_popcount (gomp_cpuset_size, cpusetp) != 0;
+#else
+      unsigned long k, max = gomp_cpuset_size / sizeof (cpusetp->__bits[0]);
+      for (k = 0; k < max; k++)
+	if ((cpusetp->__bits[k] &= gomp_cpusetp->__bits[k]) != 0)
+	  nonempty = true;
+#endif
+      if (nonempty)
+	gomp_places_list[j++] = gomp_places_list[i];
+    }
+
+  if (j == 0)
+    {
+      if (!quiet)
+	gomp_error ("None of the places contain usable logical CPUs");
+      return false;
+    }
+  else if (j < gomp_places_list_len)
+    {
+      if (!quiet)
+	gomp_error ("Number of places reduced from %ld to %ld because some "
+		    "places didn't contain any usable logical CPUs",
+		    gomp_places_list_len, j);
+      gomp_places_list_len = j;
+    }
+  return true;
+}
+
+bool
+gomp_affinity_init_level (int level, unsigned long count, bool quiet)
+{
+  unsigned long i, max = 8 * gomp_cpuset_size;
+
+  if (gomp_cpusetp)
+    {
+      unsigned long maxcount
+	= gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
+      if (count > maxcount)
+	count = maxcount;
+    }
+  gomp_places_list = gomp_affinity_alloc (count, quiet);
+  gomp_places_list_len = 0;
+  if (gomp_places_list == NULL)
+    return false;
+  /* SMT (threads).  */
+  if (level == 1)
+    {
+      for (i = 0; i < max && gomp_places_list_len < count; i++)
+	if (CPU_ISSET_S (i, gomp_cpuset_size, gomp_cpusetp))
 	  {
-	    cpus++;
-	    gomp_cpu_affinity[widx++] = idx;
+	    gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
+	    gomp_affinity_add_cpus (gomp_places_list[gomp_places_list_len],
+				    i, 1, 0, true);
+	    ++gomp_places_list_len;
 	  }
+      return true;
     }
   else
-    for (widx = idx = 0; idx < gomp_cpu_affinity_len; idx++)
-      if (gomp_cpu_affinity[idx] < CPU_SETSIZE
-	  && CPU_ISSET (gomp_cpu_affinity[idx], &cpuset))
+    {
+      char name[sizeof ("/sys/devices/system/cpu/cpu/topology/"
+			"thread_siblings_list") + 3 * sizeof (unsigned long)];
+      size_t prefix_len = sizeof ("/sys/devices/system/cpu/cpu") - 1;
+      cpu_set_t *copy = gomp_alloca (gomp_cpuset_size);
+      FILE *f;
+      char *line = NULL;
+      size_t linelen = 0;
+
+      memcpy (name, "/sys/devices/system/cpu/cpu", prefix_len);
+      memcpy (copy, gomp_cpusetp, gomp_cpuset_size);
+      for (i = 0; i < max && gomp_places_list_len < count; i++)
+	if (CPU_ISSET_S (i, gomp_cpuset_size, copy))
+	  {
+	    sprintf (name + prefix_len, "%lu/topology/%s_siblings_list",
+		     i, level == 2 ? "thread" : "core");
+	    f = fopen (name, "r");
+	    if (f != NULL)
+	      {
+		if (getline (&line, &linelen, f) > 0)
+		  {
+		    char *p = line;
+		    bool seen_i = false;
+		    void *pl = gomp_places_list[gomp_places_list_len];
+		    gomp_affinity_init_place (pl);
+		    while (*p && *p != '\n')
+		      {
+			unsigned long first, last;
+			errno = 0;
+			first = strtoul (p, &p, 10);
+			if (errno)
+			  break;
+			last = first;
+			if (*p == '-')
+			  {
+			    errno = 0;
+			    last = strtoul (p + 1, &p, 10);
+			    if (errno || last < first)
+			      break;
+			  }
+			for (; first <= last; first++)
+			  if (CPU_ISSET_S (first, gomp_cpuset_size, copy)
+			      && gomp_affinity_add_cpus (pl, first, 1, 0,
+							 true))
+			    {
+			      CPU_CLR_S (first, gomp_cpuset_size, copy);
+			      if (first == i)
+				seen_i = true;
+			    }
+			if (*p == ',')
+			  ++p;
+		      }
+		    if (seen_i)
+		      gomp_places_list_len++;
+		  }
+		fclose (f);
+	      }
+	  }
+      if (gomp_places_list == 0)
 	{
-	  if (! CPU_ISSET (gomp_cpu_affinity[idx], &cpusetnew))
-	    {
-	      cpus++;
-	      CPU_SET (gomp_cpu_affinity[idx], &cpusetnew);
-	    }
-	  gomp_cpu_affinity[widx++] = gomp_cpu_affinity[idx];
+	  if (!quiet)
+	    gomp_error ("Error reading %s topology",
+			level == 2 ? "core" : "socket");
+	  free (gomp_places_list);
+	  gomp_places_list = NULL;
+	  return false;
 	}
-
-  if (widx == 0)
-    {
-      gomp_error ("no CPUs left for affinity setting");
-      free (gomp_cpu_affinity);
-      gomp_cpu_affinity = NULL;
-      gomp_cpu_affinity_len = 0;
-      return;
-    }
-
-  gomp_cpu_affinity_len = widx;
-  if (cpus < gomp_available_cpus)
-    gomp_available_cpus = cpus;
-  CPU_ZERO (&cpuset);
-  CPU_SET (gomp_cpu_affinity[0], &cpuset);
-  pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset);
-  affinity_counter = 1;
+      return true;
+    }
+  return false;
 }
 
 void
-gomp_init_thread_affinity (pthread_attr_t *attr)
+gomp_affinity_print_place (void *p)
 {
-  unsigned int cpu;
-  cpu_set_t cpuset;
+  unsigned long i, max = 8 * gomp_cpuset_size, len;
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  bool notfirst = false;
 
-  cpu = __atomic_fetch_add (&affinity_counter, 1, MEMMODEL_RELAXED);
-  cpu %= gomp_cpu_affinity_len;
-  CPU_ZERO (&cpuset);
-  CPU_SET (gomp_cpu_affinity[cpu], &cpuset);
-  pthread_attr_setaffinity_np (attr, sizeof (cpu_set_t), &cpuset);
+  for (i = 0, len = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
+      {
+	if (len == 0)
+	  {
+	    if (notfirst)
+	      fputc (',', stderr);
+	    notfirst = true;
+	    fprintf (stderr, "%lu", i);
+	  }
+	++len;
+      }
+    else
+      {
+	if (len > 1)
+	  fprintf (stderr, ":%lu", len);
+	len = 0;
+      }
+  if (len > 1)
+    fprintf (stderr, ":%lu", len);
 }
 
 #else
--- libgomp/config/linux/bar.c	(.../trunk)	(revision 203241)
+++ libgomp/config/linux/bar.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -33,11 +33,11 @@ 
 void
 gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  if (__builtin_expect ((state & 1) != 0, 0))
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
       bar->awaited = bar->total;
-      __atomic_store_n (&bar->generation, bar->generation + 4,
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELEASE);
       futex_wake ((int *) &bar->generation, INT_MAX);
     }
@@ -66,7 +66,7 @@  void
 gomp_barrier_wait_last (gomp_barrier_t *bar)
 {
   gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
-  if (state & 1)
+  if (state & BAR_WAS_LAST)
     gomp_barrier_wait_end (bar, state);
 }
 
@@ -81,40 +81,43 @@  gomp_team_barrier_wait_end (gomp_barrier
 {
   unsigned int generation, gen;
 
-  if (__builtin_expect ((state & 1) != 0, 0))
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
 
       bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
       if (__builtin_expect (team->task_count, 0))
 	{
 	  gomp_barrier_handle_tasks (state);
-	  state &= ~1;
+	  state &= ~BAR_WAS_LAST;
 	}
       else
 	{
-	  __atomic_store_n (&bar->generation, state + 3, MEMMODEL_RELEASE);
+	  state &= ~BAR_CANCELLED;
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
 	  futex_wake ((int *) &bar->generation, INT_MAX);
 	  return;
 	}
     }
 
   generation = state;
+  state &= ~BAR_CANCELLED;
   do
     {
       do_wait ((int *) &bar->generation, generation);
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-      if (__builtin_expect (gen & 1, 0))
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
 	{
 	  gomp_barrier_handle_tasks (state);
 	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
 	}
-      if ((gen & 2) != 0)
-	generation |= 2;
+      generation |= gen & BAR_WAITING_FOR_TASK;
     }
-  while (gen != state + 4);
+  while (gen != state + BAR_INCR);
 }
 
 void
@@ -122,3 +125,86 @@  gomp_team_barrier_wait (gomp_barrier_t *
 {
   gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
 }
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+	  futex_wake ((int *) &bar->generation, INT_MAX);
+	  return false;
+	}
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  do
+    {
+      do_wait ((int *) &bar->generation, generation);
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+	return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  futex_wake ((int *) &team->barrier.generation, INT_MAX);
+}
--- libgomp/config/linux/bar.h	(.../trunk)	(revision 203241)
+++ libgomp/config/linux/bar.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -38,13 +38,25 @@  typedef struct
   unsigned total __attribute__((aligned (64)));
   unsigned generation;
   unsigned awaited __attribute__((aligned (64)));
+  unsigned awaited_final;
 } gomp_barrier_t;
+
 typedef unsigned int gomp_barrier_state_t;
 
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
 static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
 {
   bar->total = count;
   bar->awaited = count;
+  bar->awaited_final = count;
   bar->generation = 0;
 }
 
@@ -62,27 +74,55 @@  extern void gomp_barrier_wait (gomp_barr
 extern void gomp_barrier_wait_last (gomp_barrier_t *);
 extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
 extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
 extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
 extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
 
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {
-  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) & ~3;
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
   /* A memory barrier is needed before exiting from the various forms
      of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
      2.8.6 flush Construct, which says there is an implicit flush during
      a barrier region.  This is a convenient place to add the barrier,
      so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
-  ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0;
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
   return ret;
 }
 
 static inline bool
 gomp_barrier_last_thread (gomp_barrier_state_t state)
 {
-  return state & 1;
+  return state & BAR_WAS_LAST;
 }
 
 /* All the inlines below must be called with team->task_lock
@@ -91,31 +131,37 @@  gomp_barrier_last_thread (gomp_barrier_s
 static inline void
 gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation |= 1;
+  bar->generation |= BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation &= ~1;
+  bar->generation &= ~BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  bar->generation |= 2;
+  bar->generation |= BAR_WAITING_FOR_TASK;
 }
 
 static inline bool
 gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  return (bar->generation & 2) != 0;
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
 }
 
 static inline void
 gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  bar->generation = (state & ~3) + 4;
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
 }
 
 #endif /* GOMP_BARRIER_H */
--- libgomp/config/linux/proc.c	(.../trunk)	(revision 203241)
+++ libgomp/config/linux/proc.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -30,6 +30,7 @@ 
 #endif
 #include "libgomp.h"
 #include "proc.h"
+#include <errno.h>
 #include <stdlib.h>
 #include <unistd.h>
 #ifdef HAVE_GETLOADAVG
@@ -39,19 +40,28 @@ 
 #endif
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
+unsigned long gomp_cpuset_size;
+static unsigned long gomp_get_cpuset_size;
+cpu_set_t *gomp_cpusetp;
+
 unsigned long
-gomp_cpuset_popcount (cpu_set_t *cpusetp)
+gomp_cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp)
 {
-#ifdef CPU_COUNT
-  /* glibc 2.6 and above provide a macro for this.  */
-  return CPU_COUNT (cpusetp);
+#ifdef CPU_COUNT_S
+  /* glibc 2.7 and above provide a macro for this.  */
+  return CPU_COUNT_S (cpusetsize, cpusetp);
 #else
+#ifdef CPU_COUNT
+  if (cpusetsize == sizeof (cpu_set_t))
+    /* glibc 2.6 and above provide a macro for this.  */
+    return CPU_COUNT (cpusetp);
+#endif
   size_t i;
   unsigned long ret = 0;
-  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)];
+  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)
+		   ? 1 : -1];
 
-  (void) check;
-  for (i = 0; i < sizeof (*cpusetp) / sizeof (cpusetp->__bits[0]); i++)
+  for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++)
     {
       unsigned long int mask = cpusetp->__bits[i];
       if (mask == 0)
@@ -70,16 +80,63 @@  void
 gomp_init_num_threads (void)
 {
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-  cpu_set_t cpuset;
+#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE)
+  gomp_cpuset_size = sysconf (_SC_NPROCESSORS_CONF);
+  gomp_cpuset_size = CPU_ALLOC_SIZE (gomp_cpuset_size);
+#else
+  gomp_cpuset_size = sizeof (cpu_set_t);
+#endif
 
-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset) == 0)
+  gomp_cpusetp = (cpu_set_t *) gomp_malloc (gomp_cpuset_size);
+  do
     {
-      /* Count only the CPUs this process can use.  */
-      gomp_global_icv.nthreads_var = gomp_cpuset_popcount (&cpuset);
-      if (gomp_global_icv.nthreads_var == 0)
-	gomp_global_icv.nthreads_var = 1;
-      return;
+      int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size,
+					gomp_cpusetp);
+      if (ret == 0)
+	{
+	  unsigned long i;
+	  /* Count only the CPUs this process can use.  */
+	  gomp_global_icv.nthreads_var
+	    = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
+	  if (gomp_global_icv.nthreads_var == 0)
+	    break;
+	  gomp_get_cpuset_size = gomp_cpuset_size;
+#ifdef CPU_ALLOC_SIZE
+	  for (i = gomp_cpuset_size * 8; i; i--)
+	    if (CPU_ISSET_S (i - 1, gomp_cpuset_size, gomp_cpusetp))
+	      break;
+	  gomp_cpuset_size = CPU_ALLOC_SIZE (i);
+#endif
+	  return;
+	}
+      if (ret != EINVAL)
+	break;
+#ifdef CPU_ALLOC_SIZE
+      if (gomp_cpuset_size < sizeof (cpu_set_t))
+	gomp_cpuset_size = sizeof (cpu_set_t);
+      else
+	gomp_cpuset_size = gomp_cpuset_size * 2;
+      if (gomp_cpuset_size < 8 * sizeof (cpu_set_t))
+	gomp_cpusetp
+	  = (cpu_set_t *) gomp_realloc (gomp_cpusetp, gomp_cpuset_size);
+      else
+	{
+	  /* Avoid gomp_fatal if too large memory allocation would be
+	     requested, e.g. kernel returning EINVAL all the time.  */
+	  void *p = realloc (gomp_cpusetp, gomp_cpuset_size);
+	  if (p == NULL)
+	    break;
+	  gomp_cpusetp = (cpu_set_t *) p;
+	}
+#else
+      break;
+#endif
     }
+  while (1);
+  gomp_cpuset_size = 0;
+  gomp_global_icv.nthreads_var = 1;
+  free (gomp_cpusetp);
+  gomp_cpusetp = NULL;
 #endif
 #ifdef _SC_NPROCESSORS_ONLN
   gomp_global_icv.nthreads_var = sysconf (_SC_NPROCESSORS_ONLN);
@@ -90,15 +147,14 @@  static int
 get_num_procs (void)
 {
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-  cpu_set_t cpuset;
-
-  if (gomp_cpu_affinity == NULL)
+  if (gomp_places_list == NULL)
     {
       /* Count only the CPUs this process can use.  */
-      if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset),
-				  &cpuset) == 0)
+      if (gomp_cpusetp
+	  && pthread_getaffinity_np (pthread_self (), gomp_get_cpuset_size,
+				     gomp_cpusetp) == 0)
 	{
-	  int ret = gomp_cpuset_popcount (&cpuset);
+	  int ret = gomp_cpuset_popcount (gomp_get_cpuset_size, gomp_cpusetp);
 	  return ret != 0 ? ret : 1;
 	}
     }
--- libgomp/config/linux/proc.h	(.../trunk)	(revision 203241)
+++ libgomp/config/linux/proc.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -28,7 +28,10 @@ 
 #include <sched.h>
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-extern unsigned long gomp_cpuset_popcount (cpu_set_t *);
+extern unsigned long gomp_cpuset_size attribute_hidden;
+extern cpu_set_t *gomp_cpusetp attribute_hidden;
+extern unsigned long gomp_cpuset_popcount (unsigned long, cpu_set_t *)
+     attribute_hidden;
 #endif
 
 #endif /* GOMP_PROC_H */
--- libgomp/config/posix/affinity.c	(.../trunk)	(revision 203241)
+++ libgomp/config/posix/affinity.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -32,7 +32,84 @@  gomp_init_affinity (void)
 }
 
 void
-gomp_init_thread_affinity (pthread_attr_t *attr)
+gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
 {
   (void) attr;
+  (void) place;
+}
+
+void **
+gomp_affinity_alloc (unsigned long count, bool quiet)
+{
+  (void) count;
+  if (!quiet)
+    gomp_error ("Affinity not supported on this configuration");
+  return NULL;
+}
+
+void
+gomp_affinity_init_place (void *p)
+{
+  (void) p;
+}
+
+bool
+gomp_affinity_add_cpus (void *p, unsigned long num,
+			unsigned long len, long stride, bool quiet)
+{
+  (void) p;
+  (void) num;
+  (void) len;
+  (void) stride;
+  (void) quiet;
+  return false;
+}
+
+bool
+gomp_affinity_remove_cpu (void *p, unsigned long num)
+{
+  (void) p;
+  (void) num;
+  return false;
+}
+
+bool
+gomp_affinity_copy_place (void *p, void *q, long stride)
+{
+  (void) p;
+  (void) q;
+  (void) stride;
+  return false;
+}
+
+bool
+gomp_affinity_same_place (void *p, void *q)
+{
+  (void) p;
+  (void) q;
+  return false;
+}
+
+bool
+gomp_affinity_finalize_place_list (bool quiet)
+{
+  (void) quiet;
+  return false;
+}
+
+bool
+gomp_affinity_init_level (int level, unsigned long count, bool quiet)
+{
+  (void) level;
+  (void) count;
+  (void) quiet;
+  if (!quiet)
+    gomp_error ("Affinity not supported on this configuration");
+  return NULL;
+}
+
+void
+gomp_affinity_print_place (void *p)
+{
+  (void) p;
 }
--- libgomp/config/posix/bar.c	(.../trunk)	(revision 203241)
+++ libgomp/config/posix/bar.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -42,6 +42,7 @@  gomp_barrier_init (gomp_barrier_t *bar,
   bar->total = count;
   bar->arrived = 0;
   bar->generation = 0;
+  bar->cancellable = false;
 }
 
 void
@@ -72,7 +73,7 @@  gomp_barrier_wait_end (gomp_barrier_t *b
 {
   unsigned int n;
 
-  if (state & 1)
+  if (state & BAR_WAS_LAST)
     {
       n = --bar->arrived;
       if (n > 0)
@@ -113,12 +114,14 @@  gomp_team_barrier_wait_end (gomp_barrier
 {
   unsigned int n;
 
-  if (state & 1)
+  state &= ~BAR_CANCELLED;
+  if (state & BAR_WAS_LAST)
     {
       n = --bar->arrived;
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
 
+      team->work_share_cancelled = 0;
       if (team->task_count)
 	{
 	  gomp_barrier_handle_tasks (state);
@@ -128,7 +131,7 @@  gomp_team_barrier_wait_end (gomp_barrier
 	  return;
 	}
 
-      bar->generation = state + 3;
+      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
       if (n > 0)
 	{
 	  do
@@ -141,13 +144,18 @@  gomp_team_barrier_wait_end (gomp_barrier
   else
     {
       gomp_mutex_unlock (&bar->mutex1);
+      int gen;
       do
 	{
 	  gomp_sem_wait (&bar->sem1);
-	  if (bar->generation & 1)
-	    gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	  if (gen & BAR_TASK_PENDING)
+	    {
+	      gomp_barrier_handle_tasks (state);
+	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	    }
 	}
-      while (bar->generation != state + 4);
+      while (gen != state + BAR_INCR);
 
 #ifdef HAVE_SYNC_BUILTINS
       n = __sync_add_and_fetch (&bar->arrived, -1);
@@ -162,6 +170,81 @@  gomp_team_barrier_wait_end (gomp_barrier
     }
 }
 
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int n;
+
+  if (state & BAR_WAS_LAST)
+    {
+      bar->cancellable = false;
+      n = --bar->arrived;
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      team->work_share_cancelled = 0;
+      if (team->task_count)
+	{
+	  gomp_barrier_handle_tasks (state);
+	  if (n > 0)
+	    gomp_sem_wait (&bar->sem2);
+	  gomp_mutex_unlock (&bar->mutex1);
+	  return false;
+	}
+
+      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
+      if (n > 0)
+	{
+	  do
+	    gomp_sem_post (&bar->sem1);
+	  while (--n != 0);
+	  gomp_sem_wait (&bar->sem2);
+	}
+      gomp_mutex_unlock (&bar->mutex1);
+    }
+  else
+    {
+      if (state & BAR_CANCELLED)
+	{
+	  gomp_mutex_unlock (&bar->mutex1);
+	  return true;
+	}
+      bar->cancellable = true;
+      gomp_mutex_unlock (&bar->mutex1);
+      int gen;
+      do
+	{
+	  gomp_sem_wait (&bar->sem1);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	  if (gen & BAR_CANCELLED)
+	    break;
+	  if (gen & BAR_TASK_PENDING)
+	    {
+	      gomp_barrier_handle_tasks (state);
+	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	      if (gen & BAR_CANCELLED)
+		break;
+	    }
+	}
+      while (gen != state + BAR_INCR);
+
+#ifdef HAVE_SYNC_BUILTINS
+      n = __sync_add_and_fetch (&bar->arrived, -1);
+#else
+      gomp_mutex_lock (&bar->mutex2);
+      n = --bar->arrived;
+      gomp_mutex_unlock (&bar->mutex2);
+#endif
+
+      if (n == 0)
+	gomp_sem_post (&bar->sem2);
+      if (gen & BAR_CANCELLED)
+	return true;
+    }
+  return false;
+}
+
 void
 gomp_team_barrier_wait (gomp_barrier_t *barrier)
 {
@@ -176,3 +259,40 @@  gomp_team_barrier_wake (gomp_barrier_t *
   while (count-- > 0)
     gomp_sem_post (&bar->sem1);
 }
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar);
+  return gomp_team_barrier_wait_cancel_end (bar, state);
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  if (team->barrier.generation & BAR_CANCELLED)
+    return;
+  gomp_mutex_lock (&team->barrier.mutex1);
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      gomp_mutex_unlock (&team->barrier.mutex1);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  if (team->barrier.cancellable)
+    {
+      int n = team->barrier.arrived;
+      if (n > 0)
+	{
+	  do
+	    gomp_sem_post (&team->barrier.sem1);
+	  while (--n != 0);
+	  gomp_sem_wait (&team->barrier.sem2);
+	}
+      team->barrier.cancellable = false;
+    }
+  gomp_mutex_unlock (&team->barrier.mutex1);
+}
--- libgomp/config/posix/bar.h	(.../trunk)	(revision 203241)
+++ libgomp/config/posix/bar.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -43,9 +43,20 @@  typedef struct
   unsigned total;
   unsigned arrived;
   unsigned generation;
+  bool cancellable;
 } gomp_barrier_t;
+
 typedef unsigned int gomp_barrier_state_t;
 
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
 extern void gomp_barrier_init (gomp_barrier_t *, unsigned);
 extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned);
 extern void gomp_barrier_destroy (gomp_barrier_t *);
@@ -55,22 +66,47 @@  extern void gomp_barrier_wait_end (gomp_
 extern void gomp_team_barrier_wait (gomp_barrier_t *);
 extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
 extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
 
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {
   unsigned int ret;
   gomp_mutex_lock (&bar->mutex1);
-  ret = bar->generation & ~3;
-  ret += ++bar->arrived == bar->total;
+  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
+  if (++bar->arrived == bar->total)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  unsigned int ret;
+  gomp_mutex_lock (&bar->mutex1);
+  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
+  if (ret & BAR_CANCELLED)
+    return ret;
+  if (++bar->arrived == bar->total)
+    ret |= BAR_WAS_LAST;
   return ret;
 }
 
+static inline void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait (bar);
+}
+
 static inline bool
 gomp_barrier_last_thread (gomp_barrier_state_t state)
 {
-  return state & 1;
+  return state & BAR_WAS_LAST;
 }
 
 static inline void
@@ -85,31 +121,37 @@  gomp_barrier_wait_last (gomp_barrier_t *
 static inline void
 gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation |= 1;
+  bar->generation |= BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation &= ~1;
+  bar->generation &= ~BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  bar->generation |= 2;
+  bar->generation |= BAR_WAITING_FOR_TASK;
 }
 
 static inline bool
 gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  return (bar->generation & 2) != 0;
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
 }
 
 static inline void
 gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  bar->generation = (state & ~3) + 4;
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
 }
 
 #endif /* GOMP_BARRIER_H */
--- libgomp/env.c	(.../trunk)	(revision 203241)
+++ libgomp/env.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -29,6 +29,10 @@ 
 #include "libgomp_f.h"
 #include <ctype.h>
 #include <stdlib.h>
+#include <stdio.h>
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>	/* For PRIu64.  */
+#endif
 #ifdef STRING_WITH_STRINGS
 # include <string.h>
 # include <strings.h>
@@ -50,23 +54,28 @@ 
 
 struct gomp_task_icv gomp_global_icv = {
   .nthreads_var = 1,
+  .thread_limit_var = UINT_MAX,
   .run_sched_var = GFS_DYNAMIC,
   .run_sched_modifier = 1,
+  .default_device_var = 0,
   .dyn_var = false,
-  .nest_var = false
+  .nest_var = false,
+  .bind_var = omp_proc_bind_false,
+  .target_data = NULL
 };
 
-unsigned short *gomp_cpu_affinity;
-size_t gomp_cpu_affinity_len;
 unsigned long gomp_max_active_levels_var = INT_MAX;
-unsigned long gomp_thread_limit_var = ULONG_MAX;
-unsigned long gomp_remaining_threads_count;
+bool gomp_cancel_var = false;
 #ifndef HAVE_SYNC_BUILTINS
-gomp_mutex_t gomp_remaining_threads_lock;
+gomp_mutex_t gomp_managed_threads_lock;
 #endif
 unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
 unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+char *gomp_bind_var_list;
+unsigned long gomp_bind_var_list_len;
+void **gomp_places_list;
+unsigned long gomp_places_list_len;
 
 /* Parse the OMP_SCHEDULE environment variable.  */
 
@@ -184,6 +193,24 @@  parse_unsigned_long (const char *name, u
   return false;
 }
 
+/* Parse a positive int environment variable.  Return true if one was
+   present and it was successfully parsed.  */
+
+static bool
+parse_int (const char *name, int *pvalue, bool allow_zero)
+{
+  unsigned long value;
+  if (!parse_unsigned_long (name, &value, allow_zero))
+    return false;
+  if (value > INT_MAX)
+    {
+      gomp_error ("Invalid value for environment variable %s", name);
+      return false;
+    }
+  *pvalue = (int) value;
+  return true;
+}
+
 /* Parse an unsigned long list environment variable.  Return true if one was
    present and it was successfully parsed.  */
 
@@ -273,6 +300,412 @@  parse_unsigned_long_list (const char *na
   return false;
 }
 
+/* Parse environment variable set to a boolean or list of omp_proc_bind_t
+   enum values.  Return true if one was present and it was successfully
+   parsed.  */
+
+static bool
+parse_bind_var (const char *name, char *p1stvalue,
+		char **pvalues, unsigned long *pnvalues)
+{
+  char *env;
+  char value, *values = NULL;
+  int i;
+  static struct proc_bind_kinds
+  {
+    const char name[7];
+    const char len;
+    omp_proc_bind_t kind;
+  } kinds[] =
+  {
+    { "false", 5, omp_proc_bind_false },
+    { "true", 4, omp_proc_bind_true },
+    { "master", 6, omp_proc_bind_master },
+    { "close", 5, omp_proc_bind_close },
+    { "spread", 6, omp_proc_bind_spread }
+  };
+
+  env = getenv (name);
+  if (env == NULL)
+    return false;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '\0')
+    goto invalid;
+
+  for (i = 0; i < 5; i++)
+    if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
+      {
+	value = kinds[i].kind;
+	env += kinds[i].len;
+	break;
+      }
+  if (i == 5)
+    goto invalid;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env != '\0')
+    {
+      if (*env == ',')
+	{
+	  unsigned long nvalues = 0, nalloced = 0;
+
+	  if (value == omp_proc_bind_false
+	      || value == omp_proc_bind_true)
+	    goto invalid;
+
+	  do
+	    {
+	      env++;
+	      if (nvalues == nalloced)
+		{
+		  char *n;
+		  nalloced = nalloced ? nalloced * 2 : 16;
+		  n = realloc (values, nalloced);
+		  if (n == NULL)
+		    {
+		      free (values);
+		      gomp_error ("Out of memory while trying to parse"
+				  " environment variable %s", name);
+		      return false;
+		    }
+		  values = n;
+		  if (nvalues == 0)
+		    values[nvalues++] = value;
+		}
+
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == '\0')
+		goto invalid;
+
+	      for (i = 2; i < 5; i++)
+		if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
+		  {
+		    value = kinds[i].kind;
+		    env += kinds[i].len;
+		    break;
+		  }
+	      if (i == 5)
+		goto invalid;
+
+	      values[nvalues++] = value;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == '\0')
+		break;
+	      if (*env != ',')
+		goto invalid;
+	    }
+	  while (1);
+	  *p1stvalue = values[0];
+	  *pvalues = values;
+	  *pnvalues = nvalues;
+	  return true;
+	}
+      goto invalid;
+    }
+
+  *p1stvalue = value;
+  return true;
+
+ invalid:
+  free (values);
+  gomp_error ("Invalid value for environment variable %s", name);
+  return false;
+}
+
+static bool
+parse_one_place (char **envp, bool *negatep, unsigned long *lenp,
+		 long *stridep)
+{
+  char *env = *envp, *start;
+  void *p = gomp_places_list ? gomp_places_list[gomp_places_list_len] : NULL;
+  unsigned long len = 1;
+  long stride = 1;
+  int pass;
+  bool any_negate = false;
+  *negatep = false;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '!')
+    {
+      *negatep = true;
+      ++env;
+      while (isspace ((unsigned char) *env))
+	++env;
+    }
+  if (*env != '{')
+    return false;
+  ++env;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  start = env;
+  for (pass = 0; pass < (any_negate ? 2 : 1); pass++)
+    {
+      env = start;
+      do
+	{
+	  unsigned long this_num, this_len = 1;
+	  long this_stride = 1;
+	  bool this_negate = (*env == '!');
+	  if (this_negate)
+	    {
+	      if (gomp_places_list)
+		any_negate = true;
+	      ++env;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	    }
+
+	  errno = 0;
+	  this_num = strtoul (env, &env, 10);
+	  if (errno)
+	    return false;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env == ':')
+	    {
+	      ++env;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      errno = 0;
+	      this_len = strtoul (env, &env, 10);
+	      if (errno || this_len == 0)
+		return false;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == ':')
+		{
+		  ++env;
+		  while (isspace ((unsigned char) *env))
+		    ++env;
+		  errno = 0;
+		  this_stride = strtol (env, &env, 10);
+		  if (errno)
+		    return false;
+		  while (isspace ((unsigned char) *env))
+		    ++env;
+		}
+	    }
+	  if (this_negate && this_len != 1)
+	    return false;
+	  if (gomp_places_list && pass == this_negate)
+	    {
+	      if (this_negate)
+		{
+		  if (!gomp_affinity_remove_cpu (p, this_num))
+		    return false;
+		}
+	      else if (!gomp_affinity_add_cpus (p, this_num, this_len,
+						this_stride, false))
+		return false;
+	    }
+	  if (*env == '}')
+	    break;
+	  if (*env != ',')
+	    return false;
+	  ++env;
+	}
+      while (1);
+    }
+
+  ++env;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == ':')
+    {
+      ++env;
+      while (isspace ((unsigned char) *env))
+	++env;
+      errno = 0;
+      len = strtoul (env, &env, 10);
+      if (errno || len == 0 || len >= 65536)
+	return false;
+      while (isspace ((unsigned char) *env))
+	++env;
+      if (*env == ':')
+	{
+	  ++env;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  errno = 0;
+	  stride = strtol (env, &env, 10);
+	  if (errno)
+	    return false;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	}
+    }
+  if (*negatep && len != 1)
+    return false;
+  *envp = env;
+  *lenp = len;
+  *stridep = stride;
+  return true;
+}
+
+static bool
+parse_places_var (const char *name)
+{
+  char *env = getenv (name), *end;
+  bool any_negate = false;
+  int level = 0;
+  unsigned long count = 0;
+  if (env == NULL)
+    return false;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '\0')
+    goto invalid;
+
+  if (strncasecmp (env, "threads", 7) == 0)
+    {
+      env += 7;
+      level = 1;
+    }
+  else if (strncasecmp (env, "cores", 5) == 0)
+    {
+      env += 5;
+      level = 2;
+    }
+  else if (strncasecmp (env, "sockets", 7) == 0)
+    {
+      env += 7;
+      level = 3;
+    }
+  if (level)
+    {
+      count = ULONG_MAX;
+      while (isspace ((unsigned char) *env))
+	++env;
+      if (*env != '\0')
+	{
+	  if (*env++ != '(')
+	    goto invalid;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+
+	  errno = 0;
+	  count = strtoul (env, &end, 10);
+	  if (errno)
+	    goto invalid;
+	  env = end;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env != ')')
+	    goto invalid;
+	  ++env;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env != '\0')
+	    goto invalid;
+	}
+      return gomp_affinity_init_level (level, count, false);
+    }
+
+  count = 0;
+  end = env;
+  do
+    {
+      bool negate;
+      unsigned long len;
+      long stride;
+      if (!parse_one_place (&end, &negate, &len, &stride))
+	goto invalid;
+      if (negate)
+	{
+	  if (!any_negate)
+	    count++;
+	  any_negate = true;
+	}
+      else
+	count += len;
+      if (count > 65536)
+	goto invalid;
+      if (*end == '\0')
+	break;
+      if (*end != ',')
+	goto invalid;
+      end++;
+    }
+  while (1);
+
+  if (gomp_global_icv.bind_var == omp_proc_bind_false)
+    return false;
+
+  gomp_places_list_len = 0;
+  gomp_places_list = gomp_affinity_alloc (count, false);
+  if (gomp_places_list == NULL)
+    return false;
+
+  do
+    {
+      bool negate;
+      unsigned long len;
+      long stride;
+      gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
+      if (!parse_one_place (&env, &negate, &len, &stride))
+	goto invalid;
+      if (negate)
+	{
+	  void *p;
+	  for (count = 0; count < gomp_places_list_len; count++)
+	    if (gomp_affinity_same_place
+			(gomp_places_list[count],
+			 gomp_places_list[gomp_places_list_len]))
+	      break;
+	  if (count == gomp_places_list_len)
+	    {
+	      gomp_error ("Trying to remove a non-existing place from list "
+			  "of places");
+	      goto invalid;
+	    }
+	  p = gomp_places_list[count];
+	  memmove (&gomp_places_list[count],
+		   &gomp_places_list[count + 1],
+		   (gomp_places_list_len - count - 1) * sizeof (void *));
+	  --gomp_places_list_len;
+	  gomp_places_list[gomp_places_list_len] = p;
+	}
+      else if (len == 1)
+	++gomp_places_list_len;
+      else
+	{
+	  for (count = 0; count < len - 1; count++)
+	    if (!gomp_affinity_copy_place
+			(gomp_places_list[gomp_places_list_len + count + 1],
+			 gomp_places_list[gomp_places_list_len + count],
+			 stride))
+	      goto invalid;
+	  gomp_places_list_len += len;
+	}
+      if (*env == '\0')
+	break;
+      env++;
+    }
+  while (1);
+
+  if (gomp_places_list_len == 0)
+    {
+      gomp_error ("All places have been removed");
+      goto invalid;
+    }
+  if (!gomp_affinity_finalize_place_list (false))
+    goto invalid;
+  return true;
+
+ invalid:
+  free (gomp_places_list);
+  gomp_places_list = NULL;
+  gomp_places_list_len = 0;
+  gomp_error ("Invalid value for environment variable %s", name);
+  return false;
+}
+
 /* Parse the OMP_STACKSIZE environment varible.  Return true if one was
    present and it was successfully parsed.  */
 
@@ -480,84 +913,89 @@  parse_wait_policy (void)
 static bool
 parse_affinity (void)
 {
-  char *env, *end;
+  char *env, *end, *start;
+  int pass;
   unsigned long cpu_beg, cpu_end, cpu_stride;
-  unsigned short *cpus = NULL;
-  size_t allocated = 0, used = 0, needed;
+  size_t count = 0, needed;
 
   env = getenv ("GOMP_CPU_AFFINITY");
   if (env == NULL)
     return false;
 
-  do
+  start = env;
+  for (pass = 0; pass < 2; pass++)
     {
-      while (*env == ' ' || *env == '\t')
-	env++;
-
-      cpu_beg = strtoul (env, &end, 0);
-      cpu_end = cpu_beg;
-      cpu_stride = 1;
-      if (env == end || cpu_beg >= 65536)
-	goto invalid;
-
-      env = end;
-      if (*env == '-')
+      env = start;
+      if (pass == 1)
+	{
+	  gomp_places_list_len = 0;
+	  gomp_places_list = gomp_affinity_alloc (count, true);
+	  if (gomp_places_list == NULL)
+	    return false;
+	}
+      do
 	{
-	  cpu_end = strtoul (++env, &end, 0);
-	  if (env == end || cpu_end >= 65536 || cpu_end < cpu_beg)
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+
+	  errno = 0;
+	  cpu_beg = strtoul (env, &end, 0);
+	  if (errno || cpu_beg >= 65536)
 	    goto invalid;
+	  cpu_end = cpu_beg;
+	  cpu_stride = 1;
 
 	  env = end;
-	  if (*env == ':')
+	  if (*env == '-')
 	    {
-	      cpu_stride = strtoul (++env, &end, 0);
-	      if (env == end || cpu_stride == 0 || cpu_stride >= 65536)
+	      errno = 0;
+	      cpu_end = strtoul (++env, &end, 0);
+	      if (errno || cpu_end >= 65536 || cpu_end < cpu_beg)
 		goto invalid;
 
 	      env = end;
-	    }
-	}
+	      if (*env == ':')
+		{
+		  errno = 0;
+		  cpu_stride = strtoul (++env, &end, 0);
+		  if (errno || cpu_stride == 0 || cpu_stride >= 65536)
+		    goto invalid;
 
-      needed = (cpu_end - cpu_beg) / cpu_stride + 1;
-      if (used + needed >= allocated)
-	{
-	  unsigned short *new_cpus;
+		  env = end;
+		}
+	    }
 
-	  if (allocated < 64)
-	    allocated = 64;
-	  if (allocated > needed)
-	    allocated <<= 1;
+	  needed = (cpu_end - cpu_beg) / cpu_stride + 1;
+	  if (pass == 0)
+	    count += needed;
 	  else
-	    allocated += 2 * needed;
-	  new_cpus = realloc (cpus, allocated * sizeof (unsigned short));
-	  if (new_cpus == NULL)
 	    {
-	      free (cpus);
-	      gomp_error ("not enough memory to store GOMP_CPU_AFFINITY list");
-	      return false;
+	      while (needed--)
+		{
+		  void *p = gomp_places_list[gomp_places_list_len];
+		  gomp_affinity_init_place (p);
+		  if (gomp_affinity_add_cpus (p, cpu_beg, 1, 0, true))
+		    ++gomp_places_list_len;
+		  cpu_beg += cpu_stride;
+		}
 	    }
 
-	  cpus = new_cpus;
-	}
+	  while (isspace ((unsigned char) *env))
+	    ++env;
 
-      while (needed--)
-	{
-	  cpus[used++] = cpu_beg;
-	  cpu_beg += cpu_stride;
+	  if (*env == ',')
+	    env++;
+	  else if (*env == '\0')
+	    break;
 	}
-
-      while (*env == ' ' || *env == '\t')
-	env++;
-
-      if (*env == ',')
-	env++;
-      else if (*env == '\0')
-	break;
+      while (1);
     }
-  while (1);
 
-  gomp_cpu_affinity = cpus;
-  gomp_cpu_affinity_len = used;
+  if (gomp_places_list_len == 0)
+    {
+      free (gomp_places_list);
+      gomp_places_list = NULL;
+    }
   return true;
 
  invalid:
@@ -565,12 +1003,160 @@  parse_affinity (void)
   return false;
 }
 
+
+static void
+handle_omp_display_env (unsigned long stacksize, int wait_policy)
+{
+  const char *env;
+  bool display = false;
+  bool verbose = false;
+  int i;
+
+  env = getenv ("OMP_DISPLAY_ENV");
+  if (env == NULL)
+    return;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (strncasecmp (env, "true", 4) == 0)
+    {
+      display = true;
+      env += 4;
+    }
+  else if (strncasecmp (env, "false", 5) == 0)
+    {
+      display = false;
+      env += 5;
+    }
+  else if (strncasecmp (env, "verbose", 7) == 0)
+    {
+      display = true;
+      verbose = true;
+      env += 7;
+    }
+  else
+    env = "X";
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env != '\0')
+    gomp_error ("Invalid value for environment variable OMP_DISPLAY_ENV");
+
+  if (!display)
+    return;
+
+  fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr);
+
+  fputs ("  _OPENMP = '201307'\n", stderr);
+  fprintf (stderr, "  OMP_DYNAMIC = '%s'\n",
+	   gomp_global_icv.dyn_var ? "TRUE" : "FALSE");
+  fprintf (stderr, "  OMP_NESTED = '%s'\n",
+	   gomp_global_icv.nest_var ? "TRUE" : "FALSE");
+
+  fprintf (stderr, "  OMP_NUM_THREADS = '%lu", gomp_global_icv.nthreads_var);
+  for (i = 1; i < gomp_nthreads_var_list_len; i++)
+    fprintf (stderr, ",%lu", gomp_nthreads_var_list[i]);
+  fputs ("'\n", stderr);
+
+  fprintf (stderr, "  OMP_SCHEDULE = '");
+  switch (gomp_global_icv.run_sched_var)
+    {
+    case GFS_RUNTIME:
+      fputs ("RUNTIME", stderr);
+      break;
+    case GFS_STATIC:
+      fputs ("STATIC", stderr);
+      break;
+    case GFS_DYNAMIC:
+      fputs ("DYNAMIC", stderr);
+      break;
+    case GFS_GUIDED:
+      fputs ("GUIDED", stderr);
+      break;
+    case GFS_AUTO:
+      fputs ("AUTO", stderr);
+      break;
+    }
+  fputs ("'\n", stderr);
+
+  fputs ("  OMP_PROC_BIND = '", stderr);
+  switch (gomp_global_icv.bind_var)
+    {
+    case omp_proc_bind_false:
+      fputs ("FALSE", stderr);
+      break;
+    case omp_proc_bind_true:
+      fputs ("TRUE", stderr);
+      break;
+    case omp_proc_bind_master:
+      fputs ("MASTER", stderr);
+      break;
+    case omp_proc_bind_close:
+      fputs ("CLOSE", stderr);
+      break;
+    case omp_proc_bind_spread:
+      fputs ("SPREAD", stderr);
+      break;
+    }
+  for (i = 1; i < gomp_bind_var_list_len; i++)
+    switch (gomp_bind_var_list[i])
+      {
+      case omp_proc_bind_master:
+	fputs (",MASTER", stderr);
+	break;
+      case omp_proc_bind_close:
+	fputs (",CLOSE", stderr);
+	break;
+      case omp_proc_bind_spread:
+	fputs (",SPREAD", stderr);
+	break;
+      }
+  fputs ("'\n", stderr);
+  fputs ("  OMP_PLACES = '", stderr);
+  for (i = 0; i < gomp_places_list_len; i++)
+    {
+      fputs ("{", stderr);
+      gomp_affinity_print_place (gomp_places_list[i]);
+      fputs (i + 1 == gomp_places_list_len ? "}" : "},", stderr);
+    }
+  fputs ("'\n", stderr);
+
+  fprintf (stderr, "  OMP_STACKSIZE = '%lu'\n", stacksize);
+
+  /* GOMP's default value is actually neither active nor passive.  */
+  fprintf (stderr, "  OMP_WAIT_POLICY = '%s'\n",
+	   wait_policy > 0 ? "ACTIVE" : "PASSIVE");
+  fprintf (stderr, "  OMP_THREAD_LIMIT = '%u'\n",
+	   gomp_global_icv.thread_limit_var);
+  fprintf (stderr, "  OMP_MAX_ACTIVE_LEVELS = '%lu'\n",
+	   gomp_max_active_levels_var);
+
+  fprintf (stderr, "  OMP_CANCELLATION = '%s'\n",
+	   gomp_cancel_var ? "TRUE" : "FALSE");
+  fprintf (stderr, "  OMP_DEFAULT_DEVICE = '%d'\n",
+	   gomp_global_icv.default_device_var);
+
+  if (verbose)
+    {
+      fputs ("  GOMP_CPU_AFFINITY = ''\n", stderr);
+      fprintf (stderr, "  GOMP_STACKSIZE = '%lu'\n", stacksize);
+#ifdef HAVE_INTTYPES_H
+      fprintf (stderr, "  GOMP_SPINCOUNT = '%"PRIu64"'\n",
+	       (uint64_t) gomp_spin_count_var);
+#else
+      fprintf (stderr, "  GOMP_SPINCOUNT = '%lu'\n",
+	       (unsigned long) gomp_spin_count_var);
+#endif
+    }
+
+  fputs ("OPENMP DISPLAY ENVIRONMENT END\n", stderr);
+}
+
+
 static void __attribute__((constructor))
 initialize_env (void)
 {
-  unsigned long stacksize;
+  unsigned long thread_limit_var, stacksize;
   int wait_policy;
-  bool bind_var = false;
 
   /* Do a compile time check that mkomp_h.pl did good job.  */
   omp_check_defines ();
@@ -578,14 +1164,17 @@  initialize_env (void)
   parse_schedule ();
   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
-  parse_boolean ("OMP_PROC_BIND", &bind_var);
+  parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
+  parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
 		       true);
-  parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var, false);
-  if (gomp_thread_limit_var != ULONG_MAX)
-    gomp_remaining_threads_count = gomp_thread_limit_var - 1;
+  if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
+    {
+      gomp_global_icv.thread_limit_var
+	= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
+    }
 #ifndef HAVE_SYNC_BUILTINS
-  gomp_mutex_init (&gomp_remaining_threads_lock);
+  gomp_mutex_init (&gomp_managed_threads_lock);
 #endif
   gomp_init_num_threads ();
   gomp_available_cpus = gomp_global_icv.nthreads_var;
@@ -594,7 +1183,14 @@  initialize_env (void)
 				 &gomp_nthreads_var_list,
 				 &gomp_nthreads_var_list_len))
     gomp_global_icv.nthreads_var = gomp_available_cpus;
-  if (parse_affinity () || bind_var)
+  if (!parse_bind_var ("OMP_PROC_BIND",
+		       &gomp_global_icv.bind_var,
+		       &gomp_bind_var_list,
+		       &gomp_bind_var_list_len))
+    gomp_global_icv.bind_var = omp_proc_bind_false;
+  if (parse_places_var ("OMP_PLACES")
+      || parse_affinity ()
+      || gomp_global_icv.bind_var)
     gomp_init_affinity ();
   wait_policy = parse_wait_policy ();
   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
@@ -645,6 +1241,8 @@  initialize_env (void)
       if (err != 0)
 	gomp_error ("Stack size change failed: %s", strerror (err));
     }
+
+  handle_omp_display_env (stacksize, wait_policy);
 }
 
 
@@ -728,7 +1326,8 @@  omp_get_max_threads (void)
 int
 omp_get_thread_limit (void)
 {
-  return gomp_thread_limit_var > INT_MAX ? INT_MAX : gomp_thread_limit_var;
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
 }
 
 void
@@ -744,6 +1343,60 @@  omp_get_max_active_levels (void)
   return gomp_max_active_levels_var;
 }
 
+int
+omp_get_cancellation (void)
+{
+  return gomp_cancel_var;
+}
+
+omp_proc_bind_t
+omp_get_proc_bind (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->bind_var;
+}
+
+void
+omp_set_default_device (int device_num)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->default_device_var = device_num >= 0 ? device_num : 0;
+}
+
+int
+omp_get_default_device (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->default_device_var;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return gomp_get_num_devices ();
+}
+
+int
+omp_get_num_teams (void)
+{
+  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 1;
+}
+
+int
+omp_get_team_num (void)
+{
+  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 0;
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
+  return 1;
+}
+
 ialias (omp_set_dynamic)
 ialias (omp_set_nested)
 ialias (omp_set_num_threads)
@@ -755,3 +1408,11 @@  ialias (omp_get_max_threads)
 ialias (omp_get_thread_limit)
 ialias (omp_set_max_active_levels)
 ialias (omp_get_max_active_levels)
+ialias (omp_get_cancellation)
+ialias (omp_get_proc_bind)
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
--- libgomp/fortran.c	(.../trunk)	(revision 203241)
+++ libgomp/fortran.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -31,11 +31,6 @@ 
 
 #ifdef HAVE_ATTRIBUTE_ALIAS
 /* Use internal aliases if possible.  */
-# define ULP		STR1(__USER_LABEL_PREFIX__)
-# define STR1(x)	STR2(x)
-# define STR2(x)	#x
-# define ialias_redirect(fn) \
-  extern __typeof (fn) fn __asm__ (ULP "gomp_ialias_" #fn) attribute_hidden;
 # ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
 ialias_redirect (omp_init_lock)
 ialias_redirect (omp_init_nest_lock)
@@ -70,6 +65,14 @@  ialias_redirect (omp_get_ancestor_thread
 ialias_redirect (omp_get_team_size)
 ialias_redirect (omp_get_active_level)
 ialias_redirect (omp_in_final)
+ialias_redirect (omp_get_cancellation)
+ialias_redirect (omp_get_proc_bind)
+ialias_redirect (omp_set_default_device)
+ialias_redirect (omp_get_default_device)
+ialias_redirect (omp_get_num_devices)
+ialias_redirect (omp_get_num_teams)
+ialias_redirect (omp_get_team_num)
+ialias_redirect (omp_is_initial_device)
 #endif
 
 #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
@@ -435,3 +438,57 @@  omp_in_final_ (void)
 {
   return omp_in_final ();
 }
+
+int32_t
+omp_get_cancellation_ (void)
+{
+  return omp_get_cancellation ();
+}
+
+int32_t
+omp_get_proc_bind_ (void)
+{
+  return omp_get_proc_bind ();
+}
+
+void
+omp_set_default_device_ (const int32_t *device_num)
+{
+  return omp_set_default_device (*device_num);
+}
+
+void
+omp_set_default_device_8_ (const int64_t *device_num)
+{
+  return omp_set_default_device (TO_INT (*device_num));
+}
+
+int32_t
+omp_get_default_device_ (void)
+{
+  return omp_get_default_device ();
+}
+
+int32_t
+omp_get_num_devices_ (void)
+{
+  return omp_get_num_devices ();
+}
+
+int32_t
+omp_get_num_teams_ (void)
+{
+  return omp_get_num_teams ();
+}
+
+int32_t
+omp_get_team_num_ (void)
+{
+  return omp_get_team_num ();
+}
+
+int32_t
+omp_is_initial_device_ (void)
+{
+  return omp_is_initial_device ();
+}
--- libgomp/hashtab.h	(.../trunk)	(revision 0)
+++ libgomp/hashtab.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -0,0 +1,443 @@ 
+/* An expandable hash tables datatype.
+   Copyright (C) 1999-2013
+   Free Software Foundation, Inc.
+   Contributed by Vladimir Makarov <vmakarov@cygnus.com>.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* The hash table code copied from include/hashtab.[hc] and adjusted,
+   so that the hash table entries are in the flexible array at the end
+   of the control structure, no callbacks are used and the elements in the
+   table are of the hash_entry_type type.
+   Before including this file, define hash_entry_type type and
+   htab_alloc and htab_free functions.  After including it, define
+   htab_hash and htab_eq inline functions.   */
+
+/* This package implements basic hash table functionality.  It is possible
+   to search for an entry, create an entry and destroy an entry.
+
+   Elements in the table are generic pointers.
+
+   The size of the table is not fixed; if the occupancy of the table
+   grows too high the hash table will be expanded.
+
+   The abstract data implementation is based on generalized Algorithm D
+   from Knuth's book "The art of computer programming".  Hash table is
+   expanded by creation of new hash table and transferring elements from
+   the old table to the new table.  */
+
+/* The type for a hash code.  */
+typedef unsigned int hashval_t;
+
+static inline hashval_t htab_hash (hash_entry_type);
+static inline bool htab_eq (hash_entry_type, hash_entry_type);
+
+/* This macro defines reserved value for empty table entry.  */
+
+#define HTAB_EMPTY_ENTRY    ((hash_entry_type) 0)
+
+/* This macro defines reserved value for table entry which contained
+   a deleted element. */
+
+#define HTAB_DELETED_ENTRY  ((hash_entry_type) 1)
+
+/* Hash tables are of the following type.  The structure
+   (implementation) of this type is not needed for using the hash
+   tables.  All work with hash table should be executed only through
+   functions mentioned below.  The size of this structure is subject to
+   change.  */
+
+struct htab {
+  /* Current size (in entries) of the hash table.  */
+  size_t size;
+
+  /* Current number of elements including also deleted elements.  */
+  size_t n_elements;
+
+  /* Current number of deleted elements in the table.  */
+  size_t n_deleted;
+
+  /* Current size (in entries) of the hash table, as an index into the
+     table of primes.  */
+  unsigned int size_prime_index;
+
+  /* Table itself.  */
+  hash_entry_type entries[];
+};
+
+typedef struct htab *htab_t;
+
+/* An enum saying whether we insert into the hash table or not.  */
+enum insert_option {NO_INSERT, INSERT};
+
+/* Table of primes and multiplicative inverses.
+
+   Note that these are not minimally reduced inverses.  Unlike when generating
+   code to divide by a constant, we want to be able to use the same algorithm
+   all the time.  All of these inverses (are implied to) have bit 32 set.
+
+   For the record, the function that computed the table is in
+   libiberty/hashtab.c.  */
+
+struct prime_ent
+{
+  hashval_t prime;
+  hashval_t inv;
+  hashval_t inv_m2;	/* inverse of prime-2 */
+  hashval_t shift;
+};
+
+static struct prime_ent const prime_tab[] = {
+  {          7, 0x24924925, 0x9999999b, 2 },
+  {         13, 0x3b13b13c, 0x745d1747, 3 },
+  {         31, 0x08421085, 0x1a7b9612, 4 },
+  {         61, 0x0c9714fc, 0x15b1e5f8, 5 },
+  {        127, 0x02040811, 0x0624dd30, 6 },
+  {        251, 0x05197f7e, 0x073260a5, 7 },
+  {        509, 0x01824366, 0x02864fc8, 8 },
+  {       1021, 0x00c0906d, 0x014191f7, 9 },
+  {       2039, 0x0121456f, 0x0161e69e, 10 },
+  {       4093, 0x00300902, 0x00501908, 11 },
+  {       8191, 0x00080041, 0x00180241, 12 },
+  {      16381, 0x000c0091, 0x00140191, 13 },
+  {      32749, 0x002605a5, 0x002a06e6, 14 },
+  {      65521, 0x000f00e2, 0x00110122, 15 },
+  {     131071, 0x00008001, 0x00018003, 16 },
+  {     262139, 0x00014002, 0x0001c004, 17 },
+  {     524287, 0x00002001, 0x00006001, 18 },
+  {    1048573, 0x00003001, 0x00005001, 19 },
+  {    2097143, 0x00004801, 0x00005801, 20 },
+  {    4194301, 0x00000c01, 0x00001401, 21 },
+  {    8388593, 0x00001e01, 0x00002201, 22 },
+  {   16777213, 0x00000301, 0x00000501, 23 },
+  {   33554393, 0x00001381, 0x00001481, 24 },
+  {   67108859, 0x00000141, 0x000001c1, 25 },
+  {  134217689, 0x000004e1, 0x00000521, 26 },
+  {  268435399, 0x00000391, 0x000003b1, 27 },
+  {  536870909, 0x00000019, 0x00000029, 28 },
+  { 1073741789, 0x0000008d, 0x00000095, 29 },
+  { 2147483647, 0x00000003, 0x00000007, 30 },
+  /* Avoid "decimal constant so large it is unsigned" for 4294967291.  */
+  { 0xfffffffb, 0x00000006, 0x00000008, 31 }
+};
+
+/* The following function returns an index into the above table of the
+   nearest prime number which is greater than N, and near a power of two. */
+
+static unsigned int
+higher_prime_index (unsigned long n)
+{
+  unsigned int low = 0;
+  unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]);
+
+  while (low != high)
+    {
+      unsigned int mid = low + (high - low) / 2;
+      if (n > prime_tab[mid].prime)
+	low = mid + 1;
+      else
+	high = mid;
+    }
+
+  /* If we've run out of primes, abort.  */
+  if (n > prime_tab[low].prime)
+    abort ();
+
+  return low;
+}
+
+/* Return the current size of given hash table.  */
+
+static inline size_t
+htab_size (htab_t htab)
+{
+  return htab->size;
+}
+
+/* Return the current number of elements in given hash table. */
+
+static inline size_t
+htab_elements (htab_t htab)
+{
+  return htab->n_elements - htab->n_deleted;
+}
+
+/* Return X % Y.  */
+
+static inline hashval_t
+htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift)
+{
+  /* The multiplicative inverses computed above are for 32-bit types, and
+     requires that we be able to compute a highpart multiply.  */
+  if (sizeof (hashval_t) * __CHAR_BIT__ <= 32)
+    {
+      hashval_t t1, t2, t3, t4, q, r;
+
+      t1 = ((unsigned long long)x * inv) >> 32;
+      t2 = x - t1;
+      t3 = t2 >> 1;
+      t4 = t1 + t3;
+      q  = t4 >> shift;
+      r  = x - (q * y);
+
+      return r;
+    }
+
+  /* Otherwise just use the native division routines.  */
+  return x % y;
+}
+
+/* Compute the primary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return htab_mod_1 (hash, p->prime, p->inv, p->shift);
+}
+
+/* Compute the secondary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod_m2 (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift);
+}
+
+/* Create hash table of size SIZE.  */
+
+static htab_t
+htab_create (size_t size)
+{
+  htab_t result;
+  unsigned int size_prime_index;
+
+  size_prime_index = higher_prime_index (size);
+  size = prime_tab[size_prime_index].prime;
+
+  result = (htab_t) htab_alloc (sizeof (struct htab)
+				+ size * sizeof (hash_entry_type));
+  result->size = size;
+  result->n_elements = 0;
+  result->n_deleted = 0;
+  result->size_prime_index = size_prime_index;
+  memset (result->entries, 0, size * sizeof (hash_entry_type));
+  return result;
+}
+
+/* Similar to htab_find_slot, but without several unwanted side effects:
+    - Does not call htab_eq when it finds an existing entry.
+    - Does not change the count of elements in the hash table.
+   This function also assumes there are no deleted entries in the table.
+   HASH is the hash value for the element to be inserted.  */
+
+static hash_entry_type *
+find_empty_slot_for_expand (htab_t htab, hashval_t hash)
+{
+  hashval_t index = htab_mod (hash, htab);
+  size_t size = htab_size (htab);
+  hash_entry_type *slot = htab->entries + index;
+  hashval_t hash2;
+
+  if (*slot == HTAB_EMPTY_ENTRY)
+    return slot;
+  else if (*slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      slot = htab->entries + index;
+      if (*slot == HTAB_EMPTY_ENTRY)
+	return slot;
+      else if (*slot == HTAB_DELETED_ENTRY)
+	abort ();
+    }
+}
+
+/* The following function changes size of memory allocated for the
+   entries and repeatedly inserts the table elements.  The occupancy
+   of the table after the call will be about 50%.  Naturally the hash
+   table must already exist.  Remember also that the place of the
+   table entries is changed.  */
+
+static htab_t
+htab_expand (htab_t htab)
+{
+  htab_t nhtab;
+  hash_entry_type *olimit;
+  hash_entry_type *p;
+  size_t osize, elts;
+
+  osize = htab->size;
+  olimit = htab->entries + osize;
+  elts = htab_elements (htab);
+
+  /* Resize only when table after removal of unused elements is either
+     too full or too empty.  */
+  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
+    nhtab = htab_create (elts * 2);
+  else
+    nhtab = htab_create (osize - 1);
+  nhtab->n_elements = htab->n_elements - htab->n_deleted;
+
+  p = htab->entries;
+  do
+    {
+      hash_entry_type x = *p;
+
+      if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
+	*find_empty_slot_for_expand (nhtab, htab_hash (x)) = x;
+
+      p++;
+    }
+  while (p < olimit);
+
+  htab_free (htab);
+  return nhtab;
+}
+
+/* This function searches for a hash table entry equal to the given
+   element.  It cannot be used to insert or delete an element.  */
+
+static hash_entry_type
+htab_find (htab_t htab, const hash_entry_type element)
+{
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+
+  size = htab_size (htab);
+  index = htab_mod (hash, htab);
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY
+      || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+    return entry;
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY
+	  || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+	return entry;
+    }
+}
+
+/* This function searches for a hash table slot containing an entry
+   equal to the given element.  To delete an entry, call this with
+   insert=NO_INSERT, then call htab_clear_slot on the slot returned
+   (possibly after doing some checks).  To insert an entry, call this
+   with insert=INSERT, then write the value you want into the returned
+   slot.  */
+
+static hash_entry_type *
+htab_find_slot (htab_t *htabp, const hash_entry_type element,
+		enum insert_option insert)
+{
+  hash_entry_type *first_deleted_slot;
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+  htab_t htab = *htabp;
+
+  size = htab_size (htab);
+  if (insert == INSERT && size * 3 <= htab->n_elements * 4)
+    {
+      htab = *htabp = htab_expand (htab);
+      size = htab_size (htab);
+    }
+
+  index = htab_mod (hash, htab);
+
+  first_deleted_slot = NULL;
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY)
+    goto empty_entry;
+  else if (entry == HTAB_DELETED_ENTRY)
+    first_deleted_slot = &htab->entries[index];
+  else if (htab_eq (entry, element))
+    return &htab->entries[index];
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY)
+	goto empty_entry;
+      else if (entry == HTAB_DELETED_ENTRY)
+	{
+	  if (!first_deleted_slot)
+	    first_deleted_slot = &htab->entries[index];
+	}
+      else if (htab_eq (entry, element))
+	return &htab->entries[index];
+    }
+
+ empty_entry:
+  if (insert == NO_INSERT)
+    return NULL;
+
+  if (first_deleted_slot)
+    {
+      htab->n_deleted--;
+      *first_deleted_slot = HTAB_EMPTY_ENTRY;
+      return first_deleted_slot;
+    }
+
+  htab->n_elements++;
+  return &htab->entries[index];
+}
+
+/* This function clears a specified slot in a hash table.  It is
+   useful when you've already done the lookup and don't want to do it
+   again.  */
+
+static inline void
+htab_clear_slot (htab_t htab, hash_entry_type *slot)
+{
+  if (slot < htab->entries || slot >= htab->entries + htab_size (htab)
+      || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  *slot = HTAB_DELETED_ENTRY;
+  htab->n_deleted++;
+}
+
+/* Returns a hash code for pointer P. Simplified version of evahash */
+
+static inline hashval_t
+hash_pointer (const void *p)
+{
+  uintptr_t v = (uintptr_t) p;
+  if (sizeof (v) > sizeof (hashval_t))
+    v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__);
+  return v;
+}
--- libgomp/libgomp.h	(.../trunk)	(revision 203241)
+++ libgomp/libgomp.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -39,6 +39,7 @@ 
 
 #include <pthread.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility push(hidden)
@@ -201,6 +202,10 @@  struct gomp_team_state
   /* Active nesting level.  Only active parallel regions are counted.  */
   unsigned active_level;
 
+  /* Place-partition-var, offset and length into gomp_places_list array.  */
+  unsigned place_partition_off;
+  unsigned place_partition_len;
+
 #ifdef HAVE_SYNC_BUILTINS
   /* Number of single stmts encountered.  */
   unsigned long single_count;
@@ -214,30 +219,40 @@  struct gomp_team_state
   unsigned long static_trip;
 };
 
-/* These are the OpenMP 3.0 Internal Control Variables described in
+struct target_mem_desc;
+
+/* These are the OpenMP 4.0 Internal Control Variables described in
    section 2.3.1.  Those described as having one copy per task are
    stored within the structure; those described as having one copy
    for the whole program are (naturally) global variables.  */
-
+   
 struct gomp_task_icv
 {
   unsigned long nthreads_var;
   enum gomp_schedule_type run_sched_var;
   int run_sched_modifier;
+  int default_device_var;
+  unsigned int thread_limit_var;
   bool dyn_var;
   bool nest_var;
+  char bind_var;
+  /* Internal ICV.  */
+  struct target_mem_desc *target_data;
 };
 
 extern struct gomp_task_icv gomp_global_icv;
-extern unsigned long gomp_thread_limit_var;
-extern unsigned long gomp_remaining_threads_count;
 #ifndef HAVE_SYNC_BUILTINS
-extern gomp_mutex_t gomp_remaining_threads_lock;
+extern gomp_mutex_t gomp_managed_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
+extern bool gomp_cancel_var;
 extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 extern unsigned long gomp_available_cpus, gomp_managed_threads;
 extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+extern char *gomp_bind_var_list;
+extern unsigned long gomp_bind_var_list_len;
+extern void **gomp_places_list;
+extern unsigned long gomp_places_list_len;
 
 enum gomp_task_kind
 {
@@ -247,6 +262,27 @@  enum gomp_task_kind
   GOMP_TASK_TIED
 };
 
+struct gomp_task;
+struct gomp_taskgroup;
+struct htab;
+
+struct gomp_task_depend_entry
+{
+  void *addr;
+  struct gomp_task_depend_entry *next;
+  struct gomp_task_depend_entry *prev;
+  struct gomp_task *task;
+  bool is_in;
+  bool redundant;
+};
+
+struct gomp_dependers_vec
+{
+  size_t n_elem;
+  size_t allocated;
+  struct gomp_task *elem[];
+};
+
 /* This structure describes a "task" to be run by a thread.  */
 
 struct gomp_task
@@ -257,6 +293,13 @@  struct gomp_task
   struct gomp_task *prev_child;
   struct gomp_task *next_queue;
   struct gomp_task *prev_queue;
+  struct gomp_task *next_taskgroup;
+  struct gomp_task *prev_taskgroup;
+  struct gomp_taskgroup *taskgroup;
+  struct gomp_dependers_vec *dependers;
+  struct htab *depend_hash;
+  size_t depend_count;
+  size_t num_dependees;
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
@@ -264,7 +307,19 @@  struct gomp_task
   bool in_taskwait;
   bool in_tied_task;
   bool final_task;
+  bool copy_ctors_done;
   gomp_sem_t taskwait_sem;
+  struct gomp_task_depend_entry depend[];
+};
+
+struct gomp_taskgroup
+{
+  struct gomp_taskgroup *prev;
+  struct gomp_task *children;
+  bool in_taskgroup_wait;
+  bool cancelled;
+  gomp_sem_t taskgroup_sem;
+  size_t num_children;
 };
 
 /* This structure describes a "team" of threads.  These are the threads
@@ -293,6 +348,12 @@  struct gomp_team
      of the threads in the team.  */
   gomp_sem_t **ordered_release;
 
+  /* List of work shares on which gomp_fini_work_share hasn't been
+     called yet.  If the team hasn't been cancelled, this should be
+     equal to each thr->ts.work_share, but otherwise it can be a possibly
+     long list of workshares.  */
+  struct gomp_work_share *work_shares_to_free;
+
   /* List of gomp_work_share structs chained through next_free fields.
      This is populated and taken off only by the first thread in the
      team encountering a new work sharing construct, in a critical
@@ -324,8 +385,20 @@  struct gomp_team
 
   gomp_mutex_t task_lock;
   struct gomp_task *task_queue;
-  int task_count;
-  int task_running_count;
+  /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
+  unsigned int task_count;
+  /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
+  unsigned int task_queued_count;
+  /* Number of GOMP_TASK_{WAITING,TIED} tasks currently running
+     directly in gomp_barrier_handle_tasks; tasks spawned
+     from e.g. GOMP_taskwait or GOMP_taskgroup_end don't count, even when
+     that is called from a task run from gomp_barrier_handle_tasks.
+     task_running_count should be always <= team->nthreads,
+     and if current task isn't in_tied_task, then it will be
+     even < team->nthreads.  */
+  unsigned int task_running_count;
+  int work_share_cancelled;
+  int team_cancelled;
 
   /* This array contains structures for implicit tasks.  */
   struct gomp_task implicit_task[];
@@ -350,7 +423,11 @@  struct gomp_thread
   /* This semaphore is used for ordered loops.  */
   gomp_sem_t release;
 
-  /* user pthread thread pool */
+  /* Place this thread is bound to plus one, or zero if not bound
+     to any place.  */
+  unsigned int place;
+
+  /* User pthread thread pool */
   struct gomp_thread_pool *thread_pool;
 };
 
@@ -363,11 +440,23 @@  struct gomp_thread_pool
   unsigned threads_size;
   unsigned threads_used;
   struct gomp_team *last_team;
+  /* Number of threads running in this contention group.  */
+  unsigned long threads_busy;
 
   /* This barrier holds and releases threads waiting in threads.  */
   gomp_barrier_t threads_dock;
 };
 
+enum gomp_cancel_kind
+{
+  GOMP_CANCEL_PARALLEL = 1,
+  GOMP_CANCEL_LOOP = 2,
+  GOMP_CANCEL_FOR = GOMP_CANCEL_LOOP,
+  GOMP_CANCEL_DO = GOMP_CANCEL_LOOP,
+  GOMP_CANCEL_SECTIONS = 4,
+  GOMP_CANCEL_TASKGROUP = 8
+};
+
 /* ... and here is that TLS data.  */
 
 #ifdef HAVE_TLS
@@ -402,17 +491,22 @@  static inline struct gomp_task_icv *gomp
 /* The attributes to be used during thread creation.  */
 extern pthread_attr_t gomp_thread_attr;
 
-/* Other variables.  */
-
-extern unsigned short *gomp_cpu_affinity;
-extern size_t gomp_cpu_affinity_len;
-
 /* Function prototypes.  */
 
 /* affinity.c */
 
 extern void gomp_init_affinity (void);
-extern void gomp_init_thread_affinity (pthread_attr_t *);
+extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
+extern void **gomp_affinity_alloc (unsigned long, bool);
+extern void gomp_affinity_init_place (void *);
+extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,
+				    long, bool);
+extern bool gomp_affinity_remove_cpu (void *, unsigned long);
+extern bool gomp_affinity_copy_place (void *, void *, long);
+extern bool gomp_affinity_same_place (void *, void *);
+extern bool gomp_affinity_finalize_place_list (bool);
+extern bool gomp_affinity_init_level (int, unsigned long, bool);
+extern void gomp_affinity_print_place (void *);
 
 /* alloc.c */
 
@@ -486,6 +580,8 @@  extern void gomp_barrier_handle_tasks (g
 static void inline
 gomp_finish_task (struct gomp_task *task)
 {
+  if (__builtin_expect (task->depend_hash != NULL, 0))
+    free (task->depend_hash);
   gomp_sem_destroy (&task->taskwait_sem);
 }
 
@@ -493,8 +589,13 @@  gomp_finish_task (struct gomp_task *task
 
 extern struct gomp_team *gomp_new_team (unsigned);
 extern void gomp_team_start (void (*) (void *), void *, unsigned,
-			     struct gomp_team *);
+			     unsigned, struct gomp_team *);
 extern void gomp_team_end (void);
+extern void gomp_free_thread (void *);
+
+/* target.c */
+
+extern int gomp_get_num_devices (void);
 
 /* work.c */
 
@@ -502,6 +603,7 @@  extern void gomp_init_work_share (struct
 extern void gomp_fini_work_share (struct gomp_work_share *);
 extern bool gomp_work_share_start (bool);
 extern void gomp_work_share_end (void);
+extern bool gomp_work_share_end_cancel (void);
 extern void gomp_work_share_end_nowait (void);
 
 static inline void
@@ -580,11 +682,19 @@  extern int gomp_test_nest_lock_25 (omp_n
 #endif
 
 #ifdef HAVE_ATTRIBUTE_ALIAS
+# define ialias_ulp	ialias_str1(__USER_LABEL_PREFIX__)
+# define ialias_str1(x)	ialias_str2(x)
+# define ialias_str2(x)	#x
 # define ialias(fn) \
   extern __typeof (fn) gomp_ialias_##fn \
     __attribute__ ((alias (#fn))) attribute_hidden;
+# define ialias_redirect(fn) \
+  extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
+# define ialias_call(fn) gomp_ialias_ ## fn
 #else
 # define ialias(fn)
+# define ialias_redirect(fn)
+# define ialias_call(fn) fn
 #endif
 
 #endif /* LIBGOMP_H */
--- libgomp/libgomp.map	(.../trunk)	(revision 203241)
+++ libgomp/libgomp.map	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -113,6 +113,27 @@  OMP_3.1 {
 	omp_in_final_;
 } OMP_3.0;
 
+OMP_4.0 {
+  global:
+	omp_get_cancellation;
+	omp_get_cancellation_;
+	omp_get_proc_bind;
+	omp_get_proc_bind_;
+	omp_set_default_device;
+	omp_set_default_device_;
+	omp_set_default_device_8_;
+	omp_get_default_device;
+	omp_get_default_device_;
+	omp_get_num_devices;
+	omp_get_num_devices_;
+	omp_get_num_teams;
+	omp_get_num_teams_;
+	omp_get_team_num;
+	omp_get_team_num_;
+	omp_is_initial_device;
+	omp_is_initial_device_;
+} OMP_3.1;
+
 GOMP_1.0 {
   global:
 	GOMP_atomic_end;
@@ -184,3 +205,25 @@  GOMP_3.0 {
   global:
 	GOMP_taskyield;
 } GOMP_2.0;
+
+GOMP_4.0 {
+  global:
+	GOMP_barrier_cancel;
+	GOMP_cancel;
+	GOMP_cancellation_point;
+	GOMP_loop_end_cancel;
+	GOMP_parallel_loop_dynamic;
+	GOMP_parallel_loop_guided;
+	GOMP_parallel_loop_runtime;
+	GOMP_parallel_loop_static;
+	GOMP_parallel_sections;
+	GOMP_parallel;
+	GOMP_sections_end_cancel;
+	GOMP_taskgroup_start;
+	GOMP_taskgroup_end;
+	GOMP_target;
+	GOMP_target_data;
+	GOMP_target_end_data;
+	GOMP_target_update;
+	GOMP_teams;
+} GOMP_3.0;
--- libgomp/libgomp.texi	(.../trunk)	(revision 203241)
+++ libgomp/libgomp.texi	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -1083,12 +1083,9 @@  guaranteed not to change during the exec
 @node Environment Variables
 @chapter Environment Variables
 
-The variables @env{OMP_DYNAMIC}, @env{OMP_MAX_ACTIVE_LEVELS},
-@env{OMP_NESTED}, @env{OMP_NUM_THREADS}, @env{OMP_SCHEDULE},
-@env{OMP_STACKSIZE},@env{OMP_THREAD_LIMIT} and @env{OMP_WAIT_POLICY}
-are defined by section 4 of the OpenMP specifications in version 3.1,
-while @env{GOMP_CPU_AFFINITY} and @env{GOMP_STACKSIZE} are GNU 
-extensions.
+The environment variables which beginning with @env{OMP_} are defined by
+section 4 of the OpenMP specification in version 4.0, while those
+beginning with @env{GOMP_} are GNU extensions.
 
 @menu
 * OMP_DYNAMIC::           Dynamic adjustment of threads
@@ -1099,9 +1096,11 @@  extensions.
 * OMP_SCHEDULE::          How threads are scheduled
 * OMP_THREAD_LIMIT::      Set the maximum number of threads
 * OMP_WAIT_POLICY::       How waiting threads are handled
+* OMP_DISPLAY_ENV::       Show OpenMP version and environment variables
 * OMP_PROC_BIND::         Whether theads may be moved between CPUs
 * GOMP_CPU_AFFINITY::     Bind threads to specific CPUs
 * GOMP_STACKSIZE::        Set default thread stack size
+* GOMP_SPINCOUNT::        Set the busy-wait spin count
 @end menu
 
 
@@ -1119,7 +1118,7 @@  disabled by default.
 @ref{omp_set_dynamic}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, section 4.3
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.3
 @end table
 
 
@@ -1137,7 +1136,7 @@  If undefined, the number of active level
 @ref{omp_set_max_active_levels}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, section 4.8
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.9
 @end table
 
 
@@ -1157,7 +1156,7 @@  regions are disabled by default.
 @ref{omp_set_nested}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, section 4.5
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.6
 @end table
 
 
@@ -1177,7 +1176,7 @@  level. If undefined one thread per CPU i
 @ref{omp_set_num_threads}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, section 4.2
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.2
 @end table
 
 
@@ -1198,7 +1197,7 @@  dynamic scheduling and a chunk size of 1
 @ref{omp_set_schedule}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, sections 2.5.1 and 4.1
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, sections 2.7.1 and 4.1
 @end table
 
 
@@ -1218,7 +1217,7 @@  stack size is left unchanged. If undefin
 dependent.
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, sections 4.6
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.7
 @end table
 
 
@@ -1237,7 +1236,7 @@  the number of threads is not limited.
 @ref{omp_get_thread_limit}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, section 4.9
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.10
 @end table
 
 
@@ -1250,10 +1249,14 @@  the number of threads is not limited.
 Specifies whether waiting threads should be active or passive. If
 the value is @code{PASSIVE}, waiting threads should not consume CPU
 power while waiting; while the value is @code{ACTIVE} specifies that
-they should.
+they should. If undefined, threads wait actively for a short time
+before waiting passively.
+
+@item @emph{See also}:
+@ref{GOMP_SPINCOUNT}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, sections 4.7
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.8
 @end table
 
 
@@ -1264,14 +1267,32 @@  they should.
 @table @asis
 @item @emph{Description}:
 Specifies whether threads may be moved between processors. If set to
-@code{true}, OpenMP theads should not be moved, if set to @code{false}
-they may be moved.
+@code{TRUE}, OpenMP theads should not be moved, if set to @code{FALSE}
+they may be moved. If undefined, threads may move between processors.
 
 @item @emph{See also}:
 @ref{GOMP_CPU_AFFINITY}
 
 @item @emph{Reference}: 
-@uref{http://www.openmp.org/, OpenMP specifications v3.1}, sections 4.4
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.4
+@end table
+
+
+
+@node OMP_DISPLAY_ENV
+@section @env{OMP_DISPLAY_ENV} -- Show OpenMP version and environment variables
+@cindex Environment Variable
+@table @asis
+@item @emph{Description}:
+If set to @code{TRUE}, the OpenMP version number and the values
+associated with the OpenMP environment variables are printed to @code{stderr}.
+If set to @code{VERBOSE}, it additionally shows the value of the environment
+variables which are GNU extensions. If undefined or set to @code{FALSE},
+this information will not be shown.
+
+
+@item @emph{Reference}:
+@uref{http://www.openmp.org/, OpenMP specifications v4.0}, section 4.12
 @end table
 
 
@@ -1298,7 +1319,7 @@  Fortran, may be used to query the settin
 environment variable. A defined CPU affinity on startup cannot be changed 
 or disabled during the runtime of the application.
 
-If this environment variable is omitted, the host system will handle the 
+If this environment variable is omitted, the host system will handle the
 assignment of threads to CPUs. 
 
 @item @emph{See also}:
@@ -1330,6 +1351,33 @@  GCC Patches Mailinglist}
 @end table
 
 
+
+@node GOMP_SPINCOUNT
+@section @env{GOMP_SPINCOUNT} -- Set the busy-wait spin count
+@cindex Environment Variable
+@cindex Implementation specific setting
+@table @asis
+@item @emph{Description}:
+Determines how long a threads waits actively with consuming CPU power
+before waiting passively without consuming CPU power. The value may be
+either @code{INFINITE}, @code{INFINITY} to always wait actively or an
+integer which gives the number of spins of the busy-wait loop. The
+integer may optionally be followed by the following suffixes acting
+as multiplication factors: @code{k} (kilo, thousand), @code{M} (mega,
+million), @code{G} (giga, billion), or @code{T} (tera, trillion).
+If undefined, 0 is used when @env{OMP_WAIT_POLICY} is @code{PASSIVE},
+300,000 is used when @env{OMP_WAIT_POLICY} is undefined and
+30 billion is used when @env{OMP_WAIT_POLICY} is @code{ACTIVE}.
+If there are more OpenMP threads than available CPUs, 1000 and 100
+spins are used for @env{OMP_WAIT_POLICY} being @code{ACTIVE} or
+undefined, respectively; unless the @env{GOMP_SPINCOUNT} is lower
+or @env{OMP_WAIT_POLICY} is @code{PASSIVE}.
+
+@item @emph{See also}:
+@ref{OMP_WAIT_POLICY}
+@end table
+
+
 
 @c ---------------------------------------------------------------------
 @c The libgomp ABI
--- libgomp/libgomp_g.h	(.../trunk)	(revision 203241)
+++ libgomp/libgomp_g.h	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -33,6 +33,7 @@ 
 /* barrier.c */
 
 extern void GOMP_barrier (void);
+extern bool GOMP_barrier_cancel (void);
 
 /* critical.c */
 
@@ -76,9 +77,22 @@  extern void GOMP_parallel_loop_guided_st
 					     unsigned, long, long, long, long);
 extern void GOMP_parallel_loop_runtime_start (void (*)(void *), void *,
 					      unsigned, long, long, long);
+extern void GOMP_parallel_loop_static (void (*)(void *), void *,
+				       unsigned, long, long, long, long,
+				       unsigned);
+extern void GOMP_parallel_loop_dynamic (void (*)(void *), void *,
+					unsigned, long, long, long, long,
+					unsigned);
+extern void GOMP_parallel_loop_guided (void (*)(void *), void *,
+				       unsigned, long, long, long, long,
+				       unsigned);
+extern void GOMP_parallel_loop_runtime (void (*)(void *), void *,
+					unsigned, long, long, long,
+					unsigned);
 
 extern void GOMP_loop_end (void);
 extern void GOMP_loop_end_nowait (void);
+extern bool GOMP_loop_end_cancel (void);
 
 /* loop_ull.c */
 
@@ -157,13 +171,18 @@  extern void GOMP_ordered_end (void);
 
 extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
 extern void GOMP_parallel_end (void);
+extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
+extern bool GOMP_cancel (int, bool);
+extern bool GOMP_cancellation_point (int);
 
 /* task.c */
 
 extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
-		       long, long, bool, unsigned);
+		       long, long, bool, unsigned, void **);
 extern void GOMP_taskwait (void);
 extern void GOMP_taskyield (void);
+extern void GOMP_taskgroup_start (void);
+extern void GOMP_taskgroup_end (void);
 
 /* sections.c */
 
@@ -171,8 +190,11 @@  extern unsigned GOMP_sections_start (uns
 extern unsigned GOMP_sections_next (void);
 extern void GOMP_parallel_sections_start (void (*) (void *), void *,
 					  unsigned, unsigned);
+extern void GOMP_parallel_sections (void (*) (void *), void *,
+				    unsigned, unsigned, unsigned);
 extern void GOMP_sections_end (void);
 extern void GOMP_sections_end_nowait (void);
+extern bool GOMP_sections_end_cancel (void);
 
 /* single.c */
 
@@ -180,4 +202,15 @@  extern bool GOMP_single_start (void);
 extern void *GOMP_single_copy_start (void);
 extern void GOMP_single_copy_end (void *);
 
+/* target.c */
+
+extern void GOMP_target (int, void (*) (void *), const void *,
+			 size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_data (int, const void *,
+			      size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_end_data (void);
+extern void GOMP_target_update (int, const void *,
+				size_t, void **, size_t *, unsigned char *);
+extern void GOMP_teams (unsigned int, unsigned int);
+
 #endif /* LIBGOMP_G_H */
--- libgomp/loop.c	(.../trunk)	(revision 203241)
+++ libgomp/loop.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -439,14 +439,14 @@  static void
 gomp_parallel_loop_start (void (*fn) (void *), void *data,
 			  unsigned num_threads, long start, long end,
 			  long incr, enum gomp_schedule_type sched,
-			  long chunk_size)
+			  long chunk_size, unsigned int flags)
 {
   struct gomp_team *team;
 
   num_threads = gomp_resolve_num_threads (num_threads, 0);
   team = gomp_new_team (num_threads);
   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
-  gomp_team_start (fn, data, num_threads, team);
+  gomp_team_start (fn, data, num_threads, flags, team);
 }
 
 void
@@ -455,7 +455,7 @@  GOMP_parallel_loop_static_start (void (*
 				 long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_STATIC, chunk_size);
+			    GFS_STATIC, chunk_size, 0);
 }
 
 void
@@ -464,7 +464,7 @@  GOMP_parallel_loop_dynamic_start (void (
 				  long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_DYNAMIC, chunk_size);
+			    GFS_DYNAMIC, chunk_size, 0);
 }
 
 void
@@ -473,7 +473,7 @@  GOMP_parallel_loop_guided_start (void (*
 				 long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_GUIDED, chunk_size);
+			    GFS_GUIDED, chunk_size, 0);
 }
 
 void
@@ -483,11 +483,59 @@  GOMP_parallel_loop_runtime_start (void (
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_modifier);
+			    icv->run_sched_var, icv->run_sched_modifier, 0);
+}
+
+ialias_redirect (GOMP_parallel_end)
+
+void
+GOMP_parallel_loop_static (void (*fn) (void *), void *data,
+			   unsigned num_threads, long start, long end,
+			   long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_STATIC, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_DYNAMIC, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_guided (void (*fn) (void *), void *data,
+			  unsigned num_threads, long start, long end,
+			  long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_GUIDED, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var, icv->run_sched_modifier,
+			    flags);
+  fn (data);
+  GOMP_parallel_end ();
 }
 
 /* The GOMP_loop_end* routines are called after the thread is told that
-   all loop iterations are complete.  This first version synchronizes
+   all loop iterations are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
 
 void
@@ -496,6 +544,12 @@  GOMP_loop_end (void)
   gomp_work_share_end ();
 }
 
+bool
+GOMP_loop_end_cancel (void)
+{
+  return gomp_work_share_end_cancel ();
+}
+
 void
 GOMP_loop_end_nowait (void)
 {
--- libgomp/omp.h.in	(.../trunk)	(revision 203241)
+++ libgomp/omp.h.in	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -52,6 +52,15 @@  typedef enum omp_sched_t
   omp_sched_auto = 4
 } omp_sched_t;
 
+typedef enum omp_proc_bind_t
+{
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
 #ifdef __cplusplus
 extern "C" {
 # define __GOMP_NOTHROW throw ()
@@ -88,17 +97,28 @@  extern int omp_test_nest_lock (omp_nest_
 extern double omp_get_wtime (void) __GOMP_NOTHROW;
 extern double omp_get_wtick (void) __GOMP_NOTHROW;
 
-void omp_set_schedule (omp_sched_t, int) __GOMP_NOTHROW;
-void omp_get_schedule (omp_sched_t *, int *) __GOMP_NOTHROW;
-int omp_get_thread_limit (void) __GOMP_NOTHROW;
-void omp_set_max_active_levels (int) __GOMP_NOTHROW;
-int omp_get_max_active_levels (void) __GOMP_NOTHROW;
-int omp_get_level (void) __GOMP_NOTHROW;
-int omp_get_ancestor_thread_num (int) __GOMP_NOTHROW;
-int omp_get_team_size (int) __GOMP_NOTHROW;
-int omp_get_active_level (void) __GOMP_NOTHROW;
+extern void omp_set_schedule (omp_sched_t, int) __GOMP_NOTHROW;
+extern void omp_get_schedule (omp_sched_t *, int *) __GOMP_NOTHROW;
+extern int omp_get_thread_limit (void) __GOMP_NOTHROW;
+extern void omp_set_max_active_levels (int) __GOMP_NOTHROW;
+extern int omp_get_max_active_levels (void) __GOMP_NOTHROW;
+extern int omp_get_level (void) __GOMP_NOTHROW;
+extern int omp_get_ancestor_thread_num (int) __GOMP_NOTHROW;
+extern int omp_get_team_size (int) __GOMP_NOTHROW;
+extern int omp_get_active_level (void) __GOMP_NOTHROW;
+
+extern int omp_in_final (void) __GOMP_NOTHROW;
+
+extern int omp_get_cancellation (void) __GOMP_NOTHROW;
+extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
+
+extern void omp_set_default_device (int) __GOMP_NOTHROW;
+extern int omp_get_default_device (void) __GOMP_NOTHROW;
+extern int omp_get_num_devices (void) __GOMP_NOTHROW;
+extern int omp_get_num_teams (void) __GOMP_NOTHROW;
+extern int omp_get_team_num (void) __GOMP_NOTHROW;
 
-int omp_in_final (void) __GOMP_NOTHROW;
+extern int omp_is_initial_device (void) __GOMP_NOTHROW;
 
 #ifdef __cplusplus
 }
--- libgomp/omp_lib.f90.in	(.../trunk)	(revision 203241)
+++ libgomp/omp_lib.f90.in	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -27,16 +27,22 @@ 
         integer, parameter :: omp_lock_kind = @OMP_LOCK_KIND@
         integer, parameter :: omp_nest_lock_kind = @OMP_NEST_LOCK_KIND@
         integer, parameter :: omp_sched_kind = 4
+        integer, parameter :: omp_proc_bind_kind = 4
+        integer (omp_sched_kind), parameter :: omp_sched_static = 1
+        integer (omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer (omp_sched_kind), parameter :: omp_sched_guided = 3
+        integer (omp_sched_kind), parameter :: omp_sched_auto = 4
+        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
       end module
 
       module omp_lib
         use omp_lib_kinds
         implicit none
         integer, parameter :: openmp_version = 201107
-        integer (omp_sched_kind), parameter :: omp_sched_static = 1
-        integer (omp_sched_kind), parameter :: omp_sched_dynamic = 2
-        integer (omp_sched_kind), parameter :: omp_sched_guided = 3
-        integer (omp_sched_kind), parameter :: omp_sched_auto = 4
 
         interface
           subroutine omp_init_lock (svar)
@@ -123,21 +129,18 @@ 
 
         interface
           function omp_get_dynamic ()
-            use omp_lib_kinds
             logical (4) :: omp_get_dynamic
           end function omp_get_dynamic
         end interface
 
         interface
           function omp_get_nested ()
-            use omp_lib_kinds
             logical (4) :: omp_get_nested
           end function omp_get_nested
         end interface
 
         interface
           function omp_in_parallel ()
-            use omp_lib_kinds
             logical (4) :: omp_in_parallel
           end function omp_in_parallel
         end interface
@@ -152,28 +155,24 @@ 
 
         interface
           function omp_get_max_threads ()
-            use omp_lib_kinds
             integer (4) :: omp_get_max_threads
           end function omp_get_max_threads
         end interface
 
         interface
           function omp_get_num_procs ()
-            use omp_lib_kinds
             integer (4) :: omp_get_num_procs
           end function omp_get_num_procs
         end interface
 
         interface
           function omp_get_num_threads ()
-            use omp_lib_kinds
             integer (4) :: omp_get_num_threads
           end function omp_get_num_threads
         end interface
 
         interface
           function omp_get_thread_num ()
-            use omp_lib_kinds
             integer (4) :: omp_get_thread_num
           end function omp_get_thread_num
         end interface
@@ -226,44 +225,37 @@ 
 
         interface
           function omp_get_thread_limit ()
-            use omp_lib_kinds
             integer (4) :: omp_get_thread_limit
           end function omp_get_thread_limit
         end interface
 
         interface omp_set_max_active_levels
           subroutine omp_set_max_active_levels (max_levels)
-            use omp_lib_kinds
             integer (4), intent (in) :: max_levels
           end subroutine omp_set_max_active_levels
           subroutine omp_set_max_active_levels_8 (max_levels)
-            use omp_lib_kinds
             integer (8), intent (in) :: max_levels
           end subroutine omp_set_max_active_levels_8
         end interface
 
         interface
           function omp_get_max_active_levels ()
-            use omp_lib_kinds
             integer (4) :: omp_get_max_active_levels
           end function omp_get_max_active_levels
         end interface
 
         interface
           function omp_get_level ()
-            use omp_lib_kinds
             integer (4) :: omp_get_level
           end function omp_get_level
         end interface
 
         interface omp_get_ancestor_thread_num
           function omp_get_ancestor_thread_num (level)
-            use omp_lib_kinds
             integer (4), intent (in) :: level
             integer (4) :: omp_get_ancestor_thread_num
           end function omp_get_ancestor_thread_num
           function omp_get_ancestor_thread_num_8 (level)
-            use omp_lib_kinds
             integer (8), intent (in) :: level
             integer (4) :: omp_get_ancestor_thread_num_8
           end function omp_get_ancestor_thread_num_8
@@ -271,12 +263,10 @@ 
 
         interface omp_get_team_size
           function omp_get_team_size (level)
-            use omp_lib_kinds
             integer (4), intent (in) :: level
             integer (4) :: omp_get_team_size
           end function omp_get_team_size
           function omp_get_team_size_8 (level)
-            use omp_lib_kinds
             integer (8), intent (in) :: level
             integer (4) :: omp_get_team_size_8
           end function omp_get_team_size_8
@@ -284,16 +274,66 @@ 
 
         interface
           function omp_get_active_level ()
-            use omp_lib_kinds
             integer (4) :: omp_get_active_level
           end function omp_get_active_level
         end interface
 
         interface
           function omp_in_final ()
-            use omp_lib_kinds
             logical (4) :: omp_in_final
           end function omp_in_final
         end interface
 
+        interface
+          function omp_get_cancellation ()
+            logical (4) :: omp_get_cancellation
+          end function omp_get_cancellation
+        end interface
+
+        interface
+          function omp_get_proc_bind ()
+            use omp_lib_kinds
+            integer (omp_proc_bind_kind) :: omp_get_proc_bind
+          end function omp_get_proc_bind
+        end interface
+
+        interface omp_set_default_device
+          subroutine omp_set_default_device (device_num)
+            integer (4), intent (in) :: device_num
+          end subroutine omp_set_default_device
+          subroutine omp_set_default_device_8 (device_num)
+            integer (8), intent (in) :: device_num
+          end subroutine omp_set_default_device_8
+        end interface
+
+        interface
+          function omp_get_default_device ()
+            integer (4) :: omp_get_default_device
+          end function omp_get_default_device
+        end interface
+
+        interface
+          function omp_get_num_devices ()
+            integer (4) :: omp_get_num_devices
+          end function omp_get_num_devices
+        end interface
+
+        interface
+          function omp_get_num_teams ()
+            integer (4) :: omp_get_num_teams
+          end function omp_get_num_teams
+        end interface
+
+        interface
+          function omp_get_team_num ()
+            integer (4) :: omp_get_team_num
+          end function omp_get_team_num
+        end interface
+
+        interface
+          function omp_is_initial_device ()
+            logical (4) :: omp_is_initial_device
+          end function omp_is_initial_device
+        end interface
+
       end module omp_lib
--- libgomp/omp_lib.h.in	(.../trunk)	(revision 203241)
+++ libgomp/omp_lib.h.in	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -33,6 +33,18 @@ 
       parameter (omp_sched_dynamic = 2)
       parameter (omp_sched_guided = 3)
       parameter (omp_sched_auto = 4)
+      integer omp_proc_bind_kind
+      parameter (omp_proc_bind_kind = 4)
+      integer (omp_proc_bind_kind) omp_proc_bind_false
+      integer (omp_proc_bind_kind) omp_proc_bind_true
+      integer (omp_proc_bind_kind) omp_proc_bind_master
+      integer (omp_proc_bind_kind) omp_proc_bind_close
+      integer (omp_proc_bind_kind) omp_proc_bind_spread
+      parameter (omp_proc_bind_false = 0)
+      parameter (omp_proc_bind_true = 1)
+      parameter (omp_proc_bind_master = 2)
+      parameter (omp_proc_bind_close = 3)
+      parameter (omp_proc_bind_spread = 4)
       parameter (openmp_version = 201107)
 
       external omp_init_lock, omp_init_nest_lock
@@ -68,3 +80,18 @@ 
 
       external omp_in_final
       logical(4) omp_in_final
+
+      external omp_get_cancelllation
+      logical(4) omp_get_cancelllation
+
+      external omp_get_proc_bind
+      integer(omp_proc_bind_kind) omp_get_proc_bind
+
+      external omp_set_default_device, omp_get_default_device
+      external omp_get_num_devices, omp_get_num_teams
+      external omp_get_team_num
+      integer(4) omp_get_default_device, omp_get_num_devices
+      integer(4) omp_get_num_teams, omp_get_team_num
+
+      external omp_is_initial_device
+      logical(4) omp_is_initial_device
--- libgomp/parallel.c	(.../trunk)	(revision 203241)
+++ libgomp/parallel.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -37,18 +37,19 @@ 
 unsigned
 gomp_resolve_num_threads (unsigned specified, unsigned count)
 {
-  struct gomp_thread *thread = gomp_thread();
+  struct gomp_thread *thr = gomp_thread ();
   struct gomp_task_icv *icv;
   unsigned threads_requested, max_num_threads, num_threads;
-  unsigned long remaining;
+  unsigned long busy;
+  struct gomp_thread_pool *pool;
 
   icv = gomp_icv (false);
 
   if (specified == 1)
     return 1;
-  else if (thread->ts.active_level >= 1 && !icv->nest_var)
+  else if (thr->ts.active_level >= 1 && !icv->nest_var)
     return 1;
-  else if (thread->ts.active_level >= gomp_max_active_levels_var)
+  else if (thr->ts.active_level >= gomp_max_active_levels_var)
     return 1;
 
   /* If NUM_THREADS not specified, use nthreads_var.  */
@@ -72,30 +73,46 @@  gomp_resolve_num_threads (unsigned speci
 	max_num_threads = count;
     }
 
-  /* ULONG_MAX stands for infinity.  */
-  if (__builtin_expect (gomp_thread_limit_var == ULONG_MAX, 1)
+  /* UINT_MAX stands for infinity.  */
+  if (__builtin_expect (icv->thread_limit_var == UINT_MAX, 1)
       || max_num_threads == 1)
     return max_num_threads;
 
+  /* The threads_busy counter lives in thread_pool, if there
+     isn't a thread_pool yet, there must be just one thread
+     in the contention group.  If thr->team is NULL, this isn't
+     nested parallel, so there is just one thread in the
+     contention group as well, no need to handle it atomically.  */
+  pool = thr->thread_pool;
+  if (thr->ts.team == NULL)
+    {
+      num_threads = max_num_threads;
+      if (num_threads > icv->thread_limit_var)
+	num_threads = icv->thread_limit_var;
+      if (pool)
+	pool->threads_busy = num_threads;
+      return num_threads;
+    }
+
 #ifdef HAVE_SYNC_BUILTINS
   do
     {
-      remaining = gomp_remaining_threads_count;
+      busy = pool->threads_busy;
       num_threads = max_num_threads;
-      if (num_threads > remaining)
-	num_threads = remaining + 1;
+      if (icv->thread_limit_var - busy + 1 < num_threads)
+	num_threads = icv->thread_limit_var - busy + 1;
     }
-  while (__sync_val_compare_and_swap (&gomp_remaining_threads_count,
-				      remaining, remaining - num_threads + 1)
-	 != remaining);
+  while (__sync_val_compare_and_swap (&pool->threads_busy,
+				      busy, busy + num_threads - 1)
+	 != busy);
 #else
-  gomp_mutex_lock (&gomp_remaining_threads_lock);
+  gomp_mutex_lock (&gomp_managed_threads_lock);
   num_threads = max_num_threads;
-  remaining = gomp_remaining_threads_count;
-  if (num_threads > remaining)
-    num_threads = remaining + 1;
-  gomp_remaining_threads_count -= num_threads - 1;
-  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+  busy = pool->threads_busy;
+  if (icv->thread_limit_var - busy + 1 < num_threads)
+    num_threads = icv->thread_limit_var - busy + 1;
+  pool->threads_busy += num_threads - 1;
+  gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
 
   return num_threads;
@@ -105,31 +122,113 @@  void
 GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
 {
   num_threads = gomp_resolve_num_threads (num_threads, 0);
-  gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads));
+  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
 }
 
 void
 GOMP_parallel_end (void)
 {
-  if (__builtin_expect (gomp_thread_limit_var != ULONG_MAX, 0))
+  struct gomp_task_icv *icv = gomp_icv (false);
+  if (__builtin_expect (icv->thread_limit_var != UINT_MAX, 0))
     {
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
-      if (team && team->nthreads > 1)
+      unsigned int nthreads = team ? team->nthreads : 1;
+      gomp_team_end ();
+      if (nthreads > 1)
 	{
+	  /* If not nested, there is just one thread in the
+	     contention group left, no need for atomicity.  */
+	  if (thr->ts.team == NULL)
+	    thr->thread_pool->threads_busy = 1;
+	  else
+	    {
 #ifdef HAVE_SYNC_BUILTINS
-	  __sync_fetch_and_add (&gomp_remaining_threads_count,
-				1UL - team->nthreads);
+	      __sync_fetch_and_add (&thr->thread_pool->threads_busy,
+				    1UL - nthreads);
 #else
-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
-	  gomp_remaining_threads_count -= team->nthreads - 1;
-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+	      gomp_mutex_lock (&gomp_managed_threads_lock);
+	      thr->thread_pool->threads_busy -= nthreads - 1;
+	      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
+	    }
 	}
     }
-  gomp_team_end ();
+  else
+    gomp_team_end ();
+}
+ialias (GOMP_parallel_end)
+
+void
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
+{
+  num_threads = gomp_resolve_num_threads (num_threads, 0);
+  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
+  fn (data);
+  ialias_call (GOMP_parallel_end) ();
+}
+
+bool
+GOMP_cancellation_point (int which)
+{
+  if (!gomp_cancel_var)
+    return false;
+
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
+    {
+      if (team == NULL)
+	return false;
+      return team->work_share_cancelled != 0;
+    }
+  else if (which & GOMP_CANCEL_TASKGROUP)
+    {
+      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
+	return true;
+      /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
+	 as #pragma omp cancel parallel also cancels all explicit
+	 tasks.  */
+    }
+  if (team)
+    return gomp_team_barrier_cancelled (&team->barrier);
+  return false;
 }
+ialias (GOMP_cancellation_point)
+
+bool
+GOMP_cancel (int which, bool do_cancel)
+{
+  if (!gomp_cancel_var)
+    return false;
+
+  if (!do_cancel)
+    return ialias_call (GOMP_cancellation_point) (which);
 
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
+    {
+      /* In orphaned worksharing region, all we want to cancel
+	 is current thread.  */
+      if (team != NULL)
+	team->work_share_cancelled = 1;
+      return true;
+    }
+  else if (which & GOMP_CANCEL_TASKGROUP)
+    {
+      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
+	{
+	  gomp_mutex_lock (&team->task_lock);
+	  thr->task->taskgroup->cancelled = true;
+	  gomp_mutex_unlock (&team->task_lock);
+	}
+      return true;
+    }
+  team->team_cancelled = 1;
+  gomp_team_barrier_cancel (team);
+  return true;
+}
 
 /* The public OpenMP API for thread and team related inquiries.  */
 
--- libgomp/sections.c	(.../trunk)	(revision 203241)
+++ libgomp/sections.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -139,11 +139,27 @@  GOMP_parallel_sections_start (void (*fn)
   num_threads = gomp_resolve_num_threads (num_threads, count);
   team = gomp_new_team (num_threads);
   gomp_sections_init (&team->work_shares[0], count);
-  gomp_team_start (fn, data, num_threads, team);
+  gomp_team_start (fn, data, num_threads, 0, team);
+}
+
+ialias_redirect (GOMP_parallel_end)
+
+void
+GOMP_parallel_sections (void (*fn) (void *), void *data,
+			unsigned num_threads, unsigned count, unsigned flags)
+{
+  struct gomp_team *team;
+
+  num_threads = gomp_resolve_num_threads (num_threads, count);
+  team = gomp_new_team (num_threads);
+  gomp_sections_init (&team->work_shares[0], count);
+  gomp_team_start (fn, data, num_threads, flags, team);
+  fn (data);
+  GOMP_parallel_end ();
 }
 
 /* The GOMP_section_end* routines are called after the thread is told
-   that all sections are complete.  This first version synchronizes
+   that all sections are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
 
 void
@@ -152,6 +168,12 @@  GOMP_sections_end (void)
   gomp_work_share_end ();
 }
 
+bool
+GOMP_sections_end_cancel (void)
+{
+  return gomp_work_share_end_cancel ();
+}
+
 void
 GOMP_sections_end_nowait (void)
 {
--- libgomp/target.c	(.../trunk)	(revision 0)
+++ libgomp/target.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -0,0 +1,96 @@ 
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU OpenMP Library (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the maintainence of threads in response to team
+   creation and termination.  */
+
+#include "libgomp.h"
+#include <limits.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+attribute_hidden int
+gomp_get_num_devices (void)
+{
+  return 0;
+}
+
+/* Called when encountering a target directive.  If DEVICE
+   is -1, it means use device-var ICV.  If it is -2 (or any other value
+   larger than last available hw device, use host fallback.
+   FN is address of host code, OPENMP_TARGET contains value of the
+   __OPENMP_TARGET__ symbol in the shared library or binary that invokes
+   GOMP_target.  HOSTADDRS, SIZES and KINDS are arrays
+   with MAPNUM entries, with addresses of the host objects,
+   sizes of the host objects (resp. for pointer kind pointer bias
+   and assumed sizeof (void *) size) and kinds.  */
+
+void
+GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
+	     size_t mapnum, void **hostaddrs, size_t *sizes,
+	     unsigned char *kinds)
+{
+  /* Host fallback.  */
+  struct gomp_thread old_thr, *thr = gomp_thread ();
+  old_thr = *thr;
+  memset (thr, '\0', sizeof (*thr));
+  if (gomp_places_list)
+    {
+      thr->place = old_thr.place;
+      thr->ts.place_partition_len = gomp_places_list_len;
+    }
+  fn (hostaddrs);
+  gomp_free_thread (thr);
+  *thr = old_thr;
+}
+
+void
+GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
+		  void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+}
+
+void
+GOMP_target_end_data (void)
+{
+}
+
+void
+GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
+		    void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+}
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  (void) num_teams;
+}
--- libgomp/task.c	(.../trunk)	(revision 203241)
+++ libgomp/task.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -29,6 +29,33 @@ 
 #include <stdlib.h>
 #include <string.h>
 
+typedef struct gomp_task_depend_entry *hash_entry_type;
+
+static inline void *
+htab_alloc (size_t size)
+{
+  return gomp_malloc (size);
+}
+
+static inline void
+htab_free (void *ptr)
+{
+  free (ptr);
+}
+
+#include "hashtab.h"
+
+static inline hashval_t
+htab_hash (hash_entry_type element)
+{
+  return hash_pointer (element->addr);
+}
+
+static inline bool
+htab_eq (hash_entry_type x, hash_entry_type y)
+{
+  return x->addr == y->addr;
+}
 
 /* Create a new task data structure.  */
 
@@ -42,7 +69,12 @@  gomp_init_task (struct gomp_task *task,
   task->in_taskwait = false;
   task->in_tied_task = false;
   task->final_task = false;
+  task->copy_ctors_done = false;
   task->children = NULL;
+  task->taskgroup = NULL;
+  task->dependers = NULL;
+  task->depend_hash = NULL;
+  task->depend_count = 0;
   gomp_sem_init (&task->taskwait_sem, 0);
 }
 
@@ -78,7 +110,8 @@  gomp_clear_parent (struct gomp_task *chi
 
 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
-	   long arg_size, long arg_align, bool if_clause, unsigned flags)
+	   long arg_size, long arg_align, bool if_clause, unsigned flags,
+	   void **depend)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -94,17 +127,58 @@  GOMP_task (void (*fn) (void *), void *da
     flags &= ~1;
 #endif
 
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team
+      && (gomp_team_barrier_cancelled (&team->barrier)
+	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+    return;
+
   if (!if_clause || team == NULL
       || (thr->task && thr->task->final_task)
       || team->task_count > 64 * team->nthreads)
     {
       struct gomp_task task;
 
+      /* If there are depend clauses and earlier deferred sibling tasks
+	 with depend clauses, check if there isn't a dependency.  If there
+	 is, fall through to the deferred task handling, as we can't
+	 schedule such tasks right away.  There is no need to handle
+	 depend clauses for non-deferred tasks other than this, because
+	 the parent task is suspended until the child task finishes and thus
+	 it can't start further child tasks.  */
+      if ((flags & 8) && thr->task && thr->task->depend_hash)
+	{
+	  struct gomp_task *parent = thr->task;
+	  struct gomp_task_depend_entry elem, *ent = NULL;
+	  size_t ndepend = (uintptr_t) depend[0];
+	  size_t nout = (uintptr_t) depend[1];
+	  size_t i;
+	  gomp_mutex_lock (&team->task_lock);
+	  for (i = 0; i < ndepend; i++)
+	    {
+	      elem.addr = depend[i + 2];
+	      ent = htab_find (parent->depend_hash, &elem);
+	      for (; ent; ent = ent->next)
+		if (i >= nout && ent->is_in)
+		  continue;
+		else
+		  break;
+	      if (ent)
+		break;
+	    }
+	  gomp_mutex_unlock (&team->task_lock);
+	  if (ent)
+	    goto defer;
+	}
+
       gomp_init_task (&task, thr->task, gomp_icv (false));
       task.kind = GOMP_TASK_IFFALSE;
       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
       if (thr->task)
-	task.in_tied_task = thr->task->in_tied_task;
+	{
+	  task.in_tied_task = thr->task->in_tied_task;
+	  task.taskgroup = thr->task->taskgroup;
+	}
       thr->task = &task;
       if (__builtin_expect (cpyfn != NULL, 0))
 	{
@@ -135,29 +209,161 @@  GOMP_task (void (*fn) (void *), void *da
     }
   else
     {
+     defer:;
       struct gomp_task *task;
       struct gomp_task *parent = thr->task;
+      struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
       bool do_wake;
+      size_t depend_size = 0;
 
-      task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
-      arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
+      if (flags & 8)
+	depend_size = ((uintptr_t) depend[0]
+		       * sizeof (struct gomp_task_depend_entry));
+      task = gomp_malloc (sizeof (*task) + depend_size
+			  + arg_size + arg_align - 1);
+      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
 		      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
       task->kind = GOMP_TASK_IFFALSE;
       task->in_tied_task = parent->in_tied_task;
+      task->taskgroup = taskgroup;
       thr->task = task;
       if (cpyfn)
-	cpyfn (arg, data);
+	{
+	  cpyfn (arg, data);
+	  task->copy_ctors_done = true;
+	}
       else
 	memcpy (arg, data, arg_size);
       thr->task = parent;
       task->kind = GOMP_TASK_WAITING;
       task->fn = fn;
       task->fn_data = arg;
-      task->in_tied_task = true;
       task->final_task = (flags & 2) >> 1;
       gomp_mutex_lock (&team->task_lock);
+      /* If parallel or taskgroup has been cancelled, don't start new
+	 tasks.  */
+      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
+			     || (taskgroup && taskgroup->cancelled))
+			    && !task->copy_ctors_done, 0))
+	{
+	  gomp_mutex_unlock (&team->task_lock);
+	  gomp_finish_task (task);
+	  free (task);
+	  return;
+	}
+      if (taskgroup)
+	taskgroup->num_children++;
+      if (depend_size)
+	{
+	  size_t ndepend = (uintptr_t) depend[0];
+	  size_t nout = (uintptr_t) depend[1];
+	  size_t i;
+	  hash_entry_type ent;
+
+	  task->depend_count = ndepend;
+	  task->num_dependees = 0;
+	  if (parent->depend_hash == NULL)
+	    parent->depend_hash
+	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+	  for (i = 0; i < ndepend; i++)
+	    {
+	      task->depend[i].addr = depend[2 + i];
+	      task->depend[i].next = NULL;
+	      task->depend[i].prev = NULL;
+	      task->depend[i].task = task;
+	      task->depend[i].is_in = i >= nout;
+	      task->depend[i].redundant = false;
+
+	      hash_entry_type *slot
+		= htab_find_slot (&parent->depend_hash, &task->depend[i],
+				  INSERT);
+	      hash_entry_type out = NULL;
+	      if (*slot)
+		{
+		  /* If multiple depends on the same task are the
+		     same, all but the first one are redundant.
+		     As inout/out come first, if any of them is
+		     inout/out, it will win, which is the right
+		     semantics.  */
+		  if ((*slot)->task == task)
+		    {
+		      task->depend[i].redundant = true;
+		      continue;
+		    }
+		  for (ent = *slot; ent; ent = ent->next)
+		    {
+		      /* depend(in:...) doesn't depend on earlier
+			 depend(in:...).  */
+		      if (i >= nout && ent->is_in)
+			continue;
+
+		      if (!ent->is_in)
+			out = ent;
+
+		      struct gomp_task *tsk = ent->task;
+		      if (tsk->dependers == NULL)
+			{
+			  tsk->dependers
+			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
+					   + 6 * sizeof (struct gomp_task *));
+			  tsk->dependers->n_elem = 1;
+			  tsk->dependers->allocated = 6;
+			  tsk->dependers->elem[0] = task;
+			  task->num_dependees++;
+			  continue;
+			}
+		      /* We already have some other dependency on tsk
+			 from earlier depend clause.  */
+		      else if (tsk->dependers->n_elem
+			       && (tsk->dependers->elem[tsk->dependers->n_elem
+							- 1]
+				   == task))
+			continue;
+		      else if (tsk->dependers->n_elem
+			       == tsk->dependers->allocated)
+			{
+			  tsk->dependers->allocated
+			    = tsk->dependers->allocated * 2 + 2;
+			  tsk->dependers
+			    = gomp_realloc (tsk->dependers,
+					    sizeof (struct gomp_dependers_vec)
+					    + (tsk->dependers->allocated
+					       * sizeof (struct gomp_task *)));
+			}
+		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
+		      task->num_dependees++;
+		    }
+		  task->depend[i].next = *slot;
+		  (*slot)->prev = &task->depend[i];
+		}
+	      *slot = &task->depend[i];
+
+	      /* There is no need to store more than one depend({,in}out:)
+		 task per address in the hash table chain, because each out
+		 depends on all earlier outs, thus it is enough to record
+		 just the last depend({,in}out:).  For depend(in:), we need
+		 to keep all of the previous ones not terminated yet, because
+		 a later depend({,in}out:) might need to depend on all of
+		 them.  So, if the new task's clause is depend({,in}out:),
+		 we know there is at most one other depend({,in}out:) clause
+		 in the list (out) and to maintain the invariant we now
+		 need to remove it from the list.  */
+	      if (!task->depend[i].is_in && out)
+		{
+		  if (out->next)
+		    out->next->prev = out->prev;
+		  out->prev->next = out->next;
+		  out->redundant = true;
+		}
+	    }
+	  if (task->num_dependees)
+	    {
+	      gomp_mutex_unlock (&team->task_lock);
+	      return;
+	    }
+	}
       if (parent->children)
 	{
 	  task->next_child = parent->children;
@@ -171,6 +377,22 @@  GOMP_task (void (*fn) (void *), void *da
 	  task->prev_child = task;
 	}
       parent->children = task;
+      if (taskgroup)
+	{
+	  if (taskgroup->children)
+	    {
+	      task->next_taskgroup = taskgroup->children;
+	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	      task->next_taskgroup->prev_taskgroup = task;
+	      task->prev_taskgroup->next_taskgroup = task;
+	    }
+	  else
+	    {
+	      task->next_taskgroup = task;
+	      task->prev_taskgroup = task;
+	    }
+	  taskgroup->children = task;
+	}
       if (team->task_queue)
 	{
 	  task->next_queue = team->task_queue;
@@ -185,6 +407,7 @@  GOMP_task (void (*fn) (void *), void *da
 	  team->task_queue = task;
 	}
       ++team->task_count;
+      ++team->task_queued_count;
       gomp_team_barrier_set_task_pending (&team->barrier);
       do_wake = team->task_running_count + !parent->in_tied_task
 		< team->nthreads;
@@ -194,6 +417,220 @@  GOMP_task (void (*fn) (void *), void *da
     }
 }
 
+static inline bool
+gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
+		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
+{
+  if (parent && parent->children == child_task)
+    parent->children = child_task->next_child;
+  if (taskgroup && taskgroup->children == child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  child_task->prev_queue->next_queue = child_task->next_queue;
+  child_task->next_queue->prev_queue = child_task->prev_queue;
+  if (team->task_queue == child_task)
+    {
+      if (child_task->next_queue != child_task)
+	team->task_queue = child_task->next_queue;
+      else
+	team->task_queue = NULL;
+    }
+  child_task->kind = GOMP_TASK_TIED;
+  if (--team->task_queued_count == 0)
+    gomp_team_barrier_clear_task_pending (&team->barrier);
+  if ((gomp_team_barrier_cancelled (&team->barrier)
+       || (taskgroup && taskgroup->cancelled))
+      && !child_task->copy_ctors_done)
+    return true;
+  return false;
+}
+
+static void
+gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i;
+
+  for (i = 0; i < child_task->depend_count; i++)
+    if (!child_task->depend[i].redundant)
+      {
+	if (child_task->depend[i].next)
+	  child_task->depend[i].next->prev = child_task->depend[i].prev;
+	if (child_task->depend[i].prev)
+	  child_task->depend[i].prev->next = child_task->depend[i].next;
+	else
+	  {
+	    hash_entry_type *slot
+	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
+				NO_INSERT);
+	    if (*slot != &child_task->depend[i])
+	      abort ();
+	    if (child_task->depend[i].next)
+	      *slot = child_task->depend[i].next;
+	    else
+	      htab_clear_slot (parent->depend_hash, slot);
+	  }
+      }
+}
+
+static size_t
+gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
+				     struct gomp_team *team)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i, count = child_task->dependers->n_elem, ret = 0;
+  for (i = 0; i < count; i++)
+    {
+      struct gomp_task *task = child_task->dependers->elem[i];
+      if (--task->num_dependees != 0)
+	continue;
+
+      struct gomp_taskgroup *taskgroup = task->taskgroup;
+      if (parent)
+	{
+	  if (parent->children)
+	    {
+	      task->next_child = parent->children;
+	      task->prev_child = parent->children->prev_child;
+	      task->next_child->prev_child = task;
+	      task->prev_child->next_child = task;
+	    }
+	  else
+	    {
+	      task->next_child = task;
+	      task->prev_child = task;
+	    }
+	  parent->children = task;
+	  if (parent->in_taskwait)
+	    {
+	      parent->in_taskwait = false;
+	      gomp_sem_post (&parent->taskwait_sem);
+	    }
+	}
+      if (taskgroup)
+	{
+	  if (taskgroup->children)
+	    {
+	      task->next_taskgroup = taskgroup->children;
+	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	      task->next_taskgroup->prev_taskgroup = task;
+	      task->prev_taskgroup->next_taskgroup = task;
+	    }
+	  else
+	    {
+	      task->next_taskgroup = task;
+	      task->prev_taskgroup = task;
+	    }
+	  taskgroup->children = task;
+	  if (taskgroup->in_taskgroup_wait)
+	    {
+	      taskgroup->in_taskgroup_wait = false;
+	      gomp_sem_post (&taskgroup->taskgroup_sem);
+	    }
+	}
+      if (team->task_queue)
+	{
+	  task->next_queue = team->task_queue;
+	  task->prev_queue = team->task_queue->prev_queue;
+	  task->next_queue->prev_queue = task;
+	  task->prev_queue->next_queue = task;
+	}
+      else
+	{
+	  task->next_queue = task;
+	  task->prev_queue = task;
+	  team->task_queue = task;
+	}
+      ++team->task_count;
+      ++team->task_queued_count;
+      ++ret;
+    }
+  free (child_task->dependers);
+  child_task->dependers = NULL;
+  if (ret > 1)
+    gomp_team_barrier_set_task_pending (&team->barrier);
+  return ret;
+}
+
+static inline size_t
+gomp_task_run_post_handle_depend (struct gomp_task *child_task,
+				  struct gomp_team *team)
+{
+  if (child_task->depend_count == 0)
+    return 0;
+
+  /* If parent is gone already, the hash table is freed and nothing
+     will use the hash table anymore, no need to remove anything from it.  */
+  if (child_task->parent != NULL)
+    gomp_task_run_post_handle_depend_hash (child_task);
+
+  if (child_task->dependers == NULL)
+    return 0;
+
+  return gomp_task_run_post_handle_dependers (child_task, team);
+}
+
+static inline void
+gomp_task_run_post_remove_parent (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  if (parent == NULL)
+    return;
+  child_task->prev_child->next_child = child_task->next_child;
+  child_task->next_child->prev_child = child_task->prev_child;
+  if (parent->children != child_task)
+    return;
+  if (child_task->next_child != child_task)
+    parent->children = child_task->next_child;
+  else
+    {
+      /* We access task->children in GOMP_taskwait
+	 outside of the task lock mutex region, so
+	 need a release barrier here to ensure memory
+	 written by child_task->fn above is flushed
+	 before the NULL is written.  */
+      __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
+      if (parent->in_taskwait)
+	{
+	  parent->in_taskwait = false;
+	  gomp_sem_post (&parent->taskwait_sem);
+	}
+    }
+}
+
+static inline void
+gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
+{
+  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
+  if (taskgroup == NULL)
+    return;
+  child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
+  child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
+  if (taskgroup->num_children > 1)
+    --taskgroup->num_children;
+  else
+    {
+      /* We access taskgroup->num_children in GOMP_taskgroup_end
+	 outside of the task lock mutex region, so
+	 need a release barrier here to ensure memory
+	 written by child_task->fn above is flushed
+	 before the NULL is written.  */
+      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+    }
+  if (taskgroup->children != child_task)
+    return;
+  if (child_task->next_taskgroup != child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  else
+    {
+      taskgroup->children = NULL;
+      if (taskgroup->in_taskgroup_wait)
+	{
+	  taskgroup->in_taskgroup_wait = false;
+	  gomp_sem_post (&taskgroup->taskgroup_sem);
+	}
+    }
+}
+
 void
 gomp_barrier_handle_tasks (gomp_barrier_state_t state)
 {
@@ -202,6 +639,7 @@  gomp_barrier_handle_tasks (gomp_barrier_
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   gomp_mutex_lock (&team->task_lock);
   if (gomp_barrier_last_thread (state))
@@ -218,26 +656,31 @@  gomp_barrier_handle_tasks (gomp_barrier_
 
   while (1)
     {
+      bool cancelled = false;
       if (team->task_queue != NULL)
 	{
-	  struct gomp_task *parent;
-
 	  child_task = team->task_queue;
-	  parent = child_task->parent;
-	  if (parent && parent->children == child_task)
-	    parent->children = child_task->next_child;
-	  child_task->prev_queue->next_queue = child_task->next_queue;
-	  child_task->next_queue->prev_queue = child_task->prev_queue;
-	  if (child_task->next_queue != child_task)
-	    team->task_queue = child_task->next_queue;
-	  else
-	    team->task_queue = NULL;
-	  child_task->kind = GOMP_TASK_TIED;
+	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
+					 child_task->taskgroup, team);
+	  if (__builtin_expect (cancelled, 0))
+	    {
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
+	    }
 	  team->task_running_count++;
-	  if (team->task_count == team->task_running_count)
-	    gomp_team_barrier_clear_task_pending (&team->barrier);
+	  child_task->in_tied_task = true;
 	}
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -255,33 +698,22 @@  gomp_barrier_handle_tasks (gomp_barrier_
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
-	  struct gomp_task *parent = child_task->parent;
-	  if (parent)
-	    {
-	      child_task->prev_child->next_child = child_task->next_child;
-	      child_task->next_child->prev_child = child_task->prev_child;
-	      if (parent->children == child_task)
-		{
-		  if (child_task->next_child != child_task)
-		    parent->children = child_task->next_child;
-		  else
-		    {
-		      /* We access task->children in GOMP_taskwait
-			 outside of the task lock mutex region, so
-			 need a release barrier here to ensure memory
-			 written by child_task->fn above is flushed
-			 before the NULL is written.  */
-		      __atomic_store_n (&parent->children, NULL,
-					MEMMODEL_RELEASE);
-		      if (parent->in_taskwait)
-			gomp_sem_post (&parent->taskwait_sem);
-		    }
-		}
-	    }
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  gomp_task_run_post_remove_parent (child_task);
 	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
 	  to_free = child_task;
 	  child_task = NULL;
-	  team->task_running_count--;
+	  if (!cancelled)
+	    team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	  if (--team->task_count == 0
 	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
 	    {
@@ -304,9 +736,10 @@  GOMP_taskwait (void)
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
-     with the write of a NULL in gomp_barrier_handle_tasks.  It is
+     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
      not necessary that we synchronize with other non-NULL writes at
      this point, but we must ensure that all writes to memory by a
      child thread task work function are seen before we exit from
@@ -318,6 +751,7 @@  GOMP_taskwait (void)
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
+      bool cancelled = false;
       if (task->children == NULL)
 	{
 	  gomp_mutex_unlock (&team->task_lock);
@@ -331,26 +765,30 @@  GOMP_taskwait (void)
       if (task->children->kind == GOMP_TASK_WAITING)
 	{
 	  child_task = task->children;
-	  task->children = child_task->next_child;
-	  child_task->prev_queue->next_queue = child_task->next_queue;
-	  child_task->next_queue->prev_queue = child_task->prev_queue;
-	  if (team->task_queue == child_task)
+	  cancelled
+	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
+				 team);
+	  if (__builtin_expect (cancelled, 0))
 	    {
-	      if (child_task->next_queue != child_task)
-		team->task_queue = child_task->next_queue;
-	      else
-		team->task_queue = NULL;
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
 	    }
-	  child_task->kind = GOMP_TASK_TIED;
-	  team->task_running_count++;
-	  if (team->task_count == team->task_running_count)
-	    gomp_team_barrier_clear_task_pending (&team->barrier);
 	}
       else
 	/* All tasks we are waiting for are already running
 	   in other threads.  Wait for them.  */
 	task->in_taskwait = true;
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -364,14 +802,13 @@  GOMP_taskwait (void)
 	  thr->task = task;
 	}
       else
-	{
-	  gomp_sem_wait (&task->taskwait_sem);
-	  task->in_taskwait = false;
-	  return;
-	}
+	gomp_sem_wait (&task->taskwait_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
 	  child_task->prev_child->next_child = child_task->next_child;
 	  child_task->next_child->prev_child = child_task->prev_child;
 	  if (task->children == child_task)
@@ -382,10 +819,17 @@  GOMP_taskwait (void)
 		task->children = NULL;
 	    }
 	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
 	  to_free = child_task;
 	  child_task = NULL;
 	  team->task_count--;
-	  team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	}
     }
 }
@@ -398,6 +842,153 @@  GOMP_taskyield (void)
   /* Nothing at the moment.  */
 }
 
+void
+GOMP_taskgroup_start (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  struct gomp_taskgroup *taskgroup;
+
+  /* If team is NULL, all tasks are executed as
+     GOMP_TASK_IFFALSE tasks and thus all children tasks of
+     taskgroup and their descendant tasks will be finished
+     by the time GOMP_taskgroup_end is called.  */
+  if (team == NULL)
+    return;
+  taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
+  taskgroup->prev = task->taskgroup;
+  taskgroup->children = NULL;
+  taskgroup->in_taskgroup_wait = false;
+  taskgroup->cancelled = false;
+  taskgroup->num_children = 0;
+  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
+  task->taskgroup = taskgroup;
+}
+
+void
+GOMP_taskgroup_end (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  struct gomp_taskgroup *taskgroup;
+  struct gomp_task *child_task = NULL;
+  struct gomp_task *to_free = NULL;
+  int do_wake = 0;
+
+  if (team == NULL)
+    return;
+  taskgroup = task->taskgroup;
+
+  /* The acquire barrier on load of taskgroup->num_children here
+     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
+     It is not necessary that we synchronize with other non-0 writes at
+     this point, but we must ensure that all writes to memory by a
+     child thread task work function are seen before we exit from
+     GOMP_taskgroup_end.  */
+  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
+    goto finish;
+
+  gomp_mutex_lock (&team->task_lock);
+  while (1)
+    {
+      bool cancelled = false;
+      if (taskgroup->children == NULL)
+	{
+	  if (taskgroup->num_children)
+	    goto do_wait;
+	  gomp_mutex_unlock (&team->task_lock);
+	  if (to_free)
+	    {
+	      gomp_finish_task (to_free);
+	      free (to_free);
+	    }
+	  goto finish;
+	}
+      if (taskgroup->children->kind == GOMP_TASK_WAITING)
+	{
+	  child_task = taskgroup->children;
+	  cancelled
+	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
+				 team);
+	  if (__builtin_expect (cancelled, 0))
+	    {
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
+	    }
+	}
+      else
+	{
+	 do_wait:
+	  /* All tasks we are waiting for are already running
+	     in other threads.  Wait for them.  */
+	  taskgroup->in_taskgroup_wait = true;
+	}
+      gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
+      if (to_free)
+	{
+	  gomp_finish_task (to_free);
+	  free (to_free);
+	  to_free = NULL;
+	}
+      if (child_task)
+	{
+	  thr->task = child_task;
+	  child_task->fn (child_task->fn_data);
+	  thr->task = task;
+	}
+      else
+	gomp_sem_wait (&taskgroup->taskgroup_sem);
+      gomp_mutex_lock (&team->task_lock);
+      if (child_task)
+	{
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  child_task->prev_taskgroup->next_taskgroup
+	    = child_task->next_taskgroup;
+	  child_task->next_taskgroup->prev_taskgroup
+	    = child_task->prev_taskgroup;
+	  --taskgroup->num_children;
+	  if (taskgroup->children == child_task)
+	    {
+	      if (child_task->next_taskgroup != child_task)
+		taskgroup->children = child_task->next_taskgroup;
+	      else
+		taskgroup->children = NULL;
+	    }
+	  gomp_task_run_post_remove_parent (child_task);
+	  gomp_clear_parent (child_task->children);
+	  to_free = child_task;
+	  child_task = NULL;
+	  team->task_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
+	}
+    }
+
+ finish:
+  task->taskgroup = taskgroup->prev;
+  gomp_sem_destroy (&taskgroup->taskgroup_sem);
+  free (taskgroup);
+}
+
 int
 omp_in_final (void)
 {
--- libgomp/team.c	(.../trunk)	(revision 203241)
+++ libgomp/team.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -53,6 +53,7 @@  struct gomp_thread_start_data
   struct gomp_team_state ts;
   struct gomp_task *task;
   struct gomp_thread_pool *thread_pool;
+  unsigned int place;
   bool nested;
 };
 
@@ -84,6 +85,7 @@  gomp_thread_start (void *xdata)
   thr->thread_pool = data->thread_pool;
   thr->ts = data->ts;
   thr->task = data->task;
+  thr->place = data->place;
 
   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
 
@@ -98,7 +100,7 @@  gomp_thread_start (void *xdata)
       gomp_barrier_wait (&team->barrier);
 
       local_fn (local_data);
-      gomp_team_barrier_wait (&team->barrier);
+      gomp_team_barrier_wait_final (&team->barrier);
       gomp_finish_task (task);
       gomp_barrier_wait_last (&team->barrier);
     }
@@ -113,7 +115,7 @@  gomp_thread_start (void *xdata)
 	  struct gomp_task *task = thr->task;
 
 	  local_fn (local_data);
-	  gomp_team_barrier_wait (&team->barrier);
+	  gomp_team_barrier_wait_final (&team->barrier);
 	  gomp_finish_task (task);
 
 	  gomp_barrier_wait (&pool->threads_dock);
@@ -126,6 +128,8 @@  gomp_thread_start (void *xdata)
     }
 
   gomp_sem_destroy (&thr->release);
+  thr->thread_pool = NULL;
+  thr->task = NULL;
   return NULL;
 }
 
@@ -149,6 +153,7 @@  gomp_new_team (unsigned nthreads)
 #else
   gomp_mutex_init (&team->work_share_list_free_lock);
 #endif
+  team->work_shares_to_free = &team->work_shares[0];
   gomp_init_work_share (&team->work_shares[0], false, nthreads);
   team->work_shares[0].next_alloc = NULL;
   team->work_share_list_free = NULL;
@@ -167,7 +172,10 @@  gomp_new_team (unsigned nthreads)
   gomp_mutex_init (&team->task_lock);
   team->task_queue = NULL;
   team->task_count = 0;
+  team->task_queued_count = 0;
   team->task_running_count = 0;
+  team->work_share_cancelled = 0;
+  team->team_cancelled = 0;
 
   return team;
 }
@@ -199,16 +207,19 @@  static struct gomp_thread_pool *gomp_new
 static void
 gomp_free_pool_helper (void *thread_pool)
 {
+  struct gomp_thread *thr = gomp_thread ();
   struct gomp_thread_pool *pool
     = (struct gomp_thread_pool *) thread_pool;
   gomp_barrier_wait_last (&pool->threads_dock);
-  gomp_sem_destroy (&gomp_thread ()->release);
+  gomp_sem_destroy (&thr->release);
+  thr->thread_pool = NULL;
+  thr->task = NULL;
   pthread_exit (NULL);
 }
 
 /* Free a thread pool and release its threads. */
 
-static void
+void
 gomp_free_thread (void *arg __attribute__((unused)))
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -236,9 +247,9 @@  gomp_free_thread (void *arg __attribute_
 	  __sync_fetch_and_add (&gomp_managed_threads,
 				1L - pool->threads_used);
 #else
-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
+	  gomp_mutex_lock (&gomp_managed_threads_lock);
 	  gomp_managed_threads -= pool->threads_used - 1L;
-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+	  gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
 	}
       free (pool->threads);
@@ -259,7 +270,7 @@  gomp_free_thread (void *arg __attribute_
 
 void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
-		 struct gomp_team *team)
+		 unsigned flags, struct gomp_team *team)
 {
   struct gomp_thread_start_data *start_data;
   struct gomp_thread *thr, *nthr;
@@ -270,17 +281,24 @@  gomp_team_start (void (*fn) (void *), vo
   unsigned i, n, old_threads_used = 0;
   pthread_attr_t thread_attr, *attr;
   unsigned long nthreads_var;
+  char bind, bind_var;
+  unsigned int s = 0, rest = 0, p = 0, k = 0;
+  unsigned int affinity_count = 0;
+  struct gomp_thread **affinity_thr = NULL;
 
   thr = gomp_thread ();
   nested = thr->ts.team != NULL;
   if (__builtin_expect (thr->thread_pool == NULL, 0))
     {
       thr->thread_pool = gomp_new_thread_pool ();
+      thr->thread_pool->threads_busy = nthreads;
       pthread_setspecific (gomp_thread_destructor, thr);
     }
   pool = thr->thread_pool;
   task = thr->task;
   icv = task ? &task->icv : &gomp_global_icv;
+  if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
+    gomp_init_affinity ();
 
   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
@@ -303,14 +321,95 @@  gomp_team_start (void (*fn) (void *), vo
   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
       && thr->ts.level < gomp_nthreads_var_list_len)
     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
+  bind_var = icv->bind_var;
+  if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
+    bind_var = flags & 7;
+  bind = bind_var;
+  if (__builtin_expect (gomp_bind_var_list != NULL, 0)
+      && thr->ts.level < gomp_bind_var_list_len)
+    bind_var = gomp_bind_var_list[thr->ts.level];
   gomp_init_task (thr->task, task, icv);
   team->implicit_task[0].icv.nthreads_var = nthreads_var;
+  team->implicit_task[0].icv.bind_var = bind_var;
 
   if (nthreads == 1)
     return;
 
   i = 1;
 
+  if (__builtin_expect (gomp_places_list != NULL, 0))
+    {
+      if (bind == omp_proc_bind_false)
+	bind = omp_proc_bind_true;
+      /* Depending on chosen proc_bind model, set subpartition
+	 for the master thread and initialize helper variables
+	 P and optionally S, K and/or REST used by later place
+	 computation for each additional thread.  */
+      p = thr->place - 1;
+      switch (bind)
+	{
+	case omp_proc_bind_false:
+	  bind = omp_proc_bind_true;
+	  /* FALLTHRU */
+	case omp_proc_bind_true:
+	case omp_proc_bind_close:
+	  if (nthreads > thr->ts.place_partition_len)
+	    {
+	      /* T > P.  S threads will be placed in each place,
+		 and the final REM threads placed one by one
+		 into the already occupied places.  */
+	      s = nthreads / thr->ts.place_partition_len;
+	      rest = nthreads % thr->ts.place_partition_len;
+	    }
+	  else
+	    s = 1;
+	  k = 1;
+	  break;
+	case omp_proc_bind_master:
+	  /* Each thread will be bound to master's place.  */
+	  break;
+	case omp_proc_bind_spread:
+	  if (nthreads <= thr->ts.place_partition_len)
+	    {
+	      /* T <= P.  Each subpartition will have in between s
+		 and s+1 places (subpartitions starting at or
+		 after rest will have s places, earlier s+1 places),
+		 each thread will be bound to the first place in
+		 its subpartition (except for the master thread
+		 that can be bound to another place in its
+		 subpartition).  */
+	      s = thr->ts.place_partition_len / nthreads;
+	      rest = thr->ts.place_partition_len % nthreads;
+	      rest = (s + 1) * rest + thr->ts.place_partition_off;
+	      if (p < rest)
+		{
+		  p -= (p - thr->ts.place_partition_off) % (s + 1);
+		  thr->ts.place_partition_len = s + 1;
+		}
+	      else
+		{
+		  p -= (p - rest) % s;
+		  thr->ts.place_partition_len = s;
+		}
+	      thr->ts.place_partition_off = p;
+	    }
+	  else
+	    {
+	      /* T > P.  Each subpartition will have just a single
+		 place and we'll place between s and s+1
+		 threads into each subpartition.  */
+	      s = nthreads / thr->ts.place_partition_len;
+	      rest = nthreads % thr->ts.place_partition_len;
+	      thr->ts.place_partition_off = p;
+	      thr->ts.place_partition_len = 1;
+	      k = 1;
+	    }
+	  break;
+	}
+    }
+  else
+    bind = omp_proc_bind_false;
+
   /* We only allow the reuse of idle threads for non-nested PARALLEL
      regions.  This appears to be implied by the semantics of
      threadprivate variables, but perhaps that's reading too much into
@@ -341,47 +440,244 @@  gomp_team_start (void (*fn) (void *), vo
 	 team will exit.  */
       pool->threads_used = nthreads;
 
+      /* If necessary, expand the size of the gomp_threads array.  It is
+	 expected that changes in the number of threads are rare, thus we
+	 make no effort to expand gomp_threads_size geometrically.  */
+      if (nthreads >= pool->threads_size)
+	{
+	  pool->threads_size = nthreads + 1;
+	  pool->threads
+	    = gomp_realloc (pool->threads,
+			    pool->threads_size
+			    * sizeof (struct gomp_thread_data *));
+	}
+
       /* Release existing idle threads.  */
       for (; i < n; ++i)
 	{
-	  nthr = pool->threads[i];
+	  unsigned int place_partition_off = thr->ts.place_partition_off;
+	  unsigned int place_partition_len = thr->ts.place_partition_len;
+	  unsigned int place = 0;
+	  if (__builtin_expect (gomp_places_list != NULL, 0))
+	    {
+	      switch (bind)
+		{
+		case omp_proc_bind_true:
+		case omp_proc_bind_close:
+		  if (k == s)
+		    {
+		      ++p;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      k = 1;
+		      if (i == nthreads - rest)
+			s = 1;
+		    }
+		  else
+		    ++k;
+		  break;
+		case omp_proc_bind_master:
+		  break;
+		case omp_proc_bind_spread:
+		  if (k == 0)
+		    {
+		      /* T <= P.  */
+		      if (p < rest)
+			p += s + 1;
+		      else
+			p += s;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      place_partition_off = p;
+		      if (p < rest)
+			place_partition_len = s + 1;
+		      else
+			place_partition_len = s;
+		    }
+		  else
+		    {
+		      /* T > P.  */
+		      if (k == s)
+			{
+			  ++p;
+			  if (p == (team->prev_ts.place_partition_off
+				    + team->prev_ts.place_partition_len))
+			    p = team->prev_ts.place_partition_off;
+			  k = 1;
+			  if (i == nthreads - rest)
+			    s = 1;
+			}
+		      else
+			++k;
+		      place_partition_off = p;
+		      place_partition_len = 1;
+		    }
+		  break;
+		}
+	      if (affinity_thr != NULL
+		  || (bind != omp_proc_bind_true
+		      && pool->threads[i]->place != p + 1)
+		  || pool->threads[i]->place <= place_partition_off
+		  || pool->threads[i]->place > (place_partition_off
+						+ place_partition_len))
+		{
+		  unsigned int l;
+		  if (affinity_thr == NULL)
+		    {
+		      unsigned int j;
+
+		      if (team->prev_ts.place_partition_len > 64)
+			affinity_thr
+			  = gomp_malloc (team->prev_ts.place_partition_len
+					 * sizeof (struct gomp_thread *));
+		      else
+			affinity_thr
+			  = gomp_alloca (team->prev_ts.place_partition_len
+					 * sizeof (struct gomp_thread *));
+		      memset (affinity_thr, '\0',
+			      team->prev_ts.place_partition_len
+			      * sizeof (struct gomp_thread *));
+		      for (j = i; j < old_threads_used; j++)
+			{
+			  if (pool->threads[j]->place
+			      > team->prev_ts.place_partition_off
+			      && (pool->threads[j]->place
+				  <= (team->prev_ts.place_partition_off
+				      + team->prev_ts.place_partition_len)))
+			    {
+			      l = pool->threads[j]->place - 1
+				  - team->prev_ts.place_partition_off;
+			      pool->threads[j]->data = affinity_thr[l];
+			      affinity_thr[l] = pool->threads[j];
+			    }
+			  pool->threads[j] = NULL;
+			}
+		      if (nthreads > old_threads_used)
+			memset (&pool->threads[old_threads_used],
+				'\0', ((nthreads - old_threads_used)
+				       * sizeof (struct gomp_thread *)));
+		      n = nthreads;
+		      affinity_count = old_threads_used - i;
+		    }
+		  if (affinity_count == 0)
+		    break;
+		  l = p;
+		  if (affinity_thr[l - team->prev_ts.place_partition_off]
+		      == NULL)
+		    {
+		      if (bind != omp_proc_bind_true)
+			continue;
+		      for (l = place_partition_off;
+			   l < place_partition_off + place_partition_len;
+			   l++)
+			if (affinity_thr[l - team->prev_ts.place_partition_off]
+			    != NULL)
+			  break;
+		      if (l == place_partition_off + place_partition_len)
+			continue;
+		    }
+		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
+		  affinity_thr[l - team->prev_ts.place_partition_off]
+		    = (struct gomp_thread *) nthr->data;
+		  affinity_count--;
+		  pool->threads[i] = nthr;
+		}
+	      else
+		nthr = pool->threads[i];
+	      place = p + 1;
+	    }
+	  else
+	    nthr = pool->threads[i];
 	  nthr->ts.team = team;
 	  nthr->ts.work_share = &team->work_shares[0];
 	  nthr->ts.last_work_share = NULL;
 	  nthr->ts.team_id = i;
 	  nthr->ts.level = team->prev_ts.level + 1;
 	  nthr->ts.active_level = thr->ts.active_level;
+	  nthr->ts.place_partition_off = place_partition_off;
+	  nthr->ts.place_partition_len = place_partition_len;
 #ifdef HAVE_SYNC_BUILTINS
 	  nthr->ts.single_count = 0;
 #endif
 	  nthr->ts.static_trip = 0;
 	  nthr->task = &team->implicit_task[i];
+	  nthr->place = place;
 	  gomp_init_task (nthr->task, task, icv);
 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
+	  team->implicit_task[i].icv.bind_var = bind_var;
 	  nthr->fn = fn;
 	  nthr->data = data;
 	  team->ordered_release[i] = &nthr->release;
 	}
 
+      if (__builtin_expect (affinity_thr != NULL, 0))
+	{
+	  /* If AFFINITY_THR is non-NULL just because we had to
+	     permute some threads in the pool, but we've managed
+	     to find exactly as many old threads as we'd find
+	     without affinity, we don't need to handle this
+	     specially anymore.  */
+	  if (nthreads <= old_threads_used
+	      ? (affinity_count == old_threads_used - nthreads)
+	      : (i == old_threads_used))
+	    {
+	      if (team->prev_ts.place_partition_len > 64)
+		free (affinity_thr);
+	      affinity_thr = NULL;
+	      affinity_count = 0;
+	    }
+	  else
+	    {
+	      i = 1;
+	      /* We are going to compute the places/subpartitions
+		 again from the beginning.  So, we need to reinitialize
+		 vars modified by the switch (bind) above inside
+		 of the loop, to the state they had after the initial
+		 switch (bind).  */
+	      switch (bind)
+		{
+		case omp_proc_bind_true:
+		case omp_proc_bind_close:
+		  if (nthreads > thr->ts.place_partition_len)
+		    /* T > P.  S has been changed, so needs
+		       to be recomputed.  */
+		    s = nthreads / thr->ts.place_partition_len;
+		  k = 1;
+		  p = thr->place - 1;
+		  break;
+		case omp_proc_bind_master:
+		  /* No vars have been changed.  */
+		  break;
+		case omp_proc_bind_spread:
+		  p = thr->ts.place_partition_off;
+		  if (k != 0)
+		    {
+		      /* T > P.  */
+		      s = nthreads / team->prev_ts.place_partition_len;
+		      k = 1;
+		    }
+		  break;
+		}
+
+	      /* Increase the barrier threshold to make sure all new
+		 threads and all the threads we're going to let die
+		 arrive before the team is released.  */
+	      if (affinity_count)
+		gomp_barrier_reinit (&pool->threads_dock,
+				     nthreads + affinity_count);
+	    }
+	}
+
       if (i == nthreads)
 	goto do_release;
 
-      /* If necessary, expand the size of the gomp_threads array.  It is
-	 expected that changes in the number of threads are rare, thus we
-	 make no effort to expand gomp_threads_size geometrically.  */
-      if (nthreads >= pool->threads_size)
-	{
-	  pool->threads_size = nthreads + 1;
-	  pool->threads
-	    = gomp_realloc (pool->threads,
-			    pool->threads_size
-			    * sizeof (struct gomp_thread_data *));
-	}
     }
 
-  if (__builtin_expect (nthreads > old_threads_used, 0))
+  if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
     {
-      long diff = (long) nthreads - (long) old_threads_used;
+      long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
 
       if (old_threads_used == 0)
 	--diff;
@@ -389,14 +685,14 @@  gomp_team_start (void (*fn) (void *), vo
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, diff);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads += diff;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
     }
 
   attr = &gomp_thread_attr;
-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+  if (__builtin_expect (gomp_places_list != NULL, 0))
     {
       size_t stacksize;
       pthread_attr_init (&thread_attr);
@@ -410,11 +706,78 @@  gomp_team_start (void (*fn) (void *), vo
 			    * (nthreads-i));
 
   /* Launch new threads.  */
-  for (; i < nthreads; ++i, ++start_data)
+  for (; i < nthreads; ++i)
     {
       pthread_t pt;
       int err;
 
+      start_data->ts.place_partition_off = thr->ts.place_partition_off;
+      start_data->ts.place_partition_len = thr->ts.place_partition_len;
+      start_data->place = 0;
+      if (__builtin_expect (gomp_places_list != NULL, 0))
+	{
+	  switch (bind)
+	    {
+	    case omp_proc_bind_true:
+	    case omp_proc_bind_close:
+	      if (k == s)
+		{
+		  ++p;
+		  if (p == (team->prev_ts.place_partition_off
+			    + team->prev_ts.place_partition_len))
+		    p = team->prev_ts.place_partition_off;
+		  k = 1;
+		  if (i == nthreads - rest)
+		    s = 1;
+		}
+	      else
+		++k;
+	      break;
+	    case omp_proc_bind_master:
+	      break;
+	    case omp_proc_bind_spread:
+	      if (k == 0)
+		{
+		  /* T <= P.  */
+		  if (p < rest)
+		    p += s + 1;
+		  else
+		    p += s;
+		  if (p == (team->prev_ts.place_partition_off
+			    + team->prev_ts.place_partition_len))
+		    p = team->prev_ts.place_partition_off;
+		  start_data->ts.place_partition_off = p;
+		  if (p < rest)
+		    start_data->ts.place_partition_len = s + 1;
+		  else
+		    start_data->ts.place_partition_len = s;
+		}
+	      else
+		{
+		  /* T > P.  */
+		  if (k == s)
+		    {
+		      ++p;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      k = 1;
+		      if (i == nthreads - rest)
+			s = 1;
+		    }
+		  else
+		    ++k;
+		  start_data->ts.place_partition_off = p;
+		  start_data->ts.place_partition_len = 1;
+		}
+	      break;
+	    }
+	  start_data->place = p + 1;
+	  if (affinity_thr != NULL && pool->threads[i] != NULL)
+	    continue;
+	  gomp_init_thread_affinity (attr, p);
+	}
+
       start_data->fn = fn;
       start_data->fn_data = data;
       start_data->ts.team = team;
@@ -430,18 +793,16 @@  gomp_team_start (void (*fn) (void *), vo
       start_data->task = &team->implicit_task[i];
       gomp_init_task (start_data->task, task, icv);
       team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      team->implicit_task[i].icv.bind_var = bind_var;
       start_data->thread_pool = pool;
       start_data->nested = nested;
 
-      if (gomp_cpu_affinity != NULL)
-	gomp_init_thread_affinity (attr);
-
-      err = pthread_create (&pt, attr, gomp_thread_start, start_data);
+      err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
       if (err != 0)
 	gomp_fatal ("Thread creation failed: %s", strerror (err));
     }
 
-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+  if (__builtin_expect (gomp_places_list != NULL, 0))
     pthread_attr_destroy (&thread_attr);
 
  do_release:
@@ -450,21 +811,32 @@  gomp_team_start (void (*fn) (void *), vo
   /* Decrease the barrier threshold to match the number of threads
      that should arrive back at the end of this team.  The extra
      threads should be exiting.  Note that we arrange for this test
-     to never be true for nested teams.  */
-  if (__builtin_expect (nthreads < old_threads_used, 0))
+     to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
+     the barrier as well as gomp_managed_threads was temporarily
+     set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
+     AFFINITY_COUNT if non-zero will be always at least
+     OLD_THREADS_COUNT - NTHREADS.  */
+  if (__builtin_expect (nthreads < old_threads_used, 0)
+      || __builtin_expect (affinity_count, 0))
     {
       long diff = (long) nthreads - (long) old_threads_used;
 
+      if (affinity_count)
+	diff = -affinity_count;
+
       gomp_barrier_reinit (&pool->threads_dock, nthreads);
 
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, diff);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads += diff;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
     }
+  if (__builtin_expect (affinity_thr != NULL, 0)
+      && team->prev_ts.place_partition_len > 64)
+    free (affinity_thr);
 }
 
 
@@ -477,9 +849,26 @@  gomp_team_end (void)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
 
-  /* This barrier handles all pending explicit threads.  */
-  gomp_team_barrier_wait (&team->barrier);
-  gomp_fini_work_share (thr->ts.work_share);
+  /* This barrier handles all pending explicit threads.
+     As #pragma omp cancel parallel might get awaited count in
+     team->barrier in a inconsistent state, we need to use a different
+     counter here.  */
+  gomp_team_barrier_wait_final (&team->barrier);
+  if (__builtin_expect (team->team_cancelled, 0))
+    {
+      struct gomp_work_share *ws = team->work_shares_to_free;
+      do
+	{
+	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
+	  if (next_ws == NULL)
+	    gomp_ptrlock_set (&ws->next_ws, ws);
+	  gomp_fini_work_share (ws);
+	  ws = next_ws;
+	}
+      while (ws != NULL);
+    }
+  else
+    gomp_fini_work_share (thr->ts.work_share);
 
   gomp_end_task ();
   thr->ts = team->prev_ts;
@@ -489,9 +878,9 @@  gomp_team_end (void)
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads -= team->nthreads - 1L;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
       /* This barrier has gomp_barrier_wait_last counterparts
 	 and ensures the team can be safely destroyed.  */
@@ -532,8 +921,6 @@  gomp_team_end (void)
 static void __attribute__((constructor))
 initialize_team (void)
 {
-  struct gomp_thread *thr;
-
 #ifndef HAVE_TLS
   static struct gomp_thread initial_thread_tls_data;
 
@@ -543,13 +930,6 @@  initialize_team (void)
 
   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
     gomp_fatal ("could not create thread pool destructor.");
-
-#ifdef HAVE_TLS
-  thr = &gomp_tls_data;
-#else
-  thr = &initial_thread_tls_data;
-#endif
-  gomp_sem_init (&thr->release, 0);
 }
 
 static void __attribute__((destructor))
--- libgomp/work.c	(.../trunk)	(revision 203241)
+++ libgomp/work.c	(.../branches/gomp-4_0-branch)	(revision 203287)
@@ -221,7 +221,10 @@  gomp_work_share_end (void)
   if (gomp_barrier_last_thread (bstate))
     {
       if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
-	free_work_share (team, thr->ts.last_work_share);
+	{
+	  team->work_shares_to_free = thr->ts.work_share;
+	  free_work_share (team, thr->ts.last_work_share);
+	}
     }
 
   gomp_team_barrier_wait_end (&team->barrier, bstate);
@@ -229,6 +232,32 @@  gomp_work_share_end (void)
 }
 
 /* The current thread is done with its current work sharing construct.
+   This version implies a cancellable barrier at the end of the work-share.  */
+
+bool
+gomp_work_share_end_cancel (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  gomp_barrier_state_t bstate;
+
+  /* Cancellable work sharing constructs cannot be orphaned.  */
+  bstate = gomp_barrier_wait_cancel_start (&team->barrier);
+
+  if (gomp_barrier_last_thread (bstate))
+    {
+      if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
+	{
+	  team->work_shares_to_free = thr->ts.work_share;
+	  free_work_share (team, thr->ts.last_work_share);
+	}
+    }
+  thr->ts.last_work_share = NULL;
+
+  return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate);
+}
+
+/* The current thread is done with its current work sharing construct.
    This version does NOT imply a barrier at the end of the work-share.  */
 
 void
@@ -259,6 +288,9 @@  gomp_work_share_end_nowait (void)
 #endif
 
   if (completed == team->nthreads)
-    free_work_share (team, thr->ts.last_work_share);
+    {
+      team->work_shares_to_free = thr->ts.work_share;
+      free_work_share (team, thr->ts.last_work_share);
+    }
   thr->ts.last_work_share = NULL;
 }