diff mbox

[gomp4] Library side of depend clause support

Message ID 20130926183624.GI30970@tucnak.zalov.cz
State New
Headers show

Commit Message

Jakub Jelinek Sept. 26, 2013, 6:36 p.m. UTC
Hi!

This patch adds depend clause support.
In GOMP_task, before queueing the task, if task has any depend clauses
we look up the addresses in a hash table (in the parent task, because
only sibling tasks are ordered through depend clause), and if there
are any dependencies on the earlier started tasks, the new task
isn't added to team, parent and taskgroup task queues, but instead
just added into the earlier task's depender vectors.  Each task
has also an integer number of how many other tasks it depends on.
When a task on which something depends on finishes, if parent exists,
it's records are removed from parent's depend address hash table,
and even if parent doesn't exist anymore, we decrement num_dependees
of every task mentioned in the dependers vector.  If any of those
counters go to zero, we insert them into all the relevant task queues.

Tested on x86_64-linux.  Will commit tomorrow unless somebody complains,
but in any case would appreciate review of the changes.

2013-09-26  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h: Include stdlib.h.
	(struct gomp_task_depend_entry): New type.
	(struct gomp_task): Add dependers, depend_hash, depend_count,
	num_dependees and depend fields.
	(struct gomp_taskgroup): Add num_children field.
	(gomp_finish_task): Free depend_hash if non-NULL.
	* libgomp_g.h (GOMP_task): Add depend argument.
	* hashtab.h: New file.
	* task.c: Include hashtab.h.
	(hash_entry_type): New typedef.
	(htab_alloc, htab_free, htab_hash, htab_eq): New inlines.
	(gomp_init_task): Clear dependers, depend_hash and depend_count
	fields.
	(GOMP_task): Add depend argument, handle depend clauses.  Increment
	num_children field in taskgroup.
	(gomp_task_run_pre): Don't increment task_running_count here,
	nor clear task_pending bit.
	(gomp_task_run_post_handle_depend_hash,
	gomp_task_run_post_handle_dependers,
	gomp_task_run_post_handle_depend): New functions.
	(gomp_task_run_post_remove_parent): Clear in_taskwait before
	signalling corresponding semaphore.
	(gomp_task_run_post_remove_taskgroup): Decrement num_children
	field and make the decrement to 0 MEMMODEL_RELEASE operation,
	rather than storing NULL to taskgroup->children.  Clear
	in_taskgroup_wait before signalling corresponding semaphore.
	(gomp_barrier_handle_tasks): Move task_running_count increment
	and task_pending bit clearing here.  Call
	gomp_task_run_post_handle_depend.  If more than one new tasks
	have been queued, wake other threads if needed.
	(GOMP_taskwait): Call gomp_task_run_post_handle_depend.  If more
	than one new tasks have been queued, wake other threads if needed.
	After waiting on taskwait_sem, enter critical section again.
	(GOMP_taskgroup_start): Initialize num_children field.
	(GOMP_taskgroup_end): Check num_children instead of children
	before critical section.  If children is NULL, but num_children
	is non-zero, wait on taskgroup_sem.  Call
	gomp_task_run_post_handle_depend.  If more than one new tasks have
	been queued, wake other threads if needed.  After waiting on
	taskgroup_sem, enter critical section again.
	* testsuite/libgomp.c/depend-1.c: New test.
	* testsuite/libgomp.c/depend-2.c: New test.



	Jakub

Comments

Richard Henderson Sept. 26, 2013, 10:54 p.m. UTC | #1
On 09/26/2013 11:36 AM, Jakub Jelinek wrote:
> +struct gomp_task;
>  struct gomp_taskgroup;
> +struct htab;
> +
> +struct gomp_task_depend_entry
> +{
> +  void *addr;
> +  struct gomp_task_depend_entry *next;
> +  struct gomp_task_depend_entry *prev;
> +  struct gomp_task *task;
> +  bool is_in;
> +  bool redundant;
> +};

I'm a bit confused about the combination of linked lists and reallocated
arrays.  When did you decide to use what?

> +      if ((flags & 8) && thr->task && thr->task->depend_hash)
> +	{
> +	  struct gomp_task *parent = thr->task;
> +	  struct gomp_task_depend_entry elem, *ent = NULL;
> +	  size_t ndepend = (uintptr_t) depend[0];
> +	  size_t nout = (uintptr_t) depend[1];
> +	  size_t i;
> +	  gomp_mutex_lock (&team->task_lock);
> +	  for (i = 0; i < ndepend; i++)
> +	    {
> +	      elem.addr = depend[i + 2];
> +	      ent = htab_find (parent->depend_hash, &elem);
> +	      for (; ent; ent = ent->next)
> +		if (i >= nout && ent->is_in)
> +		  continue;
> +		else
> +		  break;

I wonder if we ought always defer tasks with dependencies and skip this lock
and search?  Unless the taskgroup is truly weird, we *are* going to have
dependencies between the tasks.  Dunno what exactly to do with final_tasks
that have unfulfilled dependencies...

I also think it would significantly clean up the code to declare a struct with
a variable tail for the depend argument.  That would eliminate all of the
casting to uintptr_t and give names to the first two entries.

> +		      if (tsk->dependers == NULL)
> +			{
> +			  tsk->dependers
> +			    = gomp_malloc (8 * sizeof (struct gomp_task *));
> +			  tsk->dependers[0]
> +			    = (struct gomp_task *) (uintptr_t) 1;
> +			  tsk->dependers[1]
> +			    = (struct gomp_task *) (uintptr_t) (8 - 2);
> +			  tsk->dependers[2] = task;
> +			  task->num_dependees++;
> +			  continue;

Another place for which a struct with variable tail would significantly clean
up the code.  And here's where I wonder why you're using realloc'd arrays here
as opposed to another linked list?

Perhaps what we need are smaller linked list entries like

  struct gomp_task_depend_node {
     struct gomp_task *task;
     struct gomp_task_depend_node *next;
     struct gomp_task_depend_node *prev;
  };

and a different hash table entry like

  struct gomp_task_depend_head {
    void *addr;
    struct gomp_task_depend_node *outs;
    struct gomp_task_depend_node *ins;
    size_t n_ins;
  };

If we scan the ndepend entries twice, we can find out how many nodes we need,
and allocate them with the task like you do now.  Scanning the ndepends array
twice can be sped by only looking up the hash table entries once -- allocate a
local array of size ndepend entries and cache the lookups.

I'd say we don't need a count of the n_outs because all of them on the list
must be sequentially dependent.  Thus any new task simply depends on the
previous task in the outs list.  Thus imo it makes sense to have ins/outs point
to the tail of the list as opposed to the head.

Is is really worthwhile to detect redundant dependencies?  It seems just as
easy to add multiple dependencies and let them just fall out naturally.

OTOH, perhaps you should just go ahead with this patch and we can evolve it
incrementally.  I don't see anything technically wrong with it.


r~
Jakub Jelinek Sept. 26, 2013, 11:48 p.m. UTC | #2
On Thu, Sep 26, 2013 at 03:54:09PM -0700, Richard Henderson wrote:
> On 09/26/2013 11:36 AM, Jakub Jelinek wrote:
> > +struct gomp_task;
> >  struct gomp_taskgroup;
> > +struct htab;
> > +
> > +struct gomp_task_depend_entry
> > +{
> > +  void *addr;
> > +  struct gomp_task_depend_entry *next;
> > +  struct gomp_task_depend_entry *prev;
> > +  struct gomp_task *task;
> > +  bool is_in;
> > +  bool redundant;
> > +};
> 
> I'm a bit confused about the combination of linked lists and reallocated
> arrays.  When did you decide to use what?

I initially wanted to use linked lists only, but while I can statically
preallocate the chains for the hash table, for the depender -> dependee
chains where a task may depend on many other tasks that would mean having to
allocate small structures (or pool allocate them, per team?).

> I wonder if we ought always defer tasks with dependencies and skip this lock
> and search?  Unless the taskgroup is truly weird, we *are* going to have
> dependencies between the tasks.  Dunno what exactly to do with final_tasks
> that have unfulfilled dependencies...

I think final tasks aren't a problem, if the parent is a final task, then
all the children are non-deferred, thus we never record any dependencies
and the test for that will be cheap too (because parent->depend_hash will be
NULL).  The problem is if (0) tasks, the spec says that they must be
non-deferred unless they depend on some earlier non-finished task.  But
the cost in that case is primarily in taking the lock/unlock; the search
will stop on the first dependency found, if there aren't any, nothing will
be recorded and we don't jump to the defer label, if there are some, as soon
as we discover first we jump there.

> I also think it would significantly clean up the code to declare a struct with
> a variable tail for the depend argument.  That would eliminate all of the
> casting to uintptr_t and give names to the first two entries.

I agree if we keep using realloced vectors that flexible array would make it
cleaner.

> Perhaps what we need are smaller linked list entries like
> 
>   struct gomp_task_depend_node {
>      struct gomp_task *task;
>      struct gomp_task_depend_node *next;
>      struct gomp_task_depend_node *prev;
>   };

The dependee -> depender vectors resp. linked lists are just pushed to first
(the only thing needed during insertion is to have a cheap check if the last
inserted task is the current one, to avoid having the same task multiple
times in the vector/linked list), and then just walked once when the
dependee finishes, so no removal is needed there, it can be freed at once;
thus, for linked list, it would be enough to use non-doubly linked list for
that.  For the hash table chains, unless we want to always lookup the hash
table and walk the chains for removal, we need doubly linked list.
> 
> and a different hash table entry like
> 
>   struct gomp_task_depend_head {
>     void *addr;
>     struct gomp_task_depend_node *outs;
>     struct gomp_task_depend_node *ins;
>     size_t n_ins;
>   };

You mean that the hash table instead would contain the structures, or
pointers to these structures?  If the latter (not sure what n_ins would be
for), then we'd again need to pool alloc them.
> 
> If we scan the ndepend entries twice, we can find out how many nodes we need,
> and allocate them with the task like you do now.  Scanning the ndepends array
> twice can be sped by only looking up the hash table entries once -- allocate a
> local array of size ndepend entries and cache the lookups.
> 
> I'd say we don't need a count of the n_outs because all of them on the list
> must be sequentially dependent.  Thus any new task simply depends on the
> previous task in the outs list.  Thus imo it makes sense to have ins/outs point
> to the tail of the list as opposed to the head.

Ah, haven't thought about it this way, yes, you're right that for out/inout
dependencies it is enough to remember in the hash table the last one,
because the dependencies will form a chain on the same address and serialize
the tasks.

> Is is really worthwhile to detect redundant dependencies?  It seems just as
> easy to add multiple dependencies and let them just fall out naturally.

I just didn't want to have duplicates in the hash table chains, the
redundant flag is just a sign that the entry doesn't need to be removed
from the hash table chains.

> OTOH, perhaps you should just go ahead with this patch and we can evolve it
> incrementally.  I don't see anything technically wrong with it.

Perhaps.  What if I do just minor cleanup (use flexible array members for
the reallocated vectors, and perhaps keep only the last out/inout task
in the hash table chains rather than all of them), retest, commit and then
we can discuss/incrementally improve it?

	Jakub
diff mbox

Patch

--- libgomp/libgomp.h.jj	2013-09-26 09:43:10.903930832 +0200
+++ libgomp/libgomp.h	2013-09-26 17:17:28.267001263 +0200
@@ -39,6 +39,7 @@ 
 
 #include <pthread.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility push(hidden)
@@ -253,7 +254,19 @@  enum gomp_task_kind
   GOMP_TASK_TIED
 };
 
+struct gomp_task;
 struct gomp_taskgroup;
+struct htab;
+
+struct gomp_task_depend_entry
+{
+  void *addr;
+  struct gomp_task_depend_entry *next;
+  struct gomp_task_depend_entry *prev;
+  struct gomp_task *task;
+  bool is_in;
+  bool redundant;
+};
 
 /* This structure describes a "task" to be run by a thread.  */
 
@@ -268,6 +281,10 @@  struct gomp_task
   struct gomp_task *next_taskgroup;
   struct gomp_task *prev_taskgroup;
   struct gomp_taskgroup *taskgroup;
+  struct gomp_task **dependers;
+  struct htab *depend_hash;
+  size_t depend_count;
+  size_t num_dependees;
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
@@ -277,6 +294,7 @@  struct gomp_task
   bool final_task;
   bool copy_ctors_done;
   gomp_sem_t taskwait_sem;
+  struct gomp_task_depend_entry depend[];
 };
 
 struct gomp_taskgroup
@@ -286,6 +304,7 @@  struct gomp_taskgroup
   bool in_taskgroup_wait;
   bool cancelled;
   gomp_sem_t taskgroup_sem;
+  size_t num_children;
 };
 
 /* This structure describes a "team" of threads.  These are the threads
@@ -525,6 +544,8 @@  extern void gomp_barrier_handle_tasks (g
 static void inline
 gomp_finish_task (struct gomp_task *task)
 {
+  if (__builtin_expect (task->depend_hash != NULL, 0))
+    free (task->depend_hash);
   gomp_sem_destroy (&task->taskwait_sem);
 }
 
--- libgomp/libgomp_g.h.jj	2013-09-26 09:43:10.902930838 +0200
+++ libgomp/libgomp_g.h	2013-09-26 10:08:44.820160094 +0200
@@ -178,7 +178,7 @@  extern bool GOMP_cancellation_point (int
 /* task.c */
 
 extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
-		       long, long, bool, unsigned);
+		       long, long, bool, unsigned, void **);
 extern void GOMP_taskwait (void);
 extern void GOMP_taskyield (void);
 extern void GOMP_taskgroup_start (void);
--- libgomp/hashtab.h.jj	2013-09-26 10:08:51.031128932 +0200
+++ libgomp/hashtab.h	2013-09-26 18:11:51.241624867 +0200
@@ -0,0 +1,443 @@ 
+/* An expandable hash tables datatype.
+   Copyright (C) 1999-2013
+   Free Software Foundation, Inc.
+   Contributed by Vladimir Makarov <vmakarov@cygnus.com>.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* The hash table code copied from include/hashtab.[hc] and adjusted,
+   so that the hash table entries are in the flexible array at the end
+   of the control structure, no callbacks are used and the elements in the
+   table are of the hash_entry_type type.
+   Before including this file, define hash_entry_type type and
+   htab_alloc and htab_free functions.  After including it, define
+   htab_hash and htab_eq inline functions.   */
+
+/* This package implements basic hash table functionality.  It is possible
+   to search for an entry, create an entry and destroy an entry.
+
+   Elements in the table are generic pointers.
+
+   The size of the table is not fixed; if the occupancy of the table
+   grows too high the hash table will be expanded.
+
+   The abstract data implementation is based on generalized Algorithm D
+   from Knuth's book "The art of computer programming".  Hash table is
+   expanded by creation of new hash table and transferring elements from
+   the old table to the new table.  */
+
+/* The type for a hash code.  */
+typedef unsigned int hashval_t;
+
+static inline hashval_t htab_hash (hash_entry_type);
+static inline bool htab_eq (hash_entry_type, hash_entry_type);
+
+/* This macro defines reserved value for empty table entry.  */
+
+#define HTAB_EMPTY_ENTRY    ((hash_entry_type) 0)
+
+/* This macro defines reserved value for table entry which contained
+   a deleted element. */
+
+#define HTAB_DELETED_ENTRY  ((hash_entry_type) 1)
+
+/* Hash tables are of the following type.  The structure
+   (implementation) of this type is not needed for using the hash
+   tables.  All work with hash table should be executed only through
+   functions mentioned below.  The size of this structure is subject to
+   change.  */
+
+struct htab {
+  /* Current size (in entries) of the hash table.  */
+  size_t size;
+
+  /* Current number of elements including also deleted elements.  */
+  size_t n_elements;
+
+  /* Current number of deleted elements in the table.  */
+  size_t n_deleted;
+
+  /* Current size (in entries) of the hash table, as an index into the
+     table of primes.  */
+  unsigned int size_prime_index;
+
+  /* Table itself.  */
+  hash_entry_type entries[];
+};
+
+typedef struct htab *htab_t;
+
+/* An enum saying whether we insert into the hash table or not.  */
+enum insert_option {NO_INSERT, INSERT};
+
+/* Table of primes and multiplicative inverses.
+
+   Note that these are not minimally reduced inverses.  Unlike when generating
+   code to divide by a constant, we want to be able to use the same algorithm
+   all the time.  All of these inverses (are implied to) have bit 32 set.
+
+   For the record, the function that computed the table is in
+   libiberty/hashtab.c.  */
+
+struct prime_ent
+{
+  hashval_t prime;
+  hashval_t inv;
+  hashval_t inv_m2;	/* inverse of prime-2 */
+  hashval_t shift;
+};
+
+static struct prime_ent const prime_tab[] = {
+  {          7, 0x24924925, 0x9999999b, 2 },
+  {         13, 0x3b13b13c, 0x745d1747, 3 },
+  {         31, 0x08421085, 0x1a7b9612, 4 },
+  {         61, 0x0c9714fc, 0x15b1e5f8, 5 },
+  {        127, 0x02040811, 0x0624dd30, 6 },
+  {        251, 0x05197f7e, 0x073260a5, 7 },
+  {        509, 0x01824366, 0x02864fc8, 8 },
+  {       1021, 0x00c0906d, 0x014191f7, 9 },
+  {       2039, 0x0121456f, 0x0161e69e, 10 },
+  {       4093, 0x00300902, 0x00501908, 11 },
+  {       8191, 0x00080041, 0x00180241, 12 },
+  {      16381, 0x000c0091, 0x00140191, 13 },
+  {      32749, 0x002605a5, 0x002a06e6, 14 },
+  {      65521, 0x000f00e2, 0x00110122, 15 },
+  {     131071, 0x00008001, 0x00018003, 16 },
+  {     262139, 0x00014002, 0x0001c004, 17 },
+  {     524287, 0x00002001, 0x00006001, 18 },
+  {    1048573, 0x00003001, 0x00005001, 19 },
+  {    2097143, 0x00004801, 0x00005801, 20 },
+  {    4194301, 0x00000c01, 0x00001401, 21 },
+  {    8388593, 0x00001e01, 0x00002201, 22 },
+  {   16777213, 0x00000301, 0x00000501, 23 },
+  {   33554393, 0x00001381, 0x00001481, 24 },
+  {   67108859, 0x00000141, 0x000001c1, 25 },
+  {  134217689, 0x000004e1, 0x00000521, 26 },
+  {  268435399, 0x00000391, 0x000003b1, 27 },
+  {  536870909, 0x00000019, 0x00000029, 28 },
+  { 1073741789, 0x0000008d, 0x00000095, 29 },
+  { 2147483647, 0x00000003, 0x00000007, 30 },
+  /* Avoid "decimal constant so large it is unsigned" for 4294967291.  */
+  { 0xfffffffb, 0x00000006, 0x00000008, 31 }
+};
+
+/* The following function returns an index into the above table of the
+   nearest prime number which is greater than N, and near a power of two. */
+
+static unsigned int
+higher_prime_index (unsigned long n)
+{
+  unsigned int low = 0;
+  unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]);
+
+  while (low != high)
+    {
+      unsigned int mid = low + (high - low) / 2;
+      if (n > prime_tab[mid].prime)
+	low = mid + 1;
+      else
+	high = mid;
+    }
+
+  /* If we've run out of primes, abort.  */
+  if (n > prime_tab[low].prime)
+    abort ();
+
+  return low;
+}
+
+/* Return the current size of given hash table.  */
+
+static inline size_t
+htab_size (htab_t htab)
+{
+  return htab->size;
+}
+
+/* Return the current number of elements in given hash table. */
+
+static inline size_t
+htab_elements (htab_t htab)
+{
+  return htab->n_elements - htab->n_deleted;
+}
+
+/* Return X % Y.  */
+
+static inline hashval_t
+htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift)
+{
+  /* The multiplicative inverses computed above are for 32-bit types, and
+     requires that we be able to compute a highpart multiply.  */
+  if (sizeof (hashval_t) * __CHAR_BIT__ <= 32)
+    {
+      hashval_t t1, t2, t3, t4, q, r;
+
+      t1 = ((unsigned long long)x * inv) >> 32;
+      t2 = x - t1;
+      t3 = t2 >> 1;
+      t4 = t1 + t3;
+      q  = t4 >> shift;
+      r  = x - (q * y);
+
+      return r;
+    }
+
+  /* Otherwise just use the native division routines.  */
+  return x % y;
+}
+
+/* Compute the primary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return htab_mod_1 (hash, p->prime, p->inv, p->shift);
+}
+
+/* Compute the secondary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod_m2 (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift);
+}
+
+/* Create hash table of size SIZE.  */
+
+static htab_t
+htab_create (size_t size)
+{
+  htab_t result;
+  unsigned int size_prime_index;
+
+  size_prime_index = higher_prime_index (size);
+  size = prime_tab[size_prime_index].prime;
+
+  result = (htab_t) htab_alloc (sizeof (struct htab)
+				+ size * sizeof (hash_entry_type));
+  result->size = size;
+  result->n_elements = 0;
+  result->n_deleted = 0;
+  result->size_prime_index = size_prime_index;
+  memset (result->entries, 0, size * sizeof (hash_entry_type));
+  return result;
+}
+
+/* Similar to htab_find_slot, but without several unwanted side effects:
+    - Does not call htab_eq when it finds an existing entry.
+    - Does not change the count of elements in the hash table.
+   This function also assumes there are no deleted entries in the table.
+   HASH is the hash value for the element to be inserted.  */
+
+static hash_entry_type *
+find_empty_slot_for_expand (htab_t htab, hashval_t hash)
+{
+  hashval_t index = htab_mod (hash, htab);
+  size_t size = htab_size (htab);
+  hash_entry_type *slot = htab->entries + index;
+  hashval_t hash2;
+
+  if (*slot == HTAB_EMPTY_ENTRY)
+    return slot;
+  else if (*slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      slot = htab->entries + index;
+      if (*slot == HTAB_EMPTY_ENTRY)
+	return slot;
+      else if (*slot == HTAB_DELETED_ENTRY)
+	abort ();
+    }
+}
+
+/* The following function changes size of memory allocated for the
+   entries and repeatedly inserts the table elements.  The occupancy
+   of the table after the call will be about 50%.  Naturally the hash
+   table must already exist.  Remember also that the place of the
+   table entries is changed.  */
+
+static htab_t
+htab_expand (htab_t htab)
+{
+  htab_t nhtab;
+  hash_entry_type *olimit;
+  hash_entry_type *p;
+  size_t osize, elts;
+
+  osize = htab->size;
+  olimit = htab->entries + osize;
+  elts = htab_elements (htab);
+
+  /* Resize only when table after removal of unused elements is either
+     too full or too empty.  */
+  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
+    nhtab = htab_create (elts * 2);
+  else
+    nhtab = htab_create (osize - 1);
+  nhtab->n_elements = htab->n_elements - htab->n_deleted;
+
+  p = htab->entries;
+  do
+    {
+      hash_entry_type x = *p;
+
+      if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
+	*find_empty_slot_for_expand (nhtab, htab_hash (x)) = x;
+
+      p++;
+    }
+  while (p < olimit);
+
+  htab_free (htab);
+  return nhtab;
+}
+
+/* This function searches for a hash table entry equal to the given
+   element.  It cannot be used to insert or delete an element.  */
+
+static hash_entry_type
+htab_find (htab_t htab, const hash_entry_type element)
+{
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+
+  size = htab_size (htab);
+  index = htab_mod (hash, htab);
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY
+      || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+    return entry;
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY
+	  || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+	return entry;
+    }
+}
+
+/* This function searches for a hash table slot containing an entry
+   equal to the given element.  To delete an entry, call this with
+   insert=NO_INSERT, then call htab_clear_slot on the slot returned
+   (possibly after doing some checks).  To insert an entry, call this
+   with insert=INSERT, then write the value you want into the returned
+   slot.  */
+
+static hash_entry_type *
+htab_find_slot (htab_t *htabp, const hash_entry_type element,
+		enum insert_option insert)
+{
+  hash_entry_type *first_deleted_slot;
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+  htab_t htab = *htabp;
+
+  size = htab_size (htab);
+  if (insert == INSERT && size * 3 <= htab->n_elements * 4)
+    {
+      htab = *htabp = htab_expand (htab);
+      size = htab_size (htab);
+    }
+
+  index = htab_mod (hash, htab);
+
+  first_deleted_slot = NULL;
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY)
+    goto empty_entry;
+  else if (entry == HTAB_DELETED_ENTRY)
+    first_deleted_slot = &htab->entries[index];
+  else if (htab_eq (entry, element))
+    return &htab->entries[index];
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY)
+	goto empty_entry;
+      else if (entry == HTAB_DELETED_ENTRY)
+	{
+	  if (!first_deleted_slot)
+	    first_deleted_slot = &htab->entries[index];
+	}
+      else if (htab_eq (entry, element))
+	return &htab->entries[index];
+    }
+
+ empty_entry:
+  if (insert == NO_INSERT)
+    return NULL;
+
+  if (first_deleted_slot)
+    {
+      htab->n_deleted--;
+      *first_deleted_slot = HTAB_EMPTY_ENTRY;
+      return first_deleted_slot;
+    }
+
+  htab->n_elements++;
+  return &htab->entries[index];
+}
+
+/* This function clears a specified slot in a hash table.  It is
+   useful when you've already done the lookup and don't want to do it
+   again.  */
+
+static inline void
+htab_clear_slot (htab_t htab, hash_entry_type *slot)
+{
+  if (slot < htab->entries || slot >= htab->entries + htab_size (htab)
+      || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  *slot = HTAB_DELETED_ENTRY;
+  htab->n_deleted++;
+}
+
+/* Returns a hash code for pointer P. Simplified version of evahash */
+
+static inline hashval_t
+hash_pointer (const void *p)
+{
+  uintptr_t v = (uintptr_t) p;
+  if (sizeof (v) > sizeof (hashval_t))
+    v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__);
+  return v;
+}
--- libgomp/task.c.jj	2013-09-26 09:43:10.903930832 +0200
+++ libgomp/task.c	2013-09-26 19:40:17.092019688 +0200
@@ -29,6 +29,33 @@ 
 #include <stdlib.h>
 #include <string.h>
 
+typedef struct gomp_task_depend_entry *hash_entry_type;
+
+static inline void *
+htab_alloc (size_t size)
+{
+  return gomp_malloc (size);
+}
+
+static inline void
+htab_free (void *ptr)
+{
+  free (ptr);
+}
+
+#include "hashtab.h"
+
+static inline hashval_t
+htab_hash (hash_entry_type element)
+{
+  return hash_pointer (element->addr);
+}
+
+static inline bool
+htab_eq (hash_entry_type x, hash_entry_type y)
+{
+  return x->addr == y->addr;
+}
 
 /* Create a new task data structure.  */
 
@@ -45,6 +72,9 @@  gomp_init_task (struct gomp_task *task,
   task->copy_ctors_done = false;
   task->children = NULL;
   task->taskgroup = NULL;
+  task->dependers = NULL;
+  task->depend_hash = NULL;
+  task->depend_count = 0;
   gomp_sem_init (&task->taskwait_sem, 0);
 }
 
@@ -80,7 +110,8 @@  gomp_clear_parent (struct gomp_task *chi
 
 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
-	   long arg_size, long arg_align, bool if_clause, unsigned flags)
+	   long arg_size, long arg_align, bool if_clause, unsigned flags,
+	   void **depend)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -108,6 +139,38 @@  GOMP_task (void (*fn) (void *), void *da
     {
       struct gomp_task task;
 
+      /* If there are depend clauses and earlier deferred sibling tasks
+	 with depend clauses, check if there isn't a dependency.  If there
+	 is, fall through to the deferred task handling, as we can't
+	 schedule such tasks right away.  There is no need to handle
+	 depend clauses for non-deferred tasks other than this, because
+	 the parent task is suspended until the child task finishes and thus
+	 it can't start further child tasks.  */
+      if ((flags & 8) && thr->task && thr->task->depend_hash)
+	{
+	  struct gomp_task *parent = thr->task;
+	  struct gomp_task_depend_entry elem, *ent = NULL;
+	  size_t ndepend = (uintptr_t) depend[0];
+	  size_t nout = (uintptr_t) depend[1];
+	  size_t i;
+	  gomp_mutex_lock (&team->task_lock);
+	  for (i = 0; i < ndepend; i++)
+	    {
+	      elem.addr = depend[i + 2];
+	      ent = htab_find (parent->depend_hash, &elem);
+	      for (; ent; ent = ent->next)
+		if (i >= nout && ent->is_in)
+		  continue;
+		else
+		  break;
+	      if (ent)
+		break;
+	    }
+	  gomp_mutex_unlock (&team->task_lock);
+	  if (ent)
+	    goto defer;
+	}
+
       gomp_init_task (&task, thr->task, gomp_icv (false));
       task.kind = GOMP_TASK_IFFALSE;
       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
@@ -146,14 +209,20 @@  GOMP_task (void (*fn) (void *), void *da
     }
   else
     {
+     defer:;
       struct gomp_task *task;
       struct gomp_task *parent = thr->task;
       struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
       bool do_wake;
+      size_t depend_size = 0;
 
-      task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
-      arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
+      if (flags & 8)
+	depend_size = ((uintptr_t) depend[0]
+		       * sizeof (struct gomp_task_depend_entry));
+      task = gomp_malloc (sizeof (*task) + depend_size
+			  + arg_size + arg_align - 1);
+      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
 		      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
       task->kind = GOMP_TASK_IFFALSE;
@@ -171,7 +240,6 @@  GOMP_task (void (*fn) (void *), void *da
       task->kind = GOMP_TASK_WAITING;
       task->fn = fn;
       task->fn_data = arg;
-      task->in_tied_task = true;
       task->final_task = (flags & 2) >> 1;
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
@@ -185,6 +253,99 @@  GOMP_task (void (*fn) (void *), void *da
 	  free (task);
 	  return;
 	}
+      if (taskgroup)
+	taskgroup->num_children++;
+      if (depend_size)
+	{
+	  size_t ndepend = (uintptr_t) depend[0];
+	  size_t nout = (uintptr_t) depend[1];
+	  size_t i;
+	  hash_entry_type ent;
+
+	  task->depend_count = ndepend;
+	  task->num_dependees = 0;
+	  if (parent->depend_hash == NULL)
+	    parent->depend_hash
+	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+	  for (i = 0; i < ndepend; i++)
+	    {
+	      task->depend[i].addr = depend[2 + i];
+	      task->depend[i].next = NULL;
+	      task->depend[i].prev = NULL;
+	      task->depend[i].task = task;
+	      task->depend[i].is_in = i >= nout;
+	      task->depend[i].redundant = false;
+	      hash_entry_type *slot
+		= htab_find_slot (&parent->depend_hash, &task->depend[i],
+				  INSERT);
+	      if (*slot)
+		{
+		  /* If multiple depends on the same task are the
+		     same, all but the first one are redundant.
+		     As inout/out come first, if any of them is
+		     inout/out, it will win, which is the right
+		     semantics.  */
+		  if ((*slot)->task == task)
+		    {
+		      task->depend[i].redundant = true;
+		      continue;
+		    }
+		  for (ent = *slot; ent; ent = ent->next)
+		    {
+		      /* depend(in:...) doesn't depend on earlier
+			 depend(in:...).  */
+		      if (i >= nout && ent->is_in)
+			continue;
+		      struct gomp_task *tsk = ent->task;
+		      if (tsk->dependers == NULL)
+			{
+			  tsk->dependers
+			    = gomp_malloc (8 * sizeof (struct gomp_task *));
+			  tsk->dependers[0]
+			    = (struct gomp_task *) (uintptr_t) 1;
+			  tsk->dependers[1]
+			    = (struct gomp_task *) (uintptr_t) (8 - 2);
+			  tsk->dependers[2] = task;
+			  task->num_dependees++;
+			  continue;
+			}
+		      /* We already have some other dependency on tsk
+			 from earlier depend clause.  */
+		      else if (tsk->dependers[0]
+			       && (tsk->dependers[((uintptr_t)
+						   tsk->dependers[0]) + 1]
+				   == task))
+			continue;
+		      else if ((uintptr_t) tsk->dependers[0]
+			       == (uintptr_t) tsk->dependers[1])
+			{
+			  size_t count = ((uintptr_t) tsk->dependers[1]
+					  + 2) * 2;
+			  tsk->dependers
+			    = gomp_realloc (tsk->dependers,
+					    count
+					    * sizeof (struct gomp_task *));
+			  tsk->dependers[1]
+			    = (struct gomp_task *) (uintptr_t) (count - 2);
+			}
+		      tsk->dependers[((uintptr_t) tsk->dependers[0]) + 2]
+			= task;
+		      tsk->dependers[0]
+			= (struct gomp_task *)
+			  (((uintptr_t) tsk->dependers[0]) + 1);
+		      task->num_dependees++;
+		    }
+		  task->depend[i].next = *slot;
+		  (*slot)->prev = &task->depend[i];
+		}
+	      *slot = &task->depend[i];
+	    }
+	  if (task->num_dependees)
+	    {
+	      gomp_mutex_unlock (&team->task_lock);
+	      return;
+	    }
+	}
       if (parent->children)
 	{
 	  task->next_child = parent->children;
@@ -259,12 +420,133 @@  gomp_task_run_pre (struct gomp_task *chi
        || (taskgroup && taskgroup->cancelled))
       && !child_task->copy_ctors_done)
     return true;
-  team->task_running_count++;
-  if (team->task_count == team->task_running_count)
-    gomp_team_barrier_clear_task_pending (&team->barrier);
   return false;
 }
 
+static void
+gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i;
+
+  for (i = 0; i < child_task->depend_count; i++)
+    if (!child_task->depend[i].redundant)
+      {
+	if (child_task->depend[i].next)
+	  child_task->depend[i].next->prev = child_task->depend[i].prev;
+	if (child_task->depend[i].prev)
+	  child_task->depend[i].prev->next = child_task->depend[i].next;
+	else
+	  {
+	    hash_entry_type *slot
+	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
+				NO_INSERT);
+	    if (*slot != &child_task->depend[i])
+	      abort ();
+	    if (child_task->depend[i].next)
+	      *slot = child_task->depend[i].next;
+	    else
+	      htab_clear_slot (parent->depend_hash, slot);
+	  }
+      }
+}
+
+static size_t
+gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
+				     struct gomp_team *team)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i, count = (uintptr_t) child_task->dependers[0], ret = 0;
+  for (i = 0; i < count; i++)
+    {
+      struct gomp_task *task = child_task->dependers[i + 2];
+      if (--task->num_dependees != 0)
+	continue;
+
+      struct gomp_taskgroup *taskgroup = task->taskgroup;
+      if (parent)
+	{
+	  if (parent->children)
+	    {
+	      task->next_child = parent->children;
+	      task->prev_child = parent->children->prev_child;
+	      task->next_child->prev_child = task;
+	      task->prev_child->next_child = task;
+	    }
+	  else
+	    {
+	      task->next_child = task;
+	      task->prev_child = task;
+	    }
+	  parent->children = task;
+	  if (parent->in_taskwait)
+	    {
+	      parent->in_taskwait = false;
+	      gomp_sem_post (&parent->taskwait_sem);
+	    }
+	}
+      if (taskgroup)
+	{
+	  if (taskgroup->children)
+	    {
+	      task->next_taskgroup = taskgroup->children;
+	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	      task->next_taskgroup->prev_taskgroup = task;
+	      task->prev_taskgroup->next_taskgroup = task;
+	    }
+	  else
+	    {
+	      task->next_taskgroup = task;
+	      task->prev_taskgroup = task;
+	    }
+	  taskgroup->children = task;
+	  if (taskgroup->in_taskgroup_wait)
+	    {
+	      taskgroup->in_taskgroup_wait = false;
+	      gomp_sem_post (&taskgroup->taskgroup_sem);
+	    }
+	}
+      if (team->task_queue)
+	{
+	  task->next_queue = team->task_queue;
+	  task->prev_queue = team->task_queue->prev_queue;
+	  task->next_queue->prev_queue = task;
+	  task->prev_queue->next_queue = task;
+	}
+      else
+	{
+	  task->next_queue = task;
+	  task->prev_queue = task;
+	  team->task_queue = task;
+	}
+      ++team->task_count;
+      ++ret;
+    }
+  free (child_task->dependers);
+  child_task->dependers = NULL;
+  if (ret > 1)
+    gomp_team_barrier_set_task_pending (&team->barrier);
+  return ret;
+}
+
+static inline size_t
+gomp_task_run_post_handle_depend (struct gomp_task *child_task,
+				  struct gomp_team *team)
+{
+  if (child_task->depend_count == 0)
+    return 0;
+
+  /* If parent is gone already, the hash table is freed and nothing
+     will use the hash table anymore, no need to remove anything from it.  */
+  if (child_task->parent != NULL)
+    gomp_task_run_post_handle_depend_hash (child_task);
+
+  if (child_task->dependers == NULL)
+    return 0;
+
+  return gomp_task_run_post_handle_dependers (child_task, team);
+}
+
 static inline void
 gomp_task_run_post_remove_parent (struct gomp_task *child_task)
 {
@@ -286,7 +568,10 @@  gomp_task_run_post_remove_parent (struct
 	 before the NULL is written.  */
       __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
       if (parent->in_taskwait)
-	gomp_sem_post (&parent->taskwait_sem);
+	{
+	  parent->in_taskwait = false;
+	  gomp_sem_post (&parent->taskwait_sem);
+	}
     }
 }
 
@@ -298,20 +583,29 @@  gomp_task_run_post_remove_taskgroup (str
     return;
   child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
   child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
-  if (taskgroup->children != child_task)
-    return;
-  if (child_task->next_taskgroup != child_task)
-    taskgroup->children = child_task->next_taskgroup;
+  if (taskgroup->num_children > 1)
+    --taskgroup->num_children;
   else
     {
-      /* We access task->children in GOMP_taskgroup_end
+      /* We access taskgroup->num_children in GOMP_taskgroup_end
 	 outside of the task lock mutex region, so
 	 need a release barrier here to ensure memory
 	 written by child_task->fn above is flushed
 	 before the NULL is written.  */
-      __atomic_store_n (&taskgroup->children, NULL, MEMMODEL_RELEASE);
+      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+    }
+  if (taskgroup->children != child_task)
+    return;
+  if (child_task->next_taskgroup != child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  else
+    {
+      taskgroup->children = NULL;
       if (taskgroup->in_taskgroup_wait)
-	gomp_sem_post (&taskgroup->taskgroup_sem);
+	{
+	  taskgroup->in_taskgroup_wait = false;
+	  gomp_sem_post (&taskgroup->taskgroup_sem);
+	}
     }
 }
 
@@ -323,6 +617,7 @@  gomp_barrier_handle_tasks (gomp_barrier_
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   gomp_mutex_lock (&team->task_lock);
   if (gomp_barrier_last_thread (state))
@@ -355,8 +650,17 @@  gomp_barrier_handle_tasks (gomp_barrier_
 		}
 	      goto finish_cancelled;
 	    }
+	  team->task_running_count++;
+	  child_task->in_tied_task = true;
+	  if (team->task_count == team->task_running_count)
+	    gomp_team_barrier_clear_task_pending (&team->barrier);
 	}
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -374,7 +678,9 @@  gomp_barrier_handle_tasks (gomp_barrier_
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
-	 finish_cancelled:
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
 	  gomp_task_run_post_remove_parent (child_task);
 	  gomp_clear_parent (child_task->children);
 	  gomp_task_run_post_remove_taskgroup (child_task);
@@ -382,6 +688,12 @@  gomp_barrier_handle_tasks (gomp_barrier_
 	  child_task = NULL;
 	  if (!cancelled)
 	    team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	  if (--team->task_count == 0
 	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
 	    {
@@ -404,9 +716,10 @@  GOMP_taskwait (void)
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
-     with the write of a NULL in gomp_barrier_handle_tasks.  It is
+     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
      not necessary that we synchronize with other non-NULL writes at
      this point, but we must ensure that all writes to memory by a
      child thread task work function are seen before we exit from
@@ -451,6 +764,11 @@  GOMP_taskwait (void)
 	   in other threads.  Wait for them.  */
 	task->in_taskwait = true;
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -464,15 +782,13 @@  GOMP_taskwait (void)
 	  thr->task = task;
 	}
       else
-	{
-	  gomp_sem_wait (&task->taskwait_sem);
-	  task->in_taskwait = false;
-	  return;
-	}
+	gomp_sem_wait (&task->taskwait_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
-	 finish_cancelled:
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
 	  child_task->prev_child->next_child = child_task->next_child;
 	  child_task->next_child->prev_child = child_task->prev_child;
 	  if (task->children == child_task)
@@ -487,7 +803,13 @@  GOMP_taskwait (void)
 	  to_free = child_task;
 	  child_task = NULL;
 	  team->task_count--;
-	  team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	}
     }
 }
@@ -519,6 +841,7 @@  GOMP_taskgroup_start (void)
   taskgroup->children = NULL;
   taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
+  taskgroup->num_children = 0;
   gomp_sem_init (&taskgroup->taskgroup_sem, 0);
   task->taskgroup = taskgroup;
 }
@@ -532,18 +855,29 @@  GOMP_taskgroup_end (void)
   struct gomp_taskgroup *taskgroup;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   if (team == NULL)
     return;
   taskgroup = task->taskgroup;
-  if (__atomic_load_n (&taskgroup->children, MEMMODEL_ACQUIRE) == NULL)
+
+  /* The acquire barrier on load of taskgroup->num_children here
+     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
+     It is not necessary that we synchronize with other non-0 writes at
+     this point, but we must ensure that all writes to memory by a
+     child thread task work function are seen before we exit from
+     GOMP_taskgroup_end.  */
+  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
     goto finish;
+
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
       bool cancelled = false;
       if (taskgroup->children == NULL)
 	{
+	  if (taskgroup->num_children)
+	    goto do_wait;
 	  gomp_mutex_unlock (&team->task_lock);
 	  if (to_free)
 	    {
@@ -570,10 +904,18 @@  GOMP_taskgroup_end (void)
 	    }
 	}
       else
-	/* All tasks we are waiting for are already running
-	   in other threads.  Wait for them.  */
-	taskgroup->in_taskgroup_wait = true;
+	{
+	 do_wait:
+	  /* All tasks we are waiting for are already running
+	     in other threads.  Wait for them.  */
+	  taskgroup->in_taskgroup_wait = true;
+	}
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -587,19 +929,18 @@  GOMP_taskgroup_end (void)
 	  thr->task = task;
 	}
       else
-	{
-	  gomp_sem_wait (&taskgroup->taskgroup_sem);
-	  taskgroup->in_taskgroup_wait = false;
-	  goto finish;
-	}
+	gomp_sem_wait (&taskgroup->taskgroup_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
-	 finish_cancelled:
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
 	  child_task->prev_taskgroup->next_taskgroup
 	    = child_task->next_taskgroup;
 	  child_task->next_taskgroup->prev_taskgroup
 	    = child_task->prev_taskgroup;
+	  --taskgroup->num_children;
 	  if (taskgroup->children == child_task)
 	    {
 	      if (child_task->next_taskgroup != child_task)
@@ -612,7 +953,13 @@  GOMP_taskgroup_end (void)
 	  to_free = child_task;
 	  child_task = NULL;
 	  team->task_count--;
-	  team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	}
     }
 
--- libgomp/testsuite/libgomp.c/depend-1.c.jj	2013-09-26 17:57:26.011983435 +0200
+++ libgomp/testsuite/libgomp.c/depend-1.c	2013-09-26 18:49:55.262270240 +0200
@@ -0,0 +1,154 @@ 
+#include <stdlib.h>
+
+void
+dep (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+dep2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp taskwait
+  }
+}
+
+void
+firstpriv (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task depend(out: x)
+    x = 2;
+    #pragma omp task depend(in: x)
+    if (x != 1)
+      abort ();
+  }
+}
+
+void
+antidep (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared(x) depend(in: x)
+    if (x != 1)
+      abort ();
+    #pragma omp task shared(x) depend(out: x)
+    x = 2;
+  }
+}
+
+void
+antidep2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp taskgroup
+    {
+      #pragma omp task shared(x) depend(in: x)
+      if (x != 1)
+	abort ();
+      #pragma omp task shared(x) depend(out: x)
+      x = 2;
+    }
+  }
+}
+
+void
+outdep (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 0;
+    #pragma omp task shared(x) depend(out: x)
+    x = 1;
+    #pragma omp task shared(x) depend(out: x)
+    x = 2;
+    #pragma omp taskwait
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+concurrent (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+concurrent2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp taskwait
+  }
+}
+
+int
+main ()
+{
+  dep ();
+  dep2 ();
+  firstpriv ();
+  antidep ();
+  antidep2 ();
+  outdep ();
+  concurrent ();
+  concurrent2 ();
+  return 0;
+}
--- libgomp/testsuite/libgomp.c/depend-2.c.jj	2013-09-26 18:56:19.808294100 +0200
+++ libgomp/testsuite/libgomp.c/depend-2.c	2013-09-26 19:46:29.732123749 +0200
@@ -0,0 +1,71 @@ 
+#include <stdlib.h>
+#include <unistd.h>
+
+void
+foo (int do_sleep)
+{
+  int a[64], i, *p = a + 4, x = 0;
+  asm volatile ("" : "+r" (p));
+  for (i = 0; i < 64; i++)
+    a[i] = i + 8;
+  #pragma omp parallel private (i)
+  {
+    #pragma omp single nowait
+    {
+      for (i = 0; i < 8; i++)
+	{
+	  #pragma omp task depend(out: a[i * 8 : 4])
+	    a[i * 8] += (i + 2) * 9;
+	  #pragma omp task depend(out: p[i * 8 : 2])
+	    p[i * 8] += (i + 3) * 10;
+	  #pragma omp task depend(out: x)
+	    x = 1;
+	}
+      for (i = 0; i < 8; i++)
+	#pragma omp task depend(in: a[i * 8 : 4]) \
+			 depend(inout: a[i * 8 + 4 : 2]) \
+			 depend(in: a[0 : 4]) depend(in: x)
+	{
+	  if (a[0] != 8 + 2 * 9 || x != 1)
+	    abort ();
+	  if (a[i * 8] != i * 8 + 8 + (i + 2) * 9)
+	    abort ();
+	  if (a[4 + i * 8] != 4 + i * 8 + 8 + (i + 3) * 10)
+	    abort ();
+	  p[i * 8] += a[i * 8];
+	}
+      for (i = 0; i < 8; i++)
+	#pragma omp task depend(inout: a[i * 8 : 4]) \
+			 depend(in: p[i * 8 : 2]) \
+			 depend(in: p[0 : 2], x)
+	{
+	  if (p[0] != 4 + 8 + 3 * 10 + 0 + 8 + 2 * 9 || x != 1)
+	    abort ();
+	  if (a[i * 8] != i * 8 + 8 + (i + 2) * 9)
+	    abort ();
+	  if (a[4 + i * 8] != (4 + i * 8 + 8 + (i + 3) * 10
+			       + i * 8 + 8 + (i + 2) * 9))
+	    abort ();
+	  a[i * 8] += 2;
+	}
+      for (i = 0; i < 4; i++)
+	#pragma omp task depend(in: a[i * 16 : 4], a[i * 16 + 8 : 4], x)
+	{
+	  if (a[i * 16] != i * 16 + 8 + (2 * i + 2) * 9 + 2 || x != 1)
+	    abort ();
+	  if (p[i * 16 + 4] != i * 16 + 8 + 8 + (2 * i + 1 + 2) * 9 + 2)
+	    abort ();
+	}
+    }
+    if (do_sleep)
+      sleep (1);
+  }
+}
+
+int
+main ()
+{
+  foo (1);
+  foo (0);
+  return 0;
+}