Patchwork Fix PR51298, libgomp barrier failure

login
register
mail settings
Submitter Alan Modra
Date Nov. 27, 2011, 11:42 p.m.
Message ID <20111127234209.GC7827@bubble.grove.modra.org>
Download mbox | patch
Permalink /patch/127909/
State New
Headers show

Comments

Alan Modra - Nov. 27, 2011, 11:42 p.m.
This patch cures the remaining libgomp failures I see on power7.  I'll
admit to approaching this fix with a big hammer at first, liberally
using MEMMODEL_SEQ_CST barriers all over bar.h and bar.c, then
gradually reducing the number of places and strictness of the
barriers.  This may still have a few unnecessary barriers, but I'm not
confident in removing any more.

What led me to believe that barriers were missing in the current code
was the nature of the test failures in the first place, and that the
existing code uses atomic_write_barrier in two places between writing
"awaited" and "generation", but does not have corresponding read
barriers.  Without read barriers processors are allowed to
speculatively read "generation" ahead of time, leading to use of a
stale value.

	PR libgomp/51298
	* config/linux/bar.h: Use atomic rather than sync builtins.
	* config/linux/bar.c: Likewise.  Add missing acquire and release
	synchronisation on generation field.
	* task.c (gomp_barrier_handle_tasks): Regain lock so as to not
	double unlock.
Jakub Jelinek - Nov. 28, 2011, 8:13 a.m.
On Mon, Nov 28, 2011 at 10:12:09AM +1030, Alan Modra wrote:
> --- libgomp/config/linux/bar.c	(revision 181718)
> +++ libgomp/config/linux/bar.c	(working copy)
> @@ -36,18 +36,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b
>    if (__builtin_expect ((state & 1) != 0, 0))
>      {
>        /* Next time we'll be awaiting TOTAL threads again.  */
> -      bar->awaited = bar->total;
> -      atomic_write_barrier ();
> -      bar->generation += 4;
> +      __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE);
> +      __atomic_add_fetch (&bar->generation, 4, MEMMODEL_ACQ_REL);
>        futex_wake ((int *) &bar->generation, INT_MAX);
>      }

The above is unfortunate, bar->generation should be only modified from a
single thread at this point, but the __atomic_add_fetch will enforce there
an atomic instruction on it rather than just some kind of barrier.

	Jakub
Alan Modra - Nov. 28, 2011, 9:39 a.m.
On Mon, Nov 28, 2011 at 09:13:27AM +0100, Jakub Jelinek wrote:
> On Mon, Nov 28, 2011 at 10:12:09AM +1030, Alan Modra wrote:
> > --- libgomp/config/linux/bar.c	(revision 181718)
> > +++ libgomp/config/linux/bar.c	(working copy)
> > @@ -36,18 +36,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b
> >    if (__builtin_expect ((state & 1) != 0, 0))
> >      {
> >        /* Next time we'll be awaiting TOTAL threads again.  */
> > -      bar->awaited = bar->total;
> > -      atomic_write_barrier ();
> > -      bar->generation += 4;
> > +      __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE);
> > +      __atomic_add_fetch (&bar->generation, 4, MEMMODEL_ACQ_REL);
> >        futex_wake ((int *) &bar->generation, INT_MAX);
> >      }
> 
> The above is unfortunate, bar->generation should be only modified from a
> single thread at this point, but the __atomic_add_fetch will enforce there
> an atomic instruction on it rather than just some kind of barrier.

I will try without the atomic add and see what happens.

Patch

Index: libgomp/task.c
===================================================================
--- libgomp/task.c	(revision 181718)
+++ libgomp/task.c	(working copy)
@@ -273,6 +273,7 @@  gomp_barrier_handle_tasks (gomp_barrier_
 	      gomp_team_barrier_done (&team->barrier, state);
 	      gomp_mutex_unlock (&team->task_lock);
 	      gomp_team_barrier_wake (&team->barrier, 0);
+	      gomp_mutex_lock (&team->task_lock);
 	    }
 	}
     }
Index: libgomp/config/linux/bar.h
===================================================================
--- libgomp/config/linux/bar.h	(revision 181718)
+++ libgomp/config/linux/bar.h	(working copy)
@@ -50,7 +50,7 @@  static inline void gomp_barrier_init (go
 
 static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
 {
-  __sync_fetch_and_add (&bar->awaited, count - bar->total);
+  __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL);
   bar->total = count;
 }
 
@@ -69,10 +69,8 @@  extern void gomp_team_barrier_wake (gomp
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {
-  unsigned int ret = bar->generation & ~3;
-  /* Do we need any barrier here or is __sync_add_and_fetch acting
-     as the needed LoadLoad barrier already?  */
-  ret += __sync_add_and_fetch (&bar->awaited, -1) == 0;
+  unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3;
+  ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0;
   return ret;
 }
 
Index: libgomp/config/linux/bar.c
===================================================================
--- libgomp/config/linux/bar.c	(revision 181718)
+++ libgomp/config/linux/bar.c	(working copy)
@@ -36,18 +36,15 @@  gomp_barrier_wait_end (gomp_barrier_t *b
   if (__builtin_expect ((state & 1) != 0, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
-      bar->awaited = bar->total;
-      atomic_write_barrier ();
-      bar->generation += 4;
+      __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE);
+      __atomic_add_fetch (&bar->generation, 4, MEMMODEL_ACQ_REL);
       futex_wake ((int *) &bar->generation, INT_MAX);
     }
   else
     {
-      unsigned int generation = state;
-
       do
-	do_wait ((int *) &bar->generation, generation);
-      while (bar->generation == generation);
+	do_wait ((int *) &bar->generation, state);
+      while (__atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) == state);
     }
 }
 
@@ -81,15 +78,15 @@  gomp_team_barrier_wake (gomp_barrier_t *
 void
 gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  unsigned int generation;
+  unsigned int generation, gen;
 
   if (__builtin_expect ((state & 1) != 0, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
-      bar->awaited = bar->total;
-      atomic_write_barrier ();
+
+      __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE);
       if (__builtin_expect (team->task_count, 0))
 	{
 	  gomp_barrier_handle_tasks (state);
@@ -97,7 +94,7 @@  gomp_team_barrier_wait_end (gomp_barrier
 	}
       else
 	{
-	  bar->generation = state + 3;
+	  __atomic_store_4 (&bar->generation, state + 3, MEMMODEL_RELEASE);
 	  futex_wake ((int *) &bar->generation, INT_MAX);
 	  return;
 	}
@@ -107,12 +104,14 @@  gomp_team_barrier_wait_end (gomp_barrier
   do
     {
       do_wait ((int *) &bar->generation, generation);
-      if (__builtin_expect (bar->generation & 1, 0))
+      gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & 1, 0))
 	gomp_barrier_handle_tasks (state);
-      if ((bar->generation & 2))
+      gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE);
+      if ((gen & 2) != 0)
 	generation |= 2;
     }
-  while (bar->generation != state + 4);
+  while (gen != state + 4);
 }
 
 void