Index: libgomp/config/linux/sem.h
===================================================================
--- libgomp/config/linux/sem.h	(revision 181718)
+++ libgomp/config/linux/sem.h	(working copy)
@@ -24,34 +24,74 @@
 
 /* This is a Linux specific implementation of a semaphore synchronization
    mechanism for libgomp.  This type is private to the library.  This 
-   implementation uses atomic instructions and the futex syscall.  */
+   counting semaphore implementation uses atomic instructions and the
+   futex syscall, and a single 32-bit int to store semaphore state.
+   The low 31 bits are the count, the top bit is a flag set when some
+   threads may be waiting.  */
 
 #ifndef GOMP_SEM_H
 #define GOMP_SEM_H 1
 
 typedef int gomp_sem_t;
 
-static inline void gomp_sem_init (gomp_sem_t *sem, int value)
+enum memmodel
+{
+  MEMMODEL_RELAXED = 0,
+  MEMMODEL_CONSUME = 1,
+  MEMMODEL_ACQUIRE = 2,
+  MEMMODEL_RELEASE = 3,
+  MEMMODEL_ACQ_REL = 4,
+  MEMMODEL_SEQ_CST = 5,
+  MEMMODEL_LAST = 6
+};
+
+extern void gomp_sem_wait_slow (gomp_sem_t *, int);
+extern void gomp_sem_post_slow (gomp_sem_t *);
+
+static inline void
+gomp_sem_init (gomp_sem_t *sem, int value)
 {
   *sem = value;
 }
 
-extern void gomp_sem_wait_slow (gomp_sem_t *);
-static inline void gomp_sem_wait (gomp_sem_t *sem)
+static inline void
+gomp_sem_destroy (gomp_sem_t *sem)
 {
-  if (!__sync_bool_compare_and_swap (sem, 1, 0))
-    gomp_sem_wait_slow (sem);
 }
 
-extern void gomp_sem_post_slow (gomp_sem_t *);
-static inline void gomp_sem_post (gomp_sem_t *sem)
+static inline void
+gomp_sem_wait (gomp_sem_t *sem)
 {
-  if (!__sync_bool_compare_and_swap (sem, 0, 1))
-    gomp_sem_post_slow (sem);
+  int count = *sem;
+
+  while ((count & 0x7fffffff) != 0)
+    {
+      int oldval = count;
+      __atomic_compare_exchange_4 (sem, &oldval, count - 1,
+				   false, MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+      if (__builtin_expect (oldval == count, 1))
+	return;
+      count = oldval;
+    }
+  gomp_sem_wait_slow (sem, count);
 }
 
-static inline void gomp_sem_destroy (gomp_sem_t *sem)
+static inline void
+gomp_sem_post (gomp_sem_t *sem)
 {
-}
+  int count = *sem;
+
+  while (1)
+    {
+      int oldval = count;
+      __atomic_compare_exchange_4 (sem, &oldval, (count + 1) & 0x7fffffff,
+				   false, MEMMODEL_RELEASE, MEMMODEL_RELAXED);
+      if (__builtin_expect (oldval == count, 1))
+	break;
+      count = oldval;
+    }
 
+  if (__builtin_expect (count & 0x80000000, 0))
+    gomp_sem_post_slow (sem);
+}
 #endif /* GOMP_SEM_H */
Index: libgomp/config/linux/sem.c
===================================================================
--- libgomp/config/linux/sem.c	(revision 181718)
+++ libgomp/config/linux/sem.c	(working copy)
@@ -28,34 +28,69 @@
 
 #include "wait.h"
 
-
 void
-gomp_sem_wait_slow (gomp_sem_t *sem)
+gomp_sem_wait_slow (gomp_sem_t *sem, int count)
 {
-  while (1)
+  int oldval, newval;
+
+  /* First loop spins a while.  */
+  while (count == 0)
     {
-      int val = __sync_val_compare_and_swap (sem, 0, -1);
-      if (val > 0)
+      if (do_spin (sem, 0))
+	{
+	  /* Spin timeout, nothing changed.  Set waiting flag.  */
+	  oldval = 0;
+	  __atomic_compare_exchange_4 (sem, &oldval, 0x80000000, false,
+				       MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+	  count = oldval;
+	  if (oldval == 0)
+	    {
+	      futex_wait (sem, 0x80000000);
+	      count = *sem;
+	    }
+	  break;
+	}
+      /* Something changed.  If positive, we're good to go.  */
+      else if ((count = *sem) > 0)
 	{
-	  if (__sync_bool_compare_and_swap (sem, val, val - 1))
+	  oldval = count;
+	  __atomic_compare_exchange_4 (sem, &oldval, count - 1, false,
+				       MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+	  if (oldval == count)
 	    return;
+	  count = oldval;
 	}
-      do_wait (sem, -1);
+    }
+
+  /* Second loop waits until semaphore is posted.  We always exit this
+     loop with wait flag set, so next post will awaken a thread.  */
+  while (1)
+    {
+      oldval = count;
+      newval = 0x80000000;
+      if ((count & 0x7fffffff) != 0)
+	newval |= count - 1;
+      __atomic_compare_exchange_4 (sem, &oldval, newval, false,
+				   MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
+      if (oldval == count)
+	{
+	  if ((count & 0x7fffffff) != 0)
+	    {
+	      /* If we can wake more threads, do so now.  */
+	      if ((count & 0x7fffffff) > 1)
+		gomp_sem_post_slow (sem);
+	      break;
+	    }
+	  do_wait (sem, 0x80000000);
+	  count = *sem;
+	}
+      else
+	count = oldval;
     }
 }
 
 void
 gomp_sem_post_slow (gomp_sem_t *sem)
 {
-  int old, tmp = *sem, wake;
-
-  do
-    {
-      old = tmp;
-      wake = old > 0 ? old + 1 : 1;
-      tmp = __sync_val_compare_and_swap (sem, old, wake);
-    }
-  while (old != tmp);
-
-  futex_wake (sem, wake);
+  futex_wake (sem, 1);
 }
