diff mbox

__sync_swap* with acq/rel/full memory barrier semantics

Message ID 4DDAE516.4010307@redhat.com
State New
Headers show

Commit Message

Aldy Hernandez May 23, 2011, 10:52 p.m. UTC
This is a patch implementing builtins for an atomic exchange with full, 
acquire, and release memory barrier semantics.  It is similar to 
__sync_lock_test_and_set(), but the target does not have the option of 
implementing a reduced functionality of only implementing a store of 1.  
Also, unlike __sync_lock_test_and_set(), we have all three memory 
barrier variants.

The compiler will fall back to a full barrier if the user requests an 
acquire/release and it is not available in the target.  Also, if no 
variant is available, we will fall back to a compare and swap loop with 
a full barrier at the end.

The real reason for this patch is to implement atomic stores in the C++ 
runtime library, which can currently incorrectly move prior stores past 
an atomic store, thus invalidating the happens-before promise for the 
sequentially consistent model.  I am attaching the corresponding patch 
to libstdc++ to show how I intend to use the builtin.  This is not an 
official submission for the C++ library bits, as I have not yet fully 
tested the library.  I will do so separately.

In a followup patch I will be implementing acq/rel/full variants for all 
the __sync_* builtins which we can use for the atomic loads and for some 
of the OpenMP atomics Jakub has been working on.

Oh yeah, I would gladly accept patterns/patches for other architectures :).

Tested on x86-64 Linux.

OK for mainline?
* c-family/c-common.c (resolve_overloaded_builtin): Add
	BUILT_IN_LOCK_TEST_AND_SET_*_N variants.
	* doc/extend.texi: Document __sync_lock_test_and_set_* variants.
	* libgcc-std.ver: Add __sync_swap_*.
	* optabs.h: Add DOI_sync_swap*.
	Define sync_swap*_optab.
	* optabs.c (expand_sync_swap): New.
	* genopinit.c: Add sync_swap_{acq,rel,full}.
	* config/i386/sync.md ("sync_lock_test_and_set_full<mode>"): New.
	* config/i386/i386.md: Add UNSPECV_SWAP_FULL.
	* tree.h (enum membar_mode): Same.
	* builtins.c (expand_builtin_swap): New.
	(expand_builtin): Add cases for BUILT_IN_SWAP_*.
	* sync-builtins.def (BUILT_IN_SWAP_*): New.
	* expr.h (expand_sync_swap): Protoize.
	(expand_builtin_synchronize): Same.
(__sso_string_base<>::_M_assign): Likewise.
	* include/bits/atomic_2.h (_ITp<>::store): Use __sync_swap_full.
	(_ITp<>::store volatile): Same.
	(_PTp<>::store): Same.
	(_PTp<>::store volatile): Same.

Index: include/bits/atomic_2.h
===================================================================
--- include/bits/atomic_2.h	(revision 173831)
+++ include/bits/atomic_2.h	(working copy)
@@ -249,14 +249,12 @@ namespace __atomic2
 	__glibcxx_assert(__m != memory_order_acq_rel);
 	__glibcxx_assert(__m != memory_order_consume);
 
-	if (__m == memory_order_relaxed)
-	  _M_i = __i;
+	if (__m == memory_order_seq_cst)
+	  (void)__sync_swap_full (&_M_i, __i);
 	else
 	  {
 	    // write_mem_barrier();
 	    _M_i = __i;
-	    if (__m == memory_order_seq_cst)
-	      __sync_synchronize();
 	  }
       }
 
@@ -267,14 +265,12 @@ namespace __atomic2
 	__glibcxx_assert(__m != memory_order_acq_rel);
 	__glibcxx_assert(__m != memory_order_consume);
 
-	if (__m == memory_order_relaxed)
-	  _M_i = __i;
+	if (__m == memory_order_seq_cst)
+	  (void)__sync_swap_full (&_M_i, __i);
 	else
 	  {
 	    // write_mem_barrier();
 	    _M_i = __i;
-	    if (__m == memory_order_seq_cst)
-	      __sync_synchronize();
 	  }
       }
 
@@ -540,14 +536,12 @@ namespace __atomic2
 	__glibcxx_assert(__m != memory_order_acq_rel);
 	__glibcxx_assert(__m != memory_order_consume);
 
-	if (__m == memory_order_relaxed)
-	  _M_p = __p;
+	if (__m = memory_order_seq_cst)
+	  __sync_swap_full (&_M_p, __p);
 	else
 	  {
 	    // write_mem_barrier();
 	    _M_p = __p;
-	    if (__m == memory_order_seq_cst)
-	      __sync_synchronize();
 	  }
       }
 
@@ -559,14 +553,12 @@ namespace __atomic2
 	__glibcxx_assert(__m != memory_order_acq_rel);
 	__glibcxx_assert(__m != memory_order_consume);
 
-	if (__m == memory_order_relaxed)
-	  _M_p = __p;
+	if (__m = memory_order_seq_cst)
+	  __sync_swap_full (&_M_p, __p);
 	else
 	  {
 	    // write_mem_barrier();
 	    _M_p = __p;
-	    if (__m == memory_order_seq_cst)
-	      __sync_synchronize();
 	  }
       }

Comments

Joseph Myers May 23, 2011, 11:05 p.m. UTC | #1
On Mon, 23 May 2011, Aldy Hernandez wrote:

> This is a patch implementing builtins for an atomic exchange with full,
> acquire, and release memory barrier semantics.  It is similar to
> __sync_lock_test_and_set(), but the target does not have the option of
> implementing a reduced functionality of only implementing a store of 1.  Also,
> unlike __sync_lock_test_and_set(), we have all three memory barrier variants.

What's the reason you've implemented three variants, rather than six (the 
C1X/C++0X atomics have six memory order values) or one built-in function 
taking a memory order parameter?  More generally, what is the underlying 
design here for how built-in functions should cover the whole of the new 
atomics functionality in C1X and C++0X?

Adding functions to libgcc-std.ver seems premature in the absence of any 
library implementations of them.
Andrew MacLeod May 30, 2011, 8:07 p.m. UTC | #2
On 05/23/2011 07:05 PM, Joseph S. Myers wrote:
> On Mon, 23 May 2011, Aldy Hernandez wrote:
>
>> This is a patch implementing builtins for an atomic exchange with full,
>> acquire, and release memory barrier semantics.  It is similar to
>> __sync_lock_test_and_set(), but the target does not have the option of
>> implementing a reduced functionality of only implementing a store of 1.  Also,
>> unlike __sync_lock_test_and_set(), we have all three memory barrier variants.
> What's the reason you've implemented three variants, rather than six (the
> C1X/C++0X atomics have six memory order values) or one built-in function
> taking a memory order parameter?  More generally, what is the underlying
> design here for how built-in functions should cover the whole of the new
> atomics functionality in C1X and C++0X?

Aldy was just too excited about working on memory model I think :-)

I've been looking at this, and I propose we go this way :

http://gcc.gnu.org/wiki/Atomic/GCCMM/CodeGen

Please feel free to criticize, comment on,  or ask for clarification.  I 
usually miss something I meant to get across.


Andrew
Jakub Jelinek May 31, 2011, 10:38 a.m. UTC | #3
On Mon, May 30, 2011 at 04:07:09PM -0400, Andrew MacLeod wrote:
> On 05/23/2011 07:05 PM, Joseph S. Myers wrote:
> >On Mon, 23 May 2011, Aldy Hernandez wrote:
> >
> >>This is a patch implementing builtins for an atomic exchange with full,
> >>acquire, and release memory barrier semantics.  It is similar to
> >>__sync_lock_test_and_set(), but the target does not have the option of
> >>implementing a reduced functionality of only implementing a store of 1.  Also,
> >>unlike __sync_lock_test_and_set(), we have all three memory barrier variants.
> >What's the reason you've implemented three variants, rather than six (the
> >C1X/C++0X atomics have six memory order values) or one built-in function
> >taking a memory order parameter?  More generally, what is the underlying
> >design here for how built-in functions should cover the whole of the new
> >atomics functionality in C1X and C++0X?
> 
> Aldy was just too excited about working on memory model I think :-)
> 
> I've been looking at this, and I propose we go this way :
> 
> http://gcc.gnu.org/wiki/Atomic/GCCMM/CodeGen
> 
> Please feel free to criticize, comment on,  or ask for
> clarification.  I usually miss something I meant to get across.

I think the addition of new __sync_* builtins for the different models
is preferrable and would be generally more usable even for other users than
C++ atomics. On some targets any atomic insn will act as a full barrier,
while on others it could generate different insns or code sequences that
way.  For OpenMP atomics having a none (in addition to full/acq/rel)
would be useful, I think #pragma omp atomic doesn't impose any ordering
on memory accesses other than the memory being atomically
read/written/changed.  Haven't read the C++0x standard in detail why
it has 6 memory order modes instead of just 4, but if really 6 are needed
(even for 4 probably), having new builtins with just one constant extra
argument which says the memory ordering mode would be best.

	Jakub
Andrew MacLeod May 31, 2011, 1:11 p.m. UTC | #4
On 05/31/2011 06:38 AM, Jakub Jelinek wrote:
>
>> Aldy was just too excited about working on memory model I think :-)
>>
>> I've been looking at this, and I propose we go this way :
>>
>> http://gcc.gnu.org/wiki/Atomic/GCCMM/CodeGen
>>
>> Please feel free to criticize, comment on,  or ask for
>> clarification.  I usually miss something I meant to get across.
> I think the addition of new __sync_* builtins for the different models
> is preferrable and would be generally more usable even for other users than
> C++ atomics. On some targets any atomic insn will act as a full barrier,
> while on others it could generate different insns or code sequences that
> way.  For OpenMP atomics having a none (in addition to full/acq/rel)
> would be useful, I think #pragma omp atomic doesn't impose any ordering
> on memory accesses other than the memory being atomically
> read/written/changed.  Haven't read the C++0x standard in detail why
> it has 6 memory order modes instead of just 4, but if really 6 are needed
> (even for 4 probably), having new builtins with just one constant extra
> argument which says the memory ordering mode would be best.
>
>
I'm not sure if you are agreeing or not, or how much :-)

There is still only the basics of relaxed, consume, release/acquire, and 
seq-cst.  so there are 4 modes.  C++ gives you two more  by separating 
release and acquire for loads and stores, loads using 'acquire' mode, 
stores using 'release'.  I guess It allows for a slightly finer control 
over instructions that can be loads and/or stores. It looks like the 
optimal powerpc sequence for cmpxchg is slightly more efficient when its 
just an acquire or just a release rather than an acquire/release for 
instance. (and all 3 sequences are slightly different)

The table is more or less complete... ie,  a store cant have an 
'acquire' mode...  and I presume that a consumer which doesn't break 
release-acquire down into component parts would use that 'release' 
version of the store as 'release/acquire' mode.

I presume a single builtin with a parameter is the most efficient way to 
build them, but thats just an implementation detail. Presumable you have 
each builtin in the table with each of those possible modes as a valid 
parameter.   The one thing I would care about is i would like to see the 
relaxed version be 'just an insn' rather than a builtin, if thats 
possible... My understanding is that relaxed (as far as C++) has no 
synchronization at all, so therefore you can treat it like a normal 
operation as far as optimization. That seems the same for openMP.  Its 
just thats its atomic operation.  So it would be preferable if we can 
avoid a builtin in the optimizers for that. Thats why I left it out of 
the table.  If all the atomic operations are already builtins, well, 
then I guess it doesn't matter :-P

It would be nice to say something like   emit_atomic_fetch_add 
(memory_order)  and if its  relaxed, emit the atomic fetch_add insn (or 
builtin if thats what it is), and if its something else, emit the 
appropriate builtin.  That would make bits/libstdc++v2/atomic_2.h even 
easier too

I think maybe we are more or less saying the same thing? :-)

Andrew
diff mbox

Patch

Index: doc/extend.texi
===================================================================
--- doc/extend.texi	(revision 173831)
+++ doc/extend.texi	(working copy)
@@ -6719,6 +6719,22 @@  speculated to) before the builtin, but p
 be globally visible yet, and previous memory loads may not yet be
 satisfied.
 
+@item @var{type} __sync_swap_full (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_acq (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_rel (@var{type} *ptr, @var{type} value, ...)
+@findex __sync_swap_full
+@findex __sync_swap_acq
+@findex __sync_swap_rel
+These builtins implement an atomic exchange operation.  They write
+@var{value} into @code{*@var{ptr}}, and return the previous contents
+of @code{*@var{ptr}}.  The different variants provide a full barrier,
+acquire barrier, or a release barrier respectively depending on the
+suffix.
+
+If the acquire or release variants of these operations are not
+available on the given target, the compiler will fall back to a full
+barrier.
+
 @item void __sync_lock_release (@var{type} *ptr, ...)
 @findex __sync_lock_release
 This builtin releases the lock acquired by @code{__sync_lock_test_and_set}.
Index: c-family/c-common.c
===================================================================
--- c-family/c-common.c	(revision 173831)
+++ c-family/c-common.c	(working copy)
@@ -9035,6 +9035,9 @@  resolve_overloaded_builtin (location_t l
     case BUILT_IN_VAL_COMPARE_AND_SWAP_N:
     case BUILT_IN_LOCK_TEST_AND_SET_N:
     case BUILT_IN_LOCK_RELEASE_N:
+    case BUILT_IN_SWAP_FULL_N:
+    case BUILT_IN_SWAP_ACQ_N:
+    case BUILT_IN_SWAP_REL_N:
       {
 	int n = sync_resolve_size (function, params);
 	tree new_function, first_param, result;
Index: optabs.c
===================================================================
--- optabs.c	(revision 173831)
+++ optabs.c	(working copy)
@@ -6988,6 +6988,70 @@  expand_sync_lock_test_and_set (rtx mem, 
 
   return NULL_RTX;
 }
+
+/* This function expands an atomic exchange operation: atomically store
+   VAL in MEM and return the previous value in MEM.
+
+   TARGET is an option place to stick the return value.
+   MBMODE is the memory barrier type to use for the operation.  */
+
+rtx
+expand_sync_swap (rtx mem, rtx val, rtx target, enum membar_mode mbmode)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  enum insn_code icode;
+  direct_optab op;
+
+  switch (mbmode)
+    {
+    case MEMBAR_MODE_ACQUIRE:
+      op = sync_swap_acq_optab;
+      break;
+    case MEMBAR_MODE_RELEASE:
+      op = sync_swap_rel_optab;
+      break;
+    case MEMBAR_MODE_FULL:
+      op = sync_swap_full_optab;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  /* If no variant is found, try the full barrier.  */
+  if (direct_optab_handler (op, mode) == CODE_FOR_nothing)
+    op = sync_swap_full_optab;
+
+  /* If the target supports the swap directly, great.  */
+  icode = direct_optab_handler (op, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      struct expand_operand ops[3];
+
+      create_output_operand (&ops[0], target, mode);
+      create_fixed_operand (&ops[1], mem);
+      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
+      create_convert_operand_to (&ops[2], val, mode, true);
+      if (maybe_expand_insn (icode, 3, ops))
+	return ops[0].value;
+    }
+
+  /* Otherwise, use a compare-and-swap loop for the exchange.  */
+  if (direct_optab_handler (sync_compare_and_swap_optab, mode)
+      != CODE_FOR_nothing)
+    {
+      if (!target || !register_operand (target, mode))
+	target = gen_reg_rtx (mode);
+      if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode)
+	val = convert_modes (mode, GET_MODE (val), val, 1);
+      if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
+	{
+	  /* Issue a full barrier.  */
+	  expand_builtin_synchronize ();
+	  return target;
+	}
+    }
+
+  return NULL_RTX;
+}
 
 /* Return true if OPERAND is suitable for operand number OPNO of
    instruction ICODE.  */
Index: optabs.h
===================================================================
--- optabs.h	(revision 173831)
+++ optabs.h	(working copy)
@@ -669,9 +669,19 @@  enum direct_optab_index
   /* Atomic compare and swap.  */
   DOI_sync_compare_and_swap,
 
-  /* Atomic exchange with acquire semantics.  */
+  /* Atomic exchange with acquire semantics.  Exchange not fully
+     guaranteed.  Some targets may only support a store of 1.  */
   DOI_sync_lock_test_and_set,
 
+  /* Atomic exchange with acquire semantics.  */
+  DOI_sync_swap_acq,
+
+  /* Atomic exchange with release semantics.  */
+  DOI_sync_swap_rel,
+
+  /* Atomic exchange with full barrier semantics.  */
+  DOI_sync_swap_full,
+
   /* Atomic clear with release semantics.  */
   DOI_sync_lock_release,
 
@@ -720,6 +730,12 @@  typedef struct direct_optab_d *direct_op
   (&direct_optab_table[(int) DOI_sync_compare_and_swap])
 #define sync_lock_test_and_set_optab \
   (&direct_optab_table[(int) DOI_sync_lock_test_and_set])
+#define sync_swap_acq_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_acq])
+#define sync_swap_rel_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_rel])
+#define sync_swap_full_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_full])
 #define sync_lock_release_optab \
   (&direct_optab_table[(int) DOI_sync_lock_release])
 
Index: genopinit.c
===================================================================
--- genopinit.c	(revision 173831)
+++ genopinit.c	(working copy)
@@ -239,6 +239,9 @@  static const char * const optabs[] =
   "set_direct_optab_handler (sync_new_nand_optab, $A, CODE_FOR_$(sync_new_nand$I$a$))",
   "set_direct_optab_handler (sync_compare_and_swap_optab, $A, CODE_FOR_$(sync_compare_and_swap$I$a$))",
   "set_direct_optab_handler (sync_lock_test_and_set_optab, $A, CODE_FOR_$(sync_lock_test_and_set$I$a$))",
+  "set_direct_optab_handler (sync_swap_acq_optab, $A, CODE_FOR_$(sync_swap_acq$I$a$))",
+  "set_direct_optab_handler (sync_swap_rel_optab, $A, CODE_FOR_$(sync_swap_rel$I$a$))",
+  "set_direct_optab_handler (sync_swap_full_optab, $A, CODE_FOR_$(sync_swap_full$I$a$))",
   "set_direct_optab_handler (sync_lock_release_optab, $A, CODE_FOR_$(sync_lock_release$I$a$))",
   "set_optab_handler (vec_set_optab, $A, CODE_FOR_$(vec_set$a$))",
   "set_optab_handler (vec_extract_optab, $A, CODE_FOR_$(vec_extract$a$))",
Index: builtins.c
===================================================================
--- builtins.c	(revision 173831)
+++ builtins.c	(working copy)
@@ -5682,9 +5682,35 @@  expand_builtin_lock_test_and_set (enum m
   return expand_sync_lock_test_and_set (mem, val, target);
 }
 
+/* Expand the __sync_swap_* intrinsics.
+
+   EXP is the CALL_EXPR.
+   TARGET is an optional place for us to store the results.
+   MBMODE is the memory barrier mode to use.  */
+
+static rtx
+expand_builtin_swap (enum machine_mode mode, tree exp, rtx target,
+		     enum membar_mode mbmode)
+{
+  rtx val, mem;
+  enum machine_mode old_mode;
+
+  /* Expand the operands.  */
+  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
+  val = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, mode, EXPAND_NORMAL);
+  /* If VAL is promoted to a wider mode, convert it back to MODE.  Take care
+     of CONST_INTs, where we know the old_mode only from the call argument.  */
+  old_mode = GET_MODE (val);
+  if (old_mode == VOIDmode)
+    old_mode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 1)));
+  val = convert_modes (mode, old_mode, val, 1);
+
+  return expand_sync_swap (mem, val, target, mbmode);
+}
+
 /* Expand the __sync_synchronize intrinsic.  */
 
-static void
+void
 expand_builtin_synchronize (void)
 {
   gimple x;
@@ -6495,6 +6521,39 @@  expand_builtin (tree exp, rtx target, rt
 	return target;
       break;
 
+    case BUILT_IN_SWAP_ACQ_1:
+    case BUILT_IN_SWAP_ACQ_2:
+    case BUILT_IN_SWAP_ACQ_4:
+    case BUILT_IN_SWAP_ACQ_8:
+    case BUILT_IN_SWAP_ACQ_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_ACQ_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_ACQUIRE);
+      if (target)
+	return target;
+      break;
+
+    case BUILT_IN_SWAP_REL_1:
+    case BUILT_IN_SWAP_REL_2:
+    case BUILT_IN_SWAP_REL_4:
+    case BUILT_IN_SWAP_REL_8:
+    case BUILT_IN_SWAP_REL_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_REL_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_RELEASE);
+      if (target)
+	return target;
+      break;
+
+    case BUILT_IN_SWAP_FULL_1:
+    case BUILT_IN_SWAP_FULL_2:
+    case BUILT_IN_SWAP_FULL_4:
+    case BUILT_IN_SWAP_FULL_8:
+    case BUILT_IN_SWAP_FULL_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_FULL_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_FULL);
+      if (target)
+	return target;
+      break;
+
     case BUILT_IN_LOCK_TEST_AND_SET_1:
     case BUILT_IN_LOCK_TEST_AND_SET_2:
     case BUILT_IN_LOCK_TEST_AND_SET_4:
Index: sync-builtins.def
===================================================================
--- sync-builtins.def	(revision 173831)
+++ sync-builtins.def	(working copy)
@@ -235,6 +235,63 @@  DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND_SET_16, "__sync_lock_test_and_set_16",
 		  BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
 
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_N,
+		  "__sync_swap_acq",
+		  BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_1,
+		  "__sync_swap_acq_1",
+		  BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_2,
+		  "__sync_swap_acq_2",
+		  BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_4,
+		  "__sync_swap_acq_4",
+		  BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_8,
+		  "__sync_swap_acq_8",
+		  BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_16,
+		  "__sync_swap_acq_16",
+		  BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_N,
+		  "__sync_swap_rel",
+		  BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_1,
+		  "__sync_swap_rel_1",
+		  BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_2,
+		  "__sync_swap_rel_2",
+		  BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_4,
+		  "__sync_swap_rel_4",
+		  BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_8,
+		  "__sync_swap_rel_8",
+		  BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_16,
+		  "__sync_swap_rel_16",
+		  BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_N,
+		  "__sync_swap_full",
+		  BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_1,
+		  "__sync_swap_full_1",
+		  BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_2,
+		  "__sync_swap_full_2",
+		  BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_4,
+		  "__sync_swap_full_4",
+		  BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_8,
+		  "__sync_swap_full_8",
+		  BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_16,
+		  "__sync_swap_full_16",
+		  BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_N, "__sync_lock_release",
 		  BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_1, "__sync_lock_release_1",
Index: expr.h
===================================================================
--- expr.h	(revision 173831)
+++ expr.h	(working copy)
@@ -161,6 +161,14 @@  enum optab_methods
   OPTAB_MUST_WIDEN
 };
 
+/* Memory barrier type.  */
+enum membar_mode
+{
+  MEMBAR_MODE_RELEASE,
+  MEMBAR_MODE_ACQUIRE,
+  MEMBAR_MODE_FULL
+};
+
 /* Generate code for a simple binary or unary operation.  "Simple" in
    this case means "can be unambiguously described by a (mode, code)
    pair and mapped to a single optab."  */
@@ -217,6 +225,7 @@  rtx expand_bool_compare_and_swap (rtx, r
 rtx expand_sync_operation (rtx, rtx, enum rtx_code);
 rtx expand_sync_fetch_operation (rtx, rtx, enum rtx_code, bool, rtx);
 rtx expand_sync_lock_test_and_set (rtx, rtx, rtx);
+rtx expand_sync_swap (rtx, rtx, rtx, enum membar_mode);
 
 /* Functions from expmed.c:  */
 
@@ -248,6 +257,7 @@  extern void expand_builtin_setjmp_receiv
 extern rtx expand_builtin_saveregs (void);
 extern void expand_builtin_trap (void);
 extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, enum machine_mode);
+extern void expand_builtin_synchronize (void);
 
 /* Functions from expr.c:  */
 
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 173831)
+++ config/i386/i386.md	(working copy)
@@ -250,6 +250,7 @@ 
   UNSPECV_MWAIT
   UNSPECV_CMPXCHG
   UNSPECV_XCHG
+  UNSPECV_SWAP_FULL
   UNSPECV_LOCK
   UNSPECV_PROLOGUE_USE
   UNSPECV_CLD
Index: config/i386/sync.md
===================================================================
--- config/i386/sync.md	(revision 173831)
+++ config/i386/sync.md	(working copy)
@@ -232,6 +232,15 @@ 
   return "lock{%;} add{<imodesuffix>}\t{%1, %0|%0, %1}";
 })
 
+(define_insn "sync_swap_full<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_SWAP_FULL))
+   (set (match_dup 1)
+	(match_operand:SWI 2 "register_operand" "0"))]
+  ""
+  "xchg{<imodesuffix>}\t{%1, %0|%0, %1}")
+
 ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space.
 (define_insn "sync_lock_test_and_set<mode>"
   [(set (match_operand:SWI 0 "register_operand" "=<r>")
Index: libgcc-std.ver
===================================================================
--- libgcc-std.ver	(revision 173831)
+++ libgcc-std.ver	(working copy)
@@ -1919,3 +1919,26 @@  GCC_4.6.0 {
   __morestack_initial_sp
   __splitstack_find
 }
+
+%inherit GCC_4.7.0 GCC_4.6.0
+GCC_4.7.0 {
+  __sync_swap_acq_1
+  __sync_swap_rel_1
+  __sync_swap_full_1
+
+  __sync_swap_acq_2
+  __sync_swap_rel_2
+  __sync_swap_full_2
+
+  __sync_swap_acq_4
+  __sync_swap_rel_4
+  __sync_swap_full_4
+
+  __sync_swap_acq_8
+  __sync_swap_rel_8
+  __sync_swap_full_8
+
+  __sync_swap_acq_16
+  __sync_swap_rel_16
+  __sync_swap_full_16
+}