From patchwork Mon May 23 22:52:06 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: __sync_swap* with acq/rel/full memory barrier semantics Date: Mon, 23 May 2011 12:52:06 -0000 From: Aldy Hernandez X-Patchwork-Id: 97106 Message-Id: <4DDAE516.4010307@redhat.com> To: Richard Henderson , gcc-patches Cc: Benjamin De Kosnik , Andrew MacLeod This is a patch implementing builtins for an atomic exchange with full, acquire, and release memory barrier semantics. It is similar to __sync_lock_test_and_set(), but the target does not have the option of implementing a reduced functionality of only implementing a store of 1. Also, unlike __sync_lock_test_and_set(), we have all three memory barrier variants. The compiler will fall back to a full barrier if the user requests an acquire/release and it is not available in the target. Also, if no variant is available, we will fall back to a compare and swap loop with a full barrier at the end. The real reason for this patch is to implement atomic stores in the C++ runtime library, which can currently incorrectly move prior stores past an atomic store, thus invalidating the happens-before promise for the sequentially consistent model. I am attaching the corresponding patch to libstdc++ to show how I intend to use the builtin. This is not an official submission for the C++ library bits, as I have not yet fully tested the library. I will do so separately. In a followup patch I will be implementing acq/rel/full variants for all the __sync_* builtins which we can use for the atomic loads and for some of the OpenMP atomics Jakub has been working on. Oh yeah, I would gladly accept patterns/patches for other architectures :). Tested on x86-64 Linux. OK for mainline? * c-family/c-common.c (resolve_overloaded_builtin): Add BUILT_IN_LOCK_TEST_AND_SET_*_N variants. * doc/extend.texi: Document __sync_lock_test_and_set_* variants. * libgcc-std.ver: Add __sync_swap_*. * optabs.h: Add DOI_sync_swap*. Define sync_swap*_optab. * optabs.c (expand_sync_swap): New. * genopinit.c: Add sync_swap_{acq,rel,full}. * config/i386/sync.md ("sync_lock_test_and_set_full"): New. * config/i386/i386.md: Add UNSPECV_SWAP_FULL. * tree.h (enum membar_mode): Same. * builtins.c (expand_builtin_swap): New. (expand_builtin): Add cases for BUILT_IN_SWAP_*. * sync-builtins.def (BUILT_IN_SWAP_*): New. * expr.h (expand_sync_swap): Protoize. (expand_builtin_synchronize): Same. (__sso_string_base<>::_M_assign): Likewise. * include/bits/atomic_2.h (_ITp<>::store): Use __sync_swap_full. (_ITp<>::store volatile): Same. (_PTp<>::store): Same. (_PTp<>::store volatile): Same. Index: include/bits/atomic_2.h =================================================================== --- include/bits/atomic_2.h (revision 173831) +++ include/bits/atomic_2.h (working copy) @@ -249,14 +249,12 @@ namespace __atomic2 __glibcxx_assert(__m != memory_order_acq_rel); __glibcxx_assert(__m != memory_order_consume); - if (__m == memory_order_relaxed) - _M_i = __i; + if (__m == memory_order_seq_cst) + (void)__sync_swap_full (&_M_i, __i); else { // write_mem_barrier(); _M_i = __i; - if (__m == memory_order_seq_cst) - __sync_synchronize(); } } @@ -267,14 +265,12 @@ namespace __atomic2 __glibcxx_assert(__m != memory_order_acq_rel); __glibcxx_assert(__m != memory_order_consume); - if (__m == memory_order_relaxed) - _M_i = __i; + if (__m == memory_order_seq_cst) + (void)__sync_swap_full (&_M_i, __i); else { // write_mem_barrier(); _M_i = __i; - if (__m == memory_order_seq_cst) - __sync_synchronize(); } } @@ -540,14 +536,12 @@ namespace __atomic2 __glibcxx_assert(__m != memory_order_acq_rel); __glibcxx_assert(__m != memory_order_consume); - if (__m == memory_order_relaxed) - _M_p = __p; + if (__m = memory_order_seq_cst) + __sync_swap_full (&_M_p, __p); else { // write_mem_barrier(); _M_p = __p; - if (__m == memory_order_seq_cst) - __sync_synchronize(); } } @@ -559,14 +553,12 @@ namespace __atomic2 __glibcxx_assert(__m != memory_order_acq_rel); __glibcxx_assert(__m != memory_order_consume); - if (__m == memory_order_relaxed) - _M_p = __p; + if (__m = memory_order_seq_cst) + __sync_swap_full (&_M_p, __p); else { // write_mem_barrier(); _M_p = __p; - if (__m == memory_order_seq_cst) - __sync_synchronize(); } } Index: doc/extend.texi =================================================================== --- doc/extend.texi (revision 173831) +++ doc/extend.texi (working copy) @@ -6719,6 +6719,22 @@ speculated to) before the builtin, but p be globally visible yet, and previous memory loads may not yet be satisfied. +@item @var{type} __sync_swap_full (@var{type} *ptr, @var{type} value, ...) +@itemx @var{type} __sync_swap_acq (@var{type} *ptr, @var{type} value, ...) +@itemx @var{type} __sync_swap_rel (@var{type} *ptr, @var{type} value, ...) +@findex __sync_swap_full +@findex __sync_swap_acq +@findex __sync_swap_rel +These builtins implement an atomic exchange operation. They write +@var{value} into @code{*@var{ptr}}, and return the previous contents +of @code{*@var{ptr}}. The different variants provide a full barrier, +acquire barrier, or a release barrier respectively depending on the +suffix. + +If the acquire or release variants of these operations are not +available on the given target, the compiler will fall back to a full +barrier. + @item void __sync_lock_release (@var{type} *ptr, ...) @findex __sync_lock_release This builtin releases the lock acquired by @code{__sync_lock_test_and_set}. Index: c-family/c-common.c =================================================================== --- c-family/c-common.c (revision 173831) +++ c-family/c-common.c (working copy) @@ -9035,6 +9035,9 @@ resolve_overloaded_builtin (location_t l case BUILT_IN_VAL_COMPARE_AND_SWAP_N: case BUILT_IN_LOCK_TEST_AND_SET_N: case BUILT_IN_LOCK_RELEASE_N: + case BUILT_IN_SWAP_FULL_N: + case BUILT_IN_SWAP_ACQ_N: + case BUILT_IN_SWAP_REL_N: { int n = sync_resolve_size (function, params); tree new_function, first_param, result; Index: optabs.c =================================================================== --- optabs.c (revision 173831) +++ optabs.c (working copy) @@ -6988,6 +6988,70 @@ expand_sync_lock_test_and_set (rtx mem, return NULL_RTX; } + +/* This function expands an atomic exchange operation: atomically store + VAL in MEM and return the previous value in MEM. + + TARGET is an option place to stick the return value. + MBMODE is the memory barrier type to use for the operation. */ + +rtx +expand_sync_swap (rtx mem, rtx val, rtx target, enum membar_mode mbmode) +{ + enum machine_mode mode = GET_MODE (mem); + enum insn_code icode; + direct_optab op; + + switch (mbmode) + { + case MEMBAR_MODE_ACQUIRE: + op = sync_swap_acq_optab; + break; + case MEMBAR_MODE_RELEASE: + op = sync_swap_rel_optab; + break; + case MEMBAR_MODE_FULL: + op = sync_swap_full_optab; + break; + default: + gcc_unreachable (); + } + /* If no variant is found, try the full barrier. */ + if (direct_optab_handler (op, mode) == CODE_FOR_nothing) + op = sync_swap_full_optab; + + /* If the target supports the swap directly, great. */ + icode = direct_optab_handler (op, mode); + if (icode != CODE_FOR_nothing) + { + struct expand_operand ops[3]; + + create_output_operand (&ops[0], target, mode); + create_fixed_operand (&ops[1], mem); + /* VAL may have been promoted to a wider mode. Shrink it if so. */ + create_convert_operand_to (&ops[2], val, mode, true); + if (maybe_expand_insn (icode, 3, ops)) + return ops[0].value; + } + + /* Otherwise, use a compare-and-swap loop for the exchange. */ + if (direct_optab_handler (sync_compare_and_swap_optab, mode) + != CODE_FOR_nothing) + { + if (!target || !register_operand (target, mode)) + target = gen_reg_rtx (mode); + if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode) + val = convert_modes (mode, GET_MODE (val), val, 1); + if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX)) + { + /* Issue a full barrier. */ + expand_builtin_synchronize (); + return target; + } + } + + return NULL_RTX; +} /* Return true if OPERAND is suitable for operand number OPNO of instruction ICODE. */ Index: optabs.h =================================================================== --- optabs.h (revision 173831) +++ optabs.h (working copy) @@ -669,9 +669,19 @@ enum direct_optab_index /* Atomic compare and swap. */ DOI_sync_compare_and_swap, - /* Atomic exchange with acquire semantics. */ + /* Atomic exchange with acquire semantics. Exchange not fully + guaranteed. Some targets may only support a store of 1. */ DOI_sync_lock_test_and_set, + /* Atomic exchange with acquire semantics. */ + DOI_sync_swap_acq, + + /* Atomic exchange with release semantics. */ + DOI_sync_swap_rel, + + /* Atomic exchange with full barrier semantics. */ + DOI_sync_swap_full, + /* Atomic clear with release semantics. */ DOI_sync_lock_release, @@ -720,6 +730,12 @@ typedef struct direct_optab_d *direct_op (&direct_optab_table[(int) DOI_sync_compare_and_swap]) #define sync_lock_test_and_set_optab \ (&direct_optab_table[(int) DOI_sync_lock_test_and_set]) +#define sync_swap_acq_optab \ + (&direct_optab_table[(int) DOI_sync_swap_acq]) +#define sync_swap_rel_optab \ + (&direct_optab_table[(int) DOI_sync_swap_rel]) +#define sync_swap_full_optab \ + (&direct_optab_table[(int) DOI_sync_swap_full]) #define sync_lock_release_optab \ (&direct_optab_table[(int) DOI_sync_lock_release]) Index: genopinit.c =================================================================== --- genopinit.c (revision 173831) +++ genopinit.c (working copy) @@ -239,6 +239,9 @@ static const char * const optabs[] = "set_direct_optab_handler (sync_new_nand_optab, $A, CODE_FOR_$(sync_new_nand$I$a$))", "set_direct_optab_handler (sync_compare_and_swap_optab, $A, CODE_FOR_$(sync_compare_and_swap$I$a$))", "set_direct_optab_handler (sync_lock_test_and_set_optab, $A, CODE_FOR_$(sync_lock_test_and_set$I$a$))", + "set_direct_optab_handler (sync_swap_acq_optab, $A, CODE_FOR_$(sync_swap_acq$I$a$))", + "set_direct_optab_handler (sync_swap_rel_optab, $A, CODE_FOR_$(sync_swap_rel$I$a$))", + "set_direct_optab_handler (sync_swap_full_optab, $A, CODE_FOR_$(sync_swap_full$I$a$))", "set_direct_optab_handler (sync_lock_release_optab, $A, CODE_FOR_$(sync_lock_release$I$a$))", "set_optab_handler (vec_set_optab, $A, CODE_FOR_$(vec_set$a$))", "set_optab_handler (vec_extract_optab, $A, CODE_FOR_$(vec_extract$a$))", Index: builtins.c =================================================================== --- builtins.c (revision 173831) +++ builtins.c (working copy) @@ -5682,9 +5682,35 @@ expand_builtin_lock_test_and_set (enum m return expand_sync_lock_test_and_set (mem, val, target); } +/* Expand the __sync_swap_* intrinsics. + + EXP is the CALL_EXPR. + TARGET is an optional place for us to store the results. + MBMODE is the memory barrier mode to use. */ + +static rtx +expand_builtin_swap (enum machine_mode mode, tree exp, rtx target, + enum membar_mode mbmode) +{ + rtx val, mem; + enum machine_mode old_mode; + + /* Expand the operands. */ + mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode); + val = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, mode, EXPAND_NORMAL); + /* If VAL is promoted to a wider mode, convert it back to MODE. Take care + of CONST_INTs, where we know the old_mode only from the call argument. */ + old_mode = GET_MODE (val); + if (old_mode == VOIDmode) + old_mode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 1))); + val = convert_modes (mode, old_mode, val, 1); + + return expand_sync_swap (mem, val, target, mbmode); +} + /* Expand the __sync_synchronize intrinsic. */ -static void +void expand_builtin_synchronize (void) { gimple x; @@ -6495,6 +6521,39 @@ expand_builtin (tree exp, rtx target, rt return target; break; + case BUILT_IN_SWAP_ACQ_1: + case BUILT_IN_SWAP_ACQ_2: + case BUILT_IN_SWAP_ACQ_4: + case BUILT_IN_SWAP_ACQ_8: + case BUILT_IN_SWAP_ACQ_16: + mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_ACQ_1); + target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_ACQUIRE); + if (target) + return target; + break; + + case BUILT_IN_SWAP_REL_1: + case BUILT_IN_SWAP_REL_2: + case BUILT_IN_SWAP_REL_4: + case BUILT_IN_SWAP_REL_8: + case BUILT_IN_SWAP_REL_16: + mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_REL_1); + target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_RELEASE); + if (target) + return target; + break; + + case BUILT_IN_SWAP_FULL_1: + case BUILT_IN_SWAP_FULL_2: + case BUILT_IN_SWAP_FULL_4: + case BUILT_IN_SWAP_FULL_8: + case BUILT_IN_SWAP_FULL_16: + mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_FULL_1); + target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_FULL); + if (target) + return target; + break; + case BUILT_IN_LOCK_TEST_AND_SET_1: case BUILT_IN_LOCK_TEST_AND_SET_2: case BUILT_IN_LOCK_TEST_AND_SET_4: Index: sync-builtins.def =================================================================== --- sync-builtins.def (revision 173831) +++ sync-builtins.def (working copy) @@ -235,6 +235,63 @@ DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND_SET_16, "__sync_lock_test_and_set_16", BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_N, + "__sync_swap_acq", + BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_1, + "__sync_swap_acq_1", + BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_2, + "__sync_swap_acq_2", + BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_4, + "__sync_swap_acq_4", + BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_8, + "__sync_swap_acq_8", + BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_16, + "__sync_swap_acq_16", + BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST) + +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_N, + "__sync_swap_rel", + BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_1, + "__sync_swap_rel_1", + BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_2, + "__sync_swap_rel_2", + BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_4, + "__sync_swap_rel_4", + BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_8, + "__sync_swap_rel_8", + BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_16, + "__sync_swap_rel_16", + BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST) + +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_N, + "__sync_swap_full", + BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_1, + "__sync_swap_full_1", + BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_2, + "__sync_swap_full_2", + BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_4, + "__sync_swap_full_4", + BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_8, + "__sync_swap_full_8", + BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST) +DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_16, + "__sync_swap_full_16", + BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST) + DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_N, "__sync_lock_release", BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST) DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_1, "__sync_lock_release_1", Index: expr.h =================================================================== --- expr.h (revision 173831) +++ expr.h (working copy) @@ -161,6 +161,14 @@ enum optab_methods OPTAB_MUST_WIDEN }; +/* Memory barrier type. */ +enum membar_mode +{ + MEMBAR_MODE_RELEASE, + MEMBAR_MODE_ACQUIRE, + MEMBAR_MODE_FULL +}; + /* Generate code for a simple binary or unary operation. "Simple" in this case means "can be unambiguously described by a (mode, code) pair and mapped to a single optab." */ @@ -217,6 +225,7 @@ rtx expand_bool_compare_and_swap (rtx, r rtx expand_sync_operation (rtx, rtx, enum rtx_code); rtx expand_sync_fetch_operation (rtx, rtx, enum rtx_code, bool, rtx); rtx expand_sync_lock_test_and_set (rtx, rtx, rtx); +rtx expand_sync_swap (rtx, rtx, rtx, enum membar_mode); /* Functions from expmed.c: */ @@ -248,6 +257,7 @@ extern void expand_builtin_setjmp_receiv extern rtx expand_builtin_saveregs (void); extern void expand_builtin_trap (void); extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, enum machine_mode); +extern void expand_builtin_synchronize (void); /* Functions from expr.c: */ Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 173831) +++ config/i386/i386.md (working copy) @@ -250,6 +250,7 @@ UNSPECV_MWAIT UNSPECV_CMPXCHG UNSPECV_XCHG + UNSPECV_SWAP_FULL UNSPECV_LOCK UNSPECV_PROLOGUE_USE UNSPECV_CLD Index: config/i386/sync.md =================================================================== --- config/i386/sync.md (revision 173831) +++ config/i386/sync.md (working copy) @@ -232,6 +232,15 @@ return "lock{%;} add{}\t{%1, %0|%0, %1}"; }) +(define_insn "sync_swap_full" + [(set (match_operand:SWI 0 "register_operand" "=") + (unspec_volatile:SWI + [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_SWAP_FULL)) + (set (match_dup 1) + (match_operand:SWI 2 "register_operand" "0"))] + "" + "xchg{}\t{%1, %0|%0, %1}") + ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space. (define_insn "sync_lock_test_and_set" [(set (match_operand:SWI 0 "register_operand" "=") Index: libgcc-std.ver =================================================================== --- libgcc-std.ver (revision 173831) +++ libgcc-std.ver (working copy) @@ -1919,3 +1919,26 @@ GCC_4.6.0 { __morestack_initial_sp __splitstack_find } + +%inherit GCC_4.7.0 GCC_4.6.0 +GCC_4.7.0 { + __sync_swap_acq_1 + __sync_swap_rel_1 + __sync_swap_full_1 + + __sync_swap_acq_2 + __sync_swap_rel_2 + __sync_swap_full_2 + + __sync_swap_acq_4 + __sync_swap_rel_4 + __sync_swap_full_4 + + __sync_swap_acq_8 + __sync_swap_rel_8 + __sync_swap_full_8 + + __sync_swap_acq_16 + __sync_swap_rel_16 + __sync_swap_full_16 +}