diff mbox

[v2,09/27] tcg: Add atomic helpers

Message ID 1467392693-22715-10-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson July 1, 2016, 5:04 p.m. UTC
Add all of cmpxchg, op_fetch, fetch_op, and xchg.
Handle both endian-ness, and sizes up to 8.
Handle expanding non-atomically, when emulating in serial.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 Makefile.objs      |   1 -
 Makefile.target    |   1 +
 atomic_template.h  | 220 ++++++++++++++++++++++++++++++++++++
 cputlb.c           |   3 +-
 softmmu_template.h | 178 ++++++++++++++++++++++++++++-
 tcg-runtime.c      |  27 +++--
 tcg/tcg-op.c       | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-op.h       |  44 ++++++++
 tcg/tcg-runtime.h  |  75 +++++++++++++
 tcg/tcg.h          |  53 +++++++++
 10 files changed, 909 insertions(+), 17 deletions(-)
 create mode 100644 atomic_template.h

Comments

Alex Bennée Sept. 8, 2016, 1:43 p.m. UTC | #1
Richard Henderson <rth@twiddle.net> writes:

> Add all of cmpxchg, op_fetch, fetch_op, and xchg.
> Handle both endian-ness, and sizes up to 8.
> Handle expanding non-atomically, when emulating in serial.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
<snip>
>  /* For the benefit of TCG generated code, we want to avoid the complication
>     of ABI-specific return type promotion and always return a value extended
>     to the register size of the host.  This is tcg_target_long, except in the
> @@ -508,11 +507,184 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>      }
>  }
>  #endif
> +
> +#if DATA_SIZE == 1
> +# define HE_SUFFIX  _mmu
> +#elif defined(HOST_WORDS_BIGENDIAN)
> +# define HE_SUFFIX  _be_mmu
> +# define RE_SUFFIX  _le_mmu
> +#else
> +# define HE_SUFFIX  _le_mmu
> +# define RE_SUFFIX  _be_mmu
> +#endif
> +
> +#define ATOMIC_MMU_BODY                                                 \
> +    DATA_TYPE *haddr;                                                   \
> +    do {                                                                \
> +        unsigned mmu_idx = get_mmuidx(oi);                              \
> +        int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);    \
> +        CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];            \
> +        target_ulong tlb_addr = tlbe->addr_write;                       \
> +        int a_bits = get_alignment_bits(get_memop(oi));                 \
> +                                                                        \
> +        /* Adjust the given return address.  */                         \
> +        retaddr -= GETPC_ADJ;                                           \
> +                                                                        \
> +        /* Enforce guest required alignment.  */                        \
> +        if (unlikely(a_bits > 0 && (addr & ((1 << a_bits) - 1)))) {     \
> +            /* ??? Maybe indicate atomic op to cpu_unaligned_access */  \
> +            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE, \
> +                                 mmu_idx, retaddr);                     \
> +        }                                                               \
> +        /* Enforce qemu required alignment.  */                         \
> +        if (unlikely(addr & ((1 << SHIFT) - 1))) {                      \
> +            /* We get here if guest alignment was not requested,        \
> +               or was not enforced by cpu_unaligned_access above.       \
> +               We might widen the access and emulate, but for now       \
> +               mark an exception and exit the cpu loop.  */             \
> +            cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);            \
> +        }                                                               \
> +                                                                        \
> +        /* Check TLB entry and enforce page permissions.  */            \
> +        if ((addr & TARGET_PAGE_MASK)                                   \
> +            != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {    \
> +            if (!VICTIM_TLB_HIT(addr_write)) {                          \
> +                tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE,        \
> +                         mmu_idx, retaddr);                             \
> +            }                                                           \
> +            tlb_addr = tlbe->addr_write;                                \
> +        } else if (unlikely(tlbe->addr_read != tlb_addr)) {             \
> +            /* Let the guest notice RMW on a write-only page.  */       \
> +            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD,             \
> +                     mmu_idx, retaddr);                                 \
> +        }                                                               \
> +                                                                        \
> +        /* Notice an IO access.  */                                     \
> +        if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {                   \
> +            /* There's really nothing that can be done to               \
> +               support this apart from stop-the-world.  */              \
> +            cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
>  \

Hmm what are we doing here? I don't think there is a reason to stop the
world because atomicity should be guaranteed by the BQL. Although I
suspect anything doing atomic access to an I/O address (especially with
LL/SC primitives) is going to be disappointed.

> +        }                                                               \
> +        haddr = (DATA_TYPE *)((uintptr_t)addr + tlbe->addend);          \
> +    } while (0)
> +
> +DATA_TYPE glue(glue(helper_atomic_cmpxchg, SUFFIX), HE_SUFFIX)
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE cmpv, DATA_TYPE newv,
> +     TCGMemOpIdx oi, uintptr_t retaddr)
> +{
> +    ATOMIC_MMU_BODY;
> +    return atomic_cmpxchg(haddr, cmpv, newv);
> +}
> +
> +#define GEN_ATOMIC_HELPER(NAME)                                         \
> +DATA_TYPE glue(glue(glue(helper_atomic_, NAME), SUFFIX), HE_SUFFIX)     \
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE val,               \
> +     TCGMemOpIdx oi, uintptr_t retaddr)                                 \
> +{                                                                       \
> +    ATOMIC_MMU_BODY;                                                    \
> +    return glue(atomic_, NAME)(haddr, val);                             \
> +}
> +
> +GEN_ATOMIC_HELPER(fetch_add)
> +GEN_ATOMIC_HELPER(fetch_and)
> +GEN_ATOMIC_HELPER(fetch_or)
> +GEN_ATOMIC_HELPER(fetch_xor)
> +
> +GEN_ATOMIC_HELPER(add_fetch)
> +GEN_ATOMIC_HELPER(and_fetch)
> +GEN_ATOMIC_HELPER(or_fetch)
> +GEN_ATOMIC_HELPER(xor_fetch)
> +
> +GEN_ATOMIC_HELPER(xchg)
> +
> +#undef GEN_ATOMIC_HELPER
> +
> +#if DATA_SIZE > 1
> +DATA_TYPE glue(glue(helper_atomic_cmpxchg, SUFFIX), RE_SUFFIX)
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE cmpv, DATA_TYPE newv,
> +     TCGMemOpIdx oi, uintptr_t retaddr)
> +{
> +    DATA_TYPE retv;
> +    cmpv = BSWAP(cmpv);
> +    newv = BSWAP(newv);
> +    retv = (glue(glue(helper_atomic_cmpxchg, SUFFIX), HE_SUFFIX)
> +            (env, addr, cmpv, newv, oi, retaddr));
> +    return BSWAP(retv);
> +}
> +
> +#define GEN_ATOMIC_HELPER(NAME)                                         \
> +DATA_TYPE glue(glue(glue(helper_atomic_, NAME), SUFFIX), RE_SUFFIX)     \
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE val,               \
> +     TCGMemOpIdx oi, uintptr_t retaddr)                                 \
> +{                                                                       \
> +    DATA_TYPE ret;                                                      \
> +    val = BSWAP(val);                                                   \
> +    ret = (glue(glue(glue(helper_atomic_, NAME), SUFFIX), HE_SUFFIX)    \
> +           (env, addr, val, oi, retaddr));                              \
> +    return BSWAP(ret);                                                  \
> +}
> +
> +GEN_ATOMIC_HELPER(fetch_and)
> +GEN_ATOMIC_HELPER(fetch_or)
> +GEN_ATOMIC_HELPER(fetch_xor)
> +
> +GEN_ATOMIC_HELPER(and_fetch)
> +GEN_ATOMIC_HELPER(or_fetch)
> +GEN_ATOMIC_HELPER(xor_fetch)
> +
> +GEN_ATOMIC_HELPER(xchg)
> +
> +#undef GEN_ATOMIC_HELPER
> +
> +/* Note that for addition, we need to use a separate cmpxchg loop instead
> +   of bswaps around the host-endian helpers.  */
> +DATA_TYPE glue(glue(helper_atomic_fetch_add, SUFFIX), RE_SUFFIX)
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE val,
> +     TCGMemOpIdx oi, uintptr_t retaddr)
> +{
> +    DATA_TYPE ldo, ldn, ret, sto;
> +    ATOMIC_MMU_BODY;
> +
> +    ldo = *haddr;
> +    while (1) {
> +        ret = BSWAP(ldo);
> +        sto = BSWAP(ret + val);
> +        ldn = atomic_cmpxchg(haddr, ldo, sto);
> +        if (ldn == ldo) {
> +            return ret;
> +        }
> +        ldo = ldn;
> +    }
> +}
> +
> +DATA_TYPE glue(glue(helper_atomic_add_fetch, SUFFIX), RE_SUFFIX)
> +    (CPUArchState *env, target_ulong addr, DATA_TYPE val,
> +     TCGMemOpIdx oi, uintptr_t retaddr)
> +{
> +    DATA_TYPE ldo, ldn, ret, sto;
> +    ATOMIC_MMU_BODY;
> +
> +    ldo = *haddr;
> +    while (1) {
> +        ret = BSWAP(ldo) + val;
> +        sto = BSWAP(ret);
> +        ldn = atomic_cmpxchg(haddr, ldo, sto);
> +        if (ldn == ldo) {
> +            return ret;
> +        }
> +        ldo = ldn;
> +    }
> +}
> +#endif /* DATA_SIZE > 1 */
> +
> +#undef ATOMIC_MMU_BODY
> +
>  #endif /* !defined(SOFTMMU_CODE_ACCESS) */
>
>  #undef READ_ACCESS_TYPE
>  #undef SHIFT
>  #undef DATA_TYPE
> +#undef DATAX_TYPE

Accidental typo?

>  #undef SUFFIX
>  #undef LSUFFIX
>  #undef DATA_SIZE
> @@ -524,8 +696,6 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>  #undef BSWAP
>  #undef TGT_BE
>  #undef TGT_LE
> -#undef CPU_BE
> -#undef CPU_LE

I suspect these belong in a separate clean-up patch?

>  #undef helper_le_ld_name
>  #undef helper_be_ld_name
>  #undef helper_le_lds_name
> @@ -534,3 +704,5 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>  #undef helper_be_st_name
>  #undef helper_te_ld_name
>  #undef helper_te_st_name
> +#undef HE_SUFFIX
> +#undef RE_SUFFIX
> diff --git a/tcg-runtime.c b/tcg-runtime.c
> index ea2ad64..63f78fc 100644
> --- a/tcg-runtime.c
> +++ b/tcg-runtime.c
> @@ -23,17 +23,10 @@
>   */
>  #include "qemu/osdep.h"
>  #include "qemu/host-utils.h"
> -
> -/* This file is compiled once, and thus we can't include the standard
> -   "exec/helper-proto.h", which has includes that are target specific.  */
> -
> -#include "exec/helper-head.h"
> -
> -#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
> -  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
> -
> -#include "tcg-runtime.h"
> -
> +#include "cpu.h"
> +#include "exec/helper-proto.h"
> +#include "exec/cpu_ldst.h"
> +#include "exec/exec-all.h"
>
>  /* 32-bit helpers */
>
> @@ -107,3 +100,15 @@ int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
>      muls64(&l, &h, arg1, arg2);
>      return h;
>  }
> +
> +#define SHIFT 0
> +#include "atomic_template.h"
> +
> +#define SHIFT 1
> +#include "atomic_template.h"
> +
> +#define SHIFT 2
> +#include "atomic_template.h"
> +
> +#define SHIFT 3
> +#include "atomic_template.h"
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 293b854..bc72c17 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -1958,3 +1958,327 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
>                                 addr, trace_mem_get_info(memop, 1));
>      gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
>  }
> +
> +static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, TCGMemOp opc)
> +{
> +    switch (opc & MO_SSIZE) {
> +    case MO_SB:
> +        tcg_gen_ext8s_i32(ret, val);
> +        break;
> +    case MO_UB:
> +        tcg_gen_ext8u_i32(ret, val);
> +        break;
> +    case MO_SW:
> +        tcg_gen_ext16s_i32(ret, val);
> +        break;
> +    case MO_UW:
> +        tcg_gen_ext16u_i32(ret, val);
> +        break;
> +    default:
> +        tcg_gen_mov_i32(ret, val);
> +        break;
> +    }
> +}
> +
> +static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, TCGMemOp opc)
> +{
> +    switch (opc & MO_SSIZE) {
> +    case MO_SB:
> +        tcg_gen_ext8s_i64(ret, val);
> +        break;
> +    case MO_UB:
> +        tcg_gen_ext8u_i64(ret, val);
> +        break;
> +    case MO_SW:
> +        tcg_gen_ext16s_i64(ret, val);
> +        break;
> +    case MO_UW:
> +        tcg_gen_ext16u_i64(ret, val);
> +        break;
> +    case MO_SL:
> +        tcg_gen_ext32s_i64(ret, val);
> +        break;
> +    case MO_UL:
> +        tcg_gen_ext32u_i64(ret, val);
> +        break;
> +    default:
> +        tcg_gen_mov_i64(ret, val);
> +        break;
> +    }
> +}

Should these two be separate patches?

> +
> +#ifdef CONFIG_USER_ONLY
> +typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv, TCGv_i32, TCGv_i32);
> +typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv, TCGv_i64, TCGv_i64);
> +typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv, TCGv_i32);
> +typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv, TCGv_i64);
> +#else
> +typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv,
> +                                  TCGv_i32, TCGv_i32, TCGv_i32);
> +typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv,
> +                                  TCGv_i64, TCGv_i64, TCGv_i32);
> +typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32, TCGv_i32);
> +typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64, TCGv_i32);
> +#endif
> +
> +static void * const table_cmpxchg[16] = {
> +    [MO_8] = gen_helper_atomic_cmpxchgb,
> +    [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le,
> +    [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be,
> +    [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le,
> +    [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be,
> +    [MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le,
> +    [MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be,
> +};
> +
> +void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
> +                                TCGv_i32 newv, TCGArg idx, TCGMemOp memop)
> +{
> +    memop = tcg_canonicalize_memop(memop, 0, 0);
> +
> +    if (!parallel_cpus) {
> +        TCGv_i32 t1 = tcg_temp_new_i32();
> +        TCGv_i32 t2 = tcg_temp_new_i32();
> +
> +        tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE);
> +
> +        tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
> +        tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1);
> +        tcg_gen_qemu_st_i32(t2, addr, idx, memop);
> +        tcg_temp_free_i32(t2);
> +
> +        if (memop & MO_SIGN) {
> +            tcg_gen_ext_i32(retv, t1, memop);
> +        } else {
> +            tcg_gen_mov_i32(retv, t1);
> +        }
> +        tcg_temp_free_i32(t1);
> +    } else {
> +        gen_atomic_cx_i32 gen;
> +
> +        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
> +        tcg_debug_assert(gen != NULL);
> +
> +#ifdef CONFIG_USER_ONLY
> +        gen(retv, addr, cmpv, newv);
> +#else
> +        {
> +            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
> +            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
> +            tcg_temp_free_i32(oi);
> +        }
> +#endif
> +
> +        if (memop & MO_SIGN) {
> +            tcg_gen_ext_i32(retv, retv, memop);
> +        }
> +    }
> +}
> +
> +void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
> +                                TCGv_i64 newv, TCGArg idx, TCGMemOp memop)
> +{
> +    memop = tcg_canonicalize_memop(memop, 1, 0);
> +
> +    if (!parallel_cpus) {
> +        TCGv_i64 t1 = tcg_temp_new_i64();
> +        TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +        tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE);
> +
> +        tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
> +        tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1);
> +        tcg_gen_qemu_st_i64(t2, addr, idx, memop);
> +        tcg_temp_free_i64(t2);
> +
> +        if (memop & MO_SIGN) {
> +            tcg_gen_ext_i64(retv, t1, memop);
> +        } else {
> +            tcg_gen_mov_i64(retv, t1);
> +        }
> +        tcg_temp_free_i64(t1);
> +    } else if ((memop & MO_SIZE) == MO_64) {
> +        gen_atomic_cx_i64 gen;
> +
> +        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
> +        tcg_debug_assert(gen != NULL);
> +
> +#ifdef CONFIG_USER_ONLY
> +        gen(retv, addr, cmpv, newv);
> +#else
> +        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
> +        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
> +        tcg_temp_free_i32(oi);
> +#endif
> +    } else {
> +        TCGv_i32 c32 = tcg_temp_new_i32();
> +        TCGv_i32 n32 = tcg_temp_new_i32();
> +        TCGv_i32 r32 = tcg_temp_new_i32();
> +
> +        tcg_gen_extrl_i64_i32(c32, cmpv);
> +        tcg_gen_extrl_i64_i32(n32, newv);
> +        tcg_gen_atomic_cmpxchg_i32(r32, addr, c32, n32, idx, memop & ~MO_SIGN);
> +        tcg_temp_free_i32(c32);
> +        tcg_temp_free_i32(n32);
> +
> +        tcg_gen_extu_i32_i64(retv, r32);
> +        tcg_temp_free_i32(r32);
> +
> +        if (memop & MO_SIGN) {
> +            tcg_gen_ext_i64(retv, retv, memop);
> +        }
> +    }
> +}
> +
> +static void do_nonatomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
> +                                TCGArg idx, TCGMemOp memop, bool new_val,
> +                                void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t1 = tcg_temp_new_i32();
> +    TCGv_i32 t2 = tcg_temp_new_i32();
> +
> +    memop = tcg_canonicalize_memop(memop, 0, 0);
> +
> +    tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
> +    gen(t2, t1, val);
> +    tcg_gen_qemu_st_i32(t2, addr, idx, memop);
> +
> +    tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop);
> +    tcg_temp_free_i32(t1);
> +    tcg_temp_free_i32(t2);
> +}
> +
> +static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
> +                             TCGArg idx, TCGMemOp memop, void * const table[])
> +{
> +    gen_atomic_op_i32 gen;
> +
> +    memop = tcg_canonicalize_memop(memop, 0, 0);
> +
> +    gen = table[memop & (MO_SIZE | MO_BSWAP)];
> +    tcg_debug_assert(gen != NULL);
> +
> +#ifdef CONFIG_USER_ONLY
> +    gen(ret, addr, val);
> +#else
> +    {
> +        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
> +        gen(ret, tcg_ctx.tcg_env, addr, val, oi);
> +        tcg_temp_free_i32(oi);
> +    }
> +#endif
> +
> +    if (memop & MO_SIGN) {
> +        tcg_gen_ext_i32(ret, ret, memop);
> +    }
> +}
> +
> +static void do_nonatomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
> +                                TCGArg idx, TCGMemOp memop, bool new_val,
> +                                void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    memop = tcg_canonicalize_memop(memop, 1, 0);
> +
> +    tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
> +    gen(t2, t1, val);
> +    tcg_gen_qemu_st_i64(t2, addr, idx, memop);
> +
> +    tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop);
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
> +                             TCGArg idx, TCGMemOp memop, void * const table[])
> +{
> +    memop = tcg_canonicalize_memop(memop, 1, 0);
> +
> +    if ((memop & MO_SIZE) == MO_64) {
> +        gen_atomic_op_i64 gen;
> +
> +        gen = table[memop & (MO_SIZE | MO_BSWAP)];
> +        tcg_debug_assert(gen != NULL);
> +
> +#ifdef CONFIG_USER_ONLY
> +        gen(ret, addr, val);
> +#else
> +        {
> +            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
> +            gen(ret, tcg_ctx.tcg_env, addr, val, oi);
> +            tcg_temp_free_i32(oi);
> +        }
> +#endif
> +    } else {
> +        TCGv_i32 v32 = tcg_temp_new_i32();
> +        TCGv_i32 r32 = tcg_temp_new_i32();
> +
> +        tcg_gen_extrl_i64_i32(v32, val);
> +        do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table);
> +        tcg_temp_free_i32(v32);
> +
> +        tcg_gen_extu_i32_i64(ret, r32);
> +        tcg_temp_free_i32(r32);
> +
> +        if (memop & MO_SIGN) {
> +            tcg_gen_ext_i64(ret, ret, memop);
> +        }
> +    }
> +}
> +
> +#define GEN_ATOMIC_HELPER(NAME, OP, NEW)                                \
> +static void * const table_##NAME[16] = {                                \
> +    [MO_8] = gen_helper_atomic_##NAME##b,                               \
> +    [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le,                   \
> +    [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be,                   \
> +    [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le,                   \
> +    [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be,                   \
> +    [MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le,                   \
> +    [MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be,                   \
> +};                                                                      \
> +void tcg_gen_atomic_##NAME##_i32                                        \
> +    (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, TCGMemOp memop) \
> +{                                                                       \
> +    if (parallel_cpus) {                                                \
> +        do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
> +    } else {                                                            \
> +        do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
> +                            tcg_gen_##OP##_i32);                        \
> +    }                                                                   \
> +}                                                                       \
> +void tcg_gen_atomic_##NAME##_i64                                        \
> +    (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, TCGMemOp memop) \
> +{                                                                       \
> +    if (parallel_cpus) {                                                \
> +        do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
> +    } else {                                                            \
> +        do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
> +                            tcg_gen_##OP##_i64);                        \
> +    }                                                                   \
> +}
> +
> +GEN_ATOMIC_HELPER(fetch_add, add, 0)
> +GEN_ATOMIC_HELPER(fetch_and, and, 0)
> +GEN_ATOMIC_HELPER(fetch_or, or, 0)
> +GEN_ATOMIC_HELPER(fetch_xor, xor, 0)
> +
> +GEN_ATOMIC_HELPER(add_fetch, add, 1)
> +GEN_ATOMIC_HELPER(and_fetch, and, 1)
> +GEN_ATOMIC_HELPER(or_fetch, or, 1)
> +GEN_ATOMIC_HELPER(xor_fetch, xor, 1)
> +
> +static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b)
> +{
> +    tcg_gen_mov_i32(r, b);
> +}
> +
> +static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
> +{
> +    tcg_gen_mov_i64(r, b);
> +}
> +
> +GEN_ATOMIC_HELPER(xchg, mov2, 0)
> +
> +#undef GEN_ATOMIC_HELPER
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index f217e80..2a845ae 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -852,6 +852,30 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
>      tcg_gen_qemu_st_i64(arg, addr, mem_index, MO_TEQ);
>  }
>
> +void tcg_gen_atomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32,
> +                                TCGArg, TCGMemOp);
> +void tcg_gen_atomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64,
> +                                TCGArg, TCGMemOp);
> +
> +void tcg_gen_atomic_xchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_xchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_add_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_add_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_and_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_and_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_or_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_or_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_xor_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_fetch_xor_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_add_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_add_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_and_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_and_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_or_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> +void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> +
>  #if TARGET_LONG_BITS == 64
>  #define tcg_gen_movi_tl tcg_gen_movi_i64
>  #define tcg_gen_mov_tl tcg_gen_mov_i64
> @@ -930,6 +954,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
>  #define tcg_gen_sub2_tl tcg_gen_sub2_i64
>  #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
>  #define tcg_gen_muls2_tl tcg_gen_muls2_i64
> +#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i64
> +#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i64
> +#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i64
> +#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i64
> +#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i64
> +#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i64
> +#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i64
> +#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
> +#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
> +#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
>  #else
>  #define tcg_gen_movi_tl tcg_gen_movi_i32
>  #define tcg_gen_mov_tl tcg_gen_mov_i32
> @@ -1007,6 +1041,16 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
>  #define tcg_gen_sub2_tl tcg_gen_sub2_i32
>  #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
>  #define tcg_gen_muls2_tl tcg_gen_muls2_i32
> +#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i32
> +#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i32
> +#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i32
> +#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i32
> +#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i32
> +#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i32
> +#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i32
> +#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
> +#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
> +#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
>  #endif
>
>  #if UINTPTR_MAX == UINT32_MAX
> diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
> index 23a0c37..b3accf4 100644
> --- a/tcg/tcg-runtime.h
> +++ b/tcg/tcg-runtime.h
> @@ -14,3 +14,78 @@ DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
>
>  DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
>  DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
> +
> +#ifdef CONFIG_USER_ONLY
> +
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, tl, i32, i32)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, tl, i32, i32)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, tl, i32, i32)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, tl, i64, i64)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, tl, i32, i32)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, tl, i32, i32)
> +DEF_HELPER_FLAGS_3(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, tl, i64, i64)
> +
> +#define GEN_ATOMIC_HELPERS(NAME)                        \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), b),    \
> +                       TCG_CALL_NO_WG, i32, tl, i32)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), w_le), \
> +                       TCG_CALL_NO_WG, i32, tl, i32)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), w_be), \
> +                       TCG_CALL_NO_WG, i32, tl, i32)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), l_le), \
> +                       TCG_CALL_NO_WG, i32, tl, i32)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), l_be), \
> +                       TCG_CALL_NO_WG, i32, tl, i32)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), q_le), \
> +                       TCG_CALL_NO_WG, i64, tl, i64)    \
> +    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), q_be), \
> +                       TCG_CALL_NO_WG, i64, tl, i64)
> +
> +#else
> +
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env,
> +                   tl, i32, i32, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, env,
> +                   tl, i32, i32, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env,
> +                   tl, i32, i32, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env,
> +                   tl, i64, i64, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, env,
> +                   tl, i32, i32, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, env,
> +                   tl, i32, i32, i32)
> +DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env,
> +                   tl, i64, i64, i32)
> +
> +#define GEN_ATOMIC_HELPERS(NAME)                                \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),            \
> +                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),         \
> +                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),         \
> +                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),         \
> +                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),         \
> +                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),         \
> +                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)  \
> +    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),         \
> +                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
> +
> +#endif
> +
> +GEN_ATOMIC_HELPERS(fetch_add)
> +GEN_ATOMIC_HELPERS(fetch_and)
> +GEN_ATOMIC_HELPERS(fetch_or)
> +GEN_ATOMIC_HELPERS(fetch_xor)
> +
> +GEN_ATOMIC_HELPERS(add_fetch)
> +GEN_ATOMIC_HELPERS(and_fetch)
> +GEN_ATOMIC_HELPERS(or_fetch)
> +GEN_ATOMIC_HELPERS(xor_fetch)
> +
> +GEN_ATOMIC_HELPERS(xchg)
> +
> +#undef GEN_ATOMIC_HELPERS
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index ab67537..4e60498 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -1163,6 +1163,59 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
>  # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
>  #endif
>
> +uint8_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
> +                                   uint8_t cmpv, uint8_t newv,
> +                                   TCGMemOpIdx oi, uintptr_t retaddr);
> +uint16_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint16_t cmpv, uint16_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint32_t cmpv, uint32_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint64_t cmpv, uint64_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +uint16_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint16_t cmpv, uint16_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint32_t cmpv, uint32_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
> +                                       uint64_t cmpv, uint64_t newv,
> +                                       TCGMemOpIdx oi, uintptr_t retaddr);
> +
> +#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
> +TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
> +    (CPUArchState *env, target_ulong addr, TYPE val,  \
> +     TCGMemOpIdx oi, uintptr_t retaddr);
> +
> +#define GEN_ATOMIC_HELPER_ALL(NAME)          \
> +    GEN_ATOMIC_HELPER(NAME, uint8_t, b)      \
> +    GEN_ATOMIC_HELPER(NAME, uint16_t, w_le)  \
> +    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
> +    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
> +    GEN_ATOMIC_HELPER(NAME, uint16_t, w_be)  \
> +    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
> +    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
> +
> +GEN_ATOMIC_HELPER_ALL(fetch_add)
> +GEN_ATOMIC_HELPER_ALL(fetch_sub)
> +GEN_ATOMIC_HELPER_ALL(fetch_and)
> +GEN_ATOMIC_HELPER_ALL(fetch_or)
> +GEN_ATOMIC_HELPER_ALL(fetch_xor)
> +
> +GEN_ATOMIC_HELPER_ALL(add_fetch)
> +GEN_ATOMIC_HELPER_ALL(sub_fetch)
> +GEN_ATOMIC_HELPER_ALL(and_fetch)
> +GEN_ATOMIC_HELPER_ALL(or_fetch)
> +GEN_ATOMIC_HELPER_ALL(xor_fetch)
> +
> +GEN_ATOMIC_HELPER_ALL(xchg)
> +
> +#undef GEN_ATOMIC_HELPER_ALL
> +#undef GEN_ATOMIC_HELPER
> +
>  #endif /* CONFIG_SOFTMMU */
>
>  #endif /* TCG_H */


--
Alex Bennée
Richard Henderson Sept. 8, 2016, 4:08 p.m. UTC | #2
On 09/08/2016 06:43 AM, Alex Bennée wrote:
>>  DATA_TYPE
>> > +#undef DATAX_TYPE
> Accidental typo?

It wasn't, actually, but...

>> >  #undef SUFFIX
>> >  #undef LSUFFIX
>> >  #undef DATA_SIZE
>> > @@ -524,8 +696,6 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
>> >  #undef BSWAP
>> >  #undef TGT_BE
>> >  #undef TGT_LE
>> > -#undef CPU_BE
>> > -#undef CPU_LE
> I suspect these belong in a separate clean-up patch?
> 

Please look at the v3 patch set instead.


r~
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 7f1f0a3..f40bdfd 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -88,7 +88,6 @@  endif
 
 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
diff --git a/Makefile.target b/Makefile.target
index d720b3e..0ca9ed6 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -94,6 +94,7 @@  obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target-$(TARGET_BASE_ARCH)/
 obj-y += disas.o
+obj-y += tcg-runtime.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
 
diff --git a/atomic_template.h b/atomic_template.h
new file mode 100644
index 0000000..a755853
--- /dev/null
+++ b/atomic_template.h
@@ -0,0 +1,220 @@ 
+/*
+ * Atomic helper templates
+ * Included from tcg-runtime.c.
+ *
+ * Copyright (c) 2016 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define DATA_SIZE (1 << SHIFT)
+
+#if DATA_SIZE == 8
+#define SUFFIX     q
+#define DATA_TYPE  uint64_t
+#define ABI_TYPE   uint64_t
+#define BSWAP      bswap64
+#elif DATA_SIZE == 4
+#define SUFFIX     l
+#define DATA_TYPE  uint32_t
+#define ABI_TYPE   uint32_t
+#define BSWAP      bswap32
+#elif DATA_SIZE == 2
+#define SUFFIX     w
+#define DATA_TYPE  uint16_t
+#define ABI_TYPE   uint32_t
+#define BSWAP      bswap16
+#elif DATA_SIZE == 1
+#define SUFFIX     b
+#define DATA_TYPE  uint8_t
+#define ABI_TYPE   uint32_t
+#else
+#error unsupported data size
+#endif
+
+#ifdef CONFIG_USER_ONLY
+
+#if DATA_SIZE == 1
+# define HE_SUFFIX
+#elif defined(HOST_WORDS_BIGENDIAN)
+# define HE_SUFFIX  _be
+# define RE_SUFFIX  _le
+#else
+# define HE_SUFFIX  _le
+# define RE_SUFFIX  _be
+#endif
+
+ABI_TYPE HELPER(glue(glue(atomic_cmpxchg, SUFFIX), HE_SUFFIX))
+    (target_ulong addr, ABI_TYPE cmpv, ABI_TYPE newv)
+{
+    DATA_TYPE *haddr = g2h(addr);
+    return atomic_cmpxchg(haddr, cmpv, newv);
+}
+
+#define GEN_ATOMIC_HELPER_HE(NAME)                                  \
+ABI_TYPE HELPER(glue(glue(atomic_##NAME, SUFFIX), HE_SUFFIX))       \
+    (target_ulong addr, ABI_TYPE val)                               \
+{                                                                   \
+    DATA_TYPE *haddr = g2h(addr);                                   \
+    return atomic_##NAME(haddr, val);                               \
+}                                                                   \
+
+GEN_ATOMIC_HELPER_HE(fetch_add)
+GEN_ATOMIC_HELPER_HE(fetch_and)
+GEN_ATOMIC_HELPER_HE(fetch_or)
+GEN_ATOMIC_HELPER_HE(fetch_xor)
+GEN_ATOMIC_HELPER_HE(add_fetch)
+GEN_ATOMIC_HELPER_HE(and_fetch)
+GEN_ATOMIC_HELPER_HE(or_fetch)
+GEN_ATOMIC_HELPER_HE(xor_fetch)
+GEN_ATOMIC_HELPER_HE(xchg)
+
+#undef GEN_ATOMIC_HELPER_HE
+
+#if DATA_SIZE > 1
+
+ABI_TYPE HELPER(glue(glue(atomic_cmpxchg, SUFFIX), RE_SUFFIX))
+    (target_ulong addr, ABI_TYPE cmpv, ABI_TYPE newv)
+{
+    DATA_TYPE *haddr = g2h(addr);
+    return BSWAP(atomic_cmpxchg(haddr, BSWAP(cmpv), BSWAP(newv)));
+}
+
+#define GEN_ATOMIC_HELPER_RE(NAME)                                  \
+ABI_TYPE HELPER(glue(glue(atomic_##NAME, SUFFIX), RE_SUFFIX))       \
+    (target_ulong addr, ABI_TYPE val)                               \
+{                                                                   \
+    DATA_TYPE *haddr = g2h(addr);                                   \
+    return BSWAP(atomic_##NAME(haddr, BSWAP(val)));                 \
+}
+
+GEN_ATOMIC_HELPER_RE(fetch_and)
+GEN_ATOMIC_HELPER_RE(fetch_or)
+GEN_ATOMIC_HELPER_RE(fetch_xor)
+GEN_ATOMIC_HELPER_RE(and_fetch)
+GEN_ATOMIC_HELPER_RE(or_fetch)
+GEN_ATOMIC_HELPER_RE(xor_fetch)
+GEN_ATOMIC_HELPER_RE(xchg)
+
+/* Note that for addition, we need to use a separate cmpxchg loop instead
+   of bswaps for the reverse-host-endian helpers.  */
+ABI_TYPE HELPER(glue(glue(atomic_fetch_add, SUFFIX), RE_SUFFIX))
+    (target_ulong addr, ABI_TYPE val)
+{
+    DATA_TYPE ldo, ldn, ret, sto;
+    DATA_TYPE *haddr = g2h(addr);
+
+    ldo = *haddr;
+    while (1) {
+        ret = BSWAP(ldo);
+        sto = BSWAP(ret + val);
+        ldn = atomic_cmpxchg(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+ABI_TYPE HELPER(glue(glue(atomic_add_fetch, SUFFIX), RE_SUFFIX))
+    (target_ulong addr, ABI_TYPE val)
+{
+    DATA_TYPE ldo, ldn, ret, sto;
+    DATA_TYPE *haddr = g2h(addr);
+
+    ldo = *haddr;
+    while (1) {
+        ret = BSWAP(ldo) + val;
+        sto = BSWAP(ret);
+        ldn = atomic_cmpxchg(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+#undef GEN_ATOMIC_HELPER_RE
+#endif /* DATA_SIZE > 1 */
+
+#undef HE_SUFFIX
+#undef RE_SUFFIX
+
+#else /* !CONFIG_USER_ONLY */
+
+#if DATA_SIZE == 1
+#define LE_SUFFIX
+#else
+#define LE_SUFFIX  _le
+#endif
+
+ABI_TYPE HELPER(glue(glue(atomic_cmpxchg, SUFFIX), LE_SUFFIX))
+    (CPUArchState *env, target_ulong addr,
+     ABI_TYPE cmpv, ABI_TYPE newv, uint32_t oi)
+{
+    return glue(glue(glue(helper_atomic_cmpxchg, SUFFIX), LE_SUFFIX), _mmu)
+        (env, addr, cmpv, newv, oi, GETPC());
+}
+
+#define GEN_ATOMIC_HELPER(NAME, S)                                      \
+ABI_TYPE HELPER(glue(glue(atomic_##NAME, SUFFIX), S))                   \
+    (CPUArchState *env, target_ulong addr, ABI_TYPE val, uint32_t oi)   \
+{                                                                       \
+    return glue(glue(glue(helper_atomic_##NAME, SUFFIX), S), _mmu)      \
+        (env, addr, val, oi, GETPC());                                  \
+}
+
+GEN_ATOMIC_HELPER(fetch_add, LE_SUFFIX)
+GEN_ATOMIC_HELPER(fetch_and, LE_SUFFIX)
+GEN_ATOMIC_HELPER(fetch_or, LE_SUFFIX)
+GEN_ATOMIC_HELPER(fetch_xor, LE_SUFFIX)
+GEN_ATOMIC_HELPER(add_fetch, LE_SUFFIX)
+GEN_ATOMIC_HELPER(and_fetch, LE_SUFFIX)
+GEN_ATOMIC_HELPER(or_fetch, LE_SUFFIX)
+GEN_ATOMIC_HELPER(xor_fetch, LE_SUFFIX)
+GEN_ATOMIC_HELPER(xchg, LE_SUFFIX)
+
+#if DATA_SIZE > 1
+
+ABI_TYPE HELPER(glue(glue(atomic_cmpxchg, SUFFIX), _be))
+    (CPUArchState *env, target_ulong addr,
+     ABI_TYPE cmpv, ABI_TYPE newv, uint32_t oi)
+{
+    return glue(glue(helper_atomic_cmpxchg, SUFFIX), _be_mmu)
+        (env, addr, cmpv, newv, oi, GETPC());
+}
+
+GEN_ATOMIC_HELPER(fetch_add, _be)
+GEN_ATOMIC_HELPER(fetch_and, _be)
+GEN_ATOMIC_HELPER(fetch_or, _be)
+GEN_ATOMIC_HELPER(fetch_xor, _be)
+GEN_ATOMIC_HELPER(add_fetch, _be)
+GEN_ATOMIC_HELPER(and_fetch, _be)
+GEN_ATOMIC_HELPER(or_fetch, _be)
+GEN_ATOMIC_HELPER(xor_fetch, _be)
+GEN_ATOMIC_HELPER(xchg, _be)
+
+#endif /* DATA_SIZE > 1 */
+
+#undef GEN_ATOMIC_HELPER
+#undef LE_SUFFIX
+
+#endif /* CONFIG_USER_ONLY */
+
+#undef BSWAP
+#undef ABI_TYPE
+#undef DATA_TYPE
+#undef SUFFIX
+#undef DATA_SIZE
+#undef SHIFT
diff --git a/cputlb.c b/cputlb.c
index 079e497..5272456 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -23,15 +23,14 @@ 
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "exec/cpu_ldst.h"
-
 #include "exec/cputlb.h"
-
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
+#include "exec/helper-proto.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
diff --git a/softmmu_template.h b/softmmu_template.h
index 4d378ca..76712b9 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -51,7 +51,6 @@ 
 #error unsupported data size
 #endif
 
-
 /* For the benefit of TCG generated code, we want to avoid the complication
    of ABI-specific return type promotion and always return a value extended
    to the register size of the host.  This is tcg_target_long, except in the
@@ -508,11 +507,184 @@  void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
     }
 }
 #endif
+
+#if DATA_SIZE == 1
+# define HE_SUFFIX  _mmu
+#elif defined(HOST_WORDS_BIGENDIAN)
+# define HE_SUFFIX  _be_mmu
+# define RE_SUFFIX  _le_mmu
+#else
+# define HE_SUFFIX  _le_mmu
+# define RE_SUFFIX  _be_mmu
+#endif
+
+#define ATOMIC_MMU_BODY                                                 \
+    DATA_TYPE *haddr;                                                   \
+    do {                                                                \
+        unsigned mmu_idx = get_mmuidx(oi);                              \
+        int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);    \
+        CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];            \
+        target_ulong tlb_addr = tlbe->addr_write;                       \
+        int a_bits = get_alignment_bits(get_memop(oi));                 \
+                                                                        \
+        /* Adjust the given return address.  */                         \
+        retaddr -= GETPC_ADJ;                                           \
+                                                                        \
+        /* Enforce guest required alignment.  */                        \
+        if (unlikely(a_bits > 0 && (addr & ((1 << a_bits) - 1)))) {     \
+            /* ??? Maybe indicate atomic op to cpu_unaligned_access */  \
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE, \
+                                 mmu_idx, retaddr);                     \
+        }                                                               \
+        /* Enforce qemu required alignment.  */                         \
+        if (unlikely(addr & ((1 << SHIFT) - 1))) {                      \
+            /* We get here if guest alignment was not requested,        \
+               or was not enforced by cpu_unaligned_access above.       \
+               We might widen the access and emulate, but for now       \
+               mark an exception and exit the cpu loop.  */             \
+            cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);            \
+        }                                                               \
+                                                                        \
+        /* Check TLB entry and enforce page permissions.  */            \
+        if ((addr & TARGET_PAGE_MASK)                                   \
+            != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {    \
+            if (!VICTIM_TLB_HIT(addr_write)) {                          \
+                tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE,        \
+                         mmu_idx, retaddr);                             \
+            }                                                           \
+            tlb_addr = tlbe->addr_write;                                \
+        } else if (unlikely(tlbe->addr_read != tlb_addr)) {             \
+            /* Let the guest notice RMW on a write-only page.  */       \
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_LOAD,             \
+                     mmu_idx, retaddr);                                 \
+        }                                                               \
+                                                                        \
+        /* Notice an IO access.  */                                     \
+        if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {                   \
+            /* There's really nothing that can be done to               \
+               support this apart from stop-the-world.  */              \
+            cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);            \
+        }                                                               \
+        haddr = (DATA_TYPE *)((uintptr_t)addr + tlbe->addend);          \
+    } while (0)
+
+DATA_TYPE glue(glue(helper_atomic_cmpxchg, SUFFIX), HE_SUFFIX)
+    (CPUArchState *env, target_ulong addr, DATA_TYPE cmpv, DATA_TYPE newv,
+     TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    ATOMIC_MMU_BODY;
+    return atomic_cmpxchg(haddr, cmpv, newv);
+}
+
+#define GEN_ATOMIC_HELPER(NAME)                                         \
+DATA_TYPE glue(glue(glue(helper_atomic_, NAME), SUFFIX), HE_SUFFIX)     \
+    (CPUArchState *env, target_ulong addr, DATA_TYPE val,               \
+     TCGMemOpIdx oi, uintptr_t retaddr)                                 \
+{                                                                       \
+    ATOMIC_MMU_BODY;                                                    \
+    return glue(atomic_, NAME)(haddr, val);                             \
+}
+
+GEN_ATOMIC_HELPER(fetch_add)
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+
+GEN_ATOMIC_HELPER(add_fetch)
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+GEN_ATOMIC_HELPER(xchg)
+
+#undef GEN_ATOMIC_HELPER
+
+#if DATA_SIZE > 1
+DATA_TYPE glue(glue(helper_atomic_cmpxchg, SUFFIX), RE_SUFFIX)
+    (CPUArchState *env, target_ulong addr, DATA_TYPE cmpv, DATA_TYPE newv,
+     TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    DATA_TYPE retv;
+    cmpv = BSWAP(cmpv);
+    newv = BSWAP(newv);
+    retv = (glue(glue(helper_atomic_cmpxchg, SUFFIX), HE_SUFFIX)
+            (env, addr, cmpv, newv, oi, retaddr));
+    return BSWAP(retv);
+}
+
+#define GEN_ATOMIC_HELPER(NAME)                                         \
+DATA_TYPE glue(glue(glue(helper_atomic_, NAME), SUFFIX), RE_SUFFIX)     \
+    (CPUArchState *env, target_ulong addr, DATA_TYPE val,               \
+     TCGMemOpIdx oi, uintptr_t retaddr)                                 \
+{                                                                       \
+    DATA_TYPE ret;                                                      \
+    val = BSWAP(val);                                                   \
+    ret = (glue(glue(glue(helper_atomic_, NAME), SUFFIX), HE_SUFFIX)    \
+           (env, addr, val, oi, retaddr));                              \
+    return BSWAP(ret);                                                  \
+}
+
+GEN_ATOMIC_HELPER(fetch_and)
+GEN_ATOMIC_HELPER(fetch_or)
+GEN_ATOMIC_HELPER(fetch_xor)
+
+GEN_ATOMIC_HELPER(and_fetch)
+GEN_ATOMIC_HELPER(or_fetch)
+GEN_ATOMIC_HELPER(xor_fetch)
+
+GEN_ATOMIC_HELPER(xchg)
+
+#undef GEN_ATOMIC_HELPER
+
+/* Note that for addition, we need to use a separate cmpxchg loop instead
+   of bswaps around the host-endian helpers.  */
+DATA_TYPE glue(glue(helper_atomic_fetch_add, SUFFIX), RE_SUFFIX)
+    (CPUArchState *env, target_ulong addr, DATA_TYPE val,
+     TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    DATA_TYPE ldo, ldn, ret, sto;
+    ATOMIC_MMU_BODY;
+
+    ldo = *haddr;
+    while (1) {
+        ret = BSWAP(ldo);
+        sto = BSWAP(ret + val);
+        ldn = atomic_cmpxchg(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+
+DATA_TYPE glue(glue(helper_atomic_add_fetch, SUFFIX), RE_SUFFIX)
+    (CPUArchState *env, target_ulong addr, DATA_TYPE val,
+     TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    DATA_TYPE ldo, ldn, ret, sto;
+    ATOMIC_MMU_BODY;
+
+    ldo = *haddr;
+    while (1) {
+        ret = BSWAP(ldo) + val;
+        sto = BSWAP(ret);
+        ldn = atomic_cmpxchg(haddr, ldo, sto);
+        if (ldn == ldo) {
+            return ret;
+        }
+        ldo = ldn;
+    }
+}
+#endif /* DATA_SIZE > 1 */
+
+#undef ATOMIC_MMU_BODY
+
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
 #undef READ_ACCESS_TYPE
 #undef SHIFT
 #undef DATA_TYPE
+#undef DATAX_TYPE
 #undef SUFFIX
 #undef LSUFFIX
 #undef DATA_SIZE
@@ -524,8 +696,6 @@  void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 #undef BSWAP
 #undef TGT_BE
 #undef TGT_LE
-#undef CPU_BE
-#undef CPU_LE
 #undef helper_le_ld_name
 #undef helper_be_ld_name
 #undef helper_le_lds_name
@@ -534,3 +704,5 @@  void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 #undef helper_be_st_name
 #undef helper_te_ld_name
 #undef helper_te_st_name
+#undef HE_SUFFIX
+#undef RE_SUFFIX
diff --git a/tcg-runtime.c b/tcg-runtime.c
index ea2ad64..63f78fc 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -23,17 +23,10 @@ 
  */
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
-
-/* This file is compiled once, and thus we can't include the standard
-   "exec/helper-proto.h", which has includes that are target specific.  */
-
-#include "exec/helper-head.h"
-
-#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
-
-#include "tcg-runtime.h"
-
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
 
 /* 32-bit helpers */
 
@@ -107,3 +100,15 @@  int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
     muls64(&l, &h, arg1, arg2);
     return h;
 }
+
+#define SHIFT 0
+#include "atomic_template.h"
+
+#define SHIFT 1
+#include "atomic_template.h"
+
+#define SHIFT 2
+#include "atomic_template.h"
+
+#define SHIFT 3
+#include "atomic_template.h"
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 293b854..bc72c17 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1958,3 +1958,327 @@  void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
                                addr, trace_mem_get_info(memop, 1));
     gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
 }
+
+static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i32(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i32(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i32(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i32(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i32(ret, val);
+        break;
+    }
+}
+
+static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, TCGMemOp opc)
+{
+    switch (opc & MO_SSIZE) {
+    case MO_SB:
+        tcg_gen_ext8s_i64(ret, val);
+        break;
+    case MO_UB:
+        tcg_gen_ext8u_i64(ret, val);
+        break;
+    case MO_SW:
+        tcg_gen_ext16s_i64(ret, val);
+        break;
+    case MO_UW:
+        tcg_gen_ext16u_i64(ret, val);
+        break;
+    case MO_SL:
+        tcg_gen_ext32s_i64(ret, val);
+        break;
+    case MO_UL:
+        tcg_gen_ext32u_i64(ret, val);
+        break;
+    default:
+        tcg_gen_mov_i64(ret, val);
+        break;
+    }
+}
+
+#ifdef CONFIG_USER_ONLY
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv, TCGv_i64, TCGv_i64);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv, TCGv_i64);
+#else
+typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv,
+                                  TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv,
+                                  TCGv_i64, TCGv_i64, TCGv_i32);
+typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, TCGv_i32, TCGv_i32);
+typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, TCGv_i64, TCGv_i32);
+#endif
+
+static void * const table_cmpxchg[16] = {
+    [MO_8] = gen_helper_atomic_cmpxchgb,
+    [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le,
+    [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be,
+    [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le,
+    [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be,
+    [MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le,
+    [MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be,
+};
+
+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
+                                TCGv_i32 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+
+        tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+        tcg_temp_free_i32(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i32(retv, t1);
+        }
+        tcg_temp_free_i32(t1);
+    } else {
+        gen_atomic_cx_i32 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_USER_ONLY
+        gen(retv, addr, cmpv, newv);
+#else
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+            tcg_temp_free_i32(oi);
+        }
+#endif
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i32(retv, retv, memop);
+        }
+    }
+}
+
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
+                                TCGv_i64 newv, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if (!parallel_cpus) {
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+
+        tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE);
+
+        tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+        tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1);
+        tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+        tcg_temp_free_i64(t2);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, t1, memop);
+        } else {
+            tcg_gen_mov_i64(retv, t1);
+        }
+        tcg_temp_free_i64(t1);
+    } else if ((memop & MO_SIZE) == MO_64) {
+        gen_atomic_cx_i64 gen;
+
+        gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_USER_ONLY
+        gen(retv, addr, cmpv, newv);
+#else
+        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
+        gen(retv, tcg_ctx.tcg_env, addr, cmpv, newv, oi);
+        tcg_temp_free_i32(oi);
+#endif
+    } else {
+        TCGv_i32 c32 = tcg_temp_new_i32();
+        TCGv_i32 n32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(c32, cmpv);
+        tcg_gen_extrl_i64_i32(n32, newv);
+        tcg_gen_atomic_cmpxchg_i32(r32, addr, c32, n32, idx, memop & ~MO_SIGN);
+        tcg_temp_free_i32(c32);
+        tcg_temp_free_i32(n32);
+
+        tcg_gen_extu_i32_i64(retv, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(retv, retv, memop);
+        }
+    }
+}
+
+static void do_nonatomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t1 = tcg_temp_new_i32();
+    TCGv_i32 t2 = tcg_temp_new_i32();
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i32(t2, addr, idx, memop);
+
+    tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t2);
+}
+
+static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    gen_atomic_op_i32 gen;
+
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+
+    gen = table[memop & (MO_SIZE | MO_BSWAP)];
+    tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_USER_ONLY
+    gen(ret, addr, val);
+#else
+    {
+        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+        gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+        tcg_temp_free_i32(oi);
+    }
+#endif
+
+    if (memop & MO_SIGN) {
+        tcg_gen_ext_i32(ret, ret, memop);
+    }
+}
+
+static void do_nonatomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                                TCGArg idx, TCGMemOp memop, bool new_val,
+                                void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN);
+    gen(t2, t1, val);
+    tcg_gen_qemu_st_i64(t2, addr, idx, memop);
+
+    tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
+                             TCGArg idx, TCGMemOp memop, void * const table[])
+{
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+
+    if ((memop & MO_SIZE) == MO_64) {
+        gen_atomic_op_i64 gen;
+
+        gen = table[memop & (MO_SIZE | MO_BSWAP)];
+        tcg_debug_assert(gen != NULL);
+
+#ifdef CONFIG_USER_ONLY
+        gen(ret, addr, val);
+#else
+        {
+            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
+            gen(ret, tcg_ctx.tcg_env, addr, val, oi);
+            tcg_temp_free_i32(oi);
+        }
+#endif
+    } else {
+        TCGv_i32 v32 = tcg_temp_new_i32();
+        TCGv_i32 r32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(v32, val);
+        do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table);
+        tcg_temp_free_i32(v32);
+
+        tcg_gen_extu_i32_i64(ret, r32);
+        tcg_temp_free_i32(r32);
+
+        if (memop & MO_SIGN) {
+            tcg_gen_ext_i64(ret, ret, memop);
+        }
+    }
+}
+
+#define GEN_ATOMIC_HELPER(NAME, OP, NEW)                                \
+static void * const table_##NAME[16] = {                                \
+    [MO_8] = gen_helper_atomic_##NAME##b,                               \
+    [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le,                   \
+    [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be,                   \
+    [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le,                   \
+    [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be,                   \
+    [MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le,                   \
+    [MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be,                   \
+};                                                                      \
+void tcg_gen_atomic_##NAME##_i32                                        \
+    (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i32);                        \
+    }                                                                   \
+}                                                                       \
+void tcg_gen_atomic_##NAME##_i64                                        \
+    (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, TCGMemOp memop) \
+{                                                                       \
+    if (parallel_cpus) {                                                \
+        do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME);     \
+    } else {                                                            \
+        do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW,            \
+                            tcg_gen_##OP##_i64);                        \
+    }                                                                   \
+}
+
+GEN_ATOMIC_HELPER(fetch_add, add, 0)
+GEN_ATOMIC_HELPER(fetch_and, and, 0)
+GEN_ATOMIC_HELPER(fetch_or, or, 0)
+GEN_ATOMIC_HELPER(fetch_xor, xor, 0)
+
+GEN_ATOMIC_HELPER(add_fetch, add, 1)
+GEN_ATOMIC_HELPER(and_fetch, and, 1)
+GEN_ATOMIC_HELPER(or_fetch, or, 1)
+GEN_ATOMIC_HELPER(xor_fetch, xor, 1)
+
+static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_mov_i32(r, b);
+}
+
+static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_mov_i64(r, b);
+}
+
+GEN_ATOMIC_HELPER(xchg, mov2, 0)
+
+#undef GEN_ATOMIC_HELPER
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index f217e80..2a845ae 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -852,6 +852,30 @@  static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
     tcg_gen_qemu_st_i64(arg, addr, mem_index, MO_TEQ);
 }
 
+void tcg_gen_atomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32,
+                                TCGArg, TCGMemOp);
+void tcg_gen_atomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64,
+                                TCGArg, TCGMemOp);
+
+void tcg_gen_atomic_xchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_add_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_and_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_or_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_fetch_xor_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_add_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_and_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
+void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -930,6 +954,16 @@  static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i64
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
 #define tcg_gen_muls2_tl tcg_gen_muls2_i64
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i64
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i64
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i64
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i64
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i64
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i64
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i64
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1007,6 +1041,16 @@  static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_sub2_tl tcg_gen_sub2_i32
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
 #define tcg_gen_muls2_tl tcg_gen_muls2_i32
+#define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i32
+#define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i32
+#define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i32
+#define tcg_gen_atomic_fetch_and_tl tcg_gen_atomic_fetch_and_i32
+#define tcg_gen_atomic_fetch_or_tl tcg_gen_atomic_fetch_or_i32
+#define tcg_gen_atomic_fetch_xor_tl tcg_gen_atomic_fetch_xor_i32
+#define tcg_gen_atomic_add_fetch_tl tcg_gen_atomic_add_fetch_i32
+#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
+#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
+#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
 #endif
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 23a0c37..b3accf4 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -14,3 +14,78 @@  DEF_HELPER_FLAGS_2(sar_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 
 DEF_HELPER_FLAGS_2(mulsh_i64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(muluh_i64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
+#ifdef CONFIG_USER_ONLY
+
+DEF_HELPER_FLAGS_3(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, tl, i32, i32)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, tl, i32, i32)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, tl, i32, i32)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, tl, i64, i64)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, tl, i32, i32)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, tl, i32, i32)
+DEF_HELPER_FLAGS_3(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, tl, i64, i64)
+
+#define GEN_ATOMIC_HELPERS(NAME)                        \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), b),    \
+                       TCG_CALL_NO_WG, i32, tl, i32)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), w_le), \
+                       TCG_CALL_NO_WG, i32, tl, i32)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), w_be), \
+                       TCG_CALL_NO_WG, i32, tl, i32)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), l_le), \
+                       TCG_CALL_NO_WG, i32, tl, i32)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), l_be), \
+                       TCG_CALL_NO_WG, i32, tl, i32)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), q_le), \
+                       TCG_CALL_NO_WG, i64, tl, i64)    \
+    DEF_HELPER_FLAGS_2(glue(glue(atomic_, NAME), q_be), \
+                       TCG_CALL_NO_WG, i64, tl, i64)
+
+#else
+
+DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env,
+                   tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG, i32, env,
+                   tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG, i32, env,
+                   tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG, i64, env,
+                   tl, i64, i64, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG, i32, env,
+                   tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG, i32, env,
+                   tl, i32, i32, i32)
+DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG, i64, env,
+                   tl, i64, i64, i32)
+
+#define GEN_ATOMIC_HELPERS(NAME)                                \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b),            \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be),         \
+                       TCG_CALL_NO_WG, i32, env, tl, i32, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le),         \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)  \
+    DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be),         \
+                       TCG_CALL_NO_WG, i64, env, tl, i64, i32)
+
+#endif
+
+GEN_ATOMIC_HELPERS(fetch_add)
+GEN_ATOMIC_HELPERS(fetch_and)
+GEN_ATOMIC_HELPERS(fetch_or)
+GEN_ATOMIC_HELPERS(fetch_xor)
+
+GEN_ATOMIC_HELPERS(add_fetch)
+GEN_ATOMIC_HELPERS(and_fetch)
+GEN_ATOMIC_HELPERS(or_fetch)
+GEN_ATOMIC_HELPERS(xor_fetch)
+
+GEN_ATOMIC_HELPERS(xchg)
+
+#undef GEN_ATOMIC_HELPERS
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ab67537..4e60498 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1163,6 +1163,59 @@  uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
 #endif
 
+uint8_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
+                                   uint8_t cmpv, uint8_t newv,
+                                   TCGMemOpIdx oi, uintptr_t retaddr);
+uint16_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint16_t cmpv, uint16_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint16_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint16_t cmpv, uint16_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
+TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
+    (CPUArchState *env, target_ulong addr, TYPE val,  \
+     TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint8_t, b)      \
+    GEN_ATOMIC_HELPER(NAME, uint16_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint16_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+
+GEN_ATOMIC_HELPER_ALL(fetch_add)
+GEN_ATOMIC_HELPER_ALL(fetch_sub)
+GEN_ATOMIC_HELPER_ALL(fetch_and)
+GEN_ATOMIC_HELPER_ALL(fetch_or)
+GEN_ATOMIC_HELPER_ALL(fetch_xor)
+
+GEN_ATOMIC_HELPER_ALL(add_fetch)
+GEN_ATOMIC_HELPER_ALL(sub_fetch)
+GEN_ATOMIC_HELPER_ALL(and_fetch)
+GEN_ATOMIC_HELPER_ALL(or_fetch)
+GEN_ATOMIC_HELPER_ALL(xor_fetch)
+
+GEN_ATOMIC_HELPER_ALL(xchg)
+
+#undef GEN_ATOMIC_HELPER_ALL
+#undef GEN_ATOMIC_HELPER
+
 #endif /* CONFIG_SOFTMMU */
 
 #endif /* TCG_H */