diff mbox

[RFC,v3,05/13] target-arm: translate: implement qemu_ldlink and qemu_stcond ops

Message ID 1436516626-8322-6-git-send-email-a.rigo@virtualopensystems.com
State New
Headers show

Commit Message

Alvise Rigo July 10, 2015, 8:23 a.m. UTC
Implement strex and ldrex instruction relying on TCG's qemu_ldlink and
qemu_stcond.  For the time being only the 32bit instructions are supported.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 target-arm/translate.c |  87 ++++++++++++++++++++++++++++++++++-
 tcg/arm/tcg-target.c   | 121 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 178 insertions(+), 30 deletions(-)

Comments

Alex Bennée July 17, 2015, 12:51 p.m. UTC | #1
Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Implement strex and ldrex instruction relying on TCG's qemu_ldlink and
> qemu_stcond.  For the time being only the 32bit instructions are supported.
>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  target-arm/translate.c |  87 ++++++++++++++++++++++++++++++++++-
>  tcg/arm/tcg-target.c   | 121 +++++++++++++++++++++++++++++++++++++------------
>  2 files changed, 178 insertions(+), 30 deletions(-)
>
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index 80302cd..0366c76 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -72,6 +72,8 @@ static TCGv_i64 cpu_exclusive_test;
>  static TCGv_i32 cpu_exclusive_info;
>  #endif
>  
> +static TCGv_i32 cpu_ll_sc_context;
> +
>  /* FIXME:  These should be removed.  */
>  static TCGv_i32 cpu_F0s, cpu_F1s;
>  static TCGv_i64 cpu_F0d, cpu_F1d;
> @@ -103,6 +105,8 @@ void arm_translate_init(void)
>          offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
>      cpu_exclusive_val = tcg_global_mem_new_i64(TCG_AREG0,
>          offsetof(CPUARMState, exclusive_val), "exclusive_val");
> +    cpu_ll_sc_context = tcg_global_mem_new_i32(TCG_AREG0,
> +        offsetof(CPUARMState, ll_sc_context), "ll_sc_context");
>  #ifdef CONFIG_USER_ONLY
>      cpu_exclusive_test = tcg_global_mem_new_i64(TCG_AREG0,
>          offsetof(CPUARMState, exclusive_test), "exclusive_test");
> @@ -961,6 +965,18 @@ DO_GEN_ST(8, MO_UB)
>  DO_GEN_ST(16, MO_TEUW)
>  DO_GEN_ST(32, MO_TEUL)
>  
> +/* Load/Store exclusive generators (always unsigned) */
> +static inline void gen_aa32_ldex32(TCGv_i32 val, TCGv_i32 addr, int index)
> +{
> +    tcg_gen_qemu_ldlink_i32(val, addr, index, MO_TEUL | MO_EXCL);
> +}
> +
> +static inline void gen_aa32_stex32(TCGv_i32 is_dirty, TCGv_i32 val,
> +                                   TCGv_i32 addr, int index)
> +{
> +    tcg_gen_qemu_stcond_i32(is_dirty, val, addr, index, MO_TEUL | MO_EXCL);
> +}
> +
>  static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
>  {
>      tcg_gen_movi_i32(cpu_R[15], val);
> @@ -7427,6 +7443,26 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>      store_reg(s, rt, tmp);
>  }
>  
> +static void gen_load_exclusive_multi(DisasContext *s, int rt, int rt2,
> +                                     TCGv_i32 addr, int size)
> +{
> +    TCGv_i32 tmp = tcg_temp_new_i32();
> +
> +    switch (size) {
> +    case 0:
> +    case 1:
> +        abort();
> +    case 2:
> +        gen_aa32_ldex32(tmp, addr, get_mem_index(s));
> +        break;
> +    case 3:
> +    default:
> +        abort();
> +    }
> +
> +    store_reg(s, rt, tmp);
> +}
> +
>  static void gen_clrex(DisasContext *s)
>  {
>      gen_helper_atomic_clear(cpu_env);
> @@ -7460,6 +7496,52 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>      tcg_temp_free_i64(val);
>      tcg_temp_free_i32(tmp_size);
>  }
> +
> +static void gen_store_exclusive_multi(DisasContext *s, int rd, int rt, int rt2,
> +                                      TCGv_i32 addr, int size)
> +{
> +    TCGv_i32 tmp;
> +    TCGv_i32 is_dirty;
> +    TCGLabel *done_label;
> +    TCGLabel *fail_label;
> +
> +    fail_label = gen_new_label();
> +    done_label = gen_new_label();
> +
> +    tmp = tcg_temp_new_i32();
> +    is_dirty = tcg_temp_new_i32();
> +
> +    /* Fail if we are not in LL/SC context. */
> +    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ll_sc_context, 1, fail_label);
> +
> +    tmp = load_reg(s, rt);
> +    switch (size) {
> +    case 0:
> +    case 1:
> +        abort();
> +        break;
> +    case 2:
> +        gen_aa32_stex32(is_dirty, tmp, addr, get_mem_index(s));
> +        break;
> +    case 3:
> +    default:
> +        abort();
> +    }
> +
> +    tcg_temp_free_i32(tmp);
> +
> +    /* Check if the store conditional has to fail. */
> +    tcg_gen_brcondi_i32(TCG_COND_EQ, is_dirty, 1, fail_label);
> +    tcg_temp_free_i32(is_dirty);
> +
> +    tcg_temp_free_i32(tmp);
> +
> +    tcg_gen_movi_i32(cpu_R[rd], 0); /* is_dirty = 0 */
> +    tcg_gen_br(done_label);
> +    gen_set_label(fail_label);
> +    tcg_gen_movi_i32(cpu_R[rd], 1); /* is_dirty = 1 */
> +    gen_set_label(done_label);
> +}
>  #endif
>  
>  /* gen_srs:
> @@ -8308,7 +8390,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
>                          } else if (insn & (1 << 20)) {
>                              switch (op1) {
>                              case 0: /* ldrex */
> -                                gen_load_exclusive(s, rd, 15, addr, 2);
> +                                gen_load_exclusive_multi(s, rd, 15, addr, 2);
>                                  break;
>                              case 1: /* ldrexd */
>                                  gen_load_exclusive(s, rd, rd + 1, addr, 3);
> @@ -8326,7 +8408,8 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
>                              rm = insn & 0xf;
>                              switch (op1) {
>                              case 0:  /*  strex */
> -                                gen_store_exclusive(s, rd, rm, 15, addr, 2);
> +                                gen_store_exclusive_multi(s, rd, rm, 15,
> +                                                          addr, 2);
>                                  break;
>                              case 1: /*  strexd */
>                                  gen_store_exclusive(s, rd, rm, rm + 1, addr, 3);
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index ae2ec7a..f2b69a0 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -1069,6 +1069,17 @@ static void * const qemu_ld_helpers[16] = {
>      [MO_BESL] = helper_be_ldul_mmu,
>  };

So I'm guessing we'll be implementing this for every TCG backend? 

>  
> +/* LoadLink helpers, only unsigned. Use the macro below to access them. */
> +static void * const qemu_ldex_helpers[16] = {
> +    [MO_LEUL] = helper_le_ldlinkul_mmu,
> +};
> +
> +#define LDEX_HELPER(mem_op)                                             \
> +({                                                                      \
> +    assert(mem_op & MO_EXCL);                                           \
> +    qemu_ldex_helpers[((int)mem_op - MO_EXCL)];                         \
> +})
> +
>  /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
>   *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
>   */
> @@ -1082,6 +1093,19 @@ static void * const qemu_st_helpers[16] = {
>      [MO_BEQ]  = helper_be_stq_mmu,
>  };
>  
> +/* StoreConditional helpers. Use the macro below to access them. */
> +static void * const qemu_stex_helpers[16] = {
> +    [MO_LEUL] = helper_le_stcondl_mmu,
> +};
> +
> +#define STEX_HELPER(mem_op)                                             \
> +({                                                                      \
> +    assert(mem_op & MO_EXCL);                                           \
> +    qemu_stex_helpers[(int)mem_op - MO_EXCL];                           \
> +})

Can the lookup not be merged with the existing ldst_helpers?

> +
> +
> +
>  /* Helper routines for marshalling helper function arguments into
>   * the correct registers and stack.
>   * argreg is where we want to put this argument, arg is the argument itself.
> @@ -1222,13 +1246,14 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
>     path for a load or store, so that we can later generate the correct
>     helper code.  */
>  static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
> -                                TCGReg datalo, TCGReg datahi, TCGReg addrlo,
> -                                TCGReg addrhi, tcg_insn_unit *raddr,
> -                                tcg_insn_unit *label_ptr)
> +                                TCGReg llsc_success, TCGReg datalo,
> +                                TCGReg datahi, TCGReg addrlo, TCGReg addrhi,
> +                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
>  {
>      TCGLabelQemuLdst *label = new_ldst_label(s);
>  
>      label->is_ld = is_ld;
> +    label->llsc_success = llsc_success;
>      label->oi = oi;
>      label->datalo_reg = datalo;
>      label->datahi_reg = datahi;
> @@ -1259,12 +1284,16 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>      /* For armv6 we can use the canonical unsigned helpers and minimize
>         icache usage.  For pre-armv6, use the signed helpers since we do
>         not have a single insn sign-extend.  */
> -    if (use_armv6_instructions) {
> -        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
> +    if (opc & MO_EXCL) {
> +        func = LDEX_HELPER(opc);
>      } else {
> -        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
> -        if (opc & MO_SIGN) {
> -            opc = MO_UL;
> +        if (use_armv6_instructions) {
> +            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
> +        } else {
> +            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
> +            if (opc & MO_SIGN) {
> +                opc = MO_UL;
> +            }
>          }
>      }
>      tcg_out_call(s, func);
> @@ -1336,8 +1365,15 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>      argreg = tcg_out_arg_imm32(s, argreg, oi);
>      argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
>  
> -    /* Tail-call to the helper, which will return to the fast path.  */
> -    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
> +    if (opc & MO_EXCL) {
> +        tcg_out_call(s, STEX_HELPER(opc));
> +        /* Save the output of the StoreConditional */
> +        tcg_out_mov_reg(s, COND_AL, lb->llsc_success, TCG_REG_R0);
> +        tcg_out_goto(s, COND_AL, lb->raddr);
> +    } else {
> +        /* Tail-call to the helper, which will return to the fast path.  */
> +        tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
> +    }
>  }
>  #endif /* SOFTMMU */
>  
> @@ -1461,7 +1497,8 @@ static inline void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp opc,
>      }
>  }
>  
> -static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
> +static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64,
> +                            bool isLoadLink)
>  {
>      TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
>      TCGMemOpIdx oi;
> @@ -1484,13 +1521,20 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
>      addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
>  
>      /* This a conditional BL only to load a pointer within this opcode into LR
> -       for the slow path.  We will not be using the value for a tail call.  */
> -    label_ptr = s->code_ptr;
> -    tcg_out_bl_noaddr(s, COND_NE);
> +       for the slow path.  We will not be using the value for a tail call.
> +       In the context of a LoadLink instruction, we don't check the TLB but we
> +       always follow the slow path.  */
> +    if (isLoadLink) {
> +        label_ptr = s->code_ptr;
> +        tcg_out_bl_noaddr(s, COND_AL);
> +    } else {
> +        label_ptr = s->code_ptr;
> +        tcg_out_bl_noaddr(s, COND_NE);

This seems a little redundant, we could set label_ptr outside the
isLoadLink check as it will be the same in both cases. However if we are
always taking the slow path for exclusive accesses then why should we
generate a tcg_out_tlb_read()? Do we need some side effects from calling
it for the slow path?

>  
> -    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
> +        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
> +    }
>  
> -    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
> +    add_qemu_ldst_label(s, true, oi, 0, datalo, datahi, addrlo, addrhi,
>                          s->code_ptr, label_ptr);
>  #else /* !CONFIG_SOFTMMU */
>      if (GUEST_BASE) {
> @@ -1592,9 +1636,11 @@ static inline void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp opc,
>      }
>  }
>  
> -static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
> +static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64,
> +                            bool isStoreCond)
>  {
>      TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
> +    TCGReg llsc_success;
>      TCGMemOpIdx oi;
>      TCGMemOp opc;
>  #ifdef CONFIG_SOFTMMU
> @@ -1603,6 +1649,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>      tcg_insn_unit *label_ptr;
>  #endif
>  
> +    /* The stcond variant has one more param */
> +    llsc_success = (isStoreCond ? *args++ : 0);
> +
>      datalo = *args++;
>      datahi = (is64 ? *args++ : 0);
>      addrlo = *args++;
> @@ -1612,16 +1661,24 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>  
>  #ifdef CONFIG_SOFTMMU
>      mem_index = get_mmuidx(oi);
> -    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
>  
> -    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
> +    if (isStoreCond) {
> +        /* Always follow the slow-path for an exclusive access */
> +        label_ptr = s->code_ptr;
> +        tcg_out_bl_noaddr(s, COND_AL);
> +    } else {
> +        addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE,
> +                                  mem_index, 0);
>  
> -    /* The conditional call must come last, as we're going to return here.  */
> -    label_ptr = s->code_ptr;
> -    tcg_out_bl_noaddr(s, COND_NE);
> +        tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
>  
> -    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
> -                        s->code_ptr, label_ptr);
> +        /* The conditional call must come last, as we're going to return here.*/
> +        label_ptr = s->code_ptr;
> +        tcg_out_bl_noaddr(s, COND_NE);
> +    }
> +
> +    add_qemu_ldst_label(s, false, oi, llsc_success, datalo, datahi, addrlo,
> +                        addrhi, s->code_ptr, label_ptr);

Indeed this does what I commented about above ;-)

>  #else /* !CONFIG_SOFTMMU */
>      if (GUEST_BASE) {
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
> @@ -1864,16 +1921,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_qemu_ld_i32:
> -        tcg_out_qemu_ld(s, args, 0);
> +        tcg_out_qemu_ld(s, args, 0, 0);
> +        break;
> +    case INDEX_op_qemu_ldlink_i32:
> +        tcg_out_qemu_ld(s, args, 0, 1); /* LoadLink */
>          break;
>      case INDEX_op_qemu_ld_i64:
> -        tcg_out_qemu_ld(s, args, 1);
> +        tcg_out_qemu_ld(s, args, 1, 0);
>          break;
>      case INDEX_op_qemu_st_i32:
> -        tcg_out_qemu_st(s, args, 0);
> +        tcg_out_qemu_st(s, args, 0, 0);
> +        break;
> +    case INDEX_op_qemu_stcond_i32:
> +        tcg_out_qemu_st(s, args, 0, 1); /* StoreConditional */
>          break;
>      case INDEX_op_qemu_st_i64:
> -        tcg_out_qemu_st(s, args, 1);
> +        tcg_out_qemu_st(s, args, 1, 0);
>          break;
>  
>      case INDEX_op_bswap16_i32:
> @@ -1957,8 +2020,10 @@ static const TCGTargetOpDef arm_op_defs[] = {
>  
>  #if TARGET_LONG_BITS == 32
>      { INDEX_op_qemu_ld_i32, { "r", "l" } },
> +    { INDEX_op_qemu_ldlink_i32, { "r", "l" } },
>      { INDEX_op_qemu_ld_i64, { "r", "r", "l" } },
>      { INDEX_op_qemu_st_i32, { "s", "s" } },
> +    { INDEX_op_qemu_stcond_i32, { "r", "s", "s" } },
>      { INDEX_op_qemu_st_i64, { "s", "s", "s" } },
>  #else
>      { INDEX_op_qemu_ld_i32, { "r", "l", "l" } },
Alvise Rigo July 17, 2015, 1:01 p.m. UTC | #2
On Fri, Jul 17, 2015 at 2:51 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>
>> Implement strex and ldrex instruction relying on TCG's qemu_ldlink and
>> qemu_stcond.  For the time being only the 32bit instructions are supported.
>>
>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>> ---
>>  target-arm/translate.c |  87 ++++++++++++++++++++++++++++++++++-
>>  tcg/arm/tcg-target.c   | 121 +++++++++++++++++++++++++++++++++++++------------
>>  2 files changed, 178 insertions(+), 30 deletions(-)
>>
>> diff --git a/target-arm/translate.c b/target-arm/translate.c
>> index 80302cd..0366c76 100644
>> --- a/target-arm/translate.c
>> +++ b/target-arm/translate.c
>> @@ -72,6 +72,8 @@ static TCGv_i64 cpu_exclusive_test;
>>  static TCGv_i32 cpu_exclusive_info;
>>  #endif
>>
>> +static TCGv_i32 cpu_ll_sc_context;
>> +
>>  /* FIXME:  These should be removed.  */
>>  static TCGv_i32 cpu_F0s, cpu_F1s;
>>  static TCGv_i64 cpu_F0d, cpu_F1d;
>> @@ -103,6 +105,8 @@ void arm_translate_init(void)
>>          offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
>>      cpu_exclusive_val = tcg_global_mem_new_i64(TCG_AREG0,
>>          offsetof(CPUARMState, exclusive_val), "exclusive_val");
>> +    cpu_ll_sc_context = tcg_global_mem_new_i32(TCG_AREG0,
>> +        offsetof(CPUARMState, ll_sc_context), "ll_sc_context");
>>  #ifdef CONFIG_USER_ONLY
>>      cpu_exclusive_test = tcg_global_mem_new_i64(TCG_AREG0,
>>          offsetof(CPUARMState, exclusive_test), "exclusive_test");
>> @@ -961,6 +965,18 @@ DO_GEN_ST(8, MO_UB)
>>  DO_GEN_ST(16, MO_TEUW)
>>  DO_GEN_ST(32, MO_TEUL)
>>
>> +/* Load/Store exclusive generators (always unsigned) */
>> +static inline void gen_aa32_ldex32(TCGv_i32 val, TCGv_i32 addr, int index)
>> +{
>> +    tcg_gen_qemu_ldlink_i32(val, addr, index, MO_TEUL | MO_EXCL);
>> +}
>> +
>> +static inline void gen_aa32_stex32(TCGv_i32 is_dirty, TCGv_i32 val,
>> +                                   TCGv_i32 addr, int index)
>> +{
>> +    tcg_gen_qemu_stcond_i32(is_dirty, val, addr, index, MO_TEUL | MO_EXCL);
>> +}
>> +
>>  static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
>>  {
>>      tcg_gen_movi_i32(cpu_R[15], val);
>> @@ -7427,6 +7443,26 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>>      store_reg(s, rt, tmp);
>>  }
>>
>> +static void gen_load_exclusive_multi(DisasContext *s, int rt, int rt2,
>> +                                     TCGv_i32 addr, int size)
>> +{
>> +    TCGv_i32 tmp = tcg_temp_new_i32();
>> +
>> +    switch (size) {
>> +    case 0:
>> +    case 1:
>> +        abort();
>> +    case 2:
>> +        gen_aa32_ldex32(tmp, addr, get_mem_index(s));
>> +        break;
>> +    case 3:
>> +    default:
>> +        abort();
>> +    }
>> +
>> +    store_reg(s, rt, tmp);
>> +}
>> +
>>  static void gen_clrex(DisasContext *s)
>>  {
>>      gen_helper_atomic_clear(cpu_env);
>> @@ -7460,6 +7496,52 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>>      tcg_temp_free_i64(val);
>>      tcg_temp_free_i32(tmp_size);
>>  }
>> +
>> +static void gen_store_exclusive_multi(DisasContext *s, int rd, int rt, int rt2,
>> +                                      TCGv_i32 addr, int size)
>> +{
>> +    TCGv_i32 tmp;
>> +    TCGv_i32 is_dirty;
>> +    TCGLabel *done_label;
>> +    TCGLabel *fail_label;
>> +
>> +    fail_label = gen_new_label();
>> +    done_label = gen_new_label();
>> +
>> +    tmp = tcg_temp_new_i32();
>> +    is_dirty = tcg_temp_new_i32();
>> +
>> +    /* Fail if we are not in LL/SC context. */
>> +    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ll_sc_context, 1, fail_label);
>> +
>> +    tmp = load_reg(s, rt);
>> +    switch (size) {
>> +    case 0:
>> +    case 1:
>> +        abort();
>> +        break;
>> +    case 2:
>> +        gen_aa32_stex32(is_dirty, tmp, addr, get_mem_index(s));
>> +        break;
>> +    case 3:
>> +    default:
>> +        abort();
>> +    }
>> +
>> +    tcg_temp_free_i32(tmp);
>> +
>> +    /* Check if the store conditional has to fail. */
>> +    tcg_gen_brcondi_i32(TCG_COND_EQ, is_dirty, 1, fail_label);
>> +    tcg_temp_free_i32(is_dirty);
>> +
>> +    tcg_temp_free_i32(tmp);
>> +
>> +    tcg_gen_movi_i32(cpu_R[rd], 0); /* is_dirty = 0 */
>> +    tcg_gen_br(done_label);
>> +    gen_set_label(fail_label);
>> +    tcg_gen_movi_i32(cpu_R[rd], 1); /* is_dirty = 1 */
>> +    gen_set_label(done_label);
>> +}
>>  #endif
>>
>>  /* gen_srs:
>> @@ -8308,7 +8390,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
>>                          } else if (insn & (1 << 20)) {
>>                              switch (op1) {
>>                              case 0: /* ldrex */
>> -                                gen_load_exclusive(s, rd, 15, addr, 2);
>> +                                gen_load_exclusive_multi(s, rd, 15, addr, 2);
>>                                  break;
>>                              case 1: /* ldrexd */
>>                                  gen_load_exclusive(s, rd, rd + 1, addr, 3);
>> @@ -8326,7 +8408,8 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
>>                              rm = insn & 0xf;
>>                              switch (op1) {
>>                              case 0:  /*  strex */
>> -                                gen_store_exclusive(s, rd, rm, 15, addr, 2);
>> +                                gen_store_exclusive_multi(s, rd, rm, 15,
>> +                                                          addr, 2);
>>                                  break;
>>                              case 1: /*  strexd */
>>                                  gen_store_exclusive(s, rd, rm, rm + 1, addr, 3);
>> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
>> index ae2ec7a..f2b69a0 100644
>> --- a/tcg/arm/tcg-target.c
>> +++ b/tcg/arm/tcg-target.c
>> @@ -1069,6 +1069,17 @@ static void * const qemu_ld_helpers[16] = {
>>      [MO_BESL] = helper_be_ldul_mmu,
>>  };
>
> So I'm guessing we'll be implementing this for every TCG backend?

Correct. According to the architecture's atomic instructions, a subset
of these helpers will be implemented.

>
>>
>> +/* LoadLink helpers, only unsigned. Use the macro below to access them. */
>> +static void * const qemu_ldex_helpers[16] = {
>> +    [MO_LEUL] = helper_le_ldlinkul_mmu,
>> +};
>> +
>> +#define LDEX_HELPER(mem_op)                                             \
>> +({                                                                      \
>> +    assert(mem_op & MO_EXCL);                                           \
>> +    qemu_ldex_helpers[((int)mem_op - MO_EXCL)];                         \
>> +})
>> +
>>  /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
>>   *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
>>   */
>> @@ -1082,6 +1093,19 @@ static void * const qemu_st_helpers[16] = {
>>      [MO_BEQ]  = helper_be_stq_mmu,
>>  };
>>
>> +/* StoreConditional helpers. Use the macro below to access them. */
>> +static void * const qemu_stex_helpers[16] = {
>> +    [MO_LEUL] = helper_le_stcondl_mmu,
>> +};
>> +
>> +#define STEX_HELPER(mem_op)                                             \
>> +({                                                                      \
>> +    assert(mem_op & MO_EXCL);                                           \
>> +    qemu_stex_helpers[(int)mem_op - MO_EXCL];                           \
>> +})
>
> Can the lookup not be merged with the existing ldst_helpers?

Sure.

>
>> +
>> +
>> +
>>  /* Helper routines for marshalling helper function arguments into
>>   * the correct registers and stack.
>>   * argreg is where we want to put this argument, arg is the argument itself.
>> @@ -1222,13 +1246,14 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
>>     path for a load or store, so that we can later generate the correct
>>     helper code.  */
>>  static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
>> -                                TCGReg datalo, TCGReg datahi, TCGReg addrlo,
>> -                                TCGReg addrhi, tcg_insn_unit *raddr,
>> -                                tcg_insn_unit *label_ptr)
>> +                                TCGReg llsc_success, TCGReg datalo,
>> +                                TCGReg datahi, TCGReg addrlo, TCGReg addrhi,
>> +                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
>>  {
>>      TCGLabelQemuLdst *label = new_ldst_label(s);
>>
>>      label->is_ld = is_ld;
>> +    label->llsc_success = llsc_success;
>>      label->oi = oi;
>>      label->datalo_reg = datalo;
>>      label->datahi_reg = datahi;
>> @@ -1259,12 +1284,16 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>>      /* For armv6 we can use the canonical unsigned helpers and minimize
>>         icache usage.  For pre-armv6, use the signed helpers since we do
>>         not have a single insn sign-extend.  */
>> -    if (use_armv6_instructions) {
>> -        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
>> +    if (opc & MO_EXCL) {
>> +        func = LDEX_HELPER(opc);
>>      } else {
>> -        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
>> -        if (opc & MO_SIGN) {
>> -            opc = MO_UL;
>> +        if (use_armv6_instructions) {
>> +            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
>> +        } else {
>> +            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
>> +            if (opc & MO_SIGN) {
>> +                opc = MO_UL;
>> +            }
>>          }
>>      }
>>      tcg_out_call(s, func);
>> @@ -1336,8 +1365,15 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>>      argreg = tcg_out_arg_imm32(s, argreg, oi);
>>      argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
>>
>> -    /* Tail-call to the helper, which will return to the fast path.  */
>> -    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
>> +    if (opc & MO_EXCL) {
>> +        tcg_out_call(s, STEX_HELPER(opc));
>> +        /* Save the output of the StoreConditional */
>> +        tcg_out_mov_reg(s, COND_AL, lb->llsc_success, TCG_REG_R0);
>> +        tcg_out_goto(s, COND_AL, lb->raddr);
>> +    } else {
>> +        /* Tail-call to the helper, which will return to the fast path.  */
>> +        tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
>> +    }
>>  }
>>  #endif /* SOFTMMU */
>>
>> @@ -1461,7 +1497,8 @@ static inline void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp opc,
>>      }
>>  }
>>
>> -static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
>> +static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64,
>> +                            bool isLoadLink)
>>  {
>>      TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
>>      TCGMemOpIdx oi;
>> @@ -1484,13 +1521,20 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
>>      addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
>>
>>      /* This a conditional BL only to load a pointer within this opcode into LR
>> -       for the slow path.  We will not be using the value for a tail call.  */
>> -    label_ptr = s->code_ptr;
>> -    tcg_out_bl_noaddr(s, COND_NE);
>> +       for the slow path.  We will not be using the value for a tail call.
>> +       In the context of a LoadLink instruction, we don't check the TLB but we
>> +       always follow the slow path.  */
>> +    if (isLoadLink) {
>> +        label_ptr = s->code_ptr;
>> +        tcg_out_bl_noaddr(s, COND_AL);
>> +    } else {
>> +        label_ptr = s->code_ptr;
>> +        tcg_out_bl_noaddr(s, COND_NE);
>
> This seems a little redundant, we could set label_ptr outside the
> isLoadLink check as it will be the same in both cases. However if we are
> always taking the slow path for exclusive accesses then why should we
> generate a tcg_out_tlb_read()? Do we need some side effects from calling
> it for the slow path?

Good point, we can avoid the TLB read if we are doing a SC/LL.
In fact, the first thing we do in softmmu_llsc_template.h is to update
the corresponding TLB entry if necessary.

Thank you,
alvise

>
>>
>> -    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
>> +        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
>> +    }
>>
>> -    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
>> +    add_qemu_ldst_label(s, true, oi, 0, datalo, datahi, addrlo, addrhi,
>>                          s->code_ptr, label_ptr);
>>  #else /* !CONFIG_SOFTMMU */
>>      if (GUEST_BASE) {
>> @@ -1592,9 +1636,11 @@ static inline void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp opc,
>>      }
>>  }
>>
>> -static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>> +static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64,
>> +                            bool isStoreCond)
>>  {
>>      TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
>> +    TCGReg llsc_success;
>>      TCGMemOpIdx oi;
>>      TCGMemOp opc;
>>  #ifdef CONFIG_SOFTMMU
>> @@ -1603,6 +1649,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>>      tcg_insn_unit *label_ptr;
>>  #endif
>>
>> +    /* The stcond variant has one more param */
>> +    llsc_success = (isStoreCond ? *args++ : 0);
>> +
>>      datalo = *args++;
>>      datahi = (is64 ? *args++ : 0);
>>      addrlo = *args++;
>> @@ -1612,16 +1661,24 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>>
>>  #ifdef CONFIG_SOFTMMU
>>      mem_index = get_mmuidx(oi);
>> -    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
>>
>> -    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
>> +    if (isStoreCond) {
>> +        /* Always follow the slow-path for an exclusive access */
>> +        label_ptr = s->code_ptr;
>> +        tcg_out_bl_noaddr(s, COND_AL);
>> +    } else {
>> +        addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE,
>> +                                  mem_index, 0);
>>
>> -    /* The conditional call must come last, as we're going to return here.  */
>> -    label_ptr = s->code_ptr;
>> -    tcg_out_bl_noaddr(s, COND_NE);
>> +        tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
>>
>> -    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
>> -                        s->code_ptr, label_ptr);
>> +        /* The conditional call must come last, as we're going to return here.*/
>> +        label_ptr = s->code_ptr;
>> +        tcg_out_bl_noaddr(s, COND_NE);
>> +    }
>> +
>> +    add_qemu_ldst_label(s, false, oi, llsc_success, datalo, datahi, addrlo,
>> +                        addrhi, s->code_ptr, label_ptr);
>
> Indeed this does what I commented about above ;-)
>
>>  #else /* !CONFIG_SOFTMMU */
>>      if (GUEST_BASE) {
>>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
>> @@ -1864,16 +1921,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>>          break;
>>
>>      case INDEX_op_qemu_ld_i32:
>> -        tcg_out_qemu_ld(s, args, 0);
>> +        tcg_out_qemu_ld(s, args, 0, 0);
>> +        break;
>> +    case INDEX_op_qemu_ldlink_i32:
>> +        tcg_out_qemu_ld(s, args, 0, 1); /* LoadLink */
>>          break;
>>      case INDEX_op_qemu_ld_i64:
>> -        tcg_out_qemu_ld(s, args, 1);
>> +        tcg_out_qemu_ld(s, args, 1, 0);
>>          break;
>>      case INDEX_op_qemu_st_i32:
>> -        tcg_out_qemu_st(s, args, 0);
>> +        tcg_out_qemu_st(s, args, 0, 0);
>> +        break;
>> +    case INDEX_op_qemu_stcond_i32:
>> +        tcg_out_qemu_st(s, args, 0, 1); /* StoreConditional */
>>          break;
>>      case INDEX_op_qemu_st_i64:
>> -        tcg_out_qemu_st(s, args, 1);
>> +        tcg_out_qemu_st(s, args, 1, 0);
>>          break;
>>
>>      case INDEX_op_bswap16_i32:
>> @@ -1957,8 +2020,10 @@ static const TCGTargetOpDef arm_op_defs[] = {
>>
>>  #if TARGET_LONG_BITS == 32
>>      { INDEX_op_qemu_ld_i32, { "r", "l" } },
>> +    { INDEX_op_qemu_ldlink_i32, { "r", "l" } },
>>      { INDEX_op_qemu_ld_i64, { "r", "r", "l" } },
>>      { INDEX_op_qemu_st_i32, { "s", "s" } },
>> +    { INDEX_op_qemu_stcond_i32, { "r", "s", "s" } },
>>      { INDEX_op_qemu_st_i64, { "s", "s", "s" } },
>>  #else
>>      { INDEX_op_qemu_ld_i32, { "r", "l", "l" } },
>
> --
> Alex Bennée
diff mbox

Patch

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 80302cd..0366c76 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -72,6 +72,8 @@  static TCGv_i64 cpu_exclusive_test;
 static TCGv_i32 cpu_exclusive_info;
 #endif
 
+static TCGv_i32 cpu_ll_sc_context;
+
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
 static TCGv_i64 cpu_F0d, cpu_F1d;
@@ -103,6 +105,8 @@  void arm_translate_init(void)
         offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
     cpu_exclusive_val = tcg_global_mem_new_i64(TCG_AREG0,
         offsetof(CPUARMState, exclusive_val), "exclusive_val");
+    cpu_ll_sc_context = tcg_global_mem_new_i32(TCG_AREG0,
+        offsetof(CPUARMState, ll_sc_context), "ll_sc_context");
 #ifdef CONFIG_USER_ONLY
     cpu_exclusive_test = tcg_global_mem_new_i64(TCG_AREG0,
         offsetof(CPUARMState, exclusive_test), "exclusive_test");
@@ -961,6 +965,18 @@  DO_GEN_ST(8, MO_UB)
 DO_GEN_ST(16, MO_TEUW)
 DO_GEN_ST(32, MO_TEUL)
 
+/* Load/Store exclusive generators (always unsigned) */
+static inline void gen_aa32_ldex32(TCGv_i32 val, TCGv_i32 addr, int index)
+{
+    tcg_gen_qemu_ldlink_i32(val, addr, index, MO_TEUL | MO_EXCL);
+}
+
+static inline void gen_aa32_stex32(TCGv_i32 is_dirty, TCGv_i32 val,
+                                   TCGv_i32 addr, int index)
+{
+    tcg_gen_qemu_stcond_i32(is_dirty, val, addr, index, MO_TEUL | MO_EXCL);
+}
+
 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
     tcg_gen_movi_i32(cpu_R[15], val);
@@ -7427,6 +7443,26 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
     store_reg(s, rt, tmp);
 }
 
+static void gen_load_exclusive_multi(DisasContext *s, int rt, int rt2,
+                                     TCGv_i32 addr, int size)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    switch (size) {
+    case 0:
+    case 1:
+        abort();
+    case 2:
+        gen_aa32_ldex32(tmp, addr, get_mem_index(s));
+        break;
+    case 3:
+    default:
+        abort();
+    }
+
+    store_reg(s, rt, tmp);
+}
+
 static void gen_clrex(DisasContext *s)
 {
     gen_helper_atomic_clear(cpu_env);
@@ -7460,6 +7496,52 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     tcg_temp_free_i64(val);
     tcg_temp_free_i32(tmp_size);
 }
+
+static void gen_store_exclusive_multi(DisasContext *s, int rd, int rt, int rt2,
+                                      TCGv_i32 addr, int size)
+{
+    TCGv_i32 tmp;
+    TCGv_i32 is_dirty;
+    TCGLabel *done_label;
+    TCGLabel *fail_label;
+
+    fail_label = gen_new_label();
+    done_label = gen_new_label();
+
+    tmp = tcg_temp_new_i32();
+    is_dirty = tcg_temp_new_i32();
+
+    /* Fail if we are not in LL/SC context. */
+    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ll_sc_context, 1, fail_label);
+
+    tmp = load_reg(s, rt);
+    switch (size) {
+    case 0:
+    case 1:
+        abort();
+        break;
+    case 2:
+        gen_aa32_stex32(is_dirty, tmp, addr, get_mem_index(s));
+        break;
+    case 3:
+    default:
+        abort();
+    }
+
+    tcg_temp_free_i32(tmp);
+
+    /* Check if the store conditional has to fail. */
+    tcg_gen_brcondi_i32(TCG_COND_EQ, is_dirty, 1, fail_label);
+    tcg_temp_free_i32(is_dirty);
+
+    tcg_temp_free_i32(tmp);
+
+    tcg_gen_movi_i32(cpu_R[rd], 0); /* is_dirty = 0 */
+    tcg_gen_br(done_label);
+    gen_set_label(fail_label);
+    tcg_gen_movi_i32(cpu_R[rd], 1); /* is_dirty = 1 */
+    gen_set_label(done_label);
+}
 #endif
 
 /* gen_srs:
@@ -8308,7 +8390,7 @@  static void disas_arm_insn(DisasContext *s, unsigned int insn)
                         } else if (insn & (1 << 20)) {
                             switch (op1) {
                             case 0: /* ldrex */
-                                gen_load_exclusive(s, rd, 15, addr, 2);
+                                gen_load_exclusive_multi(s, rd, 15, addr, 2);
                                 break;
                             case 1: /* ldrexd */
                                 gen_load_exclusive(s, rd, rd + 1, addr, 3);
@@ -8326,7 +8408,8 @@  static void disas_arm_insn(DisasContext *s, unsigned int insn)
                             rm = insn & 0xf;
                             switch (op1) {
                             case 0:  /*  strex */
-                                gen_store_exclusive(s, rd, rm, 15, addr, 2);
+                                gen_store_exclusive_multi(s, rd, rm, 15,
+                                                          addr, 2);
                                 break;
                             case 1: /*  strexd */
                                 gen_store_exclusive(s, rd, rm, rm + 1, addr, 3);
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index ae2ec7a..f2b69a0 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1069,6 +1069,17 @@  static void * const qemu_ld_helpers[16] = {
     [MO_BESL] = helper_be_ldul_mmu,
 };
 
+/* LoadLink helpers, only unsigned. Use the macro below to access them. */
+static void * const qemu_ldex_helpers[16] = {
+    [MO_LEUL] = helper_le_ldlinkul_mmu,
+};
+
+#define LDEX_HELPER(mem_op)                                             \
+({                                                                      \
+    assert(mem_op & MO_EXCL);                                           \
+    qemu_ldex_helpers[((int)mem_op - MO_EXCL)];                         \
+})
+
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
@@ -1082,6 +1093,19 @@  static void * const qemu_st_helpers[16] = {
     [MO_BEQ]  = helper_be_stq_mmu,
 };
 
+/* StoreConditional helpers. Use the macro below to access them. */
+static void * const qemu_stex_helpers[16] = {
+    [MO_LEUL] = helper_le_stcondl_mmu,
+};
+
+#define STEX_HELPER(mem_op)                                             \
+({                                                                      \
+    assert(mem_op & MO_EXCL);                                           \
+    qemu_stex_helpers[(int)mem_op - MO_EXCL];                           \
+})
+
+
+
 /* Helper routines for marshalling helper function arguments into
  * the correct registers and stack.
  * argreg is where we want to put this argument, arg is the argument itself.
@@ -1222,13 +1246,14 @@  static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
    path for a load or store, so that we can later generate the correct
    helper code.  */
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
-                                TCGReg datalo, TCGReg datahi, TCGReg addrlo,
-                                TCGReg addrhi, tcg_insn_unit *raddr,
-                                tcg_insn_unit *label_ptr)
+                                TCGReg llsc_success, TCGReg datalo,
+                                TCGReg datahi, TCGReg addrlo, TCGReg addrhi,
+                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
+    label->llsc_success = llsc_success;
     label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
@@ -1259,12 +1284,16 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     /* For armv6 we can use the canonical unsigned helpers and minimize
        icache usage.  For pre-armv6, use the signed helpers since we do
        not have a single insn sign-extend.  */
-    if (use_armv6_instructions) {
-        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
+    if (opc & MO_EXCL) {
+        func = LDEX_HELPER(opc);
     } else {
-        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
-        if (opc & MO_SIGN) {
-            opc = MO_UL;
+        if (use_armv6_instructions) {
+            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
+        } else {
+            func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
+            if (opc & MO_SIGN) {
+                opc = MO_UL;
+            }
         }
     }
     tcg_out_call(s, func);
@@ -1336,8 +1365,15 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    /* Tail-call to the helper, which will return to the fast path.  */
-    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    if (opc & MO_EXCL) {
+        tcg_out_call(s, STEX_HELPER(opc));
+        /* Save the output of the StoreConditional */
+        tcg_out_mov_reg(s, COND_AL, lb->llsc_success, TCG_REG_R0);
+        tcg_out_goto(s, COND_AL, lb->raddr);
+    } else {
+        /* Tail-call to the helper, which will return to the fast path.  */
+        tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    }
 }
 #endif /* SOFTMMU */
 
@@ -1461,7 +1497,8 @@  static inline void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp opc,
     }
 }
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64,
+                            bool isLoadLink)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
     TCGMemOpIdx oi;
@@ -1484,13 +1521,20 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
 
     /* This a conditional BL only to load a pointer within this opcode into LR
-       for the slow path.  We will not be using the value for a tail call.  */
-    label_ptr = s->code_ptr;
-    tcg_out_bl_noaddr(s, COND_NE);
+       for the slow path.  We will not be using the value for a tail call.
+       In the context of a LoadLink instruction, we don't check the TLB but we
+       always follow the slow path.  */
+    if (isLoadLink) {
+        label_ptr = s->code_ptr;
+        tcg_out_bl_noaddr(s, COND_AL);
+    } else {
+        label_ptr = s->code_ptr;
+        tcg_out_bl_noaddr(s, COND_NE);
 
-    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
+        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
+    }
 
-    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+    add_qemu_ldst_label(s, true, oi, 0, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
@@ -1592,9 +1636,11 @@  static inline void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp opc,
     }
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64,
+                            bool isStoreCond)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGReg llsc_success;
     TCGMemOpIdx oi;
     TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
@@ -1603,6 +1649,9 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     tcg_insn_unit *label_ptr;
 #endif
 
+    /* The stcond variant has one more param */
+    llsc_success = (isStoreCond ? *args++ : 0);
+
     datalo = *args++;
     datahi = (is64 ? *args++ : 0);
     addrlo = *args++;
@@ -1612,16 +1661,24 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 #ifdef CONFIG_SOFTMMU
     mem_index = get_mmuidx(oi);
-    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
 
-    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
+    if (isStoreCond) {
+        /* Always follow the slow-path for an exclusive access */
+        label_ptr = s->code_ptr;
+        tcg_out_bl_noaddr(s, COND_AL);
+    } else {
+        addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE,
+                                  mem_index, 0);
 
-    /* The conditional call must come last, as we're going to return here.  */
-    label_ptr = s->code_ptr;
-    tcg_out_bl_noaddr(s, COND_NE);
+        tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
 
-    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
-                        s->code_ptr, label_ptr);
+        /* The conditional call must come last, as we're going to return here.*/
+        label_ptr = s->code_ptr;
+        tcg_out_bl_noaddr(s, COND_NE);
+    }
+
+    add_qemu_ldst_label(s, false, oi, llsc_success, datalo, datahi, addrlo,
+                        addrhi, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
@@ -1864,16 +1921,22 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_qemu_ld_i32:
-        tcg_out_qemu_ld(s, args, 0);
+        tcg_out_qemu_ld(s, args, 0, 0);
+        break;
+    case INDEX_op_qemu_ldlink_i32:
+        tcg_out_qemu_ld(s, args, 0, 1); /* LoadLink */
         break;
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, args, 1);
+        tcg_out_qemu_ld(s, args, 1, 0);
         break;
     case INDEX_op_qemu_st_i32:
-        tcg_out_qemu_st(s, args, 0);
+        tcg_out_qemu_st(s, args, 0, 0);
+        break;
+    case INDEX_op_qemu_stcond_i32:
+        tcg_out_qemu_st(s, args, 0, 1); /* StoreConditional */
         break;
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, args, 1);
+        tcg_out_qemu_st(s, args, 1, 0);
         break;
 
     case INDEX_op_bswap16_i32:
@@ -1957,8 +2020,10 @@  static const TCGTargetOpDef arm_op_defs[] = {
 
 #if TARGET_LONG_BITS == 32
     { INDEX_op_qemu_ld_i32, { "r", "l" } },
+    { INDEX_op_qemu_ldlink_i32, { "r", "l" } },
     { INDEX_op_qemu_ld_i64, { "r", "r", "l" } },
     { INDEX_op_qemu_st_i32, { "s", "s" } },
+    { INDEX_op_qemu_stcond_i32, { "r", "s", "s" } },
     { INDEX_op_qemu_st_i64, { "s", "s", "s" } },
 #else
     { INDEX_op_qemu_ld_i32, { "r", "l", "l" } },