diff mbox

[RFC,v3,12/13] softmmu_llsc_template.h: move to multithreading

Message ID 1436516626-8322-13-git-send-email-a.rigo@virtualopensystems.com
State New
Headers show

Commit Message

Alvise Rigo July 10, 2015, 8:23 a.m. UTC
Update the TCG LL/SC instructions to work in multi-threading.

The basic idea remains untouched, but the whole mechanism is improved to
make use of the callback support to query TLB flush requests and the
rendezvous callback to synchronize all the currently running vCPUs.

In essence, if a vCPU wants to LL to a page which is not already set as
EXCL, it will arrange a rendezvous with all the vCPUs that are executing
a TB and query a TLB flush for *all* the vCPUs.
Doing so, we make sure that:
- the running vCPUs do not touch the EXCL page while the requesting vCPU
  is setting the transaction to EXCL of the page
- all the vCPUs will have the EXCL flag in the TLB entry for that
  specific page *before* entering the next TB

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 cputlb.c                |  2 +
 include/exec/cpu-defs.h |  4 ++
 softmmu_llsc_template.h | 97 ++++++++++++++++++++++++++++++++-----------------
 3 files changed, 69 insertions(+), 34 deletions(-)

Comments

Alex Bennée July 17, 2015, 3:27 p.m. UTC | #1
Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Update the TCG LL/SC instructions to work in multi-threading.
>
> The basic idea remains untouched, but the whole mechanism is improved to
> make use of the callback support to query TLB flush requests and the
> rendezvous callback to synchronize all the currently running vCPUs.
>
> In essence, if a vCPU wants to LL to a page which is not already set as
> EXCL, it will arrange a rendezvous with all the vCPUs that are executing
> a TB and query a TLB flush for *all* the vCPUs.
> Doing so, we make sure that:
> - the running vCPUs do not touch the EXCL page while the requesting vCPU
>   is setting the transaction to EXCL of the page
> - all the vCPUs will have the EXCL flag in the TLB entry for that
>   specific page *before* entering the next TB
>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  cputlb.c                |  2 +
>  include/exec/cpu-defs.h |  4 ++
>  softmmu_llsc_template.h | 97 ++++++++++++++++++++++++++++++++-----------------
>  3 files changed, 69 insertions(+), 34 deletions(-)
>
> diff --git a/cputlb.c b/cputlb.c
> index 66df41a..0566e0f 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -30,6 +30,8 @@
>  #include "exec/ram_addr.h"
>  #include "tcg/tcg.h"
>  
> +#include "sysemu/cpus.h"
> +
>  void qemu_mutex_lock_iothread(void);
>  void qemu_mutex_unlock_iothread(void);
>  
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index c73a75f..40742b3 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
>      /* Used for atomic instruction translation. */                      \
>      bool ll_sc_context;                                                 \
>      hwaddr excl_protected_hwaddr;                                       \
> +    /* Used to carry the stcond result and also as a flag to flag a
> +     * normal store access made by a stcond. */                         \
> +    int excl_succeeded;                                                 \
> +
>  
>  #endif
> diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
> index 81e9d8e..4105e72 100644
> --- a/softmmu_llsc_template.h
> +++ b/softmmu_llsc_template.h
> @@ -54,7 +54,21 @@
>                   (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
>  })                                                                           \
>  
> -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
> +#define is_read_tlb_entry_set(env, page, index)                              \
> +({                                                                           \
> +    (addr & TARGET_PAGE_MASK)                                                \
> +         == ((env->tlb_table[mmu_idx][index].addr_read) &                    \
> +                 (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
> +})                                                                           \
> +
> +/* Whenever a SC operation fails, we add a small delay to reduce the
> + * concurrency among the atomic instruction emulation code. Without this delay,
> + * in very congested situation where plain stores make all the pending LLs
> + * fail, the code could reach a stalling situation in which all the SCs happen
> + * to fail.
> + * TODO: make the delay dynamic according with the SC failing rate.
> + * */
> +#define TCG_ATOMIC_INSN_EMUL_DELAY 100

I'd be tempted to split out this sort of chicanery into a separate patch. 

>  
>  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
>                                  TCGMemOpIdx oi, uintptr_t retaddr)
> @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
>      hwaddr hw_addr;
>      unsigned mmu_idx = get_mmuidx(oi);
>  
> -    /* Use the proper load helper from cpu_ldst.h */
> -    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
> -
> -    /* The last legacy access ensures that the TLB and IOTLB entry for 'addr'
> -     * have been created. */
>      index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
> +    if (!is_read_tlb_entry_set(env, addr, index)) {
> +        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
> +    }
>  
>      /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
>       * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
>      hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
>  
>      /* Set the exclusive-protected hwaddr. */
> -    env->excl_protected_hwaddr = hw_addr;
> -    env->ll_sc_context = true;
> +    qemu_mutex_lock(&tcg_excl_access_lock);
> +    if (cpu_physical_memory_excl_is_dirty(hw_addr) && !exit_flush_request) {
> +        exit_flush_request = 1;
>  
> -    /* No need to mask hw_addr with TARGET_PAGE_MASK since
> -     * cpu_physical_memory_excl_is_dirty() will take care of that. */
> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
> -        cpu_physical_memory_clear_excl_dirty(hw_addr);
> +        qemu_mutex_unlock(&tcg_excl_access_lock);
> +
> +        cpu_exit_init_rendezvous();
>  
> -        /* Invalidate the TLB entry for the other processors. The next TLB
> -         * entries for this page will have the TLB_EXCL flag set. */
>          CPU_FOREACH(cpu) {
> -            if (cpu != current_cpu) {
> -                tlb_flush(cpu, 1);
> +            if ((cpu->thread_id != qemu_get_thread_id())) {
> +                if (!cpu->pending_tlb_flush) {
> +                    /* Flush the TLB cache before executing the next TB. */
> +                    cpu->pending_tlb_flush = 1;
> +                    cpu_exit_callback_add(cpu, cpu_exit_tlb_flush_all_cb, NULL);
> +                }
> +                if (cpu->tcg_executing) {
> +                    /* We want to wait all the vCPUs that are running in this
> +                     * exact moment.
> +                     * Add a callback to be executed as soon as the vCPU exits
> +                     * from the current TB. Force it to exit. */
> +                    cpu_exit_rendezvous_add_attendee(cpu);
> +                    qemu_cpu_kick_thread(cpu);
> +                }
>              }
>          }
> +
> +        cpu_exit_rendezvous_wait_others(ENV_GET_CPU(env));
> +
> +        exit_flush_request = 0;
> +
> +        qemu_mutex_lock(&tcg_excl_access_lock);
> +        cpu_physical_memory_clear_excl_dirty(hw_addr);
>      }
>  
> +    env->ll_sc_context = true;
> +
> +    /* Use the proper load helper from cpu_ldst.h */
> +    env->excl_protected_hwaddr = hw_addr;
> +    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
> +
> +    qemu_mutex_unlock(&tcg_excl_access_lock);
> +
>      /* For this vCPU, just update the TLB entry, no need to flush. */
>      env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL;
>  
> @@ -106,7 +143,6 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
>  {
>      WORD_TYPE ret;
>      int index;
> -    hwaddr hw_addr;
>      unsigned mmu_idx = get_mmuidx(oi);
>  
>      /* If the TLB entry is not the right one, create it. */
> @@ -115,29 +151,22 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
>          tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
>      }
>  
> -    /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
> -     * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
> -    hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
> -
> -    if (!env->ll_sc_context) {
> -        /* No LoakLink has been set, the StoreCond has to fail. */
> -        return 1;
> -    }
> -
>      env->ll_sc_context = 0;
>  
> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
> -        /* Another vCPU has accessed the memory after the LoadLink. */
> -        ret = 1;
> -    } else {
> -        helper_st_legacy(env, addr, val, mmu_idx, retaddr);
> +    /* We set it preventively to true to distinguish the following legacy
> +     * access as one made by the store conditional wrapper. If the store
> +     * conditional does not succeed, the value will be set to 0.*/
> +    env->excl_succeeded = 1;
> +    helper_st_legacy(env, addr, val, mmu_idx, retaddr);
>  
> -        /* The StoreConditional succeeded */
> +    if (env->excl_succeeded) {
> +        env->excl_succeeded = 0;
>          ret = 0;
> +    } else {
> +        g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY);
> +        ret = 1;
>      }
>  
> -    env->tlb_table[mmu_idx][index].addr_write &= ~TLB_EXCL;
> -    env->excl_protected_hwaddr = EXCLUSIVE_RESET_ADDR;
>      /* It's likely that the page will be used again for exclusive accesses,
>       * for this reason we don't flush any TLB cache at the price of some
>       * additional slow paths and we don't set the page bit as dirty.
Alvise Rigo July 17, 2015, 3:31 p.m. UTC | #2
On Fri, Jul 17, 2015 at 5:27 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>
>> Update the TCG LL/SC instructions to work in multi-threading.
>>
>> The basic idea remains untouched, but the whole mechanism is improved to
>> make use of the callback support to query TLB flush requests and the
>> rendezvous callback to synchronize all the currently running vCPUs.
>>
>> In essence, if a vCPU wants to LL to a page which is not already set as
>> EXCL, it will arrange a rendezvous with all the vCPUs that are executing
>> a TB and query a TLB flush for *all* the vCPUs.
>> Doing so, we make sure that:
>> - the running vCPUs do not touch the EXCL page while the requesting vCPU
>>   is setting the transaction to EXCL of the page
>> - all the vCPUs will have the EXCL flag in the TLB entry for that
>>   specific page *before* entering the next TB
>>
>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>> ---
>>  cputlb.c                |  2 +
>>  include/exec/cpu-defs.h |  4 ++
>>  softmmu_llsc_template.h | 97 ++++++++++++++++++++++++++++++++-----------------
>>  3 files changed, 69 insertions(+), 34 deletions(-)
>>
>> diff --git a/cputlb.c b/cputlb.c
>> index 66df41a..0566e0f 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -30,6 +30,8 @@
>>  #include "exec/ram_addr.h"
>>  #include "tcg/tcg.h"
>>
>> +#include "sysemu/cpus.h"
>> +
>>  void qemu_mutex_lock_iothread(void);
>>  void qemu_mutex_unlock_iothread(void);
>>
>> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
>> index c73a75f..40742b3 100644
>> --- a/include/exec/cpu-defs.h
>> +++ b/include/exec/cpu-defs.h
>> @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
>>      /* Used for atomic instruction translation. */                      \
>>      bool ll_sc_context;                                                 \
>>      hwaddr excl_protected_hwaddr;                                       \
>> +    /* Used to carry the stcond result and also as a flag to flag a
>> +     * normal store access made by a stcond. */                         \
>> +    int excl_succeeded;                                                 \
>> +
>>
>>  #endif
>> diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
>> index 81e9d8e..4105e72 100644
>> --- a/softmmu_llsc_template.h
>> +++ b/softmmu_llsc_template.h
>> @@ -54,7 +54,21 @@
>>                   (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
>>  })                                                                           \
>>
>> -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
>> +#define is_read_tlb_entry_set(env, page, index)                              \
>> +({                                                                           \
>> +    (addr & TARGET_PAGE_MASK)                                                \
>> +         == ((env->tlb_table[mmu_idx][index].addr_read) &                    \
>> +                 (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
>> +})                                                                           \
>> +
>> +/* Whenever a SC operation fails, we add a small delay to reduce the
>> + * concurrency among the atomic instruction emulation code. Without this delay,
>> + * in very congested situation where plain stores make all the pending LLs
>> + * fail, the code could reach a stalling situation in which all the SCs happen
>> + * to fail.
>> + * TODO: make the delay dynamic according with the SC failing rate.
>> + * */
>> +#define TCG_ATOMIC_INSN_EMUL_DELAY 100
>
> I'd be tempted to split out this sort of chicanery into a separate patch.

OK, I think it's a good idea since it's not strictly required.

Regards,
alvise

>
>>
>>  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
>>                                  TCGMemOpIdx oi, uintptr_t retaddr)
>> @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
>>      hwaddr hw_addr;
>>      unsigned mmu_idx = get_mmuidx(oi);
>>
>> -    /* Use the proper load helper from cpu_ldst.h */
>> -    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
>> -
>> -    /* The last legacy access ensures that the TLB and IOTLB entry for 'addr'
>> -     * have been created. */
>>      index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
>> +    if (!is_read_tlb_entry_set(env, addr, index)) {
>> +        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
>> +    }
>>
>>      /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
>>       * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
>>      hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
>>
>>      /* Set the exclusive-protected hwaddr. */
>> -    env->excl_protected_hwaddr = hw_addr;
>> -    env->ll_sc_context = true;
>> +    qemu_mutex_lock(&tcg_excl_access_lock);
>> +    if (cpu_physical_memory_excl_is_dirty(hw_addr) && !exit_flush_request) {
>> +        exit_flush_request = 1;
>>
>> -    /* No need to mask hw_addr with TARGET_PAGE_MASK since
>> -     * cpu_physical_memory_excl_is_dirty() will take care of that. */
>> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
>> -        cpu_physical_memory_clear_excl_dirty(hw_addr);
>> +        qemu_mutex_unlock(&tcg_excl_access_lock);
>> +
>> +        cpu_exit_init_rendezvous();
>>
>> -        /* Invalidate the TLB entry for the other processors. The next TLB
>> -         * entries for this page will have the TLB_EXCL flag set. */
>>          CPU_FOREACH(cpu) {
>> -            if (cpu != current_cpu) {
>> -                tlb_flush(cpu, 1);
>> +            if ((cpu->thread_id != qemu_get_thread_id())) {
>> +                if (!cpu->pending_tlb_flush) {
>> +                    /* Flush the TLB cache before executing the next TB. */
>> +                    cpu->pending_tlb_flush = 1;
>> +                    cpu_exit_callback_add(cpu, cpu_exit_tlb_flush_all_cb, NULL);
>> +                }
>> +                if (cpu->tcg_executing) {
>> +                    /* We want to wait all the vCPUs that are running in this
>> +                     * exact moment.
>> +                     * Add a callback to be executed as soon as the vCPU exits
>> +                     * from the current TB. Force it to exit. */
>> +                    cpu_exit_rendezvous_add_attendee(cpu);
>> +                    qemu_cpu_kick_thread(cpu);
>> +                }
>>              }
>>          }
>> +
>> +        cpu_exit_rendezvous_wait_others(ENV_GET_CPU(env));
>> +
>> +        exit_flush_request = 0;
>> +
>> +        qemu_mutex_lock(&tcg_excl_access_lock);
>> +        cpu_physical_memory_clear_excl_dirty(hw_addr);
>>      }
>>
>> +    env->ll_sc_context = true;
>> +
>> +    /* Use the proper load helper from cpu_ldst.h */
>> +    env->excl_protected_hwaddr = hw_addr;
>> +    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
>> +
>> +    qemu_mutex_unlock(&tcg_excl_access_lock);
>> +
>>      /* For this vCPU, just update the TLB entry, no need to flush. */
>>      env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL;
>>
>> @@ -106,7 +143,6 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
>>  {
>>      WORD_TYPE ret;
>>      int index;
>> -    hwaddr hw_addr;
>>      unsigned mmu_idx = get_mmuidx(oi);
>>
>>      /* If the TLB entry is not the right one, create it. */
>> @@ -115,29 +151,22 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
>>          tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
>>      }
>>
>> -    /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
>> -     * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
>> -    hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
>> -
>> -    if (!env->ll_sc_context) {
>> -        /* No LoakLink has been set, the StoreCond has to fail. */
>> -        return 1;
>> -    }
>> -
>>      env->ll_sc_context = 0;
>>
>> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
>> -        /* Another vCPU has accessed the memory after the LoadLink. */
>> -        ret = 1;
>> -    } else {
>> -        helper_st_legacy(env, addr, val, mmu_idx, retaddr);
>> +    /* We set it preventively to true to distinguish the following legacy
>> +     * access as one made by the store conditional wrapper. If the store
>> +     * conditional does not succeed, the value will be set to 0.*/
>> +    env->excl_succeeded = 1;
>> +    helper_st_legacy(env, addr, val, mmu_idx, retaddr);
>>
>> -        /* The StoreConditional succeeded */
>> +    if (env->excl_succeeded) {
>> +        env->excl_succeeded = 0;
>>          ret = 0;
>> +    } else {
>> +        g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY);
>> +        ret = 1;
>>      }
>>
>> -    env->tlb_table[mmu_idx][index].addr_write &= ~TLB_EXCL;
>> -    env->excl_protected_hwaddr = EXCLUSIVE_RESET_ADDR;
>>      /* It's likely that the page will be used again for exclusive accesses,
>>       * for this reason we don't flush any TLB cache at the price of some
>>       * additional slow paths and we don't set the page bit as dirty.
>
> --
> Alex Bennée
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index 66df41a..0566e0f 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -30,6 +30,8 @@ 
 #include "exec/ram_addr.h"
 #include "tcg/tcg.h"
 
+#include "sysemu/cpus.h"
+
 void qemu_mutex_lock_iothread(void);
 void qemu_mutex_unlock_iothread(void);
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index c73a75f..40742b3 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -169,5 +169,9 @@  typedef struct CPUIOTLBEntry {
     /* Used for atomic instruction translation. */                      \
     bool ll_sc_context;                                                 \
     hwaddr excl_protected_hwaddr;                                       \
+    /* Used to carry the stcond result and also as a flag to flag a
+     * normal store access made by a stcond. */                         \
+    int excl_succeeded;                                                 \
+
 
 #endif
diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
index 81e9d8e..4105e72 100644
--- a/softmmu_llsc_template.h
+++ b/softmmu_llsc_template.h
@@ -54,7 +54,21 @@ 
                  (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
 })                                                                           \
 
-#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
+#define is_read_tlb_entry_set(env, page, index)                              \
+({                                                                           \
+    (addr & TARGET_PAGE_MASK)                                                \
+         == ((env->tlb_table[mmu_idx][index].addr_read) &                    \
+                 (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
+})                                                                           \
+
+/* Whenever a SC operation fails, we add a small delay to reduce the
+ * concurrency among the atomic instruction emulation code. Without this delay,
+ * in very congested situation where plain stores make all the pending LLs
+ * fail, the code could reach a stalling situation in which all the SCs happen
+ * to fail.
+ * TODO: make the delay dynamic according with the SC failing rate.
+ * */
+#define TCG_ATOMIC_INSN_EMUL_DELAY 100
 
 WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr)
@@ -65,35 +79,58 @@  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
     hwaddr hw_addr;
     unsigned mmu_idx = get_mmuidx(oi);
 
-    /* Use the proper load helper from cpu_ldst.h */
-    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
-
-    /* The last legacy access ensures that the TLB and IOTLB entry for 'addr'
-     * have been created. */
     index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    if (!is_read_tlb_entry_set(env, addr, index)) {
+        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+    }
 
     /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
      * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
     hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
 
     /* Set the exclusive-protected hwaddr. */
-    env->excl_protected_hwaddr = hw_addr;
-    env->ll_sc_context = true;
+    qemu_mutex_lock(&tcg_excl_access_lock);
+    if (cpu_physical_memory_excl_is_dirty(hw_addr) && !exit_flush_request) {
+        exit_flush_request = 1;
 
-    /* No need to mask hw_addr with TARGET_PAGE_MASK since
-     * cpu_physical_memory_excl_is_dirty() will take care of that. */
-    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
-        cpu_physical_memory_clear_excl_dirty(hw_addr);
+        qemu_mutex_unlock(&tcg_excl_access_lock);
+
+        cpu_exit_init_rendezvous();
 
-        /* Invalidate the TLB entry for the other processors. The next TLB
-         * entries for this page will have the TLB_EXCL flag set. */
         CPU_FOREACH(cpu) {
-            if (cpu != current_cpu) {
-                tlb_flush(cpu, 1);
+            if ((cpu->thread_id != qemu_get_thread_id())) {
+                if (!cpu->pending_tlb_flush) {
+                    /* Flush the TLB cache before executing the next TB. */
+                    cpu->pending_tlb_flush = 1;
+                    cpu_exit_callback_add(cpu, cpu_exit_tlb_flush_all_cb, NULL);
+                }
+                if (cpu->tcg_executing) {
+                    /* We want to wait all the vCPUs that are running in this
+                     * exact moment.
+                     * Add a callback to be executed as soon as the vCPU exits
+                     * from the current TB. Force it to exit. */
+                    cpu_exit_rendezvous_add_attendee(cpu);
+                    qemu_cpu_kick_thread(cpu);
+                }
             }
         }
+
+        cpu_exit_rendezvous_wait_others(ENV_GET_CPU(env));
+
+        exit_flush_request = 0;
+
+        qemu_mutex_lock(&tcg_excl_access_lock);
+        cpu_physical_memory_clear_excl_dirty(hw_addr);
     }
 
+    env->ll_sc_context = true;
+
+    /* Use the proper load helper from cpu_ldst.h */
+    env->excl_protected_hwaddr = hw_addr;
+    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
+
+    qemu_mutex_unlock(&tcg_excl_access_lock);
+
     /* For this vCPU, just update the TLB entry, no need to flush. */
     env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL;
 
@@ -106,7 +143,6 @@  WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
 {
     WORD_TYPE ret;
     int index;
-    hwaddr hw_addr;
     unsigned mmu_idx = get_mmuidx(oi);
 
     /* If the TLB entry is not the right one, create it. */
@@ -115,29 +151,22 @@  WORD_TYPE helper_le_stcond_name(CPUArchState *env, target_ulong addr,
         tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
     }
 
-    /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
-     * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
-    hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
-
-    if (!env->ll_sc_context) {
-        /* No LoakLink has been set, the StoreCond has to fail. */
-        return 1;
-    }
-
     env->ll_sc_context = 0;
 
-    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
-        /* Another vCPU has accessed the memory after the LoadLink. */
-        ret = 1;
-    } else {
-        helper_st_legacy(env, addr, val, mmu_idx, retaddr);
+    /* We set it preventively to true to distinguish the following legacy
+     * access as one made by the store conditional wrapper. If the store
+     * conditional does not succeed, the value will be set to 0.*/
+    env->excl_succeeded = 1;
+    helper_st_legacy(env, addr, val, mmu_idx, retaddr);
 
-        /* The StoreConditional succeeded */
+    if (env->excl_succeeded) {
+        env->excl_succeeded = 0;
         ret = 0;
+    } else {
+        g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY);
+        ret = 1;
     }
 
-    env->tlb_table[mmu_idx][index].addr_write &= ~TLB_EXCL;
-    env->excl_protected_hwaddr = EXCLUSIVE_RESET_ADDR;
     /* It's likely that the page will be used again for exclusive accesses,
      * for this reason we don't flush any TLB cache at the price of some
      * additional slow paths and we don't set the page bit as dirty.