diff mbox

[RFC,v5,2/6] softmmu: Add new TLB_EXCL flag

Message ID 1443083566-10994-3-git-send-email-a.rigo@virtualopensystems.com
State New
Headers show

Commit Message

Alvise Rigo Sept. 24, 2015, 8:32 a.m. UTC
Add a new TLB flag to force all the accesses made to a page to follow
the slow-path.

In the case we remove a TLB entry marked as EXCL, we unset the
corresponding exclusive bit in the bitmap.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 cputlb.c                |  40 ++++++++++++++++-
 include/exec/cpu-all.h  |   8 ++++
 include/exec/cpu-defs.h |  12 ++++++
 softmmu_template.h      | 112 ++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 149 insertions(+), 23 deletions(-)

Comments

Richard Henderson Sept. 30, 2015, 3:34 a.m. UTC | #1
On 09/24/2015 06:32 PM, Alvise Rigo wrote:
> +    if (unlikely(!(te->addr_write & TLB_MMIO) && (te->addr_write & TLB_EXCL))) {
> +        /* We are removing an exclusive entry, set the page to dirty. This
> +         * is not be necessary if the vCPU has performed both SC and LL. */
> +        hwaddr hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) +
> +                                          (te->addr_write & TARGET_PAGE_MASK);
> +        cpu_physical_memory_set_excl_dirty(hw_addr, cpu->cpu_index);
> +    }

Um... this seems dangerous.

(1) I don't see why EXCL support should differ whether MMIO is set or not. 
Either we support exclusive accesses on memory-mapped io like we do on ram (in 
which case this is wrong) or we don't (in which case this is unnecessary).

(2) Doesn't this prevent a target from accessing a page during a ll/sc sequence 
that aliases within our trivial hash?  Such a case on arm might be

	mov	r3, #0x100000
	ldrex	r0, [r2]
	ldr	r1, [r2, r3]
	add	r0, r0, r1
	strex	r0, [r2]

AFAIK, Alpha is the only target we have which specifies that any normal memory 
access during a ll+sc sequence may fail the sc.

(3) I'm finding the "clean/dirty" words less helpful than they could be, 
especially since "clean" implies "some cpu has an excl lock on the page",
which is reverse of what seems natural but understandable given the 
implementation.  Perhaps we could rename these helpers?

> @@ -376,6 +392,28 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
>       return qemu_ram_addr_from_host_nofail(p);
>   }
>
> +/* Atomic insn translation TLB support. */
> +#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
> +/* For every vCPU compare the exclusive address and reset it in case of a
> + * match. Since only one vCPU is running at once, no lock has to be held to
> + * guard this operation. */
> +static inline void lookup_and_reset_cpus_ll_addr(hwaddr addr, hwaddr size)
> +{
> +    CPUState *cpu;
> +    CPUArchState *acpu;
> +
> +    CPU_FOREACH(cpu) {
> +        acpu = (CPUArchState *)cpu->env_ptr;
> +
> +        if (acpu->excl_protected_range.begin != EXCLUSIVE_RESET_ADDR &&
> +            ranges_overlap(acpu->excl_protected_range.begin,
> +            acpu->excl_protected_range.end - acpu->excl_protected_range.begin,
> +            addr, size)) {

Watch the indentation here... it ought to line up with the previous argument on 
the line above, just after the (.  This may require you split the subtract 
across the line too but that's ok.


>   void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
>   void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf);
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index 98b9cff..a67f295 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -27,6 +27,7 @@
>   #include <inttypes.h>
>   #include "qemu/osdep.h"
>   #include "qemu/queue.h"
> +#include "qemu/range.h"
>   #include "tcg-target.h"
>   #ifndef CONFIG_USER_ONLY
>   #include "exec/hwaddr.h"
> @@ -150,5 +151,16 @@ typedef struct CPUIOTLBEntry {
>   #define CPU_COMMON                                                      \
>       /* soft mmu support */                                              \
>       CPU_COMMON_TLB                                                      \
> +                                                                        \
> +    /* Used by the atomic insn translation backend. */                  \
> +    int ll_sc_context;                                                  \
> +    /* vCPU current exclusive addresses range.
> +     * The address is set to EXCLUSIVE_RESET_ADDR if the vCPU is not.
> +     * in the middle of a LL/SC. */                                     \
> +    struct Range excl_protected_range;                                  \
> +    /* Used to carry the SC result but also to flag a normal (legacy)
> +     * store access made by a stcond (see softmmu_template.h). */       \
> +    int excl_succeeded;                                                 \


This seems to be required by softmmu_template.h?  In which case this must be in 
CPU_COMMON_TLB.

Please expand on this "legacy store access" comment here.  I don't understand.

> +
>
>   #endif
> diff --git a/softmmu_template.h b/softmmu_template.h
> index d42d89d..e4431e8 100644
> --- a/softmmu_template.h
> +++ b/softmmu_template.h
> @@ -409,19 +409,53 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>           tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>       }
>
> -    /* Handle an IO access.  */
> +    /* Handle an IO access or exclusive access.  */
>       if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
> -        CPUIOTLBEntry *iotlbentry;
> -        if ((addr & (DATA_SIZE - 1)) != 0) {
> -            goto do_unaligned_access;
> +        CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
> +
> +        if ((tlb_addr & ~TARGET_PAGE_MASK) == TLB_EXCL) {
> +            /* The slow-path has been forced since we are writing to
> +             * exclusive-protected memory. */
> +            hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
> +
> +            /* The function lookup_and_reset_cpus_ll_addr could have reset the
> +             * exclusive address. Fail the SC in this case.
> +             * N.B.: Here excl_succeeded == 0 means that helper_le_st_name has
> +             * not been called by a softmmu_llsc_template.h. */
> +            if(env->excl_succeeded) {
> +                if (env->excl_protected_range.begin != hw_addr) {
> +                    /* The vCPU is SC-ing to an unprotected address. */
> +                    env->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
> +                    env->excl_succeeded = 0;
> +
> +                    return;

Huh?  How does a normal store ever fail.  Surely you aren't using the normal 
store path to support true store_conditional?

> +                }
> +
> +                cpu_physical_memory_set_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
> +            }
> +
> +            haddr = addr + env->tlb_table[mmu_idx][index].addend;
> +        #if DATA_SIZE == 1
> +            glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
> +        #else
> +            glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
> +        #endif

You're assuming that TLB_EXCL can never have any other TLB_ bits set.  We 
definitely have to support TLB_EXCL+TLB_NOTDIRTY, and probably +TLB_MMIO too 
(iirc that's how watchpoints are implemeneted).


r~
Alvise Rigo Sept. 30, 2015, 9:24 a.m. UTC | #2
On Wed, Sep 30, 2015 at 5:34 AM, Richard Henderson <rth@twiddle.net> wrote:
> On 09/24/2015 06:32 PM, Alvise Rigo wrote:
>>
>> +    if (unlikely(!(te->addr_write & TLB_MMIO) && (te->addr_write &
>> TLB_EXCL))) {
>> +        /* We are removing an exclusive entry, set the page to dirty.
>> This
>> +         * is not be necessary if the vCPU has performed both SC and LL.
>> */
>> +        hwaddr hw_addr = (env->iotlb[mmu_idx][index].addr &
>> TARGET_PAGE_MASK) +
>> +                                          (te->addr_write &
>> TARGET_PAGE_MASK);
>> +        cpu_physical_memory_set_excl_dirty(hw_addr, cpu->cpu_index);
>> +    }
>
>
> Um... this seems dangerous.
>
> (1) I don't see why EXCL support should differ whether MMIO is set or not.
> Either we support exclusive accesses on memory-mapped io like we do on ram
> (in which case this is wrong) or we don't (in which case this is
> unnecessary).

I was not sure whether or not we had to support also MMIO memory.
In theory there shouldn't be any issues for including also
memory-mapped io, I will consider this for the next version.

>
> (2) Doesn't this prevent a target from accessing a page during a ll/sc
> sequence that aliases within our trivial hash?  Such a case on arm might be
>
>         mov     r3, #0x100000
>         ldrex   r0, [r2]
>         ldr     r1, [r2, r3]
>         add     r0, r0, r1
>         strex   r0, [r2]
>

I'm not sure I got it. When the CPU issues the ldrex the page will be
set as "clean" (meaning that all the CPUs will then follow the
slow-path for that page) and the exclusive range - [r2, r2+4] in this
case - is stored in the CPU state.
The forced slow-path is used to check if the normal store is hitting
the exclusive range of any CPUs, the normal loads are not affected.
I don't see any problem in the code above, what am I missing?

> AFAIK, Alpha is the only target we have which specifies that any normal
> memory access during a ll+sc sequence may fail the sc.

I will dig into it because I remember that the Alpha architecture
behaves like ARM in the handling of LDxL/STxC instructions.

>
> (3) I'm finding the "clean/dirty" words less helpful than they could be,
> especially since "clean" implies "some cpu has an excl lock on the page",
> which is reverse of what seems natural but understandable given the
> implementation.  Perhaps we could rename these helpers?
>
>> @@ -376,6 +392,28 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1,
>> target_ulong addr)
>>       return qemu_ram_addr_from_host_nofail(p);
>>   }
>>
>> +/* Atomic insn translation TLB support. */
>> +#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
>> +/* For every vCPU compare the exclusive address and reset it in case of a
>> + * match. Since only one vCPU is running at once, no lock has to be held
>> to
>> + * guard this operation. */
>> +static inline void lookup_and_reset_cpus_ll_addr(hwaddr addr, hwaddr
>> size)
>> +{
>> +    CPUState *cpu;
>> +    CPUArchState *acpu;
>> +
>> +    CPU_FOREACH(cpu) {
>> +        acpu = (CPUArchState *)cpu->env_ptr;
>> +
>> +        if (acpu->excl_protected_range.begin != EXCLUSIVE_RESET_ADDR &&
>> +            ranges_overlap(acpu->excl_protected_range.begin,
>> +            acpu->excl_protected_range.end -
>> acpu->excl_protected_range.begin,
>> +            addr, size)) {
>
>
> Watch the indentation here... it ought to line up with the previous argument
> on the line above, just after the (.  This may require you split the
> subtract across the line too but that's ok.

OK, I will fix it.

>
>
>
>>   void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
>>   void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf);
>> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
>> index 98b9cff..a67f295 100644
>> --- a/include/exec/cpu-defs.h
>> +++ b/include/exec/cpu-defs.h
>> @@ -27,6 +27,7 @@
>>   #include <inttypes.h>
>>   #include "qemu/osdep.h"
>>   #include "qemu/queue.h"
>> +#include "qemu/range.h"
>>   #include "tcg-target.h"
>>   #ifndef CONFIG_USER_ONLY
>>   #include "exec/hwaddr.h"
>> @@ -150,5 +151,16 @@ typedef struct CPUIOTLBEntry {
>>   #define CPU_COMMON
>> \
>>       /* soft mmu support */
>> \
>>       CPU_COMMON_TLB
>> \
>> +                                                                        \
>> +    /* Used by the atomic insn translation backend. */                  \
>> +    int ll_sc_context;                                                  \
>> +    /* vCPU current exclusive addresses range.
>> +     * The address is set to EXCLUSIVE_RESET_ADDR if the vCPU is not.
>> +     * in the middle of a LL/SC. */                                     \
>> +    struct Range excl_protected_range;                                  \
>> +    /* Used to carry the SC result but also to flag a normal (legacy)
>> +     * store access made by a stcond (see softmmu_template.h). */       \
>> +    int excl_succeeded;                                                 \
>
>
>
> This seems to be required by softmmu_template.h?  In which case this must be
> in CPU_COMMON_TLB.
>
> Please expand on this "legacy store access" comment here.  I don't
> understand.

ACK.

>
>
>> +
>>
>>   #endif
>> diff --git a/softmmu_template.h b/softmmu_template.h
>> index d42d89d..e4431e8 100644
>> --- a/softmmu_template.h
>> +++ b/softmmu_template.h
>> @@ -409,19 +409,53 @@ void helper_le_st_name(CPUArchState *env,
>> target_ulong addr, DATA_TYPE val,
>>           tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>>       }
>>
>> -    /* Handle an IO access.  */
>> +    /* Handle an IO access or exclusive access.  */
>>       if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
>> -        CPUIOTLBEntry *iotlbentry;
>> -        if ((addr & (DATA_SIZE - 1)) != 0) {
>> -            goto do_unaligned_access;
>> +        CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
>> +
>> +        if ((tlb_addr & ~TARGET_PAGE_MASK) == TLB_EXCL) {
>> +            /* The slow-path has been forced since we are writing to
>> +             * exclusive-protected memory. */
>> +            hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) +
>> addr;
>> +
>> +            /* The function lookup_and_reset_cpus_ll_addr could have
>> reset the
>> +             * exclusive address. Fail the SC in this case.
>> +             * N.B.: Here excl_succeeded == 0 means that
>> helper_le_st_name has
>> +             * not been called by a softmmu_llsc_template.h. */
>> +            if(env->excl_succeeded) {
>> +                if (env->excl_protected_range.begin != hw_addr) {
>> +                    /* The vCPU is SC-ing to an unprotected address. */
>> +                    env->excl_protected_range.begin =
>> EXCLUSIVE_RESET_ADDR;
>> +                    env->excl_succeeded = 0;
>> +
>> +                    return;
>
>
> Huh?  How does a normal store ever fail.  Surely you aren't using the normal
> store path to support true store_conditional?

As written in the comment, we are not doing a normal store, but the
store for a SC operation.

>
>> +                }
>> +
>> +                cpu_physical_memory_set_excl_dirty(hw_addr,
>> ENV_GET_CPU(env)->cpu_index);
>> +            }
>> +
>> +            haddr = addr + env->tlb_table[mmu_idx][index].addend;
>> +        #if DATA_SIZE == 1
>> +            glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
>> +        #else
>> +            glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
>> +        #endif
>
>
> You're assuming that TLB_EXCL can never have any other TLB_ bits set.  We

Yes, I was assuming this...

> definitely have to support TLB_EXCL+TLB_NOTDIRTY, and probably +TLB_MMIO too
> (iirc that's how watchpoints are implemeneted).

...but indeed I was wrong, we need to support also these combinations.
I will address this points for the next version.

Thank you,
alvise

>
>
> r~
Peter Maydell Sept. 30, 2015, 11:09 a.m. UTC | #3
On 30 September 2015 at 10:24, alvise rigo
<a.rigo@virtualopensystems.com> wrote:
> On Wed, Sep 30, 2015 at 5:34 AM, Richard Henderson <rth@twiddle.net> wrote:
>> (1) I don't see why EXCL support should differ whether MMIO is set or not.
>> Either we support exclusive accesses on memory-mapped io like we do on ram
>> (in which case this is wrong) or we don't (in which case this is
>> unnecessary).
>
> I was not sure whether or not we had to support also MMIO memory.
> In theory there shouldn't be any issues for including also
> memory-mapped io, I will consider this for the next version.

Worth considering the interaction between exclusives and other
cases for which we force the slowpath, notably watchpoints.

>> AFAIK, Alpha is the only target we have which specifies that any normal
>> memory access during a ll+sc sequence may fail the sc.
>
> I will dig into it because I remember that the Alpha architecture
> behaves like ARM in the handling of LDxL/STxC instructions.

ARM semantics are that a non-exclusive store by this CPU between
a ldrex and a strex might result in loss of the (local) monitor,
but non-exclusive loads by this CPU won't. (It's an IMPDEF
choice.)

thanks
-- PMM
Alvise Rigo Sept. 30, 2015, 12:44 p.m. UTC | #4
On Wed, Sep 30, 2015 at 1:09 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 30 September 2015 at 10:24, alvise rigo
> <a.rigo@virtualopensystems.com> wrote:
>> On Wed, Sep 30, 2015 at 5:34 AM, Richard Henderson <rth@twiddle.net> wrote:
>>> (1) I don't see why EXCL support should differ whether MMIO is set or not.
>>> Either we support exclusive accesses on memory-mapped io like we do on ram
>>> (in which case this is wrong) or we don't (in which case this is
>>> unnecessary).
>>
>> I was not sure whether or not we had to support also MMIO memory.
>> In theory there shouldn't be any issues for including also
>> memory-mapped io, I will consider this for the next version.
>
> Worth considering the interaction between exclusives and other
> cases for which we force the slowpath, notably watchpoints.
>
>>> AFAIK, Alpha is the only target we have which specifies that any normal
>>> memory access during a ll+sc sequence may fail the sc.
>>
>> I will dig into it because I remember that the Alpha architecture
>> behaves like ARM in the handling of LDxL/STxC instructions.
>
> ARM semantics are that a non-exclusive store by this CPU between
> a ldrex and a strex might result in loss of the (local) monitor,
> but non-exclusive loads by this CPU won't. (It's an IMPDEF
> choice.)

Indeed, what is implemented by this patch series is one of the
permissible choices - very close to the one implemented by the current
TCG - that could match all the other architectures with similar
semantics (now I'm not sure about Alpha).
In this regard, I was wondering, should these semantics be somehow
target-* dependent?
Like having some per-architecture functions that, for each LoadLink,
set the size of the exclusive memory region to be protected and decide
whether a normal store/load will make one CPU's SC fail.

Thank you,
alvise

>
> thanks
> -- PMM
Richard Henderson Sept. 30, 2015, 8:37 p.m. UTC | #5
On 09/30/2015 10:44 PM, alvise rigo wrote:
> On Wed, Sep 30, 2015 at 1:09 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> On 30 September 2015 at 10:24, alvise rigo
>> <a.rigo@virtualopensystems.com> wrote:
>>> On Wed, Sep 30, 2015 at 5:34 AM, Richard Henderson <rth@twiddle.net> wrote:
>>>> (1) I don't see why EXCL support should differ whether MMIO is set or not.
>>>> Either we support exclusive accesses on memory-mapped io like we do on ram
>>>> (in which case this is wrong) or we don't (in which case this is
>>>> unnecessary).
>>>
>>> I was not sure whether or not we had to support also MMIO memory.
>>> In theory there shouldn't be any issues for including also
>>> memory-mapped io, I will consider this for the next version.
>>
>> Worth considering the interaction between exclusives and other
>> cases for which we force the slowpath, notably watchpoints.
>>
>>>> AFAIK, Alpha is the only target we have which specifies that any normal
>>>> memory access during a ll+sc sequence may fail the sc.
>>>
>>> I will dig into it because I remember that the Alpha architecture
>>> behaves like ARM in the handling of LDxL/STxC instructions.
>>
>> ARM semantics are that a non-exclusive store by this CPU between
>> a ldrex and a strex might result in loss of the (local) monitor,
>> but non-exclusive loads by this CPU won't. (It's an IMPDEF
>> choice.)
>
> Indeed, what is implemented by this patch series is one of the
> permissible choices - very close to the one implemented by the current
> TCG - that could match all the other architectures with similar
> semantics

Except for the case I pointed out -- where a ldr to a page that conflicts with 
the ldrex in the tlb will result in the replacement of the tlb entry, and thus 
you'll see the outgoing tlb entry has TLB_EXCL set and set the dirty bit (i.e. 
clear the lock bit) on the ldrex page.

I.e. this is a case that *must* pass on arm, but your implementation won't.

> (now I'm not sure about Alpha).

Alpha is trivial, as always.  Those guys wrote the spec such that anything hard 
isn't guaranteed to work, but also isn't guaranteed to fail.

The rule is: no memory accesses of any kind, lest the lock flag be lost.  In 
practice, it's a test of the MESI state of the cacheline.  Which, I assume is 
what most other targets use.

> Like having some per-architecture functions that, for each LoadLink,
> set the size of the exclusive memory region to be protected

We probably do need that; I sort of assumed that simply be part of the 
implementation on the instruction side, not done as a side-effect of the actual 
load.

> and decide whether a normal store/load will make one CPU's SC fail.

There's likely no need to do this.  And certainly I don't think we should do it 
right away.


r~
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index a506086..1e25a2a 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -299,6 +299,14 @@  void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     env->tlb_v_table[mmu_idx][vidx] = *te;
     env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
 
+    if (unlikely(!(te->addr_write & TLB_MMIO) && (te->addr_write & TLB_EXCL))) {
+        /* We are removing an exclusive entry, set the page to dirty. This
+         * is not be necessary if the vCPU has performed both SC and LL. */
+        hwaddr hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) +
+                                          (te->addr_write & TARGET_PAGE_MASK);
+        cpu_physical_memory_set_excl_dirty(hw_addr, cpu->cpu_index);
+    }
+
     /* refill the tlb */
     env->iotlb[mmu_idx][index].addr = iotlb - vaddr;
     env->iotlb[mmu_idx][index].attrs = attrs;
@@ -324,7 +332,15 @@  void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
                                                    + xlat)) {
             te->addr_write = address | TLB_NOTDIRTY;
         } else {
-            te->addr_write = address;
+            if (!(address & TLB_MMIO) &&
+                cpu_physical_memory_excl_atleast_one_clean(section->mr->ram_addr
+                                                           + xlat)) {
+                /* There is at least one vCPU that has flagged the address as
+                 * exclusive. */
+                te->addr_write = address | TLB_EXCL;
+            } else {
+                te->addr_write = address;
+            }
         }
     } else {
         te->addr_write = -1;
@@ -376,6 +392,28 @@  tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+/* Atomic insn translation TLB support. */
+#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
+/* For every vCPU compare the exclusive address and reset it in case of a
+ * match. Since only one vCPU is running at once, no lock has to be held to
+ * guard this operation. */
+static inline void lookup_and_reset_cpus_ll_addr(hwaddr addr, hwaddr size)
+{
+    CPUState *cpu;
+    CPUArchState *acpu;
+
+    CPU_FOREACH(cpu) {
+        acpu = (CPUArchState *)cpu->env_ptr;
+
+        if (acpu->excl_protected_range.begin != EXCLUSIVE_RESET_ADDR &&
+            ranges_overlap(acpu->excl_protected_range.begin,
+            acpu->excl_protected_range.end - acpu->excl_protected_range.begin,
+            addr, size)) {
+            acpu->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
+        }
+    }
+}
+
 #define MMUSUFFIX _mmu
 
 #define SHIFT 0
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index ea6a9a6..ad6afcb 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -320,6 +320,14 @@  extern RAMList ram_list;
 #define TLB_NOTDIRTY    (1 << 4)
 /* Set if TLB entry is an IO callback.  */
 #define TLB_MMIO        (1 << 5)
+/* Set if TLB entry references a page that requires exclusive access.  */
+#define TLB_EXCL        (1 << 6)
+
+/* Do not allow a TARGET_PAGE_MASK which covers one or more bits defined
+ * above. */
+#if TLB_EXCL >= TARGET_PAGE_SIZE
+#error TARGET_PAGE_MASK covering the low bits of the TLB virtual address
+#endif
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf);
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 98b9cff..a67f295 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -27,6 +27,7 @@ 
 #include <inttypes.h>
 #include "qemu/osdep.h"
 #include "qemu/queue.h"
+#include "qemu/range.h"
 #include "tcg-target.h"
 #ifndef CONFIG_USER_ONLY
 #include "exec/hwaddr.h"
@@ -150,5 +151,16 @@  typedef struct CPUIOTLBEntry {
 #define CPU_COMMON                                                      \
     /* soft mmu support */                                              \
     CPU_COMMON_TLB                                                      \
+                                                                        \
+    /* Used by the atomic insn translation backend. */                  \
+    int ll_sc_context;                                                  \
+    /* vCPU current exclusive addresses range.
+     * The address is set to EXCLUSIVE_RESET_ADDR if the vCPU is not.
+     * in the middle of a LL/SC. */                                     \
+    struct Range excl_protected_range;                                  \
+    /* Used to carry the SC result but also to flag a normal (legacy)
+     * store access made by a stcond (see softmmu_template.h). */       \
+    int excl_succeeded;                                                 \
+
 
 #endif
diff --git a/softmmu_template.h b/softmmu_template.h
index d42d89d..e4431e8 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -409,19 +409,53 @@  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
         tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
     }
 
-    /* Handle an IO access.  */
+    /* Handle an IO access or exclusive access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            goto do_unaligned_access;
+        CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+
+        if ((tlb_addr & ~TARGET_PAGE_MASK) == TLB_EXCL) {
+            /* The slow-path has been forced since we are writing to
+             * exclusive-protected memory. */
+            hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+
+            /* The function lookup_and_reset_cpus_ll_addr could have reset the
+             * exclusive address. Fail the SC in this case.
+             * N.B.: Here excl_succeeded == 0 means that helper_le_st_name has
+             * not been called by a softmmu_llsc_template.h. */
+            if(env->excl_succeeded) {
+                if (env->excl_protected_range.begin != hw_addr) {
+                    /* The vCPU is SC-ing to an unprotected address. */
+                    env->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
+                    env->excl_succeeded = 0;
+
+                    return;
+                }
+
+                cpu_physical_memory_set_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
+            }
+
+            haddr = addr + env->tlb_table[mmu_idx][index].addend;
+        #if DATA_SIZE == 1
+            glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
+        #else
+            glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
+        #endif
+
+            lookup_and_reset_cpus_ll_addr(hw_addr, DATA_SIZE);
+
+            return;
+        } else {
+            if ((addr & (DATA_SIZE - 1)) != 0) {
+                goto do_unaligned_access;
+            }
+            iotlbentry = &env->iotlb[mmu_idx][index];
+
+            /* ??? Note that the io helpers always read data in the target
+               byte ordering.  We should push the LE/BE request down into io.  */
+            val = TGT_LE(val);
+            glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+            return;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
-
-        /* ??? Note that the io helpers always read data in the target
-           byte ordering.  We should push the LE/BE request down into io.  */
-        val = TGT_LE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
-        return;
     }
 
     /* Handle slow unaligned access (it spans two pages or IO).  */
@@ -489,19 +523,53 @@  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
         tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
     }
 
-    /* Handle an IO access.  */
+    /* Handle an IO access or exclusive access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            goto do_unaligned_access;
+        CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
+
+        if ((tlb_addr & ~TARGET_PAGE_MASK) == TLB_EXCL) {
+            /* The slow-path has been forced since we are writing to
+             * exclusive-protected memory. */
+            hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+
+            /* The function lookup_and_reset_cpus_ll_addr could have reset the
+             * exclusive address. Fail the SC in this case.
+             * N.B.: Here excl_succeeded == 0 means that helper_le_st_name has
+             * not been called by a softmmu_llsc_template.h. */
+            if(env->excl_succeeded) {
+                if (env->excl_protected_range.begin != hw_addr) {
+                    /* The vCPU is SC-ing to an unprotected address. */
+                    env->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
+                    env->excl_succeeded = 0;
+
+                    return;
+                }
+
+                cpu_physical_memory_set_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
+            }
+
+            haddr = addr + env->tlb_table[mmu_idx][index].addend;
+        #if DATA_SIZE == 1
+            glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
+        #else
+            glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
+        #endif
+
+            lookup_and_reset_cpus_ll_addr(hw_addr, DATA_SIZE);
+
+            return;
+        } else {
+            if ((addr & (DATA_SIZE - 1)) != 0) {
+                goto do_unaligned_access;
+            }
+            iotlbentry = &env->iotlb[mmu_idx][index];
+
+            /* ??? Note that the io helpers always read data in the target
+               byte ordering.  We should push the LE/BE request down into io.  */
+            val = TGT_BE(val);
+            glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+            return;
         }
-        iotlbentry = &env->iotlb[mmu_idx][index];
-
-        /* ??? Note that the io helpers always read data in the target
-           byte ordering.  We should push the LE/BE request down into io.  */
-        val = TGT_BE(val);
-        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
-        return;
     }
 
     /* Handle slow unaligned access (it spans two pages or IO).  */