diff mbox

[RFC] target-arm: protect cpu_exclusive_*.

Message ID 1418721234-9588-1-git-send-email-fred.konrad@greensocs.com
State New
Headers show

Commit Message

fred.konrad@greensocs.com Dec. 16, 2014, 9:13 a.m. UTC
From: KONRAD Frederic <fred.konrad@greensocs.com>

This adds a lock to avoid multiple exclusive access at the same time in case of
TCG multithread.

Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
---
 target-arm/cpu.c       | 15 +++++++++++++++
 target-arm/cpu.h       |  3 +++
 target-arm/helper.h    |  3 +++
 target-arm/op_helper.c | 10 ++++++++++
 target-arm/translate.c |  6 ++++++
 5 files changed, 37 insertions(+)

Comments

Paolo Bonzini Dec. 16, 2014, 9:31 a.m. UTC | #1
On 16/12/2014 10:13, fred.konrad@greensocs.com wrote:
> From: KONRAD Frederic <fred.konrad@greensocs.com>
> 
> This adds a lock to avoid multiple exclusive access at the same time in case of
> TCG multithread.
> 
> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
> ---
>  target-arm/cpu.c       | 15 +++++++++++++++
>  target-arm/cpu.h       |  3 +++
>  target-arm/helper.h    |  3 +++
>  target-arm/op_helper.c | 10 ++++++++++
>  target-arm/translate.c |  6 ++++++
>  5 files changed, 37 insertions(+)
> 
> diff --git a/target-arm/cpu.c b/target-arm/cpu.c
> index 5ce7350..a55017d 100644
> --- a/target-arm/cpu.c
> +++ b/target-arm/cpu.c
> @@ -31,6 +31,19 @@
>  #include "sysemu/kvm.h"
>  #include "kvm_arm.h"
>  
> +/* Protect cpu_exclusive_* variable .*/
> +QemuMutex cpu_exclusive_lock;
> +
> +inline void arm_exclusive_lock(void)
> +{
> +    qemu_mutex_lock(&cpu_exclusive_lock);
> +}
> +
> +inline void arm_exclusive_unlock(void)
> +{
> +    qemu_mutex_unlock(&cpu_exclusive_lock);
> +}
> +
>  static void arm_cpu_set_pc(CPUState *cs, vaddr value)
>  {
>      ARMCPU *cpu = ARM_CPU(cs);
> @@ -365,6 +378,7 @@ static void arm_cpu_initfn(Object *obj)
>          cpu->psci_version = 2; /* TCG implements PSCI 0.2 */
>          if (!inited) {
>              inited = true;
> +            qemu_mutex_init(&cpu_exclusive_lock);
>              arm_translate_init();
>          }
>      }
> @@ -404,6 +418,7 @@ static void arm_cpu_finalizefn(Object *obj)
>  {
>      ARMCPU *cpu = ARM_CPU(obj);
>      g_hash_table_destroy(cpu->cp_regs);
> +    qemu_mutex_destroy(&cpu_exclusive_lock);

No need for this, and for -smp 2 it will cause the same lock to be
destroyed twice.

Paolo

>  }
>  
>  static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
> diff --git a/target-arm/cpu.h b/target-arm/cpu.h
> index 7f80090..f01c9ef 100644
> --- a/target-arm/cpu.h
> +++ b/target-arm/cpu.h
> @@ -1539,4 +1539,7 @@ enum {
>      QEMU_PSCI_CONDUIT_HVC = 2,
>  };
>  
> +void arm_exclusive_lock(void);
> +void arm_exclusive_unlock(void);
> +
>  #endif
> diff --git a/target-arm/helper.h b/target-arm/helper.h
> index dec3728..ce07711 100644
> --- a/target-arm/helper.h
> +++ b/target-arm/helper.h
> @@ -529,6 +529,9 @@ DEF_HELPER_2(dc_zva, void, env, i64)
>  DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
>  DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
>  
> +DEF_HELPER_0(exclusive_lock, void)
> +DEF_HELPER_0(exclusive_unlock, void)
> +
>  #ifdef TARGET_AARCH64
>  #include "helper-a64.h"
>  #endif
> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
> index 62012c3..916772f 100644
> --- a/target-arm/op_helper.c
> +++ b/target-arm/op_helper.c
> @@ -33,6 +33,16 @@ static void raise_exception(CPUARMState *env, int tt)
>      cpu_loop_exit(cs);
>  }
>  
> +void HELPER(exclusive_lock)(void)
> +{
> +    arm_exclusive_lock();
> +}
> +
> +void HELPER(exclusive_unlock)(void)
> +{
> +    arm_exclusive_unlock();
> +}
> +
>  uint32_t HELPER(neon_tbl)(CPUARMState *env, uint32_t ireg, uint32_t def,
>                            uint32_t rn, uint32_t maxindex)
>  {
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index af51568..4a82ad5 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -7377,6 +7377,7 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>          abort();
>      }
>  
> +    gen_helper_exclusive_lock();
>      if (size == 3) {
>          TCGv_i32 tmp2 = tcg_temp_new_i32();
>          TCGv_i32 tmp3 = tcg_temp_new_i32();
> @@ -7392,11 +7393,14 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>  
>      store_reg(s, rt, tmp);
>      tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
> +    gen_helper_exclusive_unlock();
>  }
>  
>  static void gen_clrex(DisasContext *s)
>  {
> +    gen_helper_exclusive_lock();
>      tcg_gen_movi_i64(cpu_exclusive_addr, -1);
> +    gen_helper_exclusive_unlock();
>  }
>  
>  #ifdef CONFIG_USER_ONLY
> @@ -7427,6 +7431,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>      done_label = gen_new_label();
>      extaddr = tcg_temp_new_i64();
>      tcg_gen_extu_i32_i64(extaddr, addr);
> +    gen_helper_exclusive_lock();
>      tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
>      tcg_temp_free_i64(extaddr);
>  
> @@ -7491,6 +7496,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>      tcg_gen_movi_i32(cpu_R[rd], 1);
>      gen_set_label(done_label);
>      tcg_gen_movi_i64(cpu_exclusive_addr, -1);
> +    gen_helper_exclusive_unlock();
>  }
>  #endif
>  
>
fred.konrad@greensocs.com Dec. 16, 2014, 9:36 a.m. UTC | #2
On 16/12/2014 10:31, Paolo Bonzini wrote:
>
> On 16/12/2014 10:13, fred.konrad@greensocs.com wrote:
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> This adds a lock to avoid multiple exclusive access at the same time in case of
>> TCG multithread.
>>
>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>> ---
>>   target-arm/cpu.c       | 15 +++++++++++++++
>>   target-arm/cpu.h       |  3 +++
>>   target-arm/helper.h    |  3 +++
>>   target-arm/op_helper.c | 10 ++++++++++
>>   target-arm/translate.c |  6 ++++++
>>   5 files changed, 37 insertions(+)
[..]
>>       g_hash_table_destroy(cpu->cp_regs);
>> +    qemu_mutex_destroy(&cpu_exclusive_lock);
> No need for this, and for -smp 2 it will cause the same lock to be
> destroyed twice.
>
> Paolo
Hi Paolo,

Good point for SMP!
The mutex doesn't need to be destroyed?

Thanks,
Fred
Paolo Bonzini Dec. 16, 2014, 9:49 a.m. UTC | #3
On 16/12/2014 10:36, Frederic Konrad wrote:
> On 16/12/2014 10:31, Paolo Bonzini wrote:
>>
>> On 16/12/2014 10:13, fred.konrad@greensocs.com wrote:
>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>
>>> This adds a lock to avoid multiple exclusive access at the same time
>>> in case of
>>> TCG multithread.
>>>
>>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>>> ---
>>>   target-arm/cpu.c       | 15 +++++++++++++++
>>>   target-arm/cpu.h       |  3 +++
>>>   target-arm/helper.h    |  3 +++
>>>   target-arm/op_helper.c | 10 ++++++++++
>>>   target-arm/translate.c |  6 ++++++
>>>   5 files changed, 37 insertions(+)
> [..]
>>>       g_hash_table_destroy(cpu->cp_regs);
>>> +    qemu_mutex_destroy(&cpu_exclusive_lock);
>> No need for this, and for -smp 2 it will cause the same lock to be
>> destroyed twice.
>>
>> Paolo
> Hi Paolo,
> 
> Good point for SMP!
> The mutex doesn't need to be destroyed?

Not if it's just one for the whole process.  Exiting QEMU will take care
of it.

Paolo
fred.konrad@greensocs.com Dec. 16, 2014, 9:54 a.m. UTC | #4
On 16/12/2014 10:49, Paolo Bonzini wrote:
>
> On 16/12/2014 10:36, Frederic Konrad wrote:
>> On 16/12/2014 10:31, Paolo Bonzini wrote:
>>> On 16/12/2014 10:13, fred.konrad@greensocs.com wrote:
>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>>
>>>> This adds a lock to avoid multiple exclusive access at the same time
>>>> in case of
>>>> TCG multithread.
>>>>
>>>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>>>> ---
>>>>    target-arm/cpu.c       | 15 +++++++++++++++
>>>>    target-arm/cpu.h       |  3 +++
>>>>    target-arm/helper.h    |  3 +++
>>>>    target-arm/op_helper.c | 10 ++++++++++
>>>>    target-arm/translate.c |  6 ++++++
>>>>    5 files changed, 37 insertions(+)
>> [..]
>>>>        g_hash_table_destroy(cpu->cp_regs);
>>>> +    qemu_mutex_destroy(&cpu_exclusive_lock);
>>> No need for this, and for -smp 2 it will cause the same lock to be
>>> destroyed twice.
>>>
>>> Paolo
>> Hi Paolo,
>>
>> Good point for SMP!
>> The mutex doesn't need to be destroyed?
> Not if it's just one for the whole process.  Exiting QEMU will take care
> of it.
>
> Paolo
Ok fine :).

Thanks,
Fred
Peter Maydell Dec. 16, 2014, 4:37 p.m. UTC | #5
On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
> From: KONRAD Frederic <fred.konrad@greensocs.com>
>
> This adds a lock to avoid multiple exclusive access at the same time in case of
> TCG multithread.

This feels to me like it's not really possible to review on
its own, since you can't see how it fits into the design of
the rest of the multithreading support.

The other approach here rather than having a pile of mutexes
in the target-* code would be to have TCG IR support for
"begin critical section"/"end critical section". Then you
could have the main loop ensure that no other CPU is running
at the same time as the critical-section code. (linux-user
already has an ad-hoc implementation of this for the
exclusives.)

-- PMM
fred.konrad@greensocs.com Dec. 17, 2014, 10:27 a.m. UTC | #6
On 16/12/2014 17:37, Peter Maydell wrote:
> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> This adds a lock to avoid multiple exclusive access at the same time in case of
>> TCG multithread.
Hi Peter,

> This feels to me like it's not really possible to review on
> its own, since you can't see how it fits into the design of
> the rest of the multithreading support.
true the only thing we observe is that it didn't change anything right now.

> The other approach here rather than having a pile of mutexes
> in the target-* code would be to have TCG IR support for
> "begin critical section"/"end critical section". Then you
> could have the main loop ensure that no other CPU is running
> at the same time as the critical-section code. (linux-user
> already has an ad-hoc implementation of this for the
> exclusives.)
>
> -- PMM
>
What do you mean by TCG IR?

Thanks,
Fred
Alexander Graf Dec. 17, 2014, 10:28 a.m. UTC | #7
On 17.12.14 11:27, Frederic Konrad wrote:
> On 16/12/2014 17:37, Peter Maydell wrote:
>> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>
>>> This adds a lock to avoid multiple exclusive access at the same time
>>> in case of
>>> TCG multithread.
> Hi Peter,
> 
>> This feels to me like it's not really possible to review on
>> its own, since you can't see how it fits into the design of
>> the rest of the multithreading support.
> true the only thing we observe is that it didn't change anything right now.
> 
>> The other approach here rather than having a pile of mutexes
>> in the target-* code would be to have TCG IR support for
>> "begin critical section"/"end critical section". Then you
>> could have the main loop ensure that no other CPU is running
>> at the same time as the critical-section code. (linux-user
>> already has an ad-hoc implementation of this for the
>> exclusives.)
>>
>> -- PMM
>>
> What do you mean by TCG IR?

TCP ops. The nice thing is that TCG could translate those into
transactions if the host supports them as well.


Alex
Mark Burton Dec. 17, 2014, 10:31 a.m. UTC | #8
> On 17 Dec 2014, at 11:28, Alexander Graf <agraf@suse.de> wrote:
> 
> 
> 
> On 17.12.14 11:27, Frederic Konrad wrote:
>> On 16/12/2014 17:37, Peter Maydell wrote:
>>> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>> 
>>>> This adds a lock to avoid multiple exclusive access at the same time
>>>> in case of
>>>> TCG multithread.
>> Hi Peter,
>> 
>>> This feels to me like it's not really possible to review on
>>> its own, since you can't see how it fits into the design of
>>> the rest of the multithreading support.
>> true the only thing we observe is that it didn't change anything right now.
>> 
>>> The other approach here rather than having a pile of mutexes
>>> in the target-* code would be to have TCG IR support for
>>> "begin critical section"/"end critical section". Then you
>>> could have the main loop ensure that no other CPU is running
>>> at the same time as the critical-section code. (linux-user
>>> already has an ad-hoc implementation of this for the
>>> exclusives.)
>>> 
>>> -- PMM
>>> 
>> What do you mean by TCG IR?
> 
> TCP ops. The nice thing is that TCG could translate those into
> transactions if the host supports them as well.
> 

Hows that different in reality from what we have now?
Cheers
Mark.

> 
> Alex


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Alexander Graf Dec. 17, 2014, 10:45 a.m. UTC | #9
On 17.12.14 11:31, Mark Burton wrote:
> 
>> On 17 Dec 2014, at 11:28, Alexander Graf <agraf@suse.de> wrote:
>>
>>
>>
>> On 17.12.14 11:27, Frederic Konrad wrote:
>>> On 16/12/2014 17:37, Peter Maydell wrote:
>>>> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>>>
>>>>> This adds a lock to avoid multiple exclusive access at the same time
>>>>> in case of
>>>>> TCG multithread.
>>> Hi Peter,
>>>
>>>> This feels to me like it's not really possible to review on
>>>> its own, since you can't see how it fits into the design of
>>>> the rest of the multithreading support.
>>> true the only thing we observe is that it didn't change anything right now.
>>>
>>>> The other approach here rather than having a pile of mutexes
>>>> in the target-* code would be to have TCG IR support for
>>>> "begin critical section"/"end critical section". Then you
>>>> could have the main loop ensure that no other CPU is running
>>>> at the same time as the critical-section code. (linux-user
>>>> already has an ad-hoc implementation of this for the
>>>> exclusives.)
>>>>
>>>> -- PMM
>>>>
>>> What do you mean by TCG IR?
>>
>> TCP ops. The nice thing is that TCG could translate those into
>> transactions if the host supports them as well.
>>
> 
> Hows that different in reality from what we have now?
> Cheers
> Mark.

The current code can't optimize things in TCG. There's a good chance
your TCG host implementation can have an optimization pass that creates
host cmpxchg instructions or maybe even transaction blocks out of the
critical sections.


Alex
Mark Burton Dec. 17, 2014, 11:12 a.m. UTC | #10
> On 17 Dec 2014, at 11:45, Alexander Graf <agraf@suse.de> wrote:
> 
> 
> 
> On 17.12.14 11:31, Mark Burton wrote:
>> 
>>> On 17 Dec 2014, at 11:28, Alexander Graf <agraf@suse.de> wrote:
>>> 
>>> 
>>> 
>>> On 17.12.14 11:27, Frederic Konrad wrote:
>>>> On 16/12/2014 17:37, Peter Maydell wrote:
>>>>> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>>>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>>>> 
>>>>>> This adds a lock to avoid multiple exclusive access at the same time
>>>>>> in case of
>>>>>> TCG multithread.
>>>> Hi Peter,
>>>> 
>>>>> This feels to me like it's not really possible to review on
>>>>> its own, since you can't see how it fits into the design of
>>>>> the rest of the multithreading support.
>>>> true the only thing we observe is that it didn't change anything right now.
>>>> 
>>>>> The other approach here rather than having a pile of mutexes
>>>>> in the target-* code would be to have TCG IR support for
>>>>> "begin critical section"/"end critical section". Then you
>>>>> could have the main loop ensure that no other CPU is running
>>>>> at the same time as the critical-section code. (linux-user
>>>>> already has an ad-hoc implementation of this for the
>>>>> exclusives.)
>>>>> 
>>>>> -- PMM
>>>>> 
>>>> What do you mean by TCG IR?
>>> 
>>> TCP ops. The nice thing is that TCG could translate those into
>>> transactions if the host supports them as well.
>>> 
>> 
>> Hows that different in reality from what we have now?
>> Cheers
>> Mark.
> 
> The current code can't optimize things in TCG. There's a good chance
> your TCG host implementation can have an optimization pass that creates
> host cmpxchg instructions or maybe even transaction blocks out of the
> critical sections.
> 
> 


Ok - I get it - I see the value - so long as it’s possible to do. It would solve a lot of problems...

We were not (yet) trying to fix that, we were simply asking the question, if we add these mutex’s - do we have any detrimental impact on anything.
Seems like the answer is that adding the mutex’s is fine - it doesn’t seem to have a performance impact or anything. Good.

But - I see what you mean - if we implemented this as an op, then it would be much simpler to optimise/fix properly afterwards - and - that “fix” might not even need to deal with the whole memory chain issue - maybe….. 

Cheers

Mark.
Alexander Graf Dec. 17, 2014, 11:18 a.m. UTC | #11
On 17.12.14 12:12, Mark Burton wrote:
> 
>> On 17 Dec 2014, at 11:45, Alexander Graf <agraf@suse.de> wrote:
>>
>>
>>
>> On 17.12.14 11:31, Mark Burton wrote:
>>>
>>>> On 17 Dec 2014, at 11:28, Alexander Graf <agraf@suse.de> wrote:
>>>>
>>>>
>>>>
>>>> On 17.12.14 11:27, Frederic Konrad wrote:
>>>>> On 16/12/2014 17:37, Peter Maydell wrote:
>>>>>> On 16 December 2014 at 09:13,  <fred.konrad@greensocs.com> wrote:
>>>>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>>>>>
>>>>>>> This adds a lock to avoid multiple exclusive access at the same time
>>>>>>> in case of
>>>>>>> TCG multithread.
>>>>> Hi Peter,
>>>>>
>>>>>> This feels to me like it's not really possible to review on
>>>>>> its own, since you can't see how it fits into the design of
>>>>>> the rest of the multithreading support.
>>>>> true the only thing we observe is that it didn't change anything right now.
>>>>>
>>>>>> The other approach here rather than having a pile of mutexes
>>>>>> in the target-* code would be to have TCG IR support for
>>>>>> "begin critical section"/"end critical section". Then you
>>>>>> could have the main loop ensure that no other CPU is running
>>>>>> at the same time as the critical-section code. (linux-user
>>>>>> already has an ad-hoc implementation of this for the
>>>>>> exclusives.)
>>>>>>
>>>>>> -- PMM
>>>>>>
>>>>> What do you mean by TCG IR?
>>>>
>>>> TCP ops. The nice thing is that TCG could translate those into
>>>> transactions if the host supports them as well.
>>>>
>>>
>>> Hows that different in reality from what we have now?
>>> Cheers
>>> Mark.
>>
>> The current code can't optimize things in TCG. There's a good chance
>> your TCG host implementation can have an optimization pass that creates
>> host cmpxchg instructions or maybe even transaction blocks out of the
>> critical sections.
>>
>>
> 
> 
> Ok - I get it - I see the value - so long as it’s possible to do. It would solve a lot of problems...
> 
> We were not (yet) trying to fix that, we were simply asking the question, if we add these mutex’s - do we have any detrimental impact on anything.
> Seems like the answer is that adding the mutex’s is fine - it doesn’t seem to have a performance impact or anything. Good.
> 
> But - I see what you mean - if we implemented this as an op, then it would be much simpler to optimise/fix properly afterwards - and - that “fix” might not even need to deal with the whole memory chain issue - maybe….. 

Yes, especially given that transactional memory is getting pretty common
these days (Haswell had it, POWER8 has it) I think it makes a lot of
sense to just base on its concept in the design here. It's the easiest
way to make parallel memory accesses fast without taking big locks all
the time.

So I think the best way to go forward would be to add transaction_start
and transaction_end opcodes to TCG and implement them as mutex locks
today. When you get the chance to get yourself a machine that supports
actual TM, try to replace them with transaction start/end blocks and
have the normal mutex code as fallback if the transaction fails.


Alex
Peter Maydell Dec. 17, 2014, 11:19 a.m. UTC | #12
On 17 December 2014 at 11:12, Mark Burton <mark.burton@greensocs.com> wrote:
> We were not (yet) trying to fix that, we were simply asking the
> question, if we add these mutex’s - do we have any detrimental impact
> on anything.
> Seems like the answer is that adding the mutex’s is fine - it doesn’t
> seem to have a performance impact or anything. Good.

Personally I dislike the mutex approach because it means that every
target ends up with its own ad-hoc implementation of something that
would be better implemented in the QEMU core support (even if that
core support is simply taking mutexes in the end).

I also note that the linux-user code for handling exclusives does it
by having a region (effectively) where all other CPUs stop executing
*anything*. Do we need that?

PS: you'll find that we already have some ancient and half-broken
support code for the mutex approach (search for 'spinlock_t')...

-- PMM
Paolo Bonzini Dec. 17, 2014, 11:25 a.m. UTC | #13
On 17/12/2014 12:18, Alexander Graf wrote:
> 
> So I think the best way to go forward would be to add transaction_start
> and transaction_end opcodes to TCG and implement them as mutex locks
> today. When you get the chance to get yourself a machine that supports
> actual TM, try to replace them with transaction start/end blocks and
> have the normal mutex code as fallback if the transaction fails.

Or implement load_locked/store_conditional TCG ops.  They can be
implemented as transactions, hardware ll/sc, or something slow that uses
the MMU.

Paolo
Peter Maydell Dec. 17, 2014, 11:36 a.m. UTC | #14
On 17 December 2014 at 11:25, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 17/12/2014 12:18, Alexander Graf wrote:
>>
>> So I think the best way to go forward would be to add transaction_start
>> and transaction_end opcodes to TCG and implement them as mutex locks
>> today. When you get the chance to get yourself a machine that supports
>> actual TM, try to replace them with transaction start/end blocks and
>> have the normal mutex code as fallback if the transaction fails.
>
> Or implement load_locked/store_conditional TCG ops.  They can be
> implemented as transactions, hardware ll/sc, or something slow that uses
> the MMU.

You'd need to compare the semantics of ll/sc across the various
architectures to find out if there was anything you could
actually meaningfully define as useful TCG op semantics there.

-- PMM
Mark Burton Dec. 17, 2014, 3:52 p.m. UTC | #15
Actually - I dont see any other option.
Playing with the ideas - it seems to me that if we were to implement ‘generic’ Lock/unlock instructions which could then somehow we ‘combined’ with loads/stores then we would be relying on an optimisation step to ‘notice’ that this could be combined into e.g. a store EX on ARM, or whatever. That strikes me as risky .

But then - if we add load/store exclusive type operations - thats great for e.g. ARM on X86, but does it really cover other architectures well?

I am worried that if we go this path, we will soon end up with a lot of architecturally specific TCG ops….

Cheers

Mark.

> On 17 Dec 2014, at 12:25, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> 
> 
> On 17/12/2014 12:18, Alexander Graf wrote:
>> 
>> So I think the best way to go forward would be to add transaction_start
>> and transaction_end opcodes to TCG and implement them as mutex locks
>> today. When you get the chance to get yourself a machine that supports
>> actual TM, try to replace them with transaction start/end blocks and
>> have the normal mutex code as fallback if the transaction fails.
> 
> Or implement load_locked/store_conditional TCG ops.  They can be
> implemented as transactions, hardware ll/sc, or something slow that uses
> the MMU.
> 
> Paolo


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Mark Burton Dec. 17, 2014, 4:17 p.m. UTC | #16
Sorry - I should have replied to this Peter
I agree with you - I dont know how much overlap we’ll find with different architectures.
But if we stick to the more generic ‘lock/unlock’, I dont see how this is going to help us output thread safe code without going thought a mutex - at which point we are back to square 1.

Cheers
Mark.

> On 17 Dec 2014, at 12:36, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
> On 17 December 2014 at 11:25, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> 
>> 
>> On 17/12/2014 12:18, Alexander Graf wrote:
>>> 
>>> So I think the best way to go forward would be to add transaction_start
>>> and transaction_end opcodes to TCG and implement them as mutex locks
>>> today. When you get the chance to get yourself a machine that supports
>>> actual TM, try to replace them with transaction start/end blocks and
>>> have the normal mutex code as fallback if the transaction fails.
>> 
>> Or implement load_locked/store_conditional TCG ops.  They can be
>> implemented as transactions, hardware ll/sc, or something slow that uses
>> the MMU.
> 
> You'd need to compare the semantics of ll/sc across the various
> architectures to find out if there was anything you could
> actually meaningfully define as useful TCG op semantics there.
> 
> -- PMM


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Dr. David Alan Gilbert Dec. 17, 2014, 4:20 p.m. UTC | #17
* Mark Burton (mark.burton@greensocs.com) wrote:
> Actually - I dont see any other option.
> Playing with the ideas - it seems to me that if we were to implement ?generic? Lock/unlock instructions which could then somehow we ?combined? with loads/stores then we would be relying on an optimisation step to ?notice? that this could be combined into e.g. a store EX on ARM, or whatever. That strikes me as risky .
> 
> But then - if we add load/store exclusive type operations - thats great for e.g. ARM on X86, but does it really cover other architectures well?
> 
> I am worried that if we go this path, we will soon end up with a lot of architecturally specific TCG ops?.

I'd expect you to end up with two types;
   1) the ARM/MIPS/PPC split load/store,
   2) the x86/s390/ARMv8.1 compare exchange.

The tricky thing is to pick a sane set of TCG ops that is a good fit into each
of the two groups on different targets.

Dave


> 
> Cheers
> 
> Mark.
> 
> > On 17 Dec 2014, at 12:25, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> > 
> > 
> > On 17/12/2014 12:18, Alexander Graf wrote:
> >> 
> >> So I think the best way to go forward would be to add transaction_start
> >> and transaction_end opcodes to TCG and implement them as mutex locks
> >> today. When you get the chance to get yourself a machine that supports
> >> actual TM, try to replace them with transaction start/end blocks and
> >> have the normal mutex code as fallback if the transaction fails.
> > 
> > Or implement load_locked/store_conditional TCG ops.  They can be
> > implemented as transactions, hardware ll/sc, or something slow that uses
> > the MMU.
> > 
> > Paolo
> 
> 
> 	 +44 (0)20 7100 3485 x 210
>  +33 (0)5 33 52 01 77x 210
> 
> 	+33 (0)603762104
> 	mark.burton
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell Dec. 17, 2014, 4:27 p.m. UTC | #18
On 17 December 2014 at 16:17, Mark Burton <mark.burton@greensocs.com> wrote:
> Sorry - I should have replied to this Peter
> I agree with you - I dont know how much overlap we’ll find with different architectures.
> But if we stick to the more generic ‘lock/unlock’, I dont see how this is going to help us output thread safe code without going thought a mutex - at which point we are back to square 1.

I think a mutex is fine, personally -- I just don't want
to see fifteen hand-hacked mutexes in the target-* code.

-- PMM
Mark Burton Dec. 17, 2014, 4:29 p.m. UTC | #19
> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
> On 17 December 2014 at 16:17, Mark Burton <mark.burton@greensocs.com> wrote:
>> Sorry - I should have replied to this Peter
>> I agree with you - I dont know how much overlap we’ll find with different architectures.
>> But if we stick to the more generic ‘lock/unlock’, I dont see how this is going to help us output thread safe code without going thought a mutex - at which point we are back to square 1.
> 
> I think a mutex is fine, personally -- I just don't want
> to see fifteen hand-hacked mutexes in the target-* code.
> 

Which would seem to favour the helper function approach?
Or am I missing something?
If we can’t arrange for the target code to optimise the mutex away and use host native instructions, then I dont really see the benefit of complicating the IR and the target code?

Cheers

Mark.

> -- PMM


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Peter Maydell Dec. 17, 2014, 4:39 p.m. UTC | #20
On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
>> I think a mutex is fine, personally -- I just don't want
>> to see fifteen hand-hacked mutexes in the target-* code.
>>
>
> Which would seem to favour the helper function approach?
> Or am I missing something?

You need at least some support from QEMU core -- consider
what happens with this patch if the ldrex takes a data
abort, for instance.

And if you need the "stop all other CPUs while I do this"
semantics linux-user currently uses then that definitely needs
core code support. (Maybe linux-user is being over-zealous
there; I haven't thought about it.)

-- PMM
Peter Maydell Dec. 17, 2014, 4:51 p.m. UTC | #21
On 17 December 2014 at 16:39, Peter Maydell <peter.maydell@linaro.org> wrote:
> You need at least some support from QEMU core -- consider
> what happens with this patch if the ldrex takes a data
> abort, for instance.

More generally, I would rather see a patch which provides
some mechanisms and utility routines in core code which
targets can use for their target-specific purposes, not
a patch which only touches target-arm.

thanks
-- PMM
Mark Burton Dec. 18, 2014, 9:12 a.m. UTC | #22
> On 17 Dec 2014, at 17:39, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
> On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
>>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> I think a mutex is fine, personally -- I just don't want
>>> to see fifteen hand-hacked mutexes in the target-* code.
>>> 
>> 
>> Which would seem to favour the helper function approach?
>> Or am I missing something?
> 
> You need at least some support from QEMU core -- consider
> what happens with this patch if the ldrex takes a data
> abort, for instance.
> 
> And if you need the "stop all other CPUs while I do this”

It looks like a corner case, but working this through - the ’simple’ put a mutex around the atomic instructions approach would indeed need to ensure that no other core was doing anything - that just happens to be true for qemu today (or - we would have to put a mutex around all writes); in order to ensure the case where a store exclusive could potential fail if a non-atomic instruction wrote (a different value) to the same address. This is currently guarantee by the implementation in Qemu - how useful it is I dont know, but if we break it, we run the risk that something will fail (at the least, we could not claim to have kept things the same).

This also has implications for the idea of adding TCG ops I think...
The ideal scenario is that we could ‘fallback’ on the same semantics that are there today - allowing specific target/host combinations to be optimised (and to improve their functionality). 
But that means, from within the TCG Op, we would need to have a mechanism, to cause other TCG’s to take an exit…. etc etc… In the end, I’m sure it’s possible, but it feels so awkward.

To re-cap where we are (for my own benefit if nobody else):
We have several propositions in terms of implementing Atomic instructions

1/ We wrap the atomic instructions in a mutex using helper functions (this is the approach others have taken, it’s simple, but it is not clean, as stated above).

1.5/ We add a mechanism to ensure that when the mutex is taken, all other cores are ‘stopped’.

2/ We add some TCG ops to effectively do the same thing, but this would give us the benefit of being able to provide better implementations. This is attractive, but we would end up needing ops to cover at least exclusive load/store and atomic compare exchange. To me this looks less than elegant (being pulled close to the target, rather than being able to generalise), but it’s not clear how we would implement the operations as we would like, with a machine instruction, unless we did split them out along these lines. This approach also (probably) requires the 1.5 mechanism above.

3/ We have discussed a ‘h/w’ approach to the problem. In this case, all atomic instructions are forced to take the slow path - and a additional flags are added to the memory API. We then deal with the issue closer to the memory where we can record who has a lock on a memory address. For this to work - we would also either
a) need to add a mprotect type approach to ensure no ‘non atomic’ writes occur - or
b) need to force all cores to mark the page with the exclusive memory as IO or similar to ensure that all write accesses followed the slow path.

4/ There is an option to implement exclusive operations within the TCG using mprotect (and signal handlers). I have some concerns on this : would we need have to have support for each host O/S…. I also think we might end up the a lot of protected regions causing a lot of SIGSEGV’s because an errant guest doesn’t behave well - basically we will need to see the impact on performance - finally - this will be really painful to deal with for cases where the exclusive memory is held in what Qemu considers IO space !!!
	In other words - putting the mprotect inside TCG looks to me like it’s mutually exclusive to supporting a memory-based scheme like (3).


My personal preference is for 3b) it  is “safe” - its where the hardware is.
3a is an optimization of that.
to me, (2) is an optimisation again. We are effectively saying, if you are able to do this directly, then you dont need to pass via the slow path. Otherwise, you always have the option of reverting to the slow path.

Frankly - 1 and 1.5 are hacks - they are not optimisations, they are just dirty hacks. However - their saving grace is that they are hacks that exist and “work”. I dislike patching the hack, but it did seem to offer the fastest solution to get around this problem - at least for now. I am no longer convinced.

4/ is something I’d like other peoples views on too… Is it a better approach? What about the slow path?

I increasingly begin to feel that we should really approach this from the other end, and provide a ‘correct’ solution using the memory - then worry about making that faster…

Cheers

Mark.








> semantics linux-user currently uses then that definitely needs
> core code support. (Maybe linux-user is being over-zealous
> there; I haven't thought about it.)
> 
> -- PMM


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Alexander Graf Dec. 18, 2014, 12:24 p.m. UTC | #23
On 18.12.14 10:12, Mark Burton wrote:
> 
>> On 17 Dec 2014, at 17:39, Peter Maydell <peter.maydell@linaro.org> wrote:
>>
>> On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
>>>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
>>>> I think a mutex is fine, personally -- I just don't want
>>>> to see fifteen hand-hacked mutexes in the target-* code.
>>>>
>>>
>>> Which would seem to favour the helper function approach?
>>> Or am I missing something?
>>
>> You need at least some support from QEMU core -- consider
>> what happens with this patch if the ldrex takes a data
>> abort, for instance.
>>
>> And if you need the "stop all other CPUs while I do this”
> 
> It looks like a corner case, but working this through - the ’simple’ put a mutex around the atomic instructions approach would indeed need to ensure that no other core was doing anything - that just happens to be true for qemu today (or - we would have to put a mutex around all writes); in order to ensure the case where a store exclusive could potential fail if a non-atomic instruction wrote (a different value) to the same address. This is currently guarantee by the implementation in Qemu - how useful it is I dont know, but if we break it, we run the risk that something will fail (at the least, we could not claim to have kept things the same).
> 
> This also has implications for the idea of adding TCG ops I think...
> The ideal scenario is that we could ‘fallback’ on the same semantics that are there today - allowing specific target/host combinations to be optimised (and to improve their functionality). 
> But that means, from within the TCG Op, we would need to have a mechanism, to cause other TCG’s to take an exit…. etc etc… In the end, I’m sure it’s possible, but it feels so awkward.

That's the nice thing about transactions - they guarantee that no other
CPU accesses the same cache line at the same time. So you're safe
against other vcpus even without blocking them manually.

For the non-transactional implementation we probably would need an "IPI
others and halt them until we're done with the critical section"
approach. But I really wouldn't concentrate on making things fast on old
CPUs.

Also keep in mind that for the UP case we can always omit all the magic
- we only need to detect when we move into an SMP case (linux-user clone
or -smp on system).

> 
> To re-cap where we are (for my own benefit if nobody else):
> We have several propositions in terms of implementing Atomic instructions
> 
> 1/ We wrap the atomic instructions in a mutex using helper functions (this is the approach others have taken, it’s simple, but it is not clean, as stated above).

This is horrible. Imagine you have this split approach with a load
exclusive and then store whereas the load starts mutex usage and the
store stop is. At that point if the store creates a segfault you'll be
left with a dangling mutex.

This stuff really belongs into the TCG core.

> 
> 1.5/ We add a mechanism to ensure that when the mutex is taken, all other cores are ‘stopped’.
> 
> 2/ We add some TCG ops to effectively do the same thing, but this would give us the benefit of being able to provide better implementations. This is attractive, but we would end up needing ops to cover at least exclusive load/store and atomic compare exchange. To me this looks less than elegant (being pulled close to the target, rather than being able to generalise), but it’s not clear how we would implement the operations as we would like, with a machine instruction, unless we did split them out along these lines. This approach also (probably) requires the 1.5 mechanism above.

I'm still in favor of just forcing the semantics of transactions onto
this. If the host doesn't implement transactions, tough luck - do the
"halt all others" IPI.

> 
> 3/ We have discussed a ‘h/w’ approach to the problem. In this case, all atomic instructions are forced to take the slow path - and a additional flags are added to the memory API. We then deal with the issue closer to the memory where we can record who has a lock on a memory address. For this to work - we would also either
> a) need to add a mprotect type approach to ensure no ‘non atomic’ writes occur - or
> b) need to force all cores to mark the page with the exclusive memory as IO or similar to ensure that all write accesses followed the slow path.
> 
> 4/ There is an option to implement exclusive operations within the TCG using mprotect (and signal handlers). I have some concerns on this : would we need have to have support for each host O/S…. I also think we might end up the a lot of protected regions causing a lot of SIGSEGV’s because an errant guest doesn’t behave well - basically we will need to see the impact on performance - finally - this will be really painful to deal with for cases where the exclusive memory is held in what Qemu considers IO space !!!
> 	In other words - putting the mprotect inside TCG looks to me like it’s mutually exclusive to supporting a memory-based scheme like (3).

Again, I don't think it's worth caring about legacy host systems too
much. In a few years from now transactional memory will be commodity,
just like KVM is today.


Alex

> My personal preference is for 3b) it  is “safe” - its where the hardware is.
> 3a is an optimization of that.
> to me, (2) is an optimisation again. We are effectively saying, if you are able to do this directly, then you dont need to pass via the slow path. Otherwise, you always have the option of reverting to the slow path.
> 
> Frankly - 1 and 1.5 are hacks - they are not optimisations, they are just dirty hacks. However - their saving grace is that they are hacks that exist and “work”. I dislike patching the hack, but it did seem to offer the fastest solution to get around this problem - at least for now. I am no longer convinced.
> 
> 4/ is something I’d like other peoples views on too… Is it a better approach? What about the slow path?
> 
> I increasingly begin to feel that we should really approach this from the other end, and provide a ‘correct’ solution using the memory - then worry about making that faster…
> 
> Cheers
> 
> Mark.
> 
> 
> 
> 
> 
> 
> 
> 
>> semantics linux-user currently uses then that definitely needs
>> core code support. (Maybe linux-user is being over-zealous
>> there; I haven't thought about it.)
>>
>> -- PMM
> 
> 
> 	 +44 (0)20 7100 3485 x 210
>  +33 (0)5 33 52 01 77x 210
> 
> 	+33 (0)603762104
> 	mark.burton
>
Dr. David Alan Gilbert Dec. 18, 2014, 12:35 p.m. UTC | #24
* Alexander Graf (agraf@suse.de) wrote:
> 
> 
> On 18.12.14 10:12, Mark Burton wrote:
> > 
> >> On 17 Dec 2014, at 17:39, Peter Maydell <peter.maydell@linaro.org> wrote:
> >>
> >> On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
> >>>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
> >>>> I think a mutex is fine, personally -- I just don't want
> >>>> to see fifteen hand-hacked mutexes in the target-* code.
> >>>>
> >>>
> >>> Which would seem to favour the helper function approach?
> >>> Or am I missing something?
> >>
> >> You need at least some support from QEMU core -- consider
> >> what happens with this patch if the ldrex takes a data
> >> abort, for instance.
> >>
> >> And if you need the "stop all other CPUs while I do this???
> > 
> > It looks like a corner case, but working this through - the ???simple??? put a mutex around the atomic instructions approach would indeed need to ensure that no other core was doing anything - that just happens to be true for qemu today (or - we would have to put a mutex around all writes); in order to ensure the case where a store exclusive could potential fail if a non-atomic instruction wrote (a different value) to the same address. This is currently guarantee by the implementation in Qemu - how useful it is I dont know, but if we break it, we run the risk that something will fail (at the least, we could not claim to have kept things the same).
> > 
> > This also has implications for the idea of adding TCG ops I think...
> > The ideal scenario is that we could ???fallback??? on the same semantics that are there today - allowing specific target/host combinations to be optimised (and to improve their functionality). 
> > But that means, from within the TCG Op, we would need to have a mechanism, to cause other TCG???s to take an exit???. etc etc??? In the end, I???m sure it???s possible, but it feels so awkward.
> 
> That's the nice thing about transactions - they guarantee that no other
> CPU accesses the same cache line at the same time. So you're safe
> against other vcpus even without blocking them manually.
> 
> For the non-transactional implementation we probably would need an "IPI
> others and halt them until we're done with the critical section"
> approach. But I really wouldn't concentrate on making things fast on old
> CPUs.

Hang on; 99% of the worlds CPUs don't have (working) transactional memory;
so it's a bit excessive to lump them all under old CPUs.

> Also keep in mind that for the UP case we can always omit all the magic
> - we only need to detect when we move into an SMP case (linux-user clone
> or -smp on system).

Depends on the architecture to depend if IO breaks those type of ops.

Dave

> 
> > 
> > To re-cap where we are (for my own benefit if nobody else):
> > We have several propositions in terms of implementing Atomic instructions
> > 
> > 1/ We wrap the atomic instructions in a mutex using helper functions (this is the approach others have taken, it???s simple, but it is not clean, as stated above).
> 
> This is horrible. Imagine you have this split approach with a load
> exclusive and then store whereas the load starts mutex usage and the
> store stop is. At that point if the store creates a segfault you'll be
> left with a dangling mutex.
> 
> This stuff really belongs into the TCG core.
> 
> > 
> > 1.5/ We add a mechanism to ensure that when the mutex is taken, all other cores are ???stopped???.
> > 
> > 2/ We add some TCG ops to effectively do the same thing, but this would give us the benefit of being able to provide better implementations. This is attractive, but we would end up needing ops to cover at least exclusive load/store and atomic compare exchange. To me this looks less than elegant (being pulled close to the target, rather than being able to generalise), but it???s not clear how we would implement the operations as we would like, with a machine instruction, unless we did split them out along these lines. This approach also (probably) requires the 1.5 mechanism above.
> 
> I'm still in favor of just forcing the semantics of transactions onto
> this. If the host doesn't implement transactions, tough luck - do the
> "halt all others" IPI.
> 
> > 
> > 3/ We have discussed a ???h/w??? approach to the problem. In this case, all atomic instructions are forced to take the slow path - and a additional flags are added to the memory API. We then deal with the issue closer to the memory where we can record who has a lock on a memory address. For this to work - we would also either
> > a) need to add a mprotect type approach to ensure no ???non atomic??? writes occur - or
> > b) need to force all cores to mark the page with the exclusive memory as IO or similar to ensure that all write accesses followed the slow path.
> > 
> > 4/ There is an option to implement exclusive operations within the TCG using mprotect (and signal handlers). I have some concerns on this : would we need have to have support for each host O/S???. I also think we might end up the a lot of protected regions causing a lot of SIGSEGV???s because an errant guest doesn???t behave well - basically we will need to see the impact on performance - finally - this will be really painful to deal with for cases where the exclusive memory is held in what Qemu considers IO space !!!
> > 	In other words - putting the mprotect inside TCG looks to me like it???s mutually exclusive to supporting a memory-based scheme like (3).
> 
> Again, I don't think it's worth caring about legacy host systems too
> much. In a few years from now transactional memory will be commodity,
> just like KVM is today.
> 
> 
> Alex
> 
> > My personal preference is for 3b) it  is ???safe??? - its where the hardware is.
> > 3a is an optimization of that.
> > to me, (2) is an optimisation again. We are effectively saying, if you are able to do this directly, then you dont need to pass via the slow path. Otherwise, you always have the option of reverting to the slow path.
> > 
> > Frankly - 1 and 1.5 are hacks - they are not optimisations, they are just dirty hacks. However - their saving grace is that they are hacks that exist and ???work???. I dislike patching the hack, but it did seem to offer the fastest solution to get around this problem - at least for now. I am no longer convinced.
> > 
> > 4/ is something I???d like other peoples views on too??? Is it a better approach? What about the slow path?
> > 
> > I increasingly begin to feel that we should really approach this from the other end, and provide a ???correct??? solution using the memory - then worry about making that faster???
> > 
> > Cheers
> > 
> > Mark.
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> >> semantics linux-user currently uses then that definitely needs
> >> core code support. (Maybe linux-user is being over-zealous
> >> there; I haven't thought about it.)
> >>
> >> -- PMM
> > 
> > 
> > 	 +44 (0)20 7100 3485 x 210
> >  +33 (0)5 33 52 01 77x 210
> > 
> > 	+33 (0)603762104
> > 	mark.burton
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini Dec. 18, 2014, 1:28 p.m. UTC | #25
On 18/12/2014 13:24, Alexander Graf wrote:
> That's the nice thing about transactions - they guarantee that no other
> CPU accesses the same cache line at the same time. So you're safe
> against other vcpus even without blocking them manually.
> 
> For the non-transactional implementation we probably would need an "IPI
> others and halt them until we're done with the critical section"
> approach. But I really wouldn't concentrate on making things fast on old
> CPUs.

The non-transactional implementation can use softmmu to trap access to
the page from other VCPUs.  This makes it possible to implement (at the
cost of speed) the same semantics on all hosts.

Paolo
Mark Burton Dec. 18, 2014, 1:56 p.m. UTC | #26
> On 18 Dec 2014, at 13:24, Alexander Graf <agraf@suse.de> wrote:
> 
> 
> 
> On 18.12.14 10:12, Mark Burton wrote:
>> 
>>> On 17 Dec 2014, at 17:39, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> 
>>> On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
>>>>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
>>>>> I think a mutex is fine, personally -- I just don't want
>>>>> to see fifteen hand-hacked mutexes in the target-* code.
>>>>> 
>>>> 
>>>> Which would seem to favour the helper function approach?
>>>> Or am I missing something?
>>> 
>>> You need at least some support from QEMU core -- consider
>>> what happens with this patch if the ldrex takes a data
>>> abort, for instance.
>>> 
>>> And if you need the "stop all other CPUs while I do this”
>> 
>> It looks like a corner case, but working this through - the ’simple’ put a mutex around the atomic instructions approach would indeed need to ensure that no other core was doing anything - that just happens to be true for qemu today (or - we would have to put a mutex around all writes); in order to ensure the case where a store exclusive could potential fail if a non-atomic instruction wrote (a different value) to the same address. This is currently guarantee by the implementation in Qemu - how useful it is I dont know, but if we break it, we run the risk that something will fail (at the least, we could not claim to have kept things the same).
>> 
>> This also has implications for the idea of adding TCG ops I think...
>> The ideal scenario is that we could ‘fallback’ on the same semantics that are there today - allowing specific target/host combinations to be optimised (and to improve their functionality). 
>> But that means, from within the TCG Op, we would need to have a mechanism, to cause other TCG’s to take an exit…. etc etc… In the end, I’m sure it’s possible, but it feels so awkward.
> 
> That's the nice thing about transactions - they guarantee that no other
> CPU accesses the same cache line at the same time. So you're safe
> against other vcpus even without blocking them manually.
> 
> For the non-transactional implementation we probably would need an "IPI
> others and halt them until we're done with the critical section"
> approach. But I really wouldn't concentrate on making things fast on old
> CPUs.
> 
> Also keep in mind that for the UP case we can always omit all the magic
> - we only need to detect when we move into an SMP case (linux-user clone
> or -smp on system).
> 
>> 
>> To re-cap where we are (for my own benefit if nobody else):
>> We have several propositions in terms of implementing Atomic instructions
>> 
>> 1/ We wrap the atomic instructions in a mutex using helper functions (this is the approach others have taken, it’s simple, but it is not clean, as stated above).
> 
> This is horrible. Imagine you have this split approach with a load
> exclusive and then store whereas the load starts mutex usage and the
> store stop is. At that point if the store creates a segfault you'll be
> left with a dangling mutex.

Sorry - probably a misunderstanding here - the plan for (1) was to simply wrap the individual instructions - where today they set/check the address/value pair used to determine if a store exclusive can succeed. Adding the mutex gives you protection against other atomic operations on the same memory but not against other non-atomic instructions - which is a bad thing.

Cheers
Mark.

> 
> This stuff really belongs into the TCG core.
> 
>> 
>> 1.5/ We add a mechanism to ensure that when the mutex is taken, all other cores are ‘stopped’.
>> 
>> 2/ We add some TCG ops to effectively do the same thing, but this would give us the benefit of being able to provide better implementations. This is attractive, but we would end up needing ops to cover at least exclusive load/store and atomic compare exchange. To me this looks less than elegant (being pulled close to the target, rather than being able to generalise), but it’s not clear how we would implement the operations as we would like, with a machine instruction, unless we did split them out along these lines. This approach also (probably) requires the 1.5 mechanism above.
> 
> I'm still in favor of just forcing the semantics of transactions onto
> this. If the host doesn't implement transactions, tough luck - do the
> "halt all others" IPI.
> 
>> 
>> 3/ We have discussed a ‘h/w’ approach to the problem. In this case, all atomic instructions are forced to take the slow path - and a additional flags are added to the memory API. We then deal with the issue closer to the memory where we can record who has a lock on a memory address. For this to work - we would also either
>> a) need to add a mprotect type approach to ensure no ‘non atomic’ writes occur - or
>> b) need to force all cores to mark the page with the exclusive memory as IO or similar to ensure that all write accesses followed the slow path.
>> 
>> 4/ There is an option to implement exclusive operations within the TCG using mprotect (and signal handlers). I have some concerns on this : would we need have to have support for each host O/S…. I also think we might end up the a lot of protected regions causing a lot of SIGSEGV’s because an errant guest doesn’t behave well - basically we will need to see the impact on performance - finally - this will be really painful to deal with for cases where the exclusive memory is held in what Qemu considers IO space !!!
>> 	In other words - putting the mprotect inside TCG looks to me like it’s mutually exclusive to supporting a memory-based scheme like (3).
> 
> Again, I don't think it's worth caring about legacy host systems too
> much. In a few years from now transactional memory will be commodity,
> just like KVM is today.
> 
> 
> Alex
> 
>> My personal preference is for 3b) it  is “safe” - its where the hardware is.
>> 3a is an optimization of that.
>> to me, (2) is an optimisation again. We are effectively saying, if you are able to do this directly, then you dont need to pass via the slow path. Otherwise, you always have the option of reverting to the slow path.
>> 
>> Frankly - 1 and 1.5 are hacks - they are not optimisations, they are just dirty hacks. However - their saving grace is that they are hacks that exist and “work”. I dislike patching the hack, but it did seem to offer the fastest solution to get around this problem - at least for now. I am no longer convinced.
>> 
>> 4/ is something I’d like other peoples views on too… Is it a better approach? What about the slow path?
>> 
>> I increasingly begin to feel that we should really approach this from the other end, and provide a ‘correct’ solution using the memory - then worry about making that faster…
>> 
>> Cheers
>> 
>> Mark.
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>>> semantics linux-user currently uses then that definitely needs
>>> core code support. (Maybe linux-user is being over-zealous
>>> there; I haven't thought about it.)
>>> 
>>> -- PMM
>> 
>> 
>> 	 +44 (0)20 7100 3485 x 210
>> +33 (0)5 33 52 01 77x 210
>> 
>> 	+33 (0)603762104
>> 	mark.burton
>> 


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Mark Burton Dec. 18, 2014, 2:20 p.m. UTC | #27
> On 18/12/2014 13:24, Alexander Graf wrote:
>> That's the nice thing about transactions - they guarantee that no other
>> CPU accesses the same cache line at the same time. So you're safe
>> against other vcpus even without blocking them manually.
>> 
>> For the non-transactional implementation we probably would need an "IPI
>> others and halt them until we're done with the critical section"
>> approach. But I really wouldn't concentrate on making things fast on old
>> CPUs.
> 
> The non-transactional implementation can use softmmu to trap access to
> the page from other VCPUs.  This makes it possible to implement (at the
> cost of speed) the same semantics on all hosts.
> 
> Paolo

I believe what your describing, using transactional memory, or using softmmu amounts to either option 3 below or option 4.
Relying on it totally was option 4. 

Seems to me, the problem with that option is that support for some hosts will be a pain, and covering everything will take some time :-(

Option 3 suggests that we build a ‘slow path’ mechanism first - make sure that works (as a backup), and then add optimisations for specific hosts/guests afterwards. To me that still seems preferable?

Cheers

Mark.




> On 18 Dec 2014, at 13:24, Alexander Graf <agraf@suse.de> wrote:
> 
> 
> 
> On 18.12.14 10:12, Mark Burton wrote:
>> 
>>> On 17 Dec 2014, at 17:39, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> 
>>> On 17 December 2014 at 16:29, Mark Burton <mark.burton@greensocs.com> wrote:
>>>>> On 17 Dec 2014, at 17:27, Peter Maydell <peter.maydell@linaro.org> wrote:
>>>>> I think a mutex is fine, personally -- I just don't want
>>>>> to see fifteen hand-hacked mutexes in the target-* code.
>>>>> 
>>>> 
>>>> Which would seem to favour the helper function approach?
>>>> Or am I missing something?
>>> 
>>> You need at least some support from QEMU core -- consider
>>> what happens with this patch if the ldrex takes a data
>>> abort, for instance.
>>> 
>>> And if you need the "stop all other CPUs while I do this”
>> 
>> It looks like a corner case, but working this through - the ’simple’ put a mutex around the atomic instructions approach would indeed need to ensure that no other core was doing anything - that just happens to be true for qemu today (or - we would have to put a mutex around all writes); in order to ensure the case where a store exclusive could potential fail if a non-atomic instruction wrote (a different value) to the same address. This is currently guarantee by the implementation in Qemu - how useful it is I dont know, but if we break it, we run the risk that something will fail (at the least, we could not claim to have kept things the same).
>> 
>> This also has implications for the idea of adding TCG ops I think...
>> The ideal scenario is that we could ‘fallback’ on the same semantics that are there today - allowing specific target/host combinations to be optimised (and to improve their functionality). 
>> But that means, from within the TCG Op, we would need to have a mechanism, to cause other TCG’s to take an exit…. etc etc… In the end, I’m sure it’s possible, but it feels so awkward.
> 
> That's the nice thing about transactions - they guarantee that no other
> CPU accesses the same cache line at the same time. So you're safe
> against other vcpus even without blocking them manually.
> 
> For the non-transactional implementation we probably would need an "IPI
> others and halt them until we're done with the critical section"
> approach. But I really wouldn't concentrate on making things fast on old
> CPUs.
> 
> Also keep in mind that for the UP case we can always omit all the magic
> - we only need to detect when we move into an SMP case (linux-user clone
> or -smp on system).
> 
>> 
>> To re-cap where we are (for my own benefit if nobody else):
>> We have several propositions in terms of implementing Atomic instructions
>> 
>> 1/ We wrap the atomic instructions in a mutex using helper functions (this is the approach others have taken, it’s simple, but it is not clean, as stated above).
> 
> This is horrible. Imagine you have this split approach with a load
> exclusive and then store whereas the load starts mutex usage and the
> store stop is. At that point if the store creates a segfault you'll be
> left with a dangling mutex.
> 
> This stuff really belongs into the TCG core.
> 
>> 
>> 1.5/ We add a mechanism to ensure that when the mutex is taken, all other cores are ‘stopped’.
>> 
>> 2/ We add some TCG ops to effectively do the same thing, but this would give us the benefit of being able to provide better implementations. This is attractive, but we would end up needing ops to cover at least exclusive load/store and atomic compare exchange. To me this looks less than elegant (being pulled close to the target, rather than being able to generalise), but it’s not clear how we would implement the operations as we would like, with a machine instruction, unless we did split them out along these lines. This approach also (probably) requires the 1.5 mechanism above.
> 
> I'm still in favor of just forcing the semantics of transactions onto
> this. If the host doesn't implement transactions, tough luck - do the
> "halt all others" IPI.
> 
>> 
>> 3/ We have discussed a ‘h/w’ approach to the problem. In this case, all atomic instructions are forced to take the slow path - and a additional flags are added to the memory API. We then deal with the issue closer to the memory where we can record who has a lock on a memory address. For this to work - we would also either
>> a) need to add a mprotect type approach to ensure no ‘non atomic’ writes occur - or
>> b) need to force all cores to mark the page with the exclusive memory as IO or similar to ensure that all write accesses followed the slow path.
>> 
>> 4/ There is an option to implement exclusive operations within the TCG using mprotect (and signal handlers). I have some concerns on this : would we need have to have support for each host O/S…. I also think we might end up the a lot of protected regions causing a lot of SIGSEGV’s because an errant guest doesn’t behave well - basically we will need to see the impact on performance - finally - this will be really painful to deal with for cases where the exclusive memory is held in what Qemu considers IO space !!!
>> 	In other words - putting the mprotect inside TCG looks to me like it’s mutually exclusive to supporting a memory-based scheme like (3).
> 
> Again, I don't think it's worth caring about legacy host systems too
> much. In a few years from now transactional memory will be commodity,
> just like KVM is today.
> 
> 
> Alex
> 
>> My personal preference is for 3b) it  is “safe” - its where the hardware is.
>> 3a is an optimization of that.
>> to me, (2) is an optimisation again. We are effectively saying, if you are able to do this directly, then you dont need to pass via the slow path. Otherwise, you always have the option of reverting to the slow path.
>> 
>> Frankly - 1 and 1.5 are hacks - they are not optimisations, they are just dirty hacks. However - their saving grace is that they are hacks that exist and “work”. I dislike patching the hack, but it did seem to offer the fastest solution to get around this problem - at least for now. I am no longer convinced.
>> 
>> 4/ is something I’d like other peoples views on too… Is it a better approach? What about the slow path?
>> 
>> I increasingly begin to feel that we should really approach this from the other end, and provide a ‘correct’ solution using the memory - then worry about making that faster…
>> 
>> Cheers
>> 
>> Mark.
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>>> semantics linux-user currently uses then that definitely needs
>>> core code support. (Maybe linux-user is being over-zealous
>>> there; I haven't thought about it.)
>>> 
>>> -- PMM
>> 
>> 
>> 	 +44 (0)20 7100 3485 x 210
>> +33 (0)5 33 52 01 77x 210
>> 
>> 	+33 (0)603762104
>> 	mark.burton
>> 


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Alexander Graf Dec. 18, 2014, 2:44 p.m. UTC | #28
On 18.12.14 15:20, Mark Burton wrote:
> 
> 
>> On 18/12/2014 13:24, Alexander Graf wrote:
>>> That's the nice thing about transactions - they guarantee that no other
>>> CPU accesses the same cache line at the same time. So you're safe
>>> against other vcpus even without blocking them manually.
>>>
>>> For the non-transactional implementation we probably would need an "IPI
>>> others and halt them until we're done with the critical section"
>>> approach. But I really wouldn't concentrate on making things fast on old
>>> CPUs.
>>
>> The non-transactional implementation can use softmmu to trap access to
>> the page from other VCPUs.  This makes it possible to implement (at the
>> cost of speed) the same semantics on all hosts.
>>
>> Paolo
> 
> I believe what your describing, using transactional memory, or using softmmu amounts to either option 3 below or option 4.
> Relying on it totally was option 4. 
> 
> Seems to me, the problem with that option is that support for some hosts will be a pain, and covering everything will take some time :-(
> 
> Option 3 suggests that we build a ‘slow path’ mechanism first - make sure that works (as a backup), and then add optimisations for specific hosts/guests afterwards. To me that still seems preferable?

Yes, the only thing I'm in favor for here is to align the semantics with
what transactional memory would give you. That way moving to the fast
implementation will be easy and people would actually want to use
multi-threaded TCG ;)


Alex
Mark Burton Dec. 18, 2014, 2:51 p.m. UTC | #29
> On 18 Dec 2014, at 15:44, Alexander Graf <agraf@suse.de> wrote:
> 
> 
> 
> On 18.12.14 15:20, Mark Burton wrote:
>> 
>> 
>>> On 18/12/2014 13:24, Alexander Graf wrote:
>>>> That's the nice thing about transactions - they guarantee that no other
>>>> CPU accesses the same cache line at the same time. So you're safe
>>>> against other vcpus even without blocking them manually.
>>>> 
>>>> For the non-transactional implementation we probably would need an "IPI
>>>> others and halt them until we're done with the critical section"
>>>> approach. But I really wouldn't concentrate on making things fast on old
>>>> CPUs.
>>> 
>>> The non-transactional implementation can use softmmu to trap access to
>>> the page from other VCPUs.  This makes it possible to implement (at the
>>> cost of speed) the same semantics on all hosts.
>>> 
>>> Paolo
>> 
>> I believe what your describing, using transactional memory, or using softmmu amounts to either option 3 below or option 4.
>> Relying on it totally was option 4. 
>> 
>> Seems to me, the problem with that option is that support for some hosts will be a pain, and covering everything will take some time :-(
>> 
>> Option 3 suggests that we build a ‘slow path’ mechanism first - make sure that works (as a backup), and then add optimisations for specific hosts/guests afterwards. To me that still seems preferable?
> 
> Yes, the only thing I'm in favor for here is to align the semantics with
> what transactional memory would give you. That way moving to the fast
> implementation will be easy and people would actually want to use
> multi-threaded TCG ;)

In other words — the back-end (slow path) memory interface should look ‘transactional’…?

Cheers

Mark.




> 
> 
> Alex


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Alexander Graf Dec. 18, 2014, 3:05 p.m. UTC | #30
On 18.12.14 15:51, Mark Burton wrote:
> 
>> On 18 Dec 2014, at 15:44, Alexander Graf <agraf@suse.de> wrote:
>>
>>
>>
>> On 18.12.14 15:20, Mark Burton wrote:
>>>
>>>
>>>> On 18/12/2014 13:24, Alexander Graf wrote:
>>>>> That's the nice thing about transactions - they guarantee that no other
>>>>> CPU accesses the same cache line at the same time. So you're safe
>>>>> against other vcpus even without blocking them manually.
>>>>>
>>>>> For the non-transactional implementation we probably would need an "IPI
>>>>> others and halt them until we're done with the critical section"
>>>>> approach. But I really wouldn't concentrate on making things fast on old
>>>>> CPUs.
>>>>
>>>> The non-transactional implementation can use softmmu to trap access to
>>>> the page from other VCPUs.  This makes it possible to implement (at the
>>>> cost of speed) the same semantics on all hosts.
>>>>
>>>> Paolo
>>>
>>> I believe what your describing, using transactional memory, or using softmmu amounts to either option 3 below or option 4.
>>> Relying on it totally was option 4. 
>>>
>>> Seems to me, the problem with that option is that support for some hosts will be a pain, and covering everything will take some time :-(
>>>
>>> Option 3 suggests that we build a ‘slow path’ mechanism first - make sure that works (as a backup), and then add optimisations for specific hosts/guests afterwards. To me that still seems preferable?
>>
>> Yes, the only thing I'm in favor for here is to align the semantics with
>> what transactional memory would give you. That way moving to the fast
>> implementation will be easy and people would actually want to use
>> multi-threaded TCG ;)
> 
> In other words — the back-end (slow path) memory interface should look ‘transactional’…?

Yeah, the semantics should be tied to what TM would give you. We can
always be more safe than TM in our fallback implementation, but I
wouldn't want to see semantic optimizations tied to the MMIO
implementation put in.

This is mostly theory though, try to write the code and see where things
fall apart, then we'll be in a much better position to rationalize on
where to do things differently.


Alex
Mark Burton Dec. 18, 2014, 3:09 p.m. UTC | #31
>> 
>> In other words — the back-end (slow path) memory interface should look ‘transactional’…?
> 
> Yeah, the semantics should be tied to what TM would give you. We can
> always be more safe than TM in our fallback implementation, but I
> wouldn't want to see semantic optimizations tied to the MMIO
> implementation put in.
> 
> This is mostly theory though, try to write the code and see where things
> fall apart, then we'll be in a much better position to rationalize on
> where to do things differently.
> 
> 

absolutely. 
Seems like this is a good way forward… thanks all for your input.

Cheers
Mark.




> Alex


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Paolo Bonzini Dec. 18, 2014, 4:55 p.m. UTC | #32
On 18/12/2014 16:05, Alexander Graf wrote:
> Yeah, the semantics should be tied to what TM would give you. We can
> always be more safe than TM in our fallback implementation, but I
> wouldn't want to see semantic optimizations tied to the MMIO
> implementation put in.
> 
> This is mostly theory though, try to write the code and see where things
> fall apart, then we'll be in a much better position to rationalize on
> where to do things differently.

Yeah, this is why I think LL/SC ops for TCG are more interesting and
important than CMPXCHG.  Also because x86 doesn't have just CMPXCHG, it
also has XADD which you'd have to implement anyway as a LL/SC or CMPXCHG
loop, so CMPXCHG doesn't get you all the way.

Paolo
diff mbox

Patch

diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 5ce7350..a55017d 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -31,6 +31,19 @@ 
 #include "sysemu/kvm.h"
 #include "kvm_arm.h"
 
+/* Protect cpu_exclusive_* variable .*/
+QemuMutex cpu_exclusive_lock;
+
+inline void arm_exclusive_lock(void)
+{
+    qemu_mutex_lock(&cpu_exclusive_lock);
+}
+
+inline void arm_exclusive_unlock(void)
+{
+    qemu_mutex_unlock(&cpu_exclusive_lock);
+}
+
 static void arm_cpu_set_pc(CPUState *cs, vaddr value)
 {
     ARMCPU *cpu = ARM_CPU(cs);
@@ -365,6 +378,7 @@  static void arm_cpu_initfn(Object *obj)
         cpu->psci_version = 2; /* TCG implements PSCI 0.2 */
         if (!inited) {
             inited = true;
+            qemu_mutex_init(&cpu_exclusive_lock);
             arm_translate_init();
         }
     }
@@ -404,6 +418,7 @@  static void arm_cpu_finalizefn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
     g_hash_table_destroy(cpu->cp_regs);
+    qemu_mutex_destroy(&cpu_exclusive_lock);
 }
 
 static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 7f80090..f01c9ef 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1539,4 +1539,7 @@  enum {
     QEMU_PSCI_CONDUIT_HVC = 2,
 };
 
+void arm_exclusive_lock(void);
+void arm_exclusive_unlock(void);
+
 #endif
diff --git a/target-arm/helper.h b/target-arm/helper.h
index dec3728..ce07711 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -529,6 +529,9 @@  DEF_HELPER_2(dc_zva, void, env, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
+DEF_HELPER_0(exclusive_lock, void)
+DEF_HELPER_0(exclusive_unlock, void)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #endif
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 62012c3..916772f 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -33,6 +33,16 @@  static void raise_exception(CPUARMState *env, int tt)
     cpu_loop_exit(cs);
 }
 
+void HELPER(exclusive_lock)(void)
+{
+    arm_exclusive_lock();
+}
+
+void HELPER(exclusive_unlock)(void)
+{
+    arm_exclusive_unlock();
+}
+
 uint32_t HELPER(neon_tbl)(CPUARMState *env, uint32_t ireg, uint32_t def,
                           uint32_t rn, uint32_t maxindex)
 {
diff --git a/target-arm/translate.c b/target-arm/translate.c
index af51568..4a82ad5 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -7377,6 +7377,7 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
         abort();
     }
 
+    gen_helper_exclusive_lock();
     if (size == 3) {
         TCGv_i32 tmp2 = tcg_temp_new_i32();
         TCGv_i32 tmp3 = tcg_temp_new_i32();
@@ -7392,11 +7393,14 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
 
     store_reg(s, rt, tmp);
     tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
+    gen_helper_exclusive_unlock();
 }
 
 static void gen_clrex(DisasContext *s)
 {
+    gen_helper_exclusive_lock();
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+    gen_helper_exclusive_unlock();
 }
 
 #ifdef CONFIG_USER_ONLY
@@ -7427,6 +7431,7 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     done_label = gen_new_label();
     extaddr = tcg_temp_new_i64();
     tcg_gen_extu_i32_i64(extaddr, addr);
+    gen_helper_exclusive_lock();
     tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
     tcg_temp_free_i64(extaddr);
 
@@ -7491,6 +7496,7 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     tcg_gen_movi_i32(cpu_R[rd], 1);
     gen_set_label(done_label);
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+    gen_helper_exclusive_unlock();
 }
 #endif