diff mbox

[RFC,01/10] target-arm: protect cpu_exclusive_*.

Message ID 1421428797-23697-2-git-send-email-fred.konrad@greensocs.com
State New
Headers show

Commit Message

fred.konrad@greensocs.com Jan. 16, 2015, 5:19 p.m. UTC
From: KONRAD Frederic <fred.konrad@greensocs.com>

This adds a lock to avoid multiple exclusive access at the same time in case of
TCG multithread.

Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>

V1 -> V2:
  Removed qemu_mutex_destroy().
---
 target-arm/cpu.c       | 14 ++++++++++++++
 target-arm/cpu.h       |  3 +++
 target-arm/helper.h    |  3 +++
 target-arm/op_helper.c | 10 ++++++++++
 target-arm/translate.c |  6 ++++++
 5 files changed, 36 insertions(+)

Comments

Alex Bennée Jan. 27, 2015, 2:36 p.m. UTC | #1
fred.konrad@greensocs.com writes:

> From: KONRAD Frederic <fred.konrad@greensocs.com>
>
> This adds a lock to avoid multiple exclusive access at the same time in case of
> TCG multithread.
>
> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>
> V1 -> V2:
>   Removed qemu_mutex_destroy().
> ---
>  target-arm/cpu.c       | 14 ++++++++++++++
>  target-arm/cpu.h       |  3 +++
>  target-arm/helper.h    |  3 +++
>  target-arm/op_helper.c | 10 ++++++++++
>  target-arm/translate.c |  6 ++++++
>  5 files changed, 36 insertions(+)
>
<snip>
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index bdfcdf1..513d151 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -7381,6 +7381,7 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>          abort();
>      }
>  
> +    gen_helper_exclusive_lock();
>      if (size == 3) {
>          TCGv_i32 tmp2 = tcg_temp_new_i32();
>          TCGv_i32 tmp3 = tcg_temp_new_i32();
> @@ -7396,11 +7397,14 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>  
>      store_reg(s, rt, tmp);
>      tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
> +    gen_helper_exclusive_unlock();
>  }
>  
>  static void gen_clrex(DisasContext *s)
>  {
> +    gen_helper_exclusive_lock();
>      tcg_gen_movi_i64(cpu_exclusive_addr, -1);
> +    gen_helper_exclusive_unlock();
>  }
>  
>  #ifdef CONFIG_USER_ONLY
> @@ -7431,6 +7435,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>      done_label = gen_new_label();
>      extaddr = tcg_temp_new_i64();
>      tcg_gen_extu_i32_i64(extaddr, addr);
> +    gen_helper_exclusive_lock();
>      tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
>      tcg_temp_free_i64(extaddr);
>  
> @@ -7495,6 +7500,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>      tcg_gen_movi_i32(cpu_R[rd], 1);
>      gen_set_label(done_label);
>      tcg_gen_movi_i64(cpu_exclusive_addr, -1);
> +    gen_helper_exclusive_unlock();
>  }
>  #endif

As the aarch64 code shares a lot of helper code with arm it is probably
worth adding the locks to both translate and translate-a64.c.
Peter Maydell Jan. 29, 2015, 3:17 p.m. UTC | #2
On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
> From: KONRAD Frederic <fred.konrad@greensocs.com>
>
> This adds a lock to avoid multiple exclusive access at the same time in case of
> TCG multithread.
>
> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>

All the same comments I had on this patch earlier still apply:

 * I think adding mutex handling code to all the target-*
   frontends rather than providing facilities in common
   code for them to use is the wrong approach
 * You will fail to unlock the mutex if the ldrex or strex
   takes a data abort
 * This is making no attempt to learn from or unify with
   the existing attempts at handling exclusives in linux-user.
   When we've done this work we should have a single
   mechanism for handling exclusives in a multithreaded
   host environment which is used by both softmmu and useronly
   configs

thanks
-- PMM
fred.konrad@greensocs.com Feb. 2, 2015, 8:31 a.m. UTC | #3
On 29/01/2015 16:17, Peter Maydell wrote:
> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> This adds a lock to avoid multiple exclusive access at the same time in case of
>> TCG multithread.
>>
>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
Hi Peter,
> All the same comments I had on this patch earlier still apply:
>
>   * I think adding mutex handling code to all the target-*
>     frontends rather than providing facilities in common
>     code for them to use is the wrong approach
Ok we will fix that.
>   * You will fail to unlock the mutex if the ldrex or strex
>     takes a data abort
>   * This is making no attempt to learn from or unify with
>     the existing attempts at handling exclusives in linux-user.
>     When we've done this work we should have a single
>     mechanism for handling exclusives in a multithreaded
>     host environment which is used by both softmmu and useronly
>     configs
Can you point me to the existing attempts in linux-user?

Thanks,
Fred
>
> thanks
> -- PMM
Peter Maydell Feb. 2, 2015, 8:36 a.m. UTC | #4
On 2 February 2015 at 08:31, Frederic Konrad <fred.konrad@greensocs.com> wrote:
>>   * This is making no attempt to learn from or unify with
>>     the existing attempts at handling exclusives in linux-user.
>>     When we've done this work we should have a single
>>     mechanism for handling exclusives in a multithreaded
>>     host environment which is used by both softmmu and useronly
>>     configs
>
> Can you point me to the existing attempts in linux-user?

It's in two parts -- the #ifdefed code in target-arm/translate.c
(gen_store_exclusive), and the main loop in linux-user/main.c which
handles EXCP_STREX to call do_strex(). There's a pair of utility
routines start_exclusive() and end_exclusive() which suspend all
other CPUs (which in linux-user are separate threads) so the
strex code can run, and then resume them all afterwards.

Since "handle strex between multiple threads" is exactly
what you're trying to do, when we're done we should have
a single mechanism for both softmmu and linux-user. Whether
that means "replace linux-user's current code with better
mechanism" or not I don't currently know.

-- PMM
fred.konrad@greensocs.com Feb. 26, 2015, 6:09 p.m. UTC | #5
On 29/01/2015 16:17, Peter Maydell wrote:
> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> This adds a lock to avoid multiple exclusive access at the same time in case of
>> TCG multithread.
>>
>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
> All the same comments I had on this patch earlier still apply:
>
>   * I think adding mutex handling code to all the target-*
>     frontends rather than providing facilities in common
>     code for them to use is the wrong approach
>   * You will fail to unlock the mutex if the ldrex or strex
>     takes a data abort
>   * This is making no attempt to learn from or unify with
>     the existing attempts at handling exclusives in linux-user.
>     When we've done this work we should have a single
>     mechanism for handling exclusives in a multithreaded
>     host environment which is used by both softmmu and useronly
>     configs
>
> thanks
> -- PMM

Hi,

We see some performance improvment on this SMP, but still have random 
deadlock
in the guest in this function:

static inline void arch_spin_lock(arch_spinlock_t *lock)
{
     unsigned long tmp;
     u32 newval;
     arch_spinlock_t lockval;

     prefetchw(&lock->slock);
     __asm__ __volatile__(
"1:    ldrex    %0, [%3]\n"
"    add    %1, %0, %4\n"
"    strex    %2, %1, [%3]\n"
"    teq    %2, #0\n"
"    bne    1b"
     : "=&r" (lockval), "=&r" (newval), "=&r" (tmp)
     : "r" (&lock->slock), "I" (1 << TICKET_SHIFT)
     : "cc");

     while (lockval.tickets.next != lockval.tickets.owner) {
         wfe();
         lockval.tickets.owner = ACCESS_ONCE(lock->tickets.owner);
     }

     smp_mb();
}

In the last while loop. That's why we though immediately that it is 
caused by our
implementation of atomic instructions.

We failed to unlock the mutex a lot of time during the boot, not because 
of data
abort but I think because we can be interrupted during a strex (eg: 
there are two
branches)?

We decided to implement the whole atomic instruction inside an helper 
but is that
possible to get the data with eg: cpu_physical_memory_rw instead of the 
normal
generated code?

One other thing which looks suspicious it seems there is one pair of
exclusive_addr/exclusive_val per CPU is that normal?

Thanks,
Fred
Alexander Graf Feb. 26, 2015, 8:36 p.m. UTC | #6
> Am 26.02.2015 um 19:09 schrieb Frederic Konrad <fred.konrad@greensocs.com>:
> 
>> On 29/01/2015 16:17, Peter Maydell wrote:
>>> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>> 
>>> This adds a lock to avoid multiple exclusive access at the same time in case of
>>> TCG multithread.
>>> 
>>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>> All the same comments I had on this patch earlier still apply:
>> 
>>  * I think adding mutex handling code to all the target-*
>>    frontends rather than providing facilities in common
>>    code for them to use is the wrong approach
>>  * You will fail to unlock the mutex if the ldrex or strex
>>    takes a data abort
>>  * This is making no attempt to learn from or unify with
>>    the existing attempts at handling exclusives in linux-user.
>>    When we've done this work we should have a single
>>    mechanism for handling exclusives in a multithreaded
>>    host environment which is used by both softmmu and useronly
>>    configs
>> 
>> thanks
>> -- PMM
> 
> Hi,
> 
> We see some performance improvment on this SMP, but still have random deadlock
> in the guest in this function:
> 
> static inline void arch_spin_lock(arch_spinlock_t *lock)
> {
>    unsigned long tmp;
>    u32 newval;
>    arch_spinlock_t lockval;
> 
>    prefetchw(&lock->slock);
>    __asm__ __volatile__(
> "1:    ldrex    %0, [%3]\n"
> "    add    %1, %0, %4\n"
> "    strex    %2, %1, [%3]\n"
> "    teq    %2, #0\n"
> "    bne    1b"
>    : "=&r" (lockval), "=&r" (newval), "=&r" (tmp)
>    : "r" (&lock->slock), "I" (1 << TICKET_SHIFT)
>    : "cc");
> 
>    while (lockval.tickets.next != lockval.tickets.owner) {
>        wfe();
>        lockval.tickets.owner = ACCESS_ONCE(lock->tickets.owner);
>    }
> 
>    smp_mb();
> }
> 
> In the last while loop. That's why we though immediately that it is caused by our
> implementation of atomic instructions.
> 
> We failed to unlock the mutex a lot of time during the boot, not because of data
> abort but I think because we can be interrupted during a strex (eg: there are two
> branches)?
> 
> We decided to implement the whole atomic instruction inside an helper but is that
> possible to get the data with eg: cpu_physical_memory_rw instead of the normal
> generated code?

Yes, but make sure you store pc in env before you enter the helper.

Alex

> 
> One other thing which looks suspicious it seems there is one pair of
> exclusive_addr/exclusive_val per CPU is that normal?
> 
> Thanks,
> Fred
Peter Maydell Feb. 26, 2015, 10:56 p.m. UTC | #7
On 27 February 2015 at 03:09, Frederic Konrad <fred.konrad@greensocs.com> wrote:
> On 29/01/2015 16:17, Peter Maydell wrote:
>>
>> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>>>
>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>
>>> This adds a lock to avoid multiple exclusive access at the same time in
>>> case of
>>> TCG multithread.

>> All the same comments I had on this patch earlier still apply:
>>
>>   * I think adding mutex handling code to all the target-*
>>     frontends rather than providing facilities in common
>>     code for them to use is the wrong approach
>>   * You will fail to unlock the mutex if the ldrex or strex
>>     takes a data abort
>>   * This is making no attempt to learn from or unify with
>>     the existing attempts at handling exclusives in linux-user.
>>     When we've done this work we should have a single
>>     mechanism for handling exclusives in a multithreaded
>>     host environment which is used by both softmmu and useronly
>>     configs

> We decided to implement the whole atomic instruction inside an helper

...which is a different approach which still isn't really
addressing any of my remarks in the list above...

> but is
> that
> possible to get the data with eg: cpu_physical_memory_rw instead of the
> normal
> generated code?

cpu_physical_memory_rw would bypass the TLB and so be much slower.
Make sure you use the functions which go via the TLB if you do
this in a helper (and remember that they will longjmp out on a
tlb miss!)

> One other thing which looks suspicious it seems there is one pair of
> exclusive_addr/exclusive_val per CPU is that normal?

Pretty sure we've already discussed how the current ldrex/strex
implementation is not architecturally correct. I think this is
another of those areas.

In general I'd be much happier seeing a proper sketch of your
design, what data structures etc you intend to share between
CPUs and which are per-CPU, what generic mechanisms you plan
to provide to allow targets to implement atomic instructions, etc.
It's quite hard to see the whole picture at the moment.

-- PMM
Mark Burton Feb. 27, 2015, 7:54 a.m. UTC | #8
> On 26 Feb 2015, at 23:56, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
> On 27 February 2015 at 03:09, Frederic Konrad <fred.konrad@greensocs.com> wrote:
>> On 29/01/2015 16:17, Peter Maydell wrote:
>>> 
>>> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>>>> 
>>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>> 
>>>> This adds a lock to avoid multiple exclusive access at the same time in
>>>> case of
>>>> TCG multithread.
> 
>>> All the same comments I had on this patch earlier still apply:
>>> 
>>>  * I think adding mutex handling code to all the target-*
>>>    frontends rather than providing facilities in common
>>>    code for them to use is the wrong approach
>>>  * You will fail to unlock the mutex if the ldrex or strex
>>>    takes a data abort
>>>  * This is making no attempt to learn from or unify with
>>>    the existing attempts at handling exclusives in linux-user.
>>>    When we've done this work we should have a single
>>>    mechanism for handling exclusives in a multithreaded
>>>    host environment which is used by both softmmu and useronly
>>>    configs
> 
>> We decided to implement the whole atomic instruction inside an helper
> 
> ...which is a different approach which still isn't really
> addressing any of my remarks in the list above…

We agree on the above point. For atomic instructions, I think we discussed at length what to do. However we chose to ‘ignore’ the problem for now and to ‘hack’ something just to get it working initially. Basically at this stage we want something mostly working so we can work on individual bits of the code and address them in more careful detail. Our issue with Atomic-ness is that the ‘hack’ we put in place seems to allow a race condition, and we can’t see why :-(
But - overall, the plan is absolutely to provide a better implementation….

> 
>> but is
>> that
>> possible to get the data with eg: cpu_physical_memory_rw instead of the
>> normal
>> generated code?
> 
> cpu_physical_memory_rw would bypass the TLB and so be much slower.
> Make sure you use the functions which go via the TLB if you do
> this in a helper (and remember that they will longjmp out on a
> tlb miss!)

At this point speed isn’t our main concern - it’s simplicity of implementation - we want it to work, then we can worry about a better implementation (which likely should not go this path at all - as discussed above).
Given that - isn’t it reasonable to pass through cpu_physical_memory_rw - and hence not have to worry about the long jump ? Or am I missing something?

> 
>> One other thing which looks suspicious it seems there is one pair of
>> exclusive_addr/exclusive_val per CPU is that normal?
> 
> Pretty sure we've already discussed how the current ldrex/strex
> implementation is not architecturally correct. I think this is
> another of those areas.

We have indeed discussed this - but this is a surprise. What  we’ve found is that the ‘globals’ (which, when we discussed it, we assumed were indeed global) seem not to be global at all. 
if 2 CPU’s both wanted to write the same strex, and both wrote the address and current values to these variables, right now I believe it would be theoretically possible that both CPU’s would end up successfully completing the strex. If the TB exited on the second branch, I suspect you could have a race condition which would lead to a failure? However I guess this is unlikely.


> 
> In general I'd be much happier seeing a proper sketch of your
> design, what data structures etc you intend to share between
> CPUs and which are per-CPU, what generic mechanisms you plan
> to provide to allow targets to implement atomic instructions, etc.
> It's quite hard to see the whole picture at the moment.


Agreed - at this point - as I say - we are trying to get something to be ‘just' stable enough that we can then use it to move forward from.
But I totally agree - we should be clearer about the overall picture, (once we can see the wood for the trees)

Cheers

Mark.


> 
> -- PMM


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Peter Maydell March 2, 2015, 12:27 p.m. UTC | #9
On 27 February 2015 at 16:54, Mark Burton <mark.burton@greensocs.com> wrote:
>
>> On 26 Feb 2015, at 23:56, Peter Maydell <peter.maydell@linaro.org> wrote:
>> cpu_physical_memory_rw would bypass the TLB and so be much slower.
>> Make sure you use the functions which go via the TLB if you do
>> this in a helper (and remember that they will longjmp out on a
>> tlb miss!)
>
> At this point speed isn’t our main concern - it’s simplicity of implementation - we want it to work, then we can worry about a better implementation (which likely should not go this path at all - as discussed above).
> Given that - isn’t it reasonable to pass through cpu_physical_memory_rw - and hence not have to worry about the long jump ? Or am I missing something?

If you use cpu_physical_memory_rw you need to do the
virt-to-phys translation by hand (preferably via the TLB).
That might be something you needed to do anyway if we want
to have architecturally correct monitors that work on
physaddrs rather than vaddrs, but if not then the two
step process is a bit awkward.

>> Pretty sure we've already discussed how the current ldrex/strex
>> implementation is not architecturally correct. I think this is
>> another of those areas.
>
> We have indeed discussed this - but this is a surprise.

You're right that I didn't specifically realise this exact
part of our incorrectness earlier.

-- PMM
Mark Burton March 3, 2015, 3:29 p.m. UTC | #10
Hi Peter, thanks for the feedback

So - what we have discovered - using the slow path as discussed has a significant performance hit (no surprise), likewise the user-mode exit mechanism. However we are investigating that in parallel...

However, for now, we have left the TCG doing the majority of the work. 
(This was supposed to be the ‘quick an easy’ approach - we have already discussed better approaches to atomic instructions at length - but they are even more invasive!)

we have tried several schemes, and not found anything totally satisfactory:  we have a gremlin. There seems to be a race condition somewhere effectively means that the guest ‘ticket’ mechanism inside the guest kernel goes off by one, and therefore the guest kernel stalls as it never gets the ticket it’s looking for…. (it’s off by one in the ‘one too few’ sense, both CPU’s end up looking for tickets that are higher than the current finished ticket)...

The mechanism to hand out the tickets is of course a LDREX/STREX, leading us to believe that is the cause of our problems, however I am increasingly sceptical.


Our scheme is actually quite simple now:
We keep the same overall scheme as exits upstream today - we store the address and value.
We provide a lock which is used for LDREX, STREX, CLREX, and in the raise_exception code.
For LDREX we check that no other CPU is tagging the address. Because of the lock, we can be sure no STREX is executing so we are always safe to ‘steal’ a tag from another CPU (which is what the ARM ARM defines).
STREX and CLREX are similar to upstream...
We are also careful to invalidate the address tag if we take an exception.

The drawback of this scheme is of course that we are not totally protected against non exclusive writes, which could potentially fall between our reading a value and checking it against our saved value. But I am not convinced that is our problem (I have checked to make sure we dont get non exclusive writes).

This scheme would seem to be elegant and simple - but it suffers from the one small drawback of still having the issue as above…


Cheers

Mark.

ps. on our bug - we believe somehow the STREX is being marked as failed, but actually succeeds to write something.  There are only 3 ways the strex can fail:
1/ the address doesn't match - in which case no ST will be attempted
2/ the value doesn't match - which means - no ST attempted
3/ the store starts, but causes a TLB fill/exception…

The 3rd has 2 possibilities - the TLB is filled, and the store goes ahead totally normally - there should be no ‘fail’ - or an exception is generated in which case we will long jump away and never return. 

Am I missing something?



>> 
>> 


> On 2 Mar 2015, at 13:27, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
> On 27 February 2015 at 16:54, Mark Burton <mark.burton@greensocs.com> wrote:
>> 
>>> On 26 Feb 2015, at 23:56, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> cpu_physical_memory_rw would bypass the TLB and so be much slower.
>>> Make sure you use the functions which go via the TLB if you do
>>> this in a helper (and remember that they will longjmp out on a
>>> tlb miss!)
>> 
>> At this point speed isn’t our main concern - it’s simplicity of implementation - we want it to work, then we can worry about a better implementation (which likely should not go this path at all - as discussed above).
>> Given that - isn’t it reasonable to pass through cpu_physical_memory_rw - and hence not have to worry about the long jump ? Or am I missing something?
> 
> If you use cpu_physical_memory_rw you need to do the
> virt-to-phys translation by hand (preferably via the TLB).
> That might be something you needed to do anyway if we want
> to have architecturally correct monitors that work on
> physaddrs rather than vaddrs, but if not then the two
> step process is a bit awkward.
> 
>>> Pretty sure we've already discussed how the current ldrex/strex
>>> implementation is not architecturally correct. I think this is
>>> another of those areas.
>> 
>> We have indeed discussed this - but this is a surprise.
> 
> You're right that I didn't specifically realise this exact
> part of our incorrectness earlier.
> 
> -- PMM


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Paolo Bonzini March 3, 2015, 3:32 p.m. UTC | #11
On 03/03/2015 16:29, Mark Burton wrote:
> 
> ps. on our bug - we believe somehow the STREX is being marked as
> failed, but actually succeeds to write something.  There are only 3
> ways the strex can fail: 1/ the address doesn't match - in which case
> no ST will be attempted 2/ the value doesn't match - which means - no
> ST attempted 3/ the store starts, but causes a TLB fill/exception…
> 
> The 3rd has 2 possibilities - the TLB is filled, and the store goes
> ahead totally normally - there should be no ‘fail’ - or an exception
> is generated in which case we will long jump away and never return.

When do you release the lock?

Paolo
Mark Burton March 3, 2015, 3:33 p.m. UTC | #12
> On 3 Mar 2015, at 16:32, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> 
> 
> On 03/03/2015 16:29, Mark Burton wrote:
>> 
>> ps. on our bug - we believe somehow the STREX is being marked as
>> failed, but actually succeeds to write something.  There are only 3
>> ways the strex can fail: 1/ the address doesn't match - in which case
>> no ST will be attempted 2/ the value doesn't match - which means - no
>> ST attempted 3/ the store starts, but causes a TLB fill/exception…
>> 
>> The 3rd has 2 possibilities - the TLB is filled, and the store goes
>> ahead totally normally - there should be no ‘fail’ - or an exception
>> is generated in which case we will long jump away and never return.
> 
> When do you release the lock?
> 
(Thanks Paolo!)

We release the lock in either
a) the end of the strex
or
b) in the ‘raise_exception’

Cheers

Mark.

> Paolo


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Paolo Bonzini March 3, 2015, 3:34 p.m. UTC | #13
On 03/03/2015 16:33, Mark Burton wrote:
> 
>> On 3 Mar 2015, at 16:32, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>>
>> On 03/03/2015 16:29, Mark Burton wrote:
>>>
>>> ps. on our bug - we believe somehow the STREX is being marked as
>>> failed, but actually succeeds to write something.  There are only 3
>>> ways the strex can fail: 1/ the address doesn't match - in which case
>>> no ST will be attempted 2/ the value doesn't match - which means - no
>>> ST attempted 3/ the store starts, but causes a TLB fill/exception…
>>>
>>> The 3rd has 2 possibilities - the TLB is filled, and the store goes
>>> ahead totally normally - there should be no ‘fail’ - or an exception
>>> is generated in which case we will long jump away and never return.
>>
>> When do you release the lock?
>>
> (Thanks Paolo!)
> 
> We release the lock in either
> a) the end of the strex
> or
> b) in the ‘raise_exception’

That seems right... Can you post the patch?

Paolo
Mark Burton March 3, 2015, 3:41 p.m. UTC | #14
we’ll try and clean a patch up to show just this…….
THANKS!


Cheers
Mark.

> On 3 Mar 2015, at 16:34, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> 
> 
> On 03/03/2015 16:33, Mark Burton wrote:
>> 
>>> On 3 Mar 2015, at 16:32, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> 
>>> 
>>> 
>>> On 03/03/2015 16:29, Mark Burton wrote:
>>>> 
>>>> ps. on our bug - we believe somehow the STREX is being marked as
>>>> failed, but actually succeeds to write something.  There are only 3
>>>> ways the strex can fail: 1/ the address doesn't match - in which case
>>>> no ST will be attempted 2/ the value doesn't match - which means - no
>>>> ST attempted 3/ the store starts, but causes a TLB fill/exception…
>>>> 
>>>> The 3rd has 2 possibilities - the TLB is filled, and the store goes
>>>> ahead totally normally - there should be no ‘fail’ - or an exception
>>>> is generated in which case we will long jump away and never return.
>>> 
>>> When do you release the lock?
>>> 
>> (Thanks Paolo!)
>> 
>> We release the lock in either
>> a) the end of the strex
>> or
>> b) in the ‘raise_exception’
> 
> That seems right... Can you post the patch?
> 
> Paolo


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
Dr. David Alan Gilbert March 3, 2015, 3:47 p.m. UTC | #15
* Mark Burton (mark.burton@greensocs.com) wrote:
> 
> > On 3 Mar 2015, at 16:32, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> > 
> > 
> > On 03/03/2015 16:29, Mark Burton wrote:
> >> 
> >> ps. on our bug - we believe somehow the STREX is being marked as
> >> failed, but actually succeeds to write something.  There are only 3
> >> ways the strex can fail: 1/ the address doesn't match - in which case
> >> no ST will be attempted 2/ the value doesn't match - which means - no
> >> ST attempted 3/ the store starts, but causes a TLB fill/exception???
> >> 
> >> The 3rd has 2 possibilities - the TLB is filled, and the store goes
> >> ahead totally normally - there should be no ???fail??? - or an exception
> >> is generated in which case we will long jump away and never return.
> > 
> > When do you release the lock?
> > 
> (Thanks Paolo!)
> 
> We release the lock in either
> a) the end of the strex
> or
> b) in the ???raise_exception???

That works for ARM where you have to terminate a ldrex with a strex or clrex,
but not all architectures have the equivalent of a clrex; most as I remember
just let you do an ldrex equivalent, decide you don't want to do the strex equivalent
and get on with life. (Was clrex originally in ARM when ldrex went in?)

Dave

> 
> Cheers
> 
> Mark.
> 
> > Paolo
> 
> 
> 	 +44 (0)20 7100 3485 x 210
>  +33 (0)5 33 52 01 77x 210
> 
> 	+33 (0)603762104
> 	mark.burton
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Richard Henderson March 13, 2015, 7:38 p.m. UTC | #16
On 03/03/2015 07:47 AM, Dr. David Alan Gilbert wrote:
> That works for ARM where you have to terminate a ldrex with a strex or clrex,
> but not all architectures have the equivalent of a clrex; most as I remember
> just let you do an ldrex equivalent, decide you don't want to do the strex equivalent
> and get on with life.


I'm pretty sure that's not the case.  In fact, I can guarantee you that GCC
never issues clrex, but does in fact simply do nothing like you describe for
other architectures if we decide not to do the store.


r~
Dr. David Alan Gilbert March 13, 2015, 8:04 p.m. UTC | #17
* Richard Henderson (rth@twiddle.net) wrote:
> On 03/03/2015 07:47 AM, Dr. David Alan Gilbert wrote:
> > That works for ARM where you have to terminate a ldrex with a strex or clrex,
> > but not all architectures have the equivalent of a clrex; most as I remember
> > just let you do an ldrex equivalent, decide you don't want to do the strex equivalent
> > and get on with life.
> 
> 
> I'm pretty sure that's not the case.  In fact, I can guarantee you that GCC
> never issues clrex, but does in fact simply do nothing like you describe for
> other architectures if we decide not to do the store.

Oh well, that means this technique won't work even for ARM, where I thought
it might stand a chance for ARM but nothing else.
I've still got a vague memory that some ARM docs at one point told you that
you should terminate an LDREX by either an STREX or a CLREX; but it's ~3.5 years
since I did any arm wrestling.

Dave

> r~
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 285947f..75bdc5b 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -31,6 +31,19 @@ 
 #include "sysemu/kvm.h"
 #include "kvm_arm.h"
 
+/* Protect cpu_exclusive_* variable .*/
+QemuMutex cpu_exclusive_lock;
+
+inline void arm_exclusive_lock(void)
+{
+    qemu_mutex_lock(&cpu_exclusive_lock);
+}
+
+inline void arm_exclusive_unlock(void)
+{
+    qemu_mutex_unlock(&cpu_exclusive_lock);
+}
+
 static void arm_cpu_set_pc(CPUState *cs, vaddr value)
 {
     ARMCPU *cpu = ARM_CPU(cs);
@@ -374,6 +387,7 @@  static void arm_cpu_initfn(Object *obj)
         cpu->psci_version = 2; /* TCG implements PSCI 0.2 */
         if (!inited) {
             inited = true;
+            qemu_mutex_init(&cpu_exclusive_lock);
             arm_translate_init();
         }
     }
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 7ba55f0..2101d85 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1821,4 +1821,7 @@  enum {
     QEMU_PSCI_CONDUIT_HVC = 2,
 };
 
+void arm_exclusive_lock(void);
+void arm_exclusive_unlock(void);
+
 #endif
diff --git a/target-arm/helper.h b/target-arm/helper.h
index dec3728..ce07711 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -529,6 +529,9 @@  DEF_HELPER_2(dc_zva, void, env, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
+DEF_HELPER_0(exclusive_lock, void)
+DEF_HELPER_0(exclusive_unlock, void)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #endif
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 2bed914..d830fd8 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -33,6 +33,16 @@  static void raise_exception(CPUARMState *env, int tt)
     cpu_loop_exit(cs);
 }
 
+void HELPER(exclusive_lock)(void)
+{
+    arm_exclusive_lock();
+}
+
+void HELPER(exclusive_unlock)(void)
+{
+    arm_exclusive_unlock();
+}
+
 uint32_t HELPER(neon_tbl)(CPUARMState *env, uint32_t ireg, uint32_t def,
                           uint32_t rn, uint32_t maxindex)
 {
diff --git a/target-arm/translate.c b/target-arm/translate.c
index bdfcdf1..513d151 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -7381,6 +7381,7 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
         abort();
     }
 
+    gen_helper_exclusive_lock();
     if (size == 3) {
         TCGv_i32 tmp2 = tcg_temp_new_i32();
         TCGv_i32 tmp3 = tcg_temp_new_i32();
@@ -7396,11 +7397,14 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
 
     store_reg(s, rt, tmp);
     tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
+    gen_helper_exclusive_unlock();
 }
 
 static void gen_clrex(DisasContext *s)
 {
+    gen_helper_exclusive_lock();
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+    gen_helper_exclusive_unlock();
 }
 
 #ifdef CONFIG_USER_ONLY
@@ -7431,6 +7435,7 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     done_label = gen_new_label();
     extaddr = tcg_temp_new_i64();
     tcg_gen_extu_i32_i64(extaddr, addr);
+    gen_helper_exclusive_lock();
     tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
     tcg_temp_free_i64(extaddr);
 
@@ -7495,6 +7500,7 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
     tcg_gen_movi_i32(cpu_R[rd], 1);
     gen_set_label(done_label);
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+    gen_helper_exclusive_unlock();
 }
 #endif