diff mbox

[RFC,4/4] target-ppc: flush tlb from all the cpu

Message ID 1472797976-24210-5-git-send-email-nikunj@linux.vnet.ibm.com
State New
Headers show

Commit Message

Nikunj A Dadhania Sept. 2, 2016, 6:32 a.m. UTC
Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 cputlb.c                | 15 +++++++++++++++
 include/exec/exec-all.h |  2 ++
 target-ppc/mmu-hash64.c |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

Comments

Benjamin Herrenschmidt Sept. 2, 2016, 7:22 a.m. UTC | #1
On Fri, 2016-09-02 at 12:02 +0530, Nikunj A Dadhania wrote:
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  cputlb.c                | 15 +++++++++++++++
>  include/exec/exec-all.h |  2 ++
>  target-ppc/mmu-hash64.c |  2 +-
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/cputlb.c b/cputlb.c
> index 64faf47..17ff58e 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -123,6 +123,21 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      }
>  }
>  
> +static void tlb_flush_all_async_work(CPUState *cpu, void *opaque)
> +{
> +    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
> +}
> +
> +void tlb_flush_all(CPUState *cpu, int flush_global)
> +{
> +    CPUState *c;
> +
> +    CPU_FOREACH(c) {
> +        async_run_on_cpu(c, tlb_flush_all_async_work,
> +                         GUINT_TO_POINTER(flush_global));
> +    }
> +}

Hrm... this is asynchronous ? It probably needs to be synchronous...
We must provide a guarantee that no other processor can see the old
translation when the tlb invalidation sequence completes. With the
current lazy TLB flush, we already delay the invalidation until
we hit that synchronization point so we need to be synchronous.

Cheers,
Ben.

>  static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, void
> *mmu_bitmask)
>  {
>      CPUArchState *env = cpu->env_ptr;
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index e9f3bcf..55c344b 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -116,6 +116,8 @@ void tlb_flush_page(CPUState *cpu, target_ulong
> addr);
>   * TLB entries, and the argument is ignored.
>   */
>  void tlb_flush(CPUState *cpu, int flush_global);
> +void tlb_flush_all(CPUState *cpu, int flush_global);
> +
>  /**
>   * tlb_flush_page_by_mmuidx:
>   * @cpu: CPU whose TLB should be flushed
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 8118143..d852c21 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -912,7 +912,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
>       * invalidate, and we still don't have a tlb_flush_mask(env, n,
>       * mask) in QEMU, we just invalidate all TLBs
>       */
> -    tlb_flush(CPU(cpu), 1);
> +    tlb_flush_all(CPU(cpu), 1);
>  }
>  
>  void ppc_hash64_update_rmls(CPUPPCState *env)
Nikunj A Dadhania Sept. 2, 2016, 7:34 a.m. UTC | #2
Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Fri, 2016-09-02 at 12:02 +0530, Nikunj A Dadhania wrote:
>> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
>> ---
>>  cputlb.c                | 15 +++++++++++++++
>>  include/exec/exec-all.h |  2 ++
>>  target-ppc/mmu-hash64.c |  2 +-
>>  3 files changed, 18 insertions(+), 1 deletion(-)
>> 
>> diff --git a/cputlb.c b/cputlb.c
>> index 64faf47..17ff58e 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -123,6 +123,21 @@ void tlb_flush(CPUState *cpu, int flush_global)
>>      }
>>  }
>>  
>> +static void tlb_flush_all_async_work(CPUState *cpu, void *opaque)
>> +{
>> +    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
>> +}
>> +
>> +void tlb_flush_all(CPUState *cpu, int flush_global)
>> +{
>> +    CPUState *c;
>> +
>> +    CPU_FOREACH(c) {
>> +        async_run_on_cpu(c, tlb_flush_all_async_work,
>> +                         GUINT_TO_POINTER(flush_global));
>> +    }
>> +}
>
> Hrm... this is asynchronous ?

Yes.

> It probably needs to be synchronous...

I see run_on_cpu() which seems suitable.

> We must provide a guarantee that no other processor can see the old
> translation when the tlb invalidation sequence completes. With the
> current lazy TLB flush, we already delay the invalidation until
> we hit that synchronization point so we need to be synchronous.


>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>> index 8118143..d852c21 100644
>> --- a/target-ppc/mmu-hash64.c
>> +++ b/target-ppc/mmu-hash64.c
>> @@ -912,7 +912,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
>>       * invalidate, and we still don't have a tlb_flush_mask(env, n,
>>       * mask) in QEMU, we just invalidate all TLBs
>>       */
>> -    tlb_flush(CPU(cpu), 1);
>> +    tlb_flush_all(CPU(cpu), 1);
>>  }
>>  
>>  void ppc_hash64_update_rmls(CPUPPCState *env)

Regards,
Nikunj
Alex Bennée Sept. 4, 2016, 5 p.m. UTC | #3
Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> writes:

> Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
>
>> On Fri, 2016-09-02 at 12:02 +0530, Nikunj A Dadhania wrote:
>>> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
>>> ---
>>> cputlb.c| 15 +++++++++++++++
>>> include/exec/exec-all.h |2 ++
>>> target-ppc/mmu-hash64.c |2 +-
>>> 3 files changed, 18 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/cputlb.c b/cputlb.c
>>> index 64faf47..17ff58e 100644
>>> --- a/cputlb.c
>>> +++ b/cputlb.c
>>> @@ -123,6 +123,21 @@ void tlb_flush(CPUState *cpu, int flush_global)
>>> }
>>> }
>>>
>>> +static void tlb_flush_all_async_work(CPUState *cpu, void *opaque)
>>> +{
>>> +tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
>>> +}
>>> +
>>> +void tlb_flush_all(CPUState *cpu, int flush_global)
>>> +{
>>> +CPUState *c;
>>> +
>>> +CPU_FOREACH(c) {
>>> +async_run_on_cpu(c, tlb_flush_all_async_work,
>>> +GUINT_TO_POINTER(flush_global));
>>> +}
>>> +}
>>
>> Hrm... this is asynchronous?
>
> Yes.
>
>> It probably needs to be synchronous...
>
> I see run_on_cpu() which seems suitable.

I'm not so happy with run_on_cpu as it involves busy waiting for the
other CPU to finish.

>> We must provide a guarantee that no other processor can see the old
>> translation when the tlb invalidation sequence completes. With the
>> current lazy TLB flush, we already delay the invalidation until
>> we hit that synchronization point so we need to be synchronous.

When is the synchronisation point? On ARM we end the basic block on
system instructions that mess with the cache. As a result the flush is
done as soon as we exit the run loop on the next instruction.

>
>
>>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>>> index 8118143..d852c21 100644
>>> --- a/target-ppc/mmu-hash64.c
>>> +++ b/target-ppc/mmu-hash64.c
>>> @@ -912,7 +912,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
>>> * invalidate, and we still don't have a tlb_flush_mask(env, n,
>>> * mask) in QEMU, we just invalidate all TLBs
>>> */
>>> -tlb_flush(CPU(cpu), 1);
>>> +tlb_flush_all(CPU(cpu), 1);
>>> }
>>>
>>> void ppc_hash64_update_rmls(CPUPPCState *env)
>
> Regards,
> Nikunj


--
Alex Bennée
Benjamin Herrenschmidt Sept. 4, 2016, 10:17 p.m. UTC | #4
On Sun, 2016-09-04 at 18:00 +0100, Alex Bennée wrote:
> > 
> > > We must provide a guarantee that no other processor can see the old
> > > translation when the tlb invalidation sequence completes. With the
> > > current lazy TLB flush, we already delay the invalidation until
> > > we hit that synchronization point so we need to be synchronous.
> 
> When is the synchronisation point? On ARM we end the basic block on
> system instructions that mess with the cache. As a result the flush is
> done as soon as we exit the run loop on the next instruction.

Look for gen_check_tlb_flush() in translated code and check_tlb_flush
elsewhere in target-ppc.

Basically, when we hit tlbie or slbie (TLB or segment invalidation
instruction), we just set a flag indicating that the TLB "needs
flushing".

When we hit an execution synchronizing instruction (isync) or a
ptesync, or if we hit an exception, we do the actual flush.

This isn't 100% architecturally correct but work with every OS out there
and saves quite a bit of churn, especially on context switch when we
invalidate/replae multiple segments or when invalidating range of pages.

In any case, ptesync especially needs to be the hard sync point, past that
point all translation must have been gone and all accesses using the previous
transltion completed or retried on all processors.

Another approach would be to shoot asynchronous event on the actual tlbie/
slbie instructions and synchronize at the end, but I suspect it won't be
any better, especially since the current code structure can't do fine grained
invalidation of the qemu TLB anyway, we can only blow it all up.

So better safe than sorry here.

That being said, your statement about basic block confuses me a bit. You
mean MT TCG will sync all the threads when exiting a basic block on any CPU ?
 
Cheers,
Ben.

> > 
> > 
> > 
> > > 
> > > > 
> > > > diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> > > > index 8118143..d852c21 100644
> > > > --- a/target-ppc/mmu-hash64.c
> > > > +++ b/target-ppc/mmu-hash64.c
> > > > @@ -912,7 +912,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
> > > > * invalidate, and we still don't have a tlb_flush_mask(env, n,
> > > > * mask) in QEMU, we just invalidate all TLBs
> > > > */
> > > > -tlb_flush(CPU(cpu), 1);
> > > > +tlb_flush_all(CPU(cpu), 1);
> > > > }
> > > > 
> > > > void ppc_hash64_update_rmls(CPUPPCState *env)
> > 
> > Regards,
> > Nikunj
> 
> 
> --
> Alex Bennée
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index 64faf47..17ff58e 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -123,6 +123,21 @@  void tlb_flush(CPUState *cpu, int flush_global)
     }
 }
 
+static void tlb_flush_all_async_work(CPUState *cpu, void *opaque)
+{
+    tlb_flush_nocheck(cpu, GPOINTER_TO_INT(opaque));
+}
+
+void tlb_flush_all(CPUState *cpu, int flush_global)
+{
+    CPUState *c;
+
+    CPU_FOREACH(c) {
+        async_run_on_cpu(c, tlb_flush_all_async_work,
+                         GUINT_TO_POINTER(flush_global));
+    }
+}
+
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, void *mmu_bitmask)
 {
     CPUArchState *env = cpu->env_ptr;
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index e9f3bcf..55c344b 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -116,6 +116,8 @@  void tlb_flush_page(CPUState *cpu, target_ulong addr);
  * TLB entries, and the argument is ignored.
  */
 void tlb_flush(CPUState *cpu, int flush_global);
+void tlb_flush_all(CPUState *cpu, int flush_global);
+
 /**
  * tlb_flush_page_by_mmuidx:
  * @cpu: CPU whose TLB should be flushed
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 8118143..d852c21 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -912,7 +912,7 @@  void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
      * invalidate, and we still don't have a tlb_flush_mask(env, n,
      * mask) in QEMU, we just invalidate all TLBs
      */
-    tlb_flush(CPU(cpu), 1);
+    tlb_flush_all(CPU(cpu), 1);
 }
 
 void ppc_hash64_update_rmls(CPUPPCState *env)