diff mbox

[RFC,35/38] cputlb: use cpu_tcg_sched_work for tlb_flush_all

Message ID 1440375847-17603-36-git-send-email-cota@braap.org
State New
Headers show

Commit Message

Emilio Cota Aug. 24, 2015, 12:24 a.m. UTC
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 cputlb.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

Comments

Paolo Bonzini Aug. 24, 2015, 1:29 a.m. UTC | #1
On 23/08/2015 17:24, Emilio G. Cota wrote:
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  cputlb.c | 41 +++++++++++------------------------------
>  1 file changed, 11 insertions(+), 30 deletions(-)

As suggested by me and Peter, synchronization on TLB flushes should be
arch-specific.  CPUs can halt on a dmb if they have pending TLB flush
requests on other CPUs, and the CPU can be woken up from the run_on_cpu
callback with something like:

    if (--caller_cpu->pending_tlb_flush_request) {
        caller_cpu->interrupt_request |= CPU_INTERRUPT_TLB_DONE;
        qemu_cpu_kick(caller_cpu);
    }


...


static bool arm_cpu_has_work(CPUState *cs)
{
    ARMCPU *cpu = ARM_CPU(cs);

    return !cpu->pending_tlb_flush_request && !cpu->powered_off
        && cs->interrupt_request &
        (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD
         | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ
         | CPU_INTERRUPT_EXITTB | CPU_INTERRUPT_TLB_DONE);
}

Paolo

> diff --git a/cputlb.c b/cputlb.c
> index 1b3673e..d81a4eb 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -73,43 +73,24 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      tlb_flush_count++;
>  }
>  
> -struct TLBFlushParams {
> -    CPUState *cpu;
> -    int flush_global;
> -};
> -
> -static void tlb_flush_async_work(void *opaque)
> +static void __tlb_flush_all(void *arg)
>  {
> -    struct TLBFlushParams *params = opaque;
> +    CPUState *cpu;
> +    int flush_global = *(int *)arg;
>  
> -    tlb_flush(params->cpu, params->flush_global);
> -    g_free(params);
> +    CPU_FOREACH(cpu) {
> +        tlb_flush(cpu, flush_global);
> +    }
> +    g_free(arg);
>  }
>  
>  void tlb_flush_all(int flush_global)
>  {
> -    CPUState *cpu;
> -    struct TLBFlushParams *params;
> +    int *arg = g_malloc(sizeof(*arg));
>  
> -#if 0 /* MTTCG */
> -    CPU_FOREACH(cpu) {
> -        tlb_flush(cpu, flush_global);
> -    }
> -#else
> -    CPU_FOREACH(cpu) {
> -        if (qemu_cpu_is_self(cpu)) {
> -            /* async_run_on_cpu handle this case but this just avoid a malloc
> -             * here.
> -             */
> -            tlb_flush(cpu, flush_global);
> -        } else {
> -            params = g_malloc(sizeof(struct TLBFlushParams));
> -            params->cpu = cpu;
> -            params->flush_global = flush_global;
> -            async_run_on_cpu(cpu, tlb_flush_async_work, params);
> -        }
> -    }
> -#endif /* MTTCG */
> +    *arg = flush_global;
> +    tb_lock();
> +    cpu_tcg_sched_work(current_cpu, __tlb_flush_all, arg);
>  }
>  
>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
>
Emilio Cota Aug. 25, 2015, 10:31 p.m. UTC | #2
On Sun, Aug 23, 2015 at 18:29:33 -0700, Paolo Bonzini wrote:
> 
> 
> On 23/08/2015 17:24, Emilio G. Cota wrote:
> > Signed-off-by: Emilio G. Cota <cota@braap.org>
> > ---
> >  cputlb.c | 41 +++++++++++------------------------------
> >  1 file changed, 11 insertions(+), 30 deletions(-)
> 
> As suggested by me and Peter, synchronization on TLB flushes should be
> arch-specific.  CPUs can halt on a dmb if they have pending TLB flush
> requests on other CPUs,

I'm not sure I understand. With the patches I sent, a CPU that wants
to flush other TLBs does not continue execution until all of those TLBs
are flushed. So dsb/dmb whatever comes next would have nothing to
wait for. What am I missing?

> and the CPU can be woken up from the run_on_cpu
> callback with something like:
> 
>     if (--caller_cpu->pending_tlb_flush_request) {
>         caller_cpu->interrupt_request |= CPU_INTERRUPT_TLB_DONE;
>         qemu_cpu_kick(caller_cpu);
>     }
> 
> 
> ...
> 
> 
> static bool arm_cpu_has_work(CPUState *cs)
> {
>     ARMCPU *cpu = ARM_CPU(cs);
> 
>     return !cpu->pending_tlb_flush_request && !cpu->powered_off
>         && cs->interrupt_request &
>         (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD
>          | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ
>          | CPU_INTERRUPT_EXITTB | CPU_INTERRUPT_TLB_DONE);
> }

Another option, which I tried but my TCG skills fail me, is to
protect each TLB with a seqlock.

The advantage of this is that TLB flushes would always complete
immediately, so there's no need to halt execution.

The disadvantage is the performance hit, but at least on TSO this
seems to me worth a shot.

Thanks,

		Emilio
Paolo Bonzini Aug. 26, 2015, 12:25 a.m. UTC | #3
----- Original Message -----
> From: "Emilio G. Cota" <cota@braap.org>
> To: "Paolo Bonzini" <pbonzini@redhat.com>
> Cc: qemu-devel@nongnu.org, mttcg@greensocs.com, "guillaume delbergue" <guillaume.delbergue@greensocs.com>, "alex
> bennee" <alex.bennee@linaro.org>, "mark burton" <mark.burton@greensocs.com>, "a rigo"
> <a.rigo@virtualopensystems.com>, "Frederic Konrad" <fred.konrad@greensocs.com>
> Sent: Wednesday, August 26, 2015 12:31:22 AM
> Subject: Re: [RFC 35/38] cputlb: use cpu_tcg_sched_work for tlb_flush_all
> 
> On Sun, Aug 23, 2015 at 18:29:33 -0700, Paolo Bonzini wrote:
> > 
> > 
> > On 23/08/2015 17:24, Emilio G. Cota wrote:
> > > Signed-off-by: Emilio G. Cota <cota@braap.org>
> > > ---
> > >  cputlb.c | 41 +++++++++++------------------------------
> > >  1 file changed, 11 insertions(+), 30 deletions(-)
> > 
> > As suggested by me and Peter, synchronization on TLB flushes should be
> > arch-specific.  CPUs can halt on a dmb if they have pending TLB flush
> > requests on other CPUs,
> 
> I'm not sure I understand. With the patches I sent, a CPU that wants
> to flush other TLBs does not continue execution until all of those TLBs
> are flushed. So dsb/dmb whatever comes next would have nothing to
> wait for. What am I missing?

Probably nothing.  Still, I didn't have enough time to study your
cpu_tcg_sched_work patches well, and I'm terribly worried of deadlocks
here. :)  Ensuring that the CPU loop keeps running, and can always be
woken up via halt_cond, is the simplest way to avoid deadlocks.

> Another option, which I tried but my TCG skills fail me, is to
> protect each TLB with a seqlock.
> 
> The advantage of this is that TLB flushes would always complete
> immediately, so there's no need to halt execution.
> 
> The disadvantage is the performance hit, but at least on TSO this
> seems to me worth a shot.

The other disadvantage is that you'd have to modify all TCG backends. :(

Paolo
Alex Bennée Sept. 1, 2015, 4:10 p.m. UTC | #4
Emilio G. Cota <cota@braap.org> writes:

> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  cputlb.c | 41 +++++++++++------------------------------
>  1 file changed, 11 insertions(+), 30 deletions(-)

I bisected my Jessie boot failure to this commit. Before it boots up
fine, here it just hangs before the kernel starts init.

17:05 alex@zen/x86_64  [qemu.git/bisect:???]
>./arm-softmmu/qemu-system-arm -machine virt -cpu cortex-a15 -machine
type=virt -display none -serial telnet:127.0.0.1:4444 -monitor stdio
-smp 4 -m 4096 -kernel ../images/aarch32-current-linux-kernel-only.img
 --append "console=ttyAMA0 root=/dev/vda1" -drive
 file=../images/jessie-arm32.qcow2,id=myblock,index=0,if=none -device
 virtio-b
 lk-device,drive=myblock -netdev user,id=unet,hostfwd=tcp::2222-:22
 -device virtio-net-device,netdev=unet -D /tmp/qemu.log -d un
 imp -name debug-threads=on

See people.linaro.org/~alex.bennee/images


>
> diff --git a/cputlb.c b/cputlb.c
> index 1b3673e..d81a4eb 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -73,43 +73,24 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      tlb_flush_count++;
>  }
>  
> -struct TLBFlushParams {
> -    CPUState *cpu;
> -    int flush_global;
> -};
> -
> -static void tlb_flush_async_work(void *opaque)
> +static void __tlb_flush_all(void *arg)
>  {
> -    struct TLBFlushParams *params = opaque;
> +    CPUState *cpu;
> +    int flush_global = *(int *)arg;
>  
> -    tlb_flush(params->cpu, params->flush_global);
> -    g_free(params);
> +    CPU_FOREACH(cpu) {
> +        tlb_flush(cpu, flush_global);
> +    }
> +    g_free(arg);
>  }
>  
>  void tlb_flush_all(int flush_global)
>  {
> -    CPUState *cpu;
> -    struct TLBFlushParams *params;
> +    int *arg = g_malloc(sizeof(*arg));
>  
> -#if 0 /* MTTCG */
> -    CPU_FOREACH(cpu) {
> -        tlb_flush(cpu, flush_global);
> -    }
> -#else
> -    CPU_FOREACH(cpu) {
> -        if (qemu_cpu_is_self(cpu)) {
> -            /* async_run_on_cpu handle this case but this just avoid a malloc
> -             * here.
> -             */
> -            tlb_flush(cpu, flush_global);
> -        } else {
> -            params = g_malloc(sizeof(struct TLBFlushParams));
> -            params->cpu = cpu;
> -            params->flush_global = flush_global;
> -            async_run_on_cpu(cpu, tlb_flush_async_work, params);
> -        }
> -    }
> -#endif /* MTTCG */
> +    *arg = flush_global;
> +    tb_lock();
> +    cpu_tcg_sched_work(current_cpu, __tlb_flush_all, arg);
>  }
>  
>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
Emilio Cota Sept. 1, 2015, 7:38 p.m. UTC | #5
On Tue, Sep 01, 2015 at 17:10:30 +0100, Alex Bennée wrote:
> 
> Emilio G. Cota <cota@braap.org> writes:
> 
> > Signed-off-by: Emilio G. Cota <cota@braap.org>
> > ---
> >  cputlb.c | 41 +++++++++++------------------------------
> >  1 file changed, 11 insertions(+), 30 deletions(-)
> 
> I bisected my Jessie boot failure to this commit. Before it boots up
> fine, here it just hangs before the kernel starts init.
> 
> 17:05 alex@zen/x86_64  [qemu.git/bisect:???]
> >./arm-softmmu/qemu-system-arm -machine virt -cpu cortex-a15 -machine
> type=virt -display none -serial telnet:127.0.0.1:4444 -monitor stdio
> -smp 4 -m 4096 -kernel ../images/aarch32-current-linux-kernel-only.img
>  --append "console=ttyAMA0 root=/dev/vda1" -drive
>  file=../images/jessie-arm32.qcow2,id=myblock,index=0,if=none -device
>  virtio-b
>  lk-device,drive=myblock -netdev user,id=unet,hostfwd=tcp::2222-:22
>  -device virtio-net-device,netdev=unet -D /tmp/qemu.log -d un
>  imp -name debug-threads=on
> 
> See people.linaro.org/~alex.bennee/images

Thanks for testing!

I can replicate it; what's happening is that tlb_flush_all calls
cpu_loop_exit(), then re-enters the cpu loop, performs the
job while other CPUs are asleep(i.e. __tlb_flush_all in this case),
but then when it continues execution it loads the same instruction
(say a TLBIALLIS) again. So we end up with the same CPU calling
tlb_flush_all in an infinite loop.

A possible way to fix this is to finish the TB right after the
helper and then add a flag in cpu_sched_work to not call
cpu_exit_loop, raising an exit interrupt instead.
(Note that cpu_exit_loop is still necessary when doing work
out-of-band wrt to execution, e.g. we *want* to come back
to the same PC when doing a tb_flush.)

I've tried doing this but I can't see an obvious place to insert
the call to tcg_gen_exit_tb()--I see the calls to the TLB helpers
are embedded in structs that I presume are called by some generic
helper code. A little bit of help here would be appreciated, I'm
not very familiar with target-arm.

Thanks,

		Emilio
Peter Maydell Sept. 1, 2015, 8:18 p.m. UTC | #6
On 1 September 2015 at 20:38, Emilio G. Cota <cota@braap.org> wrote:
> I can replicate it; what's happening is that tlb_flush_all calls
> cpu_loop_exit(), then re-enters the cpu loop, performs the
> job while other CPUs are asleep(i.e. __tlb_flush_all in this case),
> but then when it continues execution it loads the same instruction
> (say a TLBIALLIS) again. So we end up with the same CPU calling
> tlb_flush_all in an infinite loop.
>
> A possible way to fix this is to finish the TB right after the
> helper and then add a flag in cpu_sched_work to not call
> cpu_exit_loop, raising an exit interrupt instead.

Sounds like a good idea.

> (Note that cpu_exit_loop is still necessary when doing work
> out-of-band wrt to execution, e.g. we *want* to come back
> to the same PC when doing a tb_flush.)

Really? I haven't looked at any of this code, but that sounds
a bit odd...

> I've tried doing this but I can't see an obvious place to insert
> the call to tcg_gen_exit_tb()--I see the calls to the TLB helpers
> are embedded in structs that I presume are called by some generic
> helper code. A little bit of help here would be appreciated, I'm
> not very familiar with target-arm.

The code (for 32-bit) is in disas_coproc_insn(). Any coprocessor
which isn't a CP_SPECIAL case (ie NOP or WFI) will always be the last
thing in its TB anyway, unless this is suppressed with the
ARM_CP_SUPPRESS_TB_END flag in the reginfo struct.

thanks
-- PMM
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index 1b3673e..d81a4eb 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -73,43 +73,24 @@  void tlb_flush(CPUState *cpu, int flush_global)
     tlb_flush_count++;
 }
 
-struct TLBFlushParams {
-    CPUState *cpu;
-    int flush_global;
-};
-
-static void tlb_flush_async_work(void *opaque)
+static void __tlb_flush_all(void *arg)
 {
-    struct TLBFlushParams *params = opaque;
+    CPUState *cpu;
+    int flush_global = *(int *)arg;
 
-    tlb_flush(params->cpu, params->flush_global);
-    g_free(params);
+    CPU_FOREACH(cpu) {
+        tlb_flush(cpu, flush_global);
+    }
+    g_free(arg);
 }
 
 void tlb_flush_all(int flush_global)
 {
-    CPUState *cpu;
-    struct TLBFlushParams *params;
+    int *arg = g_malloc(sizeof(*arg));
 
-#if 0 /* MTTCG */
-    CPU_FOREACH(cpu) {
-        tlb_flush(cpu, flush_global);
-    }
-#else
-    CPU_FOREACH(cpu) {
-        if (qemu_cpu_is_self(cpu)) {
-            /* async_run_on_cpu handle this case but this just avoid a malloc
-             * here.
-             */
-            tlb_flush(cpu, flush_global);
-        } else {
-            params = g_malloc(sizeof(struct TLBFlushParams));
-            params->cpu = cpu;
-            params->flush_global = flush_global;
-            async_run_on_cpu(cpu, tlb_flush_async_work, params);
-        }
-    }
-#endif /* MTTCG */
+    *arg = flush_global;
+    tb_lock();
+    cpu_tcg_sched_work(current_cpu, __tlb_flush_all, arg);
 }
 
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)