diff mbox

[03/22] cputlb: bring back tlb_flush_count under !TLB_DEBUG

Message ID 1499586614-20507-4-git-send-email-cota@braap.org
State New
Headers show

Commit Message

Emilio Cota July 9, 2017, 7:49 a.m. UTC
Commit f0aff0f124 ("cputlb: add assert_cpu_is_self checks") buried
the increment of tlb_flush_count under TLB_DEBUG. This results in
"info jit" always (mis)reporting 0 TLB flushes when !TLB_DEBUG.

Besides, under MTTCG tlb_flush_count is updated by several threads,
so in order not to lose counts we'd either have to use atomic ops
or distribute the counter, which is more scalable.

This patch does the latter by embedding tlb_flush_count in CPUArchState.
The global count is then easily obtained by iterating over the CPU list.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 include/exec/cpu-defs.h   |  1 +
 include/exec/cputlb.h     |  3 +--
 accel/tcg/cputlb.c        | 17 ++++++++++++++---
 accel/tcg/translate-all.c |  2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)

Comments

Richard Henderson July 9, 2017, 8 p.m. UTC | #1
On 07/08/2017 09:49 PM, Emilio G. Cota wrote:
> +    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);

Want atomic_read here, so they're all the same.

Otherwise,

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~
Emilio Cota July 9, 2017, 8:56 p.m. UTC | #2
On Sun, Jul 09, 2017 at 10:00:01 -1000, Richard Henderson wrote:
> On 07/08/2017 09:49 PM, Emilio G. Cota wrote:
> >+    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
> 
> Want atomic_read here, so they're all the same.

It's not needed. Note that this thread is the only one ever writing
to env->tlb_flush_count, so the thread can read this value without
atomic accesses.

You'll see this pattern all across the patchset.

Thanks,

		E.
Emilio Cota July 9, 2017, 9:20 p.m. UTC | #3
On Sun, Jul 09, 2017 at 16:56:23 -0400, Emilio G. Cota wrote:
> On Sun, Jul 09, 2017 at 10:00:01 -1000, Richard Henderson wrote:
> > On 07/08/2017 09:49 PM, Emilio G. Cota wrote:
> > >+    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
> > 
> > Want atomic_read here, so they're all the same.
> 
> It's not needed. Note that this thread is the only one ever writing
> to env->tlb_flush_count, so the thread can read this value without
> atomic accesses.
> 
> You'll see this pattern all across the patchset.

We already have this kind of pattern in QEMU. See this patch and
related discussion:
  https://patchwork.kernel.org/patch/9358939/

		E.
Alex Bennée July 12, 2017, 1:26 p.m. UTC | #4
Emilio G. Cota <cota@braap.org> writes:

> Commit f0aff0f124 ("cputlb: add assert_cpu_is_self checks") buried
> the increment of tlb_flush_count under TLB_DEBUG. This results in
> "info jit" always (mis)reporting 0 TLB flushes when !TLB_DEBUG.
>
> Besides, under MTTCG tlb_flush_count is updated by several threads,
> so in order not to lose counts we'd either have to use atomic ops
> or distribute the counter, which is more scalable.
>
> This patch does the latter by embedding tlb_flush_count in CPUArchState.
> The global count is then easily obtained by iterating over the CPU list.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>

As it actually fixes unintentional breakage:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

That said I'm not sure if this number alone is helpful given the range
of flushes we have. Really from a performance point of view we should
differentiate between inline per-vCPU flushes as well as the cross-vCPU
flushes of both asynchronus and synced varieties.

I had a go at this using QEMUs tracing infrastructure:

  https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg04076.html

But I guess the ideal way would be something that both keeps counters
and optionally enable tracepoints.

> ---
>  include/exec/cpu-defs.h   |  1 +
>  include/exec/cputlb.h     |  3 +--
>  accel/tcg/cputlb.c        | 17 ++++++++++++++---
>  accel/tcg/translate-all.c |  2 +-
>  4 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index bc8e7f8..e43ff83 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -137,6 +137,7 @@ typedef struct CPUIOTLBEntry {
>      CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
>      CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
>      CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
> +    size_t tlb_flush_count;                                             \
>      target_ulong tlb_flush_addr;                                        \
>      target_ulong tlb_flush_mask;                                        \
>      target_ulong vtlb_index;                                            \
> diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
> index 3f94178..c91db21 100644
> --- a/include/exec/cputlb.h
> +++ b/include/exec/cputlb.h
> @@ -23,7 +23,6 @@
>  /* cputlb.c */
>  void tlb_protect_code(ram_addr_t ram_addr);
>  void tlb_unprotect_code(ram_addr_t ram_addr);
> -extern int tlb_flush_count;
> -
> +size_t tlb_flush_count(void);
>  #endif
>  #endif
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 85635ae..9377110 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -92,8 +92,18 @@ static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
>      }
>  }
>
> -/* statistics */
> -int tlb_flush_count;
> +size_t tlb_flush_count(void)
> +{
> +    CPUState *cpu;
> +    size_t count = 0;
> +
> +    CPU_FOREACH(cpu) {
> +        CPUArchState *env = cpu->env_ptr;
> +
> +        count += atomic_read(&env->tlb_flush_count);
> +    }
> +    return count;
> +}
>
>  /* This is OK because CPU architectures generally permit an
>   * implementation to drop entries from the TLB at any time, so
> @@ -112,7 +122,8 @@ static void tlb_flush_nocheck(CPUState *cpu)
>      }
>
>      assert_cpu_is_self(cpu);
> -    tlb_debug("(count: %d)\n", tlb_flush_count++);
> +    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
> +    tlb_debug("(count: %zu)\n", tlb_flush_count());
>
>      tb_lock();
>
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index f768681..a936a5f 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1909,7 +1909,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
>              atomic_read(&tcg_ctx.tb_ctx.tb_flush_count));
>      cpu_fprintf(f, "TB invalidate count %d\n",
>              tcg_ctx.tb_ctx.tb_phys_invalidate_count);
> -    cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
> +    cpu_fprintf(f, "TLB flush count     %zu\n", tlb_flush_count());
>      tcg_dump_info(f, cpu_fprintf);
>
>      tb_unlock();


--
Alex Bennée
Emilio Cota July 12, 2017, 6:19 p.m. UTC | #5
On Wed, Jul 12, 2017 at 14:26:36 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
(snip)
> > This patch does the latter by embedding tlb_flush_count in CPUArchState.
> > The global count is then easily obtained by iterating over the CPU list.
> >
> > Signed-off-by: Emilio G. Cota <cota@braap.org>
> 
> As it actually fixes unintentional breakage:
> 
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> 
> That said I'm not sure if this number alone is helpful given the range
> of flushes we have. Really from a performance point of view we should
> differentiate between inline per-vCPU flushes as well as the cross-vCPU
> flushes of both asynchronus and synced varieties.
> 
> I had a go at this using QEMUs tracing infrastructure:
> 
>   https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg04076.html
> 
> But I guess the ideal way would be something that both keeps counters
> and optionally enable tracepoints.

Yeah the counters in my patch are there to fix the breakage while
not hurting scalability in MTTCG.

Having those counters always on + the tracers in your patchset
for more detailed info seems reasonable to me.

Maybe it's time to push to get those tracers changes in?

		Emilio
diff mbox

Patch

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index bc8e7f8..e43ff83 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -137,6 +137,7 @@  typedef struct CPUIOTLBEntry {
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
     CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
+    size_t tlb_flush_count;                                             \
     target_ulong tlb_flush_addr;                                        \
     target_ulong tlb_flush_mask;                                        \
     target_ulong vtlb_index;                                            \
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index 3f94178..c91db21 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -23,7 +23,6 @@ 
 /* cputlb.c */
 void tlb_protect_code(ram_addr_t ram_addr);
 void tlb_unprotect_code(ram_addr_t ram_addr);
-extern int tlb_flush_count;
-
+size_t tlb_flush_count(void);
 #endif
 #endif
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 85635ae..9377110 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -92,8 +92,18 @@  static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
     }
 }
 
-/* statistics */
-int tlb_flush_count;
+size_t tlb_flush_count(void)
+{
+    CPUState *cpu;
+    size_t count = 0;
+
+    CPU_FOREACH(cpu) {
+        CPUArchState *env = cpu->env_ptr;
+
+        count += atomic_read(&env->tlb_flush_count);
+    }
+    return count;
+}
 
 /* This is OK because CPU architectures generally permit an
  * implementation to drop entries from the TLB at any time, so
@@ -112,7 +122,8 @@  static void tlb_flush_nocheck(CPUState *cpu)
     }
 
     assert_cpu_is_self(cpu);
-    tlb_debug("(count: %d)\n", tlb_flush_count++);
+    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
+    tlb_debug("(count: %zu)\n", tlb_flush_count());
 
     tb_lock();
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f768681..a936a5f 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1909,7 +1909,7 @@  void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
             atomic_read(&tcg_ctx.tb_ctx.tb_flush_count));
     cpu_fprintf(f, "TB invalidate count %d\n",
             tcg_ctx.tb_ctx.tb_phys_invalidate_count);
-    cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
+    cpu_fprintf(f, "TLB flush count     %zu\n", tlb_flush_count());
     tcg_dump_info(f, cpu_fprintf);
 
     tb_unlock();