diff mbox series

[v4,4/7] accel: replacing part of CONFIG_PROFILER with TBStats

Message ID 20190720010235.32444-5-vandersonmr2@gmail.com
State New
Headers show
Series Measure Tiny Code Generation Quality | expand

Commit Message

vandersonmr July 20, 2019, 1:02 a.m. UTC
We add some of the statistics collected in the TCGProfiler
into the TBStats, having the statistics not only for the whole
emulation but for each TB. Then, we removed these stats
from TCGProfiler and reconstruct the information for the
"info jit" using the sum of all TBStats statistics.

The goal is to have one unique and better way of collecting
emulation statistics. Moreover, checking dynamiclly if the
profiling is enabled showed to have an insignificant impact
on the performance:
https://wiki.qemu.org/Internships/ProjectIdeas/TCGCodeQuality#Overheads.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
---
 accel/tcg/Makefile.objs   |   2 +-
 accel/tcg/tb-stats.c      | 107 ++++++++++++++++++++++++++++++++++++++
 accel/tcg/translate-all.c |  10 ++--
 include/exec/tb-stats.h   |   9 ++++
 tcg/tcg.c                 |  93 ++++-----------------------------
 tcg/tcg.h                 |  10 ----
 6 files changed, 131 insertions(+), 100 deletions(-)
 create mode 100644 accel/tcg/tb-stats.c

Comments

Alex Bennée July 26, 2019, 3:25 p.m. UTC | #1
vandersonmr <vandersonmr2@gmail.com> writes:

> We add some of the statistics collected in the TCGProfiler
> into the TBStats, having the statistics not only for the whole
> emulation but for each TB. Then, we removed these stats
> from TCGProfiler and reconstruct the information for the
> "info jit" using the sum of all TBStats statistics.
>
> The goal is to have one unique and better way of collecting
> emulation statistics. Moreover, checking dynamiclly if the
> profiling is enabled showed to have an insignificant impact
> on the performance:
> https://wiki.qemu.org/Internships/ProjectIdeas/TCGCodeQuality#Overheads.
>
> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
> ---
>  accel/tcg/Makefile.objs   |   2 +-
>  accel/tcg/tb-stats.c      | 107 ++++++++++++++++++++++++++++++++++++++
>  accel/tcg/translate-all.c |  10 ++--
>  include/exec/tb-stats.h   |   9 ++++
>  tcg/tcg.c                 |  93 ++++-----------------------------
>  tcg/tcg.h                 |  10 ----
>  6 files changed, 131 insertions(+), 100 deletions(-)
>  create mode 100644 accel/tcg/tb-stats.c
>
> diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
> index d381a02f34..49ffe81b5d 100644
> --- a/accel/tcg/Makefile.objs
> +++ b/accel/tcg/Makefile.objs
> @@ -2,7 +2,7 @@ obj-$(CONFIG_SOFTMMU) += tcg-all.o
>  obj-$(CONFIG_SOFTMMU) += cputlb.o
>  obj-y += tcg-runtime.o tcg-runtime-gvec.o
>  obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
> -obj-y += translator.o
> +obj-y += translator.o tb-stats.o
>
>  obj-$(CONFIG_USER_ONLY) += user-exec.o
>  obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o
> diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
> new file mode 100644
> index 0000000000..44497d4f9b
> --- /dev/null
> +++ b/accel/tcg/tb-stats.c
> @@ -0,0 +1,107 @@
> +#include "qemu/osdep.h"
> +
> +#include "disas/disas.h"
> +#include "exec/exec-all.h"
> +#include "tcg.h"
> +
> +#include "qemu/qemu-print.h"
> +
> +struct jit_profile_info {
> +    uint64_t translations;
> +    uint64_t aborted;
> +    uint64_t ops;
> +    unsigned ops_max;
> +    uint64_t del_ops;
> +    uint64_t temps;
> +    unsigned temps_max;
> +    uint64_t host;
> +    uint64_t guest;
> +    uint64_t host_ins;
> +    uint64_t search_data;
> +};
> +
> +/* accumulate the statistics from all TBs */
> +static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
> +{
> +    struct jit_profile_info *jpi = userp;
> +    TBStatistics *tbs = p;
> +
> +    jpi->translations += tbs->translations.total;
> +    jpi->ops += tbs->code.num_tcg_ops;
> +    if (tbs->translations.total && tbs->code.num_tcg_ops / tbs->translations.total
> +            > jpi->ops_max) {
> +        jpi->ops_max = tbs->code.num_tcg_ops / tbs->translations.total;
> +    }
> +    jpi->del_ops += tbs->code.deleted_ops;
> +    jpi->temps += tbs->code.temps;
> +    if (tbs->translations.total && tbs->code.temps / tbs->translations.total >
> +            jpi->temps_max) {
> +        jpi->temps_max = tbs->code.temps / tbs->translations.total;
> +    }
> +    jpi->host += tbs->code.out_len;
> +    jpi->guest += tbs->code.in_len;
> +    jpi->host_ins += tbs->code.num_host_inst;
> +    jpi->search_data += tbs->code.search_out_len;
> +}
> +
> +/* dump JIT statisticis using TCGProfile and TBStats */
> +void dump_jit_profile_info(TCGProfile *s)
> +{
> +    if (!tb_stats_collection_enabled()) {
> +        return;
> +    }
> +
> +    struct jit_profile_info *jpi = g_new0(struct jit_profile_info,
>  1);

This leaks and needs to be freed after we are done.

> +
> +    qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
> +
> +    if (jpi->translations) {
> +        qemu_printf("translated TBs      %" PRId64 "\n", jpi->translations);
> +        qemu_printf("avg ops/TB          %0.1f max=%d\n",
> +                jpi->ops / (double) jpi->translations, jpi->ops_max);
> +        qemu_printf("deleted ops/TB      %0.2f\n",
> +                jpi->del_ops / (double) jpi->translations);
> +        qemu_printf("avg temps/TB        %0.2f max=%d\n",
> +                jpi->temps / (double) jpi->translations, jpi->temps_max);
> +        qemu_printf("avg host code/TB    %0.1f\n",
> +                jpi->host / (double) jpi->translations);
> +        qemu_printf("avg host ins/TB     %0.1f\n",
> +                jpi->host_ins / (double) jpi->translations);
> +        qemu_printf("avg search data/TB  %0.1f\n",
> +                jpi->search_data / (double) jpi->translations);
> +
> +        if (s) {
> +            int64_t tot = s->interm_time + s->code_time;
> +            qemu_printf("JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
> +                            tot, tot / 2.4e9);
> +            qemu_printf("cycles/op           %0.1f\n",
> +                        jpi->ops ? (double)tot / jpi->ops : 0);
> +            qemu_printf("cycles/in byte      %0.1f\n",
> +                        jpi->guest ? (double)tot / jpi->guest : 0);
> +            qemu_printf("cycles/out byte     %0.1f\n",
> +                        jpi->host ? (double)tot / jpi->host : 0);
> +            qemu_printf("cycles/out inst     %0.1f\n",
> +                        jpi->host_ins ? (double)tot / jpi->host_ins : 0);
> +            qemu_printf("cycles/search byte     %0.1f\n",
> +                        jpi->search_data ? (double)tot / jpi->search_data : 0);
> +            if (tot == 0) {
> +                tot = 1;
> +            }
> +            qemu_printf("  gen_interm time   %0.1f%%\n",
> +                        (double)s->interm_time / tot * 100.0);
> +            qemu_printf("  gen_code time     %0.1f%%\n",
> +                        (double)s->code_time / tot * 100.0);
> +            qemu_printf("optim./code time    %0.1f%%\n",
> +                        (double)s->opt_time / (s->code_time ? s->code_time : 1)
> +                        * 100.0);
> +            qemu_printf("liveness/code time  %0.1f%%\n",
> +                    (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
> +            qemu_printf("cpu_restore count   %" PRId64 "\n",
> +                    s->restore_count);
> +            qemu_printf("  avg cycles        %0.1f\n",
> +                    s->restore_count ? (double)s->restore_time / s->restore_count : 0);
> +        }
> +    }
> +}
> +
> +
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index 3a47ac6f2c..bbdb0f23a4 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1773,8 +1773,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>   tb_overflow:
>
>  #ifdef CONFIG_PROFILER
> -    /* includes aborted translations because of exceptions */
> -    atomic_set(&prof->tb_count1, prof->tb_count1 + 1);
>      ti = profile_getclock();
>  #endif

I assume from the conversation on IRC we can add the profile clocks here
as well. I wonder if execution time could also be added (under a new
tbstats flag)? The data might be too noisy given the time to collect
timestamps vs the relatively short block length.

>
> @@ -1822,7 +1820,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      }
>
>  #ifdef CONFIG_PROFILER
> -    atomic_set(&prof->tb_count, prof->tb_count + 1);
>      atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);
>      ti = profile_getclock();
>  #endif
> @@ -1869,12 +1866,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>
>  #ifdef CONFIG_PROFILER
>      atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
> -    atomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
> -    atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
> -    atomic_set(&prof->search_out_len, prof->search_out_len + search_size);
>  #endif
>
>      if (tb_stats_enabled(tb, TB_JIT_STATS)) {
> +        atomic_add(&tb->tb_stats->code.in_len, tb->size);
> +        atomic_add(&tb->tb_stats->code.out_len, gen_code_size);

If we are switching num_host_insns to num_host_bytes we don't need a
duplicate here.

> +        atomic_add(&tb->tb_stats->code.search_out_len, search_size);
> +
>          size_t code_size = gen_code_size;
>          if (tcg_ctx->data_gen_ptr) {
>              code_size = tcg_ctx->data_gen_ptr - tb->tc.ptr;
> diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
> index 6584b78c4a..7d775f2c0d 100644
> --- a/include/exec/tb-stats.h
> +++ b/include/exec/tb-stats.h
> @@ -40,6 +40,13 @@ struct TBStatistics {
>          unsigned num_tcg_ops;
>          unsigned num_tcg_ops_opt;
>          unsigned spills;
> +
> +        /* CONFIG_PROFILE */
> +        unsigned temps;
> +        unsigned deleted_ops;
> +        unsigned in_len;
> +        unsigned out_len;
> +        unsigned search_out_len;
>      } code;
>
>      struct {
> @@ -51,4 +58,6 @@ struct TBStatistics {
>
>  bool tb_stats_cmp(const void *ap, const void *bp);
>
> +void dump_jit_profile_info(TCGProfile *s);
> +
>  #endif
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 446e3d1708..46b31a2f68 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2295,9 +2295,9 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
>      QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
>      s->nb_ops--;
>
> -#ifdef CONFIG_PROFILER
> -    atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1);
> -#endif
> +    if (tb_stats_enabled(s->current_tb, TB_JIT_STATS)) {
> +        atomic_inc(&s->current_tb->tb_stats->code.deleted_ops);
> +    }
>  }
>
>  static TCGOp *tcg_op_alloc(TCGOpcode opc)
> @@ -3914,16 +3914,6 @@ void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
>
>          if (counters) {
>              PROF_ADD(prof, orig, cpu_exec_time);
> -            PROF_ADD(prof, orig, tb_count1);
> -            PROF_ADD(prof, orig, tb_count);
> -            PROF_ADD(prof, orig, op_count);
> -            PROF_MAX(prof, orig, op_count_max);
> -            PROF_ADD(prof, orig, temp_count);
> -            PROF_MAX(prof, orig, temp_count_max);
> -            PROF_ADD(prof, orig, del_op_count);
> -            PROF_ADD(prof, orig, code_in_len);
> -            PROF_ADD(prof, orig, code_out_len);
> -            PROF_ADD(prof, orig, search_out_len);
>              PROF_ADD(prof, orig, interm_time);
>              PROF_ADD(prof, orig, code_time);
>              PROF_ADD(prof, orig, la_time);
> @@ -4003,26 +3993,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
>      TCGOp *op;
>
>      s->current_tb = tb;
> -
> -#ifdef CONFIG_PROFILER
> -    {
> +    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
>          int n = 0;
>
>          QTAILQ_FOREACH(op, &s->ops, link) {
>              n++;
>          }
> -        atomic_set(&prof->op_count, prof->op_count + n);
> -        if (n > prof->op_count_max) {
> -            atomic_set(&prof->op_count_max, n);
> -        }
> +        atomic_add(&tb->tb_stats->code.num_tcg_ops, n);
>
>          n = s->nb_temps;
> -        atomic_set(&prof->temp_count, prof->temp_count + n);
> -        if (n > prof->temp_count_max) {
> -            atomic_set(&prof->temp_count_max, n);
> -        }
> +        atomic_add(&tb->tb_stats->code.temps, n);
>      }
> -#endif
>
>  #ifdef DEBUG_DISAS
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
> @@ -4229,70 +4210,16 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
>      return tcg_current_code_size(s);
>  }
>
> -#ifdef CONFIG_PROFILER
>  void tcg_dump_info(void)
>  {
> +    TCGProfile *s = NULL;
> +#ifdef CONFIG_PROFILER
>      TCGProfile prof = {};
> -    const TCGProfile *s;
> -    int64_t tb_count;
> -    int64_t tb_div_count;
> -    int64_t tot;
> -
>      tcg_profile_snapshot_counters(&prof);
>      s = &prof;
> -    tb_count = s->tb_count;
> -    tb_div_count = tb_count ? tb_count : 1;
> -    tot = s->interm_time + s->code_time;
> -
> -    qemu_printf("JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
> -                tot, tot / 2.4e9);
> -    qemu_printf("translated TBs      %" PRId64 " (aborted=%" PRId64
> -                " %0.1f%%)\n",
> -                tb_count, s->tb_count1 - tb_count,
> -                (double)(s->tb_count1 - s->tb_count)
> -                / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
> -    qemu_printf("avg ops/TB          %0.1f max=%d\n",
> -                (double)s->op_count / tb_div_count, s->op_count_max);
> -    qemu_printf("deleted ops/TB      %0.2f\n",
> -                (double)s->del_op_count / tb_div_count);
> -    qemu_printf("avg temps/TB        %0.2f max=%d\n",
> -                (double)s->temp_count / tb_div_count, s->temp_count_max);
> -    qemu_printf("avg host code/TB    %0.1f\n",
> -                (double)s->code_out_len / tb_div_count);
> -    qemu_printf("avg search data/TB  %0.1f\n",
> -                (double)s->search_out_len / tb_div_count);
> -
> -    qemu_printf("cycles/op           %0.1f\n",
> -                s->op_count ? (double)tot / s->op_count : 0);
> -    qemu_printf("cycles/in byte      %0.1f\n",
> -                s->code_in_len ? (double)tot / s->code_in_len : 0);
> -    qemu_printf("cycles/out byte     %0.1f\n",
> -                s->code_out_len ? (double)tot / s->code_out_len : 0);
> -    qemu_printf("cycles/search byte     %0.1f\n",
> -                s->search_out_len ? (double)tot / s->search_out_len : 0);
> -    if (tot == 0) {
> -        tot = 1;
> -    }
> -    qemu_printf("  gen_interm time   %0.1f%%\n",
> -                (double)s->interm_time / tot * 100.0);
> -    qemu_printf("  gen_code time     %0.1f%%\n",
> -                (double)s->code_time / tot * 100.0);
> -    qemu_printf("optim./code time    %0.1f%%\n",
> -                (double)s->opt_time / (s->code_time ? s->code_time : 1)
> -                * 100.0);
> -    qemu_printf("liveness/code time  %0.1f%%\n",
> -                (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
> -    qemu_printf("cpu_restore count   %" PRId64 "\n",
> -                s->restore_count);
> -    qemu_printf("  avg cycles        %0.1f\n",
> -                s->restore_count ? (double)s->restore_time / s->restore_count : 0);
> -}
> -#else
> -void tcg_dump_info(void)
> -{
> -    qemu_printf("[TCG profiler not compiled]\n");
> -}
>  #endif
> +    dump_jit_profile_info(s);
> +}
>
>  #ifdef ELF_HOST_MACHINE
>  /* In order to use this feature, the backend needs to do three things:
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index bf6f3bcba3..026a066b9a 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -644,16 +644,6 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
>
>  typedef struct TCGProfile {
>      int64_t cpu_exec_time;
> -    int64_t tb_count1;
> -    int64_t tb_count;
> -    int64_t op_count; /* total insn count */
> -    int op_count_max; /* max insn per TB */
> -    int temp_count_max;
> -    int64_t temp_count;
> -    int64_t del_op_count;
> -    int64_t code_in_len;
> -    int64_t code_out_len;
> -    int64_t search_out_len;
>      int64_t interm_time;
>      int64_t code_time;
>      int64_t la_time;


--
Alex Bennée
diff mbox series

Patch

diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
index d381a02f34..49ffe81b5d 100644
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -2,7 +2,7 @@  obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
 obj-y += tcg-runtime.o tcg-runtime-gvec.o
 obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
-obj-y += translator.o
+obj-y += translator.o tb-stats.o
 
 obj-$(CONFIG_USER_ONLY) += user-exec.o
 obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o
diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
new file mode 100644
index 0000000000..44497d4f9b
--- /dev/null
+++ b/accel/tcg/tb-stats.c
@@ -0,0 +1,107 @@ 
+#include "qemu/osdep.h"
+
+#include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg.h"
+
+#include "qemu/qemu-print.h"
+
+struct jit_profile_info {
+    uint64_t translations;
+    uint64_t aborted;
+    uint64_t ops;
+    unsigned ops_max;
+    uint64_t del_ops;
+    uint64_t temps;
+    unsigned temps_max;
+    uint64_t host;
+    uint64_t guest;
+    uint64_t host_ins;
+    uint64_t search_data;
+};
+
+/* accumulate the statistics from all TBs */
+static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
+{
+    struct jit_profile_info *jpi = userp;
+    TBStatistics *tbs = p;
+
+    jpi->translations += tbs->translations.total;
+    jpi->ops += tbs->code.num_tcg_ops;
+    if (tbs->translations.total && tbs->code.num_tcg_ops / tbs->translations.total
+            > jpi->ops_max) {
+        jpi->ops_max = tbs->code.num_tcg_ops / tbs->translations.total;
+    }
+    jpi->del_ops += tbs->code.deleted_ops;
+    jpi->temps += tbs->code.temps;
+    if (tbs->translations.total && tbs->code.temps / tbs->translations.total >
+            jpi->temps_max) {
+        jpi->temps_max = tbs->code.temps / tbs->translations.total;
+    }
+    jpi->host += tbs->code.out_len;
+    jpi->guest += tbs->code.in_len;
+    jpi->host_ins += tbs->code.num_host_inst;
+    jpi->search_data += tbs->code.search_out_len;
+}
+
+/* dump JIT statisticis using TCGProfile and TBStats */
+void dump_jit_profile_info(TCGProfile *s)
+{
+    if (!tb_stats_collection_enabled()) {
+        return;
+    }
+
+    struct jit_profile_info *jpi = g_new0(struct jit_profile_info, 1);
+
+    qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
+
+    if (jpi->translations) {
+        qemu_printf("translated TBs      %" PRId64 "\n", jpi->translations);
+        qemu_printf("avg ops/TB          %0.1f max=%d\n",
+                jpi->ops / (double) jpi->translations, jpi->ops_max);
+        qemu_printf("deleted ops/TB      %0.2f\n",
+                jpi->del_ops / (double) jpi->translations);
+        qemu_printf("avg temps/TB        %0.2f max=%d\n",
+                jpi->temps / (double) jpi->translations, jpi->temps_max);
+        qemu_printf("avg host code/TB    %0.1f\n",
+                jpi->host / (double) jpi->translations);
+        qemu_printf("avg host ins/TB     %0.1f\n",
+                jpi->host_ins / (double) jpi->translations);
+        qemu_printf("avg search data/TB  %0.1f\n",
+                jpi->search_data / (double) jpi->translations);
+
+        if (s) {
+            int64_t tot = s->interm_time + s->code_time;
+            qemu_printf("JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
+                            tot, tot / 2.4e9);
+            qemu_printf("cycles/op           %0.1f\n",
+                        jpi->ops ? (double)tot / jpi->ops : 0);
+            qemu_printf("cycles/in byte      %0.1f\n",
+                        jpi->guest ? (double)tot / jpi->guest : 0);
+            qemu_printf("cycles/out byte     %0.1f\n",
+                        jpi->host ? (double)tot / jpi->host : 0);
+            qemu_printf("cycles/out inst     %0.1f\n",
+                        jpi->host_ins ? (double)tot / jpi->host_ins : 0);
+            qemu_printf("cycles/search byte     %0.1f\n",
+                        jpi->search_data ? (double)tot / jpi->search_data : 0);
+            if (tot == 0) {
+                tot = 1;
+            }
+            qemu_printf("  gen_interm time   %0.1f%%\n",
+                        (double)s->interm_time / tot * 100.0);
+            qemu_printf("  gen_code time     %0.1f%%\n",
+                        (double)s->code_time / tot * 100.0);
+            qemu_printf("optim./code time    %0.1f%%\n",
+                        (double)s->opt_time / (s->code_time ? s->code_time : 1)
+                        * 100.0);
+            qemu_printf("liveness/code time  %0.1f%%\n",
+                    (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
+            qemu_printf("cpu_restore count   %" PRId64 "\n",
+                    s->restore_count);
+            qemu_printf("  avg cycles        %0.1f\n",
+                    s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+        }
+    }
+}
+
+
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 3a47ac6f2c..bbdb0f23a4 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1773,8 +1773,6 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
  tb_overflow:
 
 #ifdef CONFIG_PROFILER
-    /* includes aborted translations because of exceptions */
-    atomic_set(&prof->tb_count1, prof->tb_count1 + 1);
     ti = profile_getclock();
 #endif
 
@@ -1822,7 +1820,6 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     }
 
 #ifdef CONFIG_PROFILER
-    atomic_set(&prof->tb_count, prof->tb_count + 1);
     atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);
     ti = profile_getclock();
 #endif
@@ -1869,12 +1866,13 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
 
 #ifdef CONFIG_PROFILER
     atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
-    atomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
-    atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
-    atomic_set(&prof->search_out_len, prof->search_out_len + search_size);
 #endif
 
     if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+        atomic_add(&tb->tb_stats->code.in_len, tb->size);
+        atomic_add(&tb->tb_stats->code.out_len, gen_code_size);
+        atomic_add(&tb->tb_stats->code.search_out_len, search_size);
+
         size_t code_size = gen_code_size;
         if (tcg_ctx->data_gen_ptr) {
             code_size = tcg_ctx->data_gen_ptr - tb->tc.ptr;
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index 6584b78c4a..7d775f2c0d 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -40,6 +40,13 @@  struct TBStatistics {
         unsigned num_tcg_ops;
         unsigned num_tcg_ops_opt;
         unsigned spills;
+
+        /* CONFIG_PROFILE */
+        unsigned temps;
+        unsigned deleted_ops;
+        unsigned in_len;
+        unsigned out_len;
+        unsigned search_out_len;
     } code;
 
     struct {
@@ -51,4 +58,6 @@  struct TBStatistics {
 
 bool tb_stats_cmp(const void *ap, const void *bp);
 
+void dump_jit_profile_info(TCGProfile *s);
+
 #endif
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 446e3d1708..46b31a2f68 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2295,9 +2295,9 @@  void tcg_op_remove(TCGContext *s, TCGOp *op)
     QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
     s->nb_ops--;
 
-#ifdef CONFIG_PROFILER
-    atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1);
-#endif
+    if (tb_stats_enabled(s->current_tb, TB_JIT_STATS)) {
+        atomic_inc(&s->current_tb->tb_stats->code.deleted_ops);
+    }
 }
 
 static TCGOp *tcg_op_alloc(TCGOpcode opc)
@@ -3914,16 +3914,6 @@  void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
 
         if (counters) {
             PROF_ADD(prof, orig, cpu_exec_time);
-            PROF_ADD(prof, orig, tb_count1);
-            PROF_ADD(prof, orig, tb_count);
-            PROF_ADD(prof, orig, op_count);
-            PROF_MAX(prof, orig, op_count_max);
-            PROF_ADD(prof, orig, temp_count);
-            PROF_MAX(prof, orig, temp_count_max);
-            PROF_ADD(prof, orig, del_op_count);
-            PROF_ADD(prof, orig, code_in_len);
-            PROF_ADD(prof, orig, code_out_len);
-            PROF_ADD(prof, orig, search_out_len);
             PROF_ADD(prof, orig, interm_time);
             PROF_ADD(prof, orig, code_time);
             PROF_ADD(prof, orig, la_time);
@@ -4003,26 +3993,17 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     TCGOp *op;
 
     s->current_tb = tb;
-
-#ifdef CONFIG_PROFILER
-    {
+    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
         int n = 0;
 
         QTAILQ_FOREACH(op, &s->ops, link) {
             n++;
         }
-        atomic_set(&prof->op_count, prof->op_count + n);
-        if (n > prof->op_count_max) {
-            atomic_set(&prof->op_count_max, n);
-        }
+        atomic_add(&tb->tb_stats->code.num_tcg_ops, n);
 
         n = s->nb_temps;
-        atomic_set(&prof->temp_count, prof->temp_count + n);
-        if (n > prof->temp_count_max) {
-            atomic_set(&prof->temp_count_max, n);
-        }
+        atomic_add(&tb->tb_stats->code.temps, n);
     }
-#endif
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
@@ -4229,70 +4210,16 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     return tcg_current_code_size(s);
 }
 
-#ifdef CONFIG_PROFILER
 void tcg_dump_info(void)
 {
+    TCGProfile *s = NULL;
+#ifdef CONFIG_PROFILER
     TCGProfile prof = {};
-    const TCGProfile *s;
-    int64_t tb_count;
-    int64_t tb_div_count;
-    int64_t tot;
-
     tcg_profile_snapshot_counters(&prof);
     s = &prof;
-    tb_count = s->tb_count;
-    tb_div_count = tb_count ? tb_count : 1;
-    tot = s->interm_time + s->code_time;
-
-    qemu_printf("JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
-                tot, tot / 2.4e9);
-    qemu_printf("translated TBs      %" PRId64 " (aborted=%" PRId64
-                " %0.1f%%)\n",
-                tb_count, s->tb_count1 - tb_count,
-                (double)(s->tb_count1 - s->tb_count)
-                / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
-    qemu_printf("avg ops/TB          %0.1f max=%d\n",
-                (double)s->op_count / tb_div_count, s->op_count_max);
-    qemu_printf("deleted ops/TB      %0.2f\n",
-                (double)s->del_op_count / tb_div_count);
-    qemu_printf("avg temps/TB        %0.2f max=%d\n",
-                (double)s->temp_count / tb_div_count, s->temp_count_max);
-    qemu_printf("avg host code/TB    %0.1f\n",
-                (double)s->code_out_len / tb_div_count);
-    qemu_printf("avg search data/TB  %0.1f\n",
-                (double)s->search_out_len / tb_div_count);
-    
-    qemu_printf("cycles/op           %0.1f\n",
-                s->op_count ? (double)tot / s->op_count : 0);
-    qemu_printf("cycles/in byte      %0.1f\n",
-                s->code_in_len ? (double)tot / s->code_in_len : 0);
-    qemu_printf("cycles/out byte     %0.1f\n",
-                s->code_out_len ? (double)tot / s->code_out_len : 0);
-    qemu_printf("cycles/search byte     %0.1f\n",
-                s->search_out_len ? (double)tot / s->search_out_len : 0);
-    if (tot == 0) {
-        tot = 1;
-    }
-    qemu_printf("  gen_interm time   %0.1f%%\n",
-                (double)s->interm_time / tot * 100.0);
-    qemu_printf("  gen_code time     %0.1f%%\n",
-                (double)s->code_time / tot * 100.0);
-    qemu_printf("optim./code time    %0.1f%%\n",
-                (double)s->opt_time / (s->code_time ? s->code_time : 1)
-                * 100.0);
-    qemu_printf("liveness/code time  %0.1f%%\n",
-                (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
-    qemu_printf("cpu_restore count   %" PRId64 "\n",
-                s->restore_count);
-    qemu_printf("  avg cycles        %0.1f\n",
-                s->restore_count ? (double)s->restore_time / s->restore_count : 0);
-}
-#else
-void tcg_dump_info(void)
-{
-    qemu_printf("[TCG profiler not compiled]\n");
-}
 #endif
+    dump_jit_profile_info(s);
+}
 
 #ifdef ELF_HOST_MACHINE
 /* In order to use this feature, the backend needs to do three things:
diff --git a/tcg/tcg.h b/tcg/tcg.h
index bf6f3bcba3..026a066b9a 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -644,16 +644,6 @@  QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
 
 typedef struct TCGProfile {
     int64_t cpu_exec_time;
-    int64_t tb_count1;
-    int64_t tb_count;
-    int64_t op_count; /* total insn count */
-    int op_count_max; /* max insn per TB */
-    int temp_count_max;
-    int64_t temp_count;
-    int64_t del_op_count;
-    int64_t code_in_len;
-    int64_t code_out_len;
-    int64_t search_out_len;
     int64_t interm_time;
     int64_t code_time;
     int64_t la_time;