@@ -25,9 +25,105 @@
#include "qemu/osdep.h"
#include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg.h"
+
+#include "qemu/qemu-print.h"
#include "exec/tb-stats.h"
+struct jit_profile_info {
+ uint64_t translations;
+ uint64_t aborted;
+ uint64_t ops;
+ unsigned ops_max;
+ uint64_t del_ops;
+ uint64_t temps;
+ unsigned temps_max;
+ uint64_t host;
+ uint64_t guest;
+ uint64_t search_data;
+};
+
+/* accumulate the statistics from all TBs */
+static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
+{
+ struct jit_profile_info *jpi = userp;
+ TBStatistics *tbs = p;
+
+ jpi->translations += tbs->translations.total;
+ jpi->ops += tbs->code.num_tcg_ops;
+ if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
+ jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
+ }
+ jpi->del_ops += tbs->code.deleted_ops;
+ jpi->temps += tbs->code.temps;
+ if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
+ jpi->temps_max = stat_per_translation(tbs, code.temps);
+ }
+ jpi->host += tbs->code.out_len;
+ jpi->guest += tbs->code.in_len;
+ jpi->search_data += tbs->code.search_out_len;
+}
+
+/* dump JIT statisticis using TCGProfile and TBStats */
+void dump_jit_profile_info(TCGProfile *s)
+{
+ if (!tb_stats_collection_enabled()) {
+ return;
+ }
+
+ struct jit_profile_info *jpi = g_new0(struct jit_profile_info, 1);
+
+ qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
+
+ if (jpi->translations) {
+ qemu_printf("translated TBs %" PRId64 "\n", jpi->translations);
+ qemu_printf("avg ops/TB %0.1f max=%d\n",
+ jpi->ops / (double) jpi->translations, jpi->ops_max);
+ qemu_printf("deleted ops/TB %0.2f\n",
+ jpi->del_ops / (double) jpi->translations);
+ qemu_printf("avg temps/TB %0.2f max=%d\n",
+ jpi->temps / (double) jpi->translations, jpi->temps_max);
+ qemu_printf("avg host code/TB %0.1f\n",
+ jpi->host / (double) jpi->translations);
+ qemu_printf("avg search data/TB %0.1f\n",
+ jpi->search_data / (double) jpi->translations);
+
+ if (s) {
+ int64_t tot = s->interm_time + s->code_time;
+ qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
+ tot, tot / 2.4e9);
+ qemu_printf("cycles/op %0.1f\n",
+ jpi->ops ? (double)tot / jpi->ops : 0);
+ qemu_printf("cycles/in byte %0.1f\n",
+ jpi->guest ? (double)tot / jpi->guest : 0);
+ qemu_printf("cycles/out byte %0.1f\n",
+ jpi->host ? (double)tot / jpi->host : 0);
+ qemu_printf("cycles/search byte %0.1f\n",
+ jpi->search_data ? (double)tot / jpi->search_data : 0);
+ if (tot == 0) {
+ tot = 1;
+ }
+ qemu_printf(" gen_interm time %0.1f%%\n",
+ (double)s->interm_time / tot * 100.0);
+ qemu_printf(" gen_code time %0.1f%%\n",
+ (double)s->code_time / tot * 100.0);
+ qemu_printf("optim./code time %0.1f%%\n",
+ (double)s->opt_time / (s->code_time ? s->code_time : 1)
+ * 100.0);
+ qemu_printf("liveness/code time %0.1f%%\n",
+ (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
+ qemu_printf("cpu_restore count %" PRId64 "\n",
+ s->restore_count);
+ qemu_printf(" avg cycles %0.1f\n",
+ s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+ }
+ }
+ g_free(jpi);
+}
+
+
void init_tb_stats_htable_if_not(void)
{
if (tb_stats_collection_enabled() && !tb_ctx.tb_stats.map) {
@@ -1774,8 +1774,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
tb_overflow:
#ifdef CONFIG_PROFILER
- /* includes aborted translations because of exceptions */
- atomic_set(&prof->tb_count1, prof->tb_count1 + 1);
ti = profile_getclock();
#endif
@@ -1823,7 +1821,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
}
#ifdef CONFIG_PROFILER
- atomic_set(&prof->tb_count, prof->tb_count + 1);
atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);
ti = profile_getclock();
#endif
@@ -1870,13 +1867,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
#ifdef CONFIG_PROFILER
atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
- atomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
- atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
- atomic_set(&prof->search_out_len, prof->search_out_len + search_size);
#endif
if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+ atomic_add(&tb->tb_stats->code.in_len, tb->size);
atomic_add(&tb->tb_stats->code.out_len, gen_code_size);
+ atomic_add(&tb->tb_stats->code.search_out_len, search_size);
}
#ifdef DEBUG_DISAS
@@ -33,6 +33,9 @@
#define tb_stats_enabled(tb, JIT_STATS) \
(tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
+#define stat_per_translation(stat, name) \
+ (stat->translations.total ? stat->name / stat->translations.total : 0)
+
typedef struct TBStatistics TBStatistics;
/*
@@ -63,7 +66,13 @@ struct TBStatistics {
unsigned num_tcg_ops;
unsigned num_tcg_ops_opt;
unsigned spills;
+
+ /* CONFIG_PROFILE */
+ unsigned temps;
+ unsigned deleted_ops;
+ unsigned in_len;
unsigned out_len;
+ unsigned search_out_len;
} code;
struct {
@@ -80,6 +89,8 @@ bool tb_stats_cmp(const void *ap, const void *bp);
void init_tb_stats_htable_if_not(void);
+void dump_jit_profile_info(TCGProfile *s);
+
/* TBStatistic collection controls */
enum TBStatsStatus { TB_STATS_RUNNING, TB_STATS_PAUSED, TB_STATS_STOPPED };
@@ -2294,9 +2294,9 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
s->nb_ops--;
-#ifdef CONFIG_PROFILER
- atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1);
-#endif
+ if (tb_stats_enabled(s->current_tb, TB_JIT_STATS)) {
+ atomic_inc(&s->current_tb->tb_stats->code.deleted_ops);
+ }
}
static TCGOp *tcg_op_alloc(TCGOpcode opc)
@@ -3913,16 +3913,6 @@ void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
if (counters) {
PROF_ADD(prof, orig, cpu_exec_time);
- PROF_ADD(prof, orig, tb_count1);
- PROF_ADD(prof, orig, tb_count);
- PROF_ADD(prof, orig, op_count);
- PROF_MAX(prof, orig, op_count_max);
- PROF_ADD(prof, orig, temp_count);
- PROF_MAX(prof, orig, temp_count_max);
- PROF_ADD(prof, orig, del_op_count);
- PROF_ADD(prof, orig, code_in_len);
- PROF_ADD(prof, orig, code_out_len);
- PROF_ADD(prof, orig, search_out_len);
PROF_ADD(prof, orig, interm_time);
PROF_ADD(prof, orig, code_time);
PROF_ADD(prof, orig, la_time);
@@ -4002,26 +3992,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
TCGOp *op;
s->current_tb = tb;
-
-#ifdef CONFIG_PROFILER
- {
+ if (tb_stats_enabled(tb, TB_JIT_STATS)) {
int n = 0;
QTAILQ_FOREACH(op, &s->ops, link) {
n++;
}
- atomic_set(&prof->op_count, prof->op_count + n);
- if (n > prof->op_count_max) {
- atomic_set(&prof->op_count_max, n);
- }
+ atomic_add(&tb->tb_stats->code.num_tcg_ops, n);
n = s->nb_temps;
- atomic_set(&prof->temp_count, prof->temp_count + n);
- if (n > prof->temp_count_max) {
- atomic_set(&prof->temp_count_max, n);
- }
+ atomic_add(&tb->tb_stats->code.temps, n);
}
-#endif
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
@@ -4228,70 +4209,16 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
return tcg_current_code_size(s);
}
-#ifdef CONFIG_PROFILER
void tcg_dump_info(void)
{
+ TCGProfile *s = NULL;
+#ifdef CONFIG_PROFILER
TCGProfile prof = {};
- const TCGProfile *s;
- int64_t tb_count;
- int64_t tb_div_count;
- int64_t tot;
-
tcg_profile_snapshot_counters(&prof);
s = &prof;
- tb_count = s->tb_count;
- tb_div_count = tb_count ? tb_count : 1;
- tot = s->interm_time + s->code_time;
-
- qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
- tot, tot / 2.4e9);
- qemu_printf("translated TBs %" PRId64 " (aborted=%" PRId64
- " %0.1f%%)\n",
- tb_count, s->tb_count1 - tb_count,
- (double)(s->tb_count1 - s->tb_count)
- / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
- qemu_printf("avg ops/TB %0.1f max=%d\n",
- (double)s->op_count / tb_div_count, s->op_count_max);
- qemu_printf("deleted ops/TB %0.2f\n",
- (double)s->del_op_count / tb_div_count);
- qemu_printf("avg temps/TB %0.2f max=%d\n",
- (double)s->temp_count / tb_div_count, s->temp_count_max);
- qemu_printf("avg host code/TB %0.1f\n",
- (double)s->code_out_len / tb_div_count);
- qemu_printf("avg search data/TB %0.1f\n",
- (double)s->search_out_len / tb_div_count);
-
- qemu_printf("cycles/op %0.1f\n",
- s->op_count ? (double)tot / s->op_count : 0);
- qemu_printf("cycles/in byte %0.1f\n",
- s->code_in_len ? (double)tot / s->code_in_len : 0);
- qemu_printf("cycles/out byte %0.1f\n",
- s->code_out_len ? (double)tot / s->code_out_len : 0);
- qemu_printf("cycles/search byte %0.1f\n",
- s->search_out_len ? (double)tot / s->search_out_len : 0);
- if (tot == 0) {
- tot = 1;
- }
- qemu_printf(" gen_interm time %0.1f%%\n",
- (double)s->interm_time / tot * 100.0);
- qemu_printf(" gen_code time %0.1f%%\n",
- (double)s->code_time / tot * 100.0);
- qemu_printf("optim./code time %0.1f%%\n",
- (double)s->opt_time / (s->code_time ? s->code_time : 1)
- * 100.0);
- qemu_printf("liveness/code time %0.1f%%\n",
- (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
- qemu_printf("cpu_restore count %" PRId64 "\n",
- s->restore_count);
- qemu_printf(" avg cycles %0.1f\n",
- s->restore_count ? (double)s->restore_time / s->restore_count : 0);
-}
-#else
-void tcg_dump_info(void)
-{
- qemu_printf("[TCG profiler not compiled]\n");
-}
#endif
+ dump_jit_profile_info(s);
+}
#ifdef ELF_HOST_MACHINE
/* In order to use this feature, the backend needs to do three things:
@@ -646,16 +646,6 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
typedef struct TCGProfile {
int64_t cpu_exec_time;
- int64_t tb_count1;
- int64_t tb_count;
- int64_t op_count; /* total insn count */
- int op_count_max; /* max insn per TB */
- int temp_count_max;
- int64_t temp_count;
- int64_t del_op_count;
- int64_t code_in_len;
- int64_t code_out_len;
- int64_t search_out_len;
int64_t interm_time;
int64_t code_time;
int64_t la_time;