Message ID | 20190614135332.12777-3-vandersonmr2@gmail.com |
---|---|
State | New |
Headers | show |
Series | Collecting TB Execution Frequency | expand |
On 6/14/19 6:53 AM, vandersonmr wrote: > A new hash map was added to store the accumulated execution > frequency of the TBs even after tb_flush events. A dump > function was also added as a way to visualize these frequencies. > > Signed-off-by: vandersonmr <vandersonmr2@gmail.com> > --- > accel/tcg/translate-all.c | 59 +++++++++++++++++++++++++++++++++++++++ > accel/tcg/translate-all.h | 2 ++ > exec.c | 7 +++++ > include/exec/exec-all.h | 3 ++ > include/exec/tb-context.h | 9 ++++++ > include/qom/cpu.h | 2 ++ > 6 files changed, 82 insertions(+) > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 5d1e08b169..0bc670ffad 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -1118,6 +1118,12 @@ static inline void code_gen_alloc(size_t tb_size) > } > } > > +static bool statistics_cmp(const void* ap, const void *bp) { Watch the formatting. > +static void do_tb_dump_exec_freq(void *p, uint32_t hash, void *userp) > +{ > +#if TARGET_LONG_SIZE == 8 > + TBStatistics *tbs = p; > + printf("%016lx\t%lu\n", tbs->pc, tbs->total_exec_freq); > +#elif TARGET_LONG_SIZE == 4 > + TBStatistics *tbs = p; > + printf("%016x\t%lu\n", tbs->pc, tbs->total_exec_freq); > +#endif > +} TARGET_FMT_lx. > +static void do_tb_read_exec_freq(void *p, uint32_t hash, void *userp) > +{ > + TranslationBlock *tb = p; > + TBStatistics tbscmp; > + tbscmp.pc = tb->pc; > + tbscmp.cs_base = tb->cs_base; > + tbscmp.flags = tb->flags; > + > + TBStatistics *tbs = qht_lookup(userp, &tbscmp, hash); > + > + uint64_t exec_freq = tb_get_and_reset_exec_freq((TranslationBlock*) p); > + > + if (tbs) { > + tbs->total_exec_freq += exec_freq; > + } else { > + void *existing; > + tbs = malloc(sizeof(TBStatistics)); > + tbs->total_exec_freq = exec_freq; > + tbs->pc = tb->pc; > + tbs->cs_base = tb->cs_base; > + tbs->flags = tb->flags; > + qht_insert(userp, tbs, hash, &existing); If you're going to ignore the result, leave the last argument NULL. > + } > +} > + > +void tb_read_exec_freq(void) > +{ > + qht_iter(&tb_ctx.htable, do_tb_read_exec_freq, &tb_ctx.tb_statistics); > +} Perhaps a comment that this is called with mmap_lock held. > +extern bool enable_freq_count; Second declaration. > +uint64_t tb_get_and_reset_exec_freq(TranslationBlock *tb) > +{ > + uint64_t exec_freq = atomic_load_acquire(&tb->exec_freq); > + atomic_store_release(&tb->exec_freq, 0); > + return exec_freq; > +} What are you intending here? Either this needs a comment that it is called with a lock held, and this does not need barriers at all (atomic_read, atomic_set). Or this should use atomic_xchg and do the load and store in one atomic operation. r~
vandersonmr <vandersonmr2@gmail.com> writes: > A new hash map was added to store the accumulated execution > frequency of the TBs even after tb_flush events. A dump > function was also added as a way to visualize these frequencies. > > Signed-off-by: vandersonmr <vandersonmr2@gmail.com> I forgot to mention the formatting looks a little off here. It should be your full name (accents and all) before the email address, e.g: Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > --- > accel/tcg/translate-all.c | 59 +++++++++++++++++++++++++++++++++++++++ > accel/tcg/translate-all.h | 2 ++ > exec.c | 7 +++++ > include/exec/exec-all.h | 3 ++ > include/exec/tb-context.h | 9 ++++++ > include/qom/cpu.h | 2 ++ > 6 files changed, 82 insertions(+) > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 5d1e08b169..0bc670ffad 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -1118,6 +1118,12 @@ static inline void code_gen_alloc(size_t tb_size) > } > } > > +static bool statistics_cmp(const void* ap, const void *bp) { > + const TBStatistics *a = ap; > + const TBStatistics *b = bp; > + return a->pc == b->pc && a->cs_base == b->cs_base && a->flags == > b->flags; Looking at tb_cmp() bellow this I wonder if we should also take into account the page_address values. Some TB's will be translated over a page boundary and in theory that can change with new mappings so we need to ensure they are really equivalent. > +} > + > static bool tb_cmp(const void *ap, const void *bp) > { > const TranslationBlock *a = ap; > @@ -1137,6 +1143,7 @@ static void tb_htable_init(void) > unsigned int mode = QHT_MODE_AUTO_RESIZE; > > qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode); > + qht_init(&tb_ctx.tb_statistics, statistics_cmp, CODE_GEN_HTABLE_SIZE, QHT_MODE_AUTO_RESIZE); > } > > /* Must be called before using the QEMU cpus. 'tb_size' is the size > @@ -1228,6 +1235,53 @@ static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data) > return false; > } > > +static void do_tb_dump_exec_freq(void *p, uint32_t hash, void *userp) > +{ > +#if TARGET_LONG_SIZE == 8 > + TBStatistics *tbs = p; > + printf("%016lx\t%lu\n", tbs->pc, tbs->total_exec_freq); > +#elif TARGET_LONG_SIZE == 4 > + TBStatistics *tbs = p; > + printf("%016x\t%lu\n", tbs->pc, tbs->total_exec_freq); > +#endif > +} This will need some fixing up so deal with output to the HMP monitor. We don't want to be directly spamming stdout with results. > + > +void tb_dump_all_exec_freq(void) > +{ > + tb_read_exec_freq(); > + qht_iter(&tb_ctx.tb_statistics, do_tb_dump_exec_freq, NULL); > +} I would be tempted to insert these into a sorted GList first so we can dump the first N hotblocks. > + > +static void do_tb_read_exec_freq(void *p, uint32_t hash, void *userp) > +{ > + TranslationBlock *tb = p; > + TBStatistics tbscmp; > + tbscmp.pc = tb->pc; > + tbscmp.cs_base = tb->cs_base; > + tbscmp.flags = tb->flags; > + > + TBStatistics *tbs = qht_lookup(userp, &tbscmp, hash); > + > + uint64_t exec_freq = tb_get_and_reset_exec_freq((TranslationBlock*) p); > + > + if (tbs) { > + tbs->total_exec_freq += exec_freq; > + } else { > + void *existing; > + tbs = malloc(sizeof(TBStatistics)); use g_new0(TBStatistics, 1) > + tbs->total_exec_freq = exec_freq; > + tbs->pc = tb->pc; > + tbs->cs_base = tb->cs_base; > + tbs->flags = tb->flags; > + qht_insert(userp, tbs, hash, &existing); > + } > +} > + Comment here about what we are doing? Maybe tb_copy_old_counts()? > +void tb_read_exec_freq(void) > +{ > + qht_iter(&tb_ctx.htable, do_tb_read_exec_freq, &tb_ctx.tb_statistics); > +} > + > /* flush all the translation blocks */ > static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) > { > @@ -1252,6 +1306,10 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) > cpu_tb_jmp_cache_clear(cpu); > } > > + if (enable_freq_count) { > + tb_read_exec_freq(); > + } > + > qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); > page_flush_tb(); > > @@ -1723,6 +1781,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > tb->flags = flags; > tb->cflags = cflags; > tb->trace_vcpu_dstate = *cpu->trace_dstate; > + tb->exec_freq = 0; > tcg_ctx->tb_cflags = cflags; > tb_overflow: > > diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h > index 64f5fd9a05..e13088c36d 100644 > --- a/accel/tcg/translate-all.h > +++ b/accel/tcg/translate-all.h > @@ -32,6 +32,8 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, > int is_cpu_write_access); > void tb_check_watchpoint(CPUState *cpu); > > +extern bool enable_freq_count; > + > #ifdef CONFIG_USER_ONLY > int page_unprotect(target_ulong address, uintptr_t pc); > #endif > diff --git a/exec.c b/exec.c > index e7622d1956..9b64a012ee 100644 > --- a/exec.c > +++ b/exec.c > @@ -1013,6 +1013,13 @@ const char *parse_cpu_option(const char *cpu_option) > return cpu_type; > } > > +uint64_t tb_get_and_reset_exec_freq(TranslationBlock *tb) > +{ > + uint64_t exec_freq = atomic_load_acquire(&tb->exec_freq); > + atomic_store_release(&tb->exec_freq, 0); I'm not sure you need acquire/release semantics here as you are only reading this in an exclusive period when all in-flight updates should be synced (locks are implicit barriers). Folding this up into do_tb_read_exec_freq might make this clearer. > + return exec_freq; > +} > + > #if defined(CONFIG_USER_ONLY) > void tb_invalidate_phys_addr(target_ulong addr) > { > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index a80407622e..5d3d829d18 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -513,4 +513,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, > /* vl.c */ > extern int singlestep; > > +void tb_read_exec_freq(void); > +void tb_dump_all_exec_freq(void); > + > #endif > diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h > index feb585e0a7..ba518d47a0 100644 > --- a/include/exec/tb-context.h > +++ b/include/exec/tb-context.h > @@ -28,6 +28,14 @@ > > typedef struct TranslationBlock TranslationBlock; > typedef struct TBContext TBContext; > +typedef struct TBStatistics TBStatistics; > + > +struct TBStatistics { > + target_ulong pc; > + target_ulong cs_base; > + uint32_t flags; > + uint64_t total_exec_freq; > +}; > > struct TBContext { > > @@ -35,6 +43,7 @@ struct TBContext { > > /* statistics */ > unsigned tb_flush_count; > + struct qht tb_statistics; > }; > > extern TBContext tb_ctx; > diff --git a/include/qom/cpu.h b/include/qom/cpu.h > index 5ee0046b62..593c1f1137 100644 > --- a/include/qom/cpu.h > +++ b/include/qom/cpu.h > @@ -474,6 +474,8 @@ static inline void cpu_tb_jmp_cache_clear(CPUState *cpu) > } > } > > +uint64_t tb_get_and_reset_exec_freq(struct TranslationBlock*); > + > /** > * qemu_tcg_mttcg_enabled: > * Check whether we are running MultiThread TCG or not. -- Alex Bennée
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 5d1e08b169..0bc670ffad 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -1118,6 +1118,12 @@ static inline void code_gen_alloc(size_t tb_size) } } +static bool statistics_cmp(const void* ap, const void *bp) { + const TBStatistics *a = ap; + const TBStatistics *b = bp; + return a->pc == b->pc && a->cs_base == b->cs_base && a->flags == b->flags; +} + static bool tb_cmp(const void *ap, const void *bp) { const TranslationBlock *a = ap; @@ -1137,6 +1143,7 @@ static void tb_htable_init(void) unsigned int mode = QHT_MODE_AUTO_RESIZE; qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode); + qht_init(&tb_ctx.tb_statistics, statistics_cmp, CODE_GEN_HTABLE_SIZE, QHT_MODE_AUTO_RESIZE); } /* Must be called before using the QEMU cpus. 'tb_size' is the size @@ -1228,6 +1235,53 @@ static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data) return false; } +static void do_tb_dump_exec_freq(void *p, uint32_t hash, void *userp) +{ +#if TARGET_LONG_SIZE == 8 + TBStatistics *tbs = p; + printf("%016lx\t%lu\n", tbs->pc, tbs->total_exec_freq); +#elif TARGET_LONG_SIZE == 4 + TBStatistics *tbs = p; + printf("%016x\t%lu\n", tbs->pc, tbs->total_exec_freq); +#endif +} + +void tb_dump_all_exec_freq(void) +{ + tb_read_exec_freq(); + qht_iter(&tb_ctx.tb_statistics, do_tb_dump_exec_freq, NULL); +} + +static void do_tb_read_exec_freq(void *p, uint32_t hash, void *userp) +{ + TranslationBlock *tb = p; + TBStatistics tbscmp; + tbscmp.pc = tb->pc; + tbscmp.cs_base = tb->cs_base; + tbscmp.flags = tb->flags; + + TBStatistics *tbs = qht_lookup(userp, &tbscmp, hash); + + uint64_t exec_freq = tb_get_and_reset_exec_freq((TranslationBlock*) p); + + if (tbs) { + tbs->total_exec_freq += exec_freq; + } else { + void *existing; + tbs = malloc(sizeof(TBStatistics)); + tbs->total_exec_freq = exec_freq; + tbs->pc = tb->pc; + tbs->cs_base = tb->cs_base; + tbs->flags = tb->flags; + qht_insert(userp, tbs, hash, &existing); + } +} + +void tb_read_exec_freq(void) +{ + qht_iter(&tb_ctx.htable, do_tb_read_exec_freq, &tb_ctx.tb_statistics); +} + /* flush all the translation blocks */ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) { @@ -1252,6 +1306,10 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) cpu_tb_jmp_cache_clear(cpu); } + if (enable_freq_count) { + tb_read_exec_freq(); + } + qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); @@ -1723,6 +1781,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, tb->flags = flags; tb->cflags = cflags; tb->trace_vcpu_dstate = *cpu->trace_dstate; + tb->exec_freq = 0; tcg_ctx->tb_cflags = cflags; tb_overflow: diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h index 64f5fd9a05..e13088c36d 100644 --- a/accel/tcg/translate-all.h +++ b/accel/tcg/translate-all.h @@ -32,6 +32,8 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, int is_cpu_write_access); void tb_check_watchpoint(CPUState *cpu); +extern bool enable_freq_count; + #ifdef CONFIG_USER_ONLY int page_unprotect(target_ulong address, uintptr_t pc); #endif diff --git a/exec.c b/exec.c index e7622d1956..9b64a012ee 100644 --- a/exec.c +++ b/exec.c @@ -1013,6 +1013,13 @@ const char *parse_cpu_option(const char *cpu_option) return cpu_type; } +uint64_t tb_get_and_reset_exec_freq(TranslationBlock *tb) +{ + uint64_t exec_freq = atomic_load_acquire(&tb->exec_freq); + atomic_store_release(&tb->exec_freq, 0); + return exec_freq; +} + #if defined(CONFIG_USER_ONLY) void tb_invalidate_phys_addr(target_ulong addr) { diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index a80407622e..5d3d829d18 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -513,4 +513,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, /* vl.c */ extern int singlestep; +void tb_read_exec_freq(void); +void tb_dump_all_exec_freq(void); + #endif diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index feb585e0a7..ba518d47a0 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -28,6 +28,14 @@ typedef struct TranslationBlock TranslationBlock; typedef struct TBContext TBContext; +typedef struct TBStatistics TBStatistics; + +struct TBStatistics { + target_ulong pc; + target_ulong cs_base; + uint32_t flags; + uint64_t total_exec_freq; +}; struct TBContext { @@ -35,6 +43,7 @@ struct TBContext { /* statistics */ unsigned tb_flush_count; + struct qht tb_statistics; }; extern TBContext tb_ctx; diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 5ee0046b62..593c1f1137 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -474,6 +474,8 @@ static inline void cpu_tb_jmp_cache_clear(CPUState *cpu) } } +uint64_t tb_get_and_reset_exec_freq(struct TranslationBlock*); + /** * qemu_tcg_mttcg_enabled: * Check whether we are running MultiThread TCG or not.
A new hash map was added to store the accumulated execution frequency of the TBs even after tb_flush events. A dump function was also added as a way to visualize these frequencies. Signed-off-by: vandersonmr <vandersonmr2@gmail.com> --- accel/tcg/translate-all.c | 59 +++++++++++++++++++++++++++++++++++++++ accel/tcg/translate-all.h | 2 ++ exec.c | 7 +++++ include/exec/exec-all.h | 3 ++ include/exec/tb-context.h | 9 ++++++ include/qom/cpu.h | 2 ++ 6 files changed, 82 insertions(+)