@@ -9922,3 +9922,8 @@ uint32_t HELPER(crc32c)(uint32_t acc, uint32_t val, uint32_t bytes)
/* Linux crc32c converts the output to one's complement. */
return crc32c(acc, buf, bytes) ^ 0xffffffff;
}
+
+uint32_t HELPER(cross_page_check)(CPUARMState *env, target_ulong vaddr)
+{
+ return !!tb_from_jmp_cache(env, vaddr);
+}
@@ -1,6 +1,8 @@
DEF_HELPER_FLAGS_1(sxtb16, TCG_CALL_NO_RWG_SE, i32, i32)
DEF_HELPER_FLAGS_1(uxtb16, TCG_CALL_NO_RWG_SE, i32, i32)
+DEF_HELPER_2(cross_page_check, i32, env, tl)
+
DEF_HELPER_3(add_setq, i32, env, i32, i32)
DEF_HELPER_3(add_saturate, i32, env, i32, i32)
DEF_HELPER_3(sub_saturate, i32, env, i32, i32)
@@ -4085,6 +4085,18 @@ static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
gen_set_pc_im(s, dest);
tcg_gen_exit_tb((uintptr_t)s->tb + n);
} else {
+ TCGv vaddr = tcg_const_tl(dest);
+ TCGv_i32 valid = tcg_temp_new_i32();
+ TCGLabel *label = gen_new_label();
+
+ gen_helper_cross_page_check(valid, cpu_env, vaddr);
+ tcg_temp_free(vaddr);
+ tcg_gen_brcondi_i32(TCG_COND_EQ, valid, 0, label);
+ tcg_temp_free_i32(valid);
+ tcg_gen_goto_tb(n);
+ gen_set_pc_im(s, dest);
+ tcg_gen_exit_tb((uintptr_t)s->tb + n);
+ gen_set_label(label);
gen_set_pc_im(s, dest);
tcg_gen_exit_tb(0);
}
Instead of unconditionally exiting to the exec loop, add a helper to check whether the target TB is valid. As long as the hit rate in tb_jmp_cache remains high, this improves performance. Measurements: - Boot time of ARM debian jessie on Intel host: | setup | ARM debian boot+shutdown time | stddev | |--------------------+-------------------------------+--------| | master | 10.050247057 | 0.0361 | | +cross | 10.311265443 | 0.0721 | That is a 2.58% slowdown when booting. This is reasonable given that tb_jmp_cache's hit rate when booting is expected to be low. - NBench, arm-softmmu. Host: Intel i7-4790K @ 4.00GHz (y axis: Speedup over 95b31d70) 1.3x+-+--------------------------------------------------------------+-+ | cross+noinline $$$ | | cross+inline %%% | | $$$%% | 1.2x+-+.................$.$.%.......$$$..............................+-+ | $ $ % $ $% | | $ $ % $ $% | 1.1x+-+.................$.$.%.......$.$%.............................+-+ | $$$%% $ $ % $ $% | | $ $ % $ $ % $ $% $$$%% $$$%% $$$%% | | $$$%% $$$%% $ $ % $ $ % $$$%% $ $% $ $ % %%% $ $ % $ $ % | 1x+-$.$B%R$R$A%G$A$H%T$M$_%P$L$i%l$n$%.$.$.%...%.%.$$$%%.$.$.%.$.$.%-+ | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % % % $ $ % $ $ % $ $ % | | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % % % $ $ % $ $ % $ $ % | 0.9x+-$.$.%.$.$.%.$.$.%.$.$.%.$.$.%.$.$%.$.$.%...%.%.$.$.%.$.$.%.$.$.%-+ | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % % % $ $ % $ $ % $ $ % | | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % $$$ % $ $ % $ $ % $ $ % | | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % $ $ % $ $ % $ $ % $ $ % | 0.8x+-$$$%%-$$$%%-$$$%%-$$$%%-$$$%%-$$$%-$$$%%-$$$%%-$$$%%-$$$%%-$$$%%-+ ASSIGNMBITFIELFOUFP_EMULATHUFFMALU_DECOMPNEURANUMERICSTRING_SOhmean png: http://imgur.com/1rmYSaF That is, a 4.04% hmean perf improvement over master with tb_from_jmp_cache not inlined, and a 5.82% hmean perf improvement over master with tb_from_jmp_cache inlined (i.e. this commit). The largest improvement is 21% for the FP_EMULATION benchmark. Signed-off-by: Emilio G. Cota <cota@braap.org> --- target/arm/helper.c | 5 +++++ target/arm/helper.h | 2 ++ target/arm/translate.c | 12 ++++++++++++ 3 files changed, 19 insertions(+)