@@ -47,7 +47,9 @@ void gen_intermediate_code(CPUArchState *env, struct TranslationBlock *tb);
void restore_state_to_opc(CPUArchState *env, struct TranslationBlock *tb,
target_ulong *data);
-void cpu_gen_init(void);
+#ifdef CONFIG_SOFTMMU
+void cpu_gen_init(int cpu_index);
+#endif
bool cpu_restore_state(CPUState *cpu, uintptr_t searched_pc);
void QEMU_NORETURN cpu_loop_exit_noexc(CPUState *cpu);
@@ -76,10 +76,10 @@ typedef struct DisasCompare {
} DisasCompare;
/* Share the TCG temporaries common between 32 and 64 bit modes. */
-extern TCGv_env cpu_env;
-extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
-extern TCGv_i64 cpu_exclusive_addr;
-extern TCGv_i64 cpu_exclusive_val;
+extern TCG_THREAD TCGv_env cpu_env;
+extern TCG_THREAD TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
+extern TCG_THREAD TCGv_i64 cpu_exclusive_addr;
+extern TCG_THREAD TCGv_i64 cpu_exclusive_val;
static inline int arm_dc_feature(DisasContext *dc, int feature)
{
@@ -727,7 +727,13 @@ struct TCGContext {
target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
};
-extern TCGContext tcg_ctx;
+#ifdef CONFIG_SOFTMMU
+#define TCG_THREAD __thread
+#else
+#define TCG_THREAD
+#endif
+
+extern TCG_THREAD TCGContext tcg_ctx;
extern bool parallel_cpus;
static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
@@ -887,7 +893,7 @@ typedef struct TCGOpDef {
#endif
} TCGOpDef;
-extern TCGOpDef tcg_op_defs[];
+extern TCG_THREAD TCGOpDef tcg_op_defs[];
extern const size_t tcg_op_defs_max;
typedef struct TCGTargetOpDef {
@@ -58,6 +58,7 @@
#include "qemu/main-loop.h"
#include "exec/log.h"
#include "sysemu/cpus.h"
+#include "sysemu/sysemu.h"
/* #define DEBUG_TB_INVALIDATE */
/* #define DEBUG_TB_FLUSH */
@@ -132,9 +133,12 @@ static int v_l2_levels;
static void *l1_map[V_L1_MAX_SIZE];
/* code generation context */
-TCGContext tcg_ctx;
+TCG_THREAD TCGContext tcg_ctx;
TBContext tb_ctx;
bool parallel_cpus;
+#ifdef CONFIG_SOFTMMU
+static TCGContext *tcg_common_ctx;
+#endif
/* translation block context */
__thread int have_tb_lock;
@@ -186,10 +190,35 @@ void tb_lock_reset(void)
static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
-void cpu_gen_init(void)
+#ifdef CONFIG_SOFTMMU
+
+/* XXX, see below */
+void arm_translate_init(void);
+
+void cpu_gen_init(int cpu_index)
{
- tcg_context_init(&tcg_ctx);
+ uintptr_t addr;
+ size_t size;
+
+ tcg_context_init(&tcg_ctx);
+ size = tcg_common_ctx->code_gen_buffer_size / smp_cpus;
+ assert(!(tcg_common_ctx->code_gen_buffer_size % smp_cpus));
+ addr = (uintptr_t)tcg_common_ctx->code_gen_buffer;
+ addr += size * cpu_index;
+ tcg_ctx.code_gen_buffer = (void *)addr;
+ tcg_ctx.code_gen_buffer_size = size;
+ tcg_prologue_init(&tcg_ctx);
+ /*
+ * XXX find a proper place to init the TCG globals. This should be trivial
+ * once when the "generic translation loop" work is finished.
+ *
+ * Note that initialising the TCG globals (that are __thread variables
+ * in full-system mode) from a *_cpu_initfn is not a viable option, since
+ * this function is called before the vCPU threads are created.
+ */
+ arm_translate_init();
}
+#endif
/* Encode VAL as a signed leb128 sequence at P.
Return P incremented past the encoded value. */
@@ -561,6 +590,18 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
if (tb_size > MAX_CODE_GEN_BUFFER_SIZE) {
tb_size = MAX_CODE_GEN_BUFFER_SIZE;
}
+#ifdef CONFIG_SOFTMMU
+ {
+ size_t per_cpu = tb_size / smp_cpus;
+
+ if (per_cpu < MIN_CODE_GEN_BUFFER_SIZE) {
+ tb_size = MIN_CODE_GEN_BUFFER_SIZE * smp_cpus;
+ per_cpu = MIN_CODE_GEN_BUFFER_SIZE;
+ }
+ /* make sure tb_size divides smp_cpus evenly */
+ tb_size = per_cpu * smp_cpus;
+ }
+#endif
return tb_size;
}
@@ -810,20 +851,21 @@ static void tb_htable_init(void)
size. */
void tcg_exec_init(unsigned long tb_size)
{
- cpu_gen_init();
page_init();
tb_htable_init();
code_gen_alloc(tb_size);
#if defined(CONFIG_SOFTMMU)
- /* There's no guest base to take into account, so go ahead and
- initialize the prologue now. */
- tcg_prologue_init(&tcg_ctx);
+ tcg_common_ctx = &tcg_ctx;
#endif
}
bool tcg_enabled(void)
{
+#ifdef CONFIG_SOFTMMU
+ return tcg_common_ctx->code_gen_buffer != NULL;
+#else
return tcg_ctx.code_gen_buffer != NULL;
+#endif
}
/*
@@ -1307,6 +1307,7 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
CPUState *cpu = arg;
rcu_register_thread();
+ cpu_gen_init(cpu->cpu_index);
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
@@ -1454,6 +1455,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
g_assert(!use_icount);
rcu_register_thread();
+ cpu_gen_init(cpu->cpu_index);
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
@@ -469,7 +469,7 @@ static void arm_cpu_initfn(Object *obj)
{
CPUState *cs = CPU(obj);
ARMCPU *cpu = ARM_CPU(obj);
- static bool inited;
+ static bool inited __attribute__((unused));
cs->env_ptr = &cpu->env;
cpu->cp_regs = g_hash_table_new_full(g_int_hash, g_int_equal,
@@ -511,10 +511,12 @@ static void arm_cpu_initfn(Object *obj)
if (tcg_enabled()) {
cpu->psci_version = 2; /* TCG implements PSCI 0.2 */
+#ifndef CONFIG_SOFTMMU
if (!inited) {
inited = true;
arm_translate_init();
}
+#endif
}
}
@@ -36,11 +36,11 @@
#include "trace-tcg.h"
-static TCGv_i64 cpu_X[32];
-static TCGv_i64 cpu_pc;
+static TCG_THREAD TCGv_i64 cpu_X[32];
+static TCG_THREAD TCGv_i64 cpu_pc;
/* Load/store exclusive handling */
-static TCGv_i64 cpu_exclusive_high;
+static TCG_THREAD TCGv_i64 cpu_exclusive_high;
static TCGv_i64 cpu_reg(DisasContext *s, int reg);
static const char *regnames[] = {
@@ -58,17 +58,17 @@
#define IS_USER(s) (s->user)
#endif
-TCGv_env cpu_env;
+TCG_THREAD TCGv_env cpu_env;
/* We reuse the same 64-bit temporaries for efficiency. */
-static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
-static TCGv_i32 cpu_R[16];
-TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
-TCGv_i64 cpu_exclusive_addr;
-TCGv_i64 cpu_exclusive_val;
+static TCG_THREAD TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
+static TCG_THREAD TCGv_i32 cpu_R[16];
+TCG_THREAD TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
+TCG_THREAD TCGv_i64 cpu_exclusive_addr;
+TCG_THREAD TCGv_i64 cpu_exclusive_val;
/* FIXME: These should be removed. */
-static TCGv_i32 cpu_F0s, cpu_F1s;
-static TCGv_i64 cpu_F0d, cpu_F1d;
+static TCG_THREAD TCGv_i32 cpu_F0s, cpu_F1s;
+static TCG_THREAD TCGv_i64 cpu_F0d, cpu_F1d;
#include "exec/gen-icount.h"
@@ -146,7 +146,7 @@ static bool have_lzcnt;
# define have_lzcnt 0
#endif
-static tcg_insn_unit *tb_ret_addr;
+static TCG_THREAD tcg_insn_unit *tb_ret_addr;
static void patch_reloc(tcg_insn_unit *code_ptr, int type,
intptr_t value, intptr_t addend)
@@ -31,7 +31,7 @@
uintptr_t tci_tb_ptr;
#endif
-TCGOpDef tcg_op_defs[] = {
+TCG_THREAD TCGOpDef tcg_op_defs[] = {
#define DEF(s, oargs, iargs, cargs, flags) \
{ #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
#include "tcg-opc.h"
@@ -117,8 +117,8 @@ static bool tcg_out_tb_finalize(TCGContext *s);
-static TCGRegSet tcg_target_available_regs[2];
-static TCGRegSet tcg_target_call_clobber_regs;
+static TCG_THREAD TCGRegSet tcg_target_available_regs[2];
+static TCG_THREAD TCGRegSet tcg_target_call_clobber_regs;
#if TCG_TARGET_INSN_UNIT_SIZE == 1
static __attribute__((unused)) inline void tcg_out8(TCGContext *s, uint8_t v)
@@ -320,7 +320,7 @@ static const TCGHelperInfo all_helpers[] = {
#include "exec/helper-tcg.h"
};
-static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
+static TCG_THREAD int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
static void process_op_defs(TCGContext *s);
void tcg_context_init(TCGContext *s)
This will allow us to generate TCG code in parallel. User-mode is kept out of this: contention due to concurrent translation is more commonly found in full-system mode (e.g. booting a many-core guest). XXX: For now, only convert arm/a64, since these are the only guests that have proper MTTCG support. XXX: arm_translate_init needs to be called from a proper place. XXX: TCG profiling info and statistics are broken by this XXX: This is calling prologue_init once per vCPU, i.e. each TCGContext gets a different prologue/epilogue (all of them with the same contents though). Far from ideal, but for an experiment it "should" work, right? XXX: Giving the same amount of code_gen_buffer to each vCPU is certainly a bad idea. A "page-like" allocation policy would be better, e.g. give chunks of 1MB to each vCPU as they need it. But for now I'm just trying to see whether this can ever work. XXX: After allowing tb_gen_code to run in parallel (see next patch), crashes due to races in TCG code are found very quickly with -smp > 1 (e.g. "tcg/tcg.c:233: tcg_out_label: Assertion `!l->has_value' failed.") Note that with -smp 1 it works fine; with smp > 1 I can make it fail later with "taskset -c 0", so clearly there is a race going on. Signed-off-by: Emilio G. Cota <cota@braap.org> --- include/exec/exec-all.h | 4 +++- target/arm/translate.h | 8 +++---- tcg/tcg.h | 10 +++++++-- accel/tcg/translate-all.c | 56 ++++++++++++++++++++++++++++++++++++++++------ cpus.c | 2 ++ target/arm/cpu.c | 4 +++- target/arm/translate-a64.c | 6 ++--- target/arm/translate.c | 16 ++++++------- tcg/i386/tcg-target.inc.c | 2 +- tcg/tcg-common.c | 2 +- tcg/tcg.c | 6 ++--- 11 files changed, 85 insertions(+), 31 deletions(-)