Message ID | 1443589786-26929-24-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On 2015-09-30 15:09, Richard Henderson wrote: > By putting the prologue at the end, we risk overwriting the > prologue should our estimate of maximum TB size. Given the > two different placements of the call to tcg_prologue_init, > move the high water mark computation into tcg_prologue_init. > > Reviewed-by: Peter Maydell <peter.maydell@linaro.org> > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/tcg.c | 35 ++++++++++++++++++++++++++++------- > translate-all.c | 28 +++++++++------------------- > 2 files changed, 37 insertions(+), 26 deletions(-) Good idea to move it. I have done some experiments with putting slow path "helpers" in the prologue, and I ended-up going over the 1024 bytes limits. > diff --git a/tcg/tcg.c b/tcg/tcg.c > index d3693b1..5609108 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -363,17 +363,38 @@ void tcg_context_init(TCGContext *s) > > void tcg_prologue_init(TCGContext *s) > { > - /* init global prologue and epilogue */ > - s->code_buf = s->code_gen_prologue; > - s->code_ptr = s->code_buf; > + size_t prologue_size, total_size; > + void *buf0, *buf1; > + > + /* Put the prologue at the beginning of code_gen_buffer. */ > + buf0 = s->code_gen_buffer; > + s->code_ptr = buf0; > + s->code_buf = buf0; > + s->code_gen_prologue = buf0; > + > + /* Generate the prologue. */ > tcg_target_qemu_prologue(s); > - flush_icache_range((uintptr_t)s->code_buf, (uintptr_t)s->code_ptr); > + buf1 = s->code_ptr; > + flush_icache_range((uintptr_t)buf0, (uintptr_t)buf1); > + > + /* Deduct the prologue from the buffer. */ > + prologue_size = tcg_current_code_size(s); > + s->code_gen_ptr = buf1; > + s->code_gen_buffer = buf1; > + s->code_buf = buf1; > + total_size = s->code_gen_buffer_size - prologue_size; > + s->code_gen_buffer_size = total_size; > + > + /* Compute a high-water mark, at which we voluntarily flush the > + buffer and start over. */ > + s->code_gen_buffer_max_size = total_size - TCG_MAX_OP_SIZE * OPC_BUF_SIZE; > + > + tcg_register_jit(s->code_gen_buffer, total_size); I am not sure why you moved this 2 lines there, I think they have more their place in code_gen_alloc() so that the heuristics stay at the same place. total_size is available in s->code_gen_buffer_size, so that should be doable. > #ifdef DEBUG_DISAS > if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) { > - size_t size = tcg_current_code_size(s); > - qemu_log("PROLOGUE: [size=%zu]\n", size); > - log_disas(s->code_buf, size); > + qemu_log("PROLOGUE: [size=%zu]\n", prologue_size); > + log_disas(buf0, prologue_size); > qemu_log("\n"); > qemu_log_flush(); > } > diff --git a/translate-all.c b/translate-all.c > index 3454f4e..0e8d176 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -690,23 +690,15 @@ static inline void code_gen_alloc(size_t tb_size) > } > > qemu_madvise(tcg_ctx.code_gen_buffer, tcg_ctx.code_gen_buffer_size, > - QEMU_MADV_HUGEPAGE); > - > - /* Steal room for the prologue at the end of the buffer. This ensures > - (via the MAX_CODE_GEN_BUFFER_SIZE limits above) that direct branches > - from TB's to the prologue are going to be in range. It also means > - that we don't need to mark (additional) portions of the data segment > - as executable. */ > - tcg_ctx.code_gen_prologue = tcg_ctx.code_gen_buffer + > - tcg_ctx.code_gen_buffer_size - 1024; > - tcg_ctx.code_gen_buffer_size -= 1024; > - > - tcg_ctx.code_gen_buffer_max_size = tcg_ctx.code_gen_buffer_size - > - (TCG_MAX_OP_SIZE * OPC_BUF_SIZE); > - tcg_ctx.code_gen_max_blocks = tcg_ctx.code_gen_buffer_size / > - CODE_GEN_AVG_BLOCK_SIZE; > - tcg_ctx.tb_ctx.tbs = > - g_malloc(tcg_ctx.code_gen_max_blocks * sizeof(TranslationBlock)); > + QEMU_MADV_HUGEPAGE); > + > + /* Estimate a good size for the number of TBs we can support. We > + still haven't deducted the prologue from the buffer size here, > + but that's minimal and won't affect the estimate much. */ > + tcg_ctx.code_gen_max_blocks > + = tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE; > + tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks); > + > qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); > } > > @@ -717,8 +709,6 @@ void tcg_exec_init(unsigned long tb_size) > { > cpu_gen_init(); > code_gen_alloc(tb_size); > - tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; > - tcg_register_jit(tcg_ctx.code_gen_buffer, tcg_ctx.code_gen_buffer_size); > page_init(); > #if defined(CONFIG_SOFTMMU) > /* There's no guest base to take into account, so go ahead and Otherwise the patch looks fine to me.
On 10/01/2015 02:17 AM, Aurelien Jarno wrote: > On 2015-09-30 15:09, Richard Henderson wrote: >> By putting the prologue at the end, we risk overwriting the >> prologue should our estimate of maximum TB size. Given the >> two different placements of the call to tcg_prologue_init, >> move the high water mark computation into tcg_prologue_init. >> >> Reviewed-by: Peter Maydell <peter.maydell@linaro.org> >> Signed-off-by: Richard Henderson <rth@twiddle.net> >> --- >> tcg/tcg.c | 35 ++++++++++++++++++++++++++++------- >> translate-all.c | 28 +++++++++------------------- >> 2 files changed, 37 insertions(+), 26 deletions(-) > > Good idea to move it. I have done some experiments with putting slow > path "helpers" in the prologue, and I ended-up going over the 1024 > bytes limits. > >> diff --git a/tcg/tcg.c b/tcg/tcg.c >> index d3693b1..5609108 100644 >> --- a/tcg/tcg.c >> +++ b/tcg/tcg.c >> @@ -363,17 +363,38 @@ void tcg_context_init(TCGContext *s) >> >> void tcg_prologue_init(TCGContext *s) >> { >> - /* init global prologue and epilogue */ >> - s->code_buf = s->code_gen_prologue; >> - s->code_ptr = s->code_buf; >> + size_t prologue_size, total_size; >> + void *buf0, *buf1; >> + >> + /* Put the prologue at the beginning of code_gen_buffer. */ >> + buf0 = s->code_gen_buffer; >> + s->code_ptr = buf0; >> + s->code_buf = buf0; >> + s->code_gen_prologue = buf0; >> + >> + /* Generate the prologue. */ >> tcg_target_qemu_prologue(s); >> - flush_icache_range((uintptr_t)s->code_buf, (uintptr_t)s->code_ptr); >> + buf1 = s->code_ptr; >> + flush_icache_range((uintptr_t)buf0, (uintptr_t)buf1); >> + >> + /* Deduct the prologue from the buffer. */ >> + prologue_size = tcg_current_code_size(s); >> + s->code_gen_ptr = buf1; >> + s->code_gen_buffer = buf1; >> + s->code_buf = buf1; >> + total_size = s->code_gen_buffer_size - prologue_size; >> + s->code_gen_buffer_size = total_size; >> + >> + /* Compute a high-water mark, at which we voluntarily flush the >> + buffer and start over. */ >> + s->code_gen_buffer_max_size = total_size - TCG_MAX_OP_SIZE * OPC_BUF_SIZE; >> + >> + tcg_register_jit(s->code_gen_buffer, total_size); > > I am not sure why you moved this 2 lines there, I think they have more > their place in code_gen_alloc() so that the heuristics stay at the same > place. total_size is available in s->code_gen_buffer_size, so that > should be doable. Because it was done too early for user-only. There, the sequence is code_gen_alloc place guest image, setting GUEST_BASE tcg_prologue_init We need the guest base for generating the prologue on most targets. Although clearly a better ordering is to place the guest image *first* before doing anything else. None of the other large allocations (including tbs) really care where they're placed. Something to fix later, or should I do a v5? r~
diff --git a/tcg/tcg.c b/tcg/tcg.c index d3693b1..5609108 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -363,17 +363,38 @@ void tcg_context_init(TCGContext *s) void tcg_prologue_init(TCGContext *s) { - /* init global prologue and epilogue */ - s->code_buf = s->code_gen_prologue; - s->code_ptr = s->code_buf; + size_t prologue_size, total_size; + void *buf0, *buf1; + + /* Put the prologue at the beginning of code_gen_buffer. */ + buf0 = s->code_gen_buffer; + s->code_ptr = buf0; + s->code_buf = buf0; + s->code_gen_prologue = buf0; + + /* Generate the prologue. */ tcg_target_qemu_prologue(s); - flush_icache_range((uintptr_t)s->code_buf, (uintptr_t)s->code_ptr); + buf1 = s->code_ptr; + flush_icache_range((uintptr_t)buf0, (uintptr_t)buf1); + + /* Deduct the prologue from the buffer. */ + prologue_size = tcg_current_code_size(s); + s->code_gen_ptr = buf1; + s->code_gen_buffer = buf1; + s->code_buf = buf1; + total_size = s->code_gen_buffer_size - prologue_size; + s->code_gen_buffer_size = total_size; + + /* Compute a high-water mark, at which we voluntarily flush the + buffer and start over. */ + s->code_gen_buffer_max_size = total_size - TCG_MAX_OP_SIZE * OPC_BUF_SIZE; + + tcg_register_jit(s->code_gen_buffer, total_size); #ifdef DEBUG_DISAS if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) { - size_t size = tcg_current_code_size(s); - qemu_log("PROLOGUE: [size=%zu]\n", size); - log_disas(s->code_buf, size); + qemu_log("PROLOGUE: [size=%zu]\n", prologue_size); + log_disas(buf0, prologue_size); qemu_log("\n"); qemu_log_flush(); } diff --git a/translate-all.c b/translate-all.c index 3454f4e..0e8d176 100644 --- a/translate-all.c +++ b/translate-all.c @@ -690,23 +690,15 @@ static inline void code_gen_alloc(size_t tb_size) } qemu_madvise(tcg_ctx.code_gen_buffer, tcg_ctx.code_gen_buffer_size, - QEMU_MADV_HUGEPAGE); - - /* Steal room for the prologue at the end of the buffer. This ensures - (via the MAX_CODE_GEN_BUFFER_SIZE limits above) that direct branches - from TB's to the prologue are going to be in range. It also means - that we don't need to mark (additional) portions of the data segment - as executable. */ - tcg_ctx.code_gen_prologue = tcg_ctx.code_gen_buffer + - tcg_ctx.code_gen_buffer_size - 1024; - tcg_ctx.code_gen_buffer_size -= 1024; - - tcg_ctx.code_gen_buffer_max_size = tcg_ctx.code_gen_buffer_size - - (TCG_MAX_OP_SIZE * OPC_BUF_SIZE); - tcg_ctx.code_gen_max_blocks = tcg_ctx.code_gen_buffer_size / - CODE_GEN_AVG_BLOCK_SIZE; - tcg_ctx.tb_ctx.tbs = - g_malloc(tcg_ctx.code_gen_max_blocks * sizeof(TranslationBlock)); + QEMU_MADV_HUGEPAGE); + + /* Estimate a good size for the number of TBs we can support. We + still haven't deducted the prologue from the buffer size here, + but that's minimal and won't affect the estimate much. */ + tcg_ctx.code_gen_max_blocks + = tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE; + tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks); + qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); } @@ -717,8 +709,6 @@ void tcg_exec_init(unsigned long tb_size) { cpu_gen_init(); code_gen_alloc(tb_size); - tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; - tcg_register_jit(tcg_ctx.code_gen_buffer, tcg_ctx.code_gen_buffer_size); page_init(); #if defined(CONFIG_SOFTMMU) /* There's no guest base to take into account, so go ahead and