Message ID | 1349202750-16815-11-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Tue, Oct 02, 2012 at 11:32:30AM -0700, Richard Henderson wrote: > Like add2, do operand ordering, constant folding, and dead operand > elimination. The latter happens about 15% of all mulu2 during an > x86_64 bios boot. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/optimize.c | 26 ++++++++++++++++++++++++++ > tcg/tcg-op.h | 2 ++ > tcg/tcg.c | 19 +++++++++++++++++++ > 3 files changed, 47 insertions(+) > > diff --git a/tcg/optimize.c b/tcg/optimize.c > index 05891ef..a06c8eb 100644 > --- a/tcg/optimize.c > +++ b/tcg/optimize.c > @@ -543,6 +543,9 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > swap_commutative(args[0], &args[2], &args[4]); > swap_commutative(args[1], &args[3], &args[5]); > break; > + case INDEX_op_mulu2_i32: > + swap_commutative(args[0], &args[2], &args[3]); > + break; > case INDEX_op_brcond2_i32: > if (swap_commutative2(&args[0], &args[2])) { > args[4] = tcg_swap_cond(args[4]); > @@ -831,6 +834,29 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > } > goto do_default; > > + case INDEX_op_mulu2_i32: > + if (temps[args[2]].state == TCG_TEMP_CONST > + && temps[args[3]].state == TCG_TEMP_CONST) { > + uint32_t a = temps[args[2]].val; > + uint32_t b = temps[args[3]].val; > + uint64_t r = (uint64_t)a * b; > + TCGArg rl, rh; > + > + /* We emit the extra nop when we emit the mulu2. */ > + assert(gen_opc_buf[op_index + 1] == INDEX_op_nop); > + > + rl = args[0]; > + rh = args[1]; > + gen_opc_buf[op_index] = INDEX_op_movi_i32; > + gen_opc_buf[++op_index] = INDEX_op_movi_i32; > + tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r); > + tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32)); > + gen_args += 4; > + args += 4; > + break; > + } > + goto do_default; > + > case INDEX_op_brcond2_i32: > tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]); > if (tmp != 2) { > diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h > index 1f5a021..044e648 100644 > --- a/tcg/tcg-op.h > +++ b/tcg/tcg-op.h > @@ -997,6 +997,8 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) > > tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0), > TCGV_LOW(arg1), TCGV_LOW(arg2)); > + /* Allow the optimizer room to replace mulu2 with two moves. */ > + tcg_gen_op0(INDEX_op_nop); > > tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2)); > tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1); > diff --git a/tcg/tcg.c b/tcg/tcg.c > index 21c1074..8280489 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -1337,6 +1337,25 @@ static void tcg_liveness_analysis(TCGContext *s) > } > goto do_not_remove; > > + case INDEX_op_mulu2_i32: > + args -= 4; > + nb_iargs = 2; > + nb_oargs = 2; > + /* Likewise, test for the high part of the operation dead. */ > + if (dead_temps[args[1]]) { > + if (dead_temps[args[0]]) { > + goto do_remove; > + } > + gen_opc_buf[op_index] = op = INDEX_op_mul_i32; Very minor nitpick: you probably don't need to set op there. > + args[1] = args[2]; > + args[2] = args[3]; > + assert(gen_opc_buf[op_index + 1] == INDEX_op_nop); > + tcg_set_nop(s, gen_opc_buf + op_index + 1, args + 3, 1); > + /* Fall through and mark the single-word operation live. */ > + nb_oargs = 1; > + } > + goto do_not_remove; > + > default: > /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */ > args -= def->nb_args; Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
On 2012-10-17 09:25, Aurelien Jarno wrote: >> > + gen_opc_buf[op_index] = op = INDEX_op_mul_i32; > Very minor nitpick: you probably don't need to set op there. > Perhaps not, but I prefer to keep the variables in sync as we drop into common code... r~
On 10/17/2012 03:09 AM, Richard Henderson wrote: > On 2012-10-17 09:25, Aurelien Jarno wrote: >>> > + gen_opc_buf[op_index] = op = INDEX_op_mul_i32; >> Very minor nitpick: you probably don't need to set op there. >> > > Perhaps not, but I prefer to keep the variables in sync as we > drop into common code... The compiler should recognize the dead variable anyway. How very meta.
diff --git a/tcg/optimize.c b/tcg/optimize.c index 05891ef..a06c8eb 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -543,6 +543,9 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, swap_commutative(args[0], &args[2], &args[4]); swap_commutative(args[1], &args[3], &args[5]); break; + case INDEX_op_mulu2_i32: + swap_commutative(args[0], &args[2], &args[3]); + break; case INDEX_op_brcond2_i32: if (swap_commutative2(&args[0], &args[2])) { args[4] = tcg_swap_cond(args[4]); @@ -831,6 +834,29 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, } goto do_default; + case INDEX_op_mulu2_i32: + if (temps[args[2]].state == TCG_TEMP_CONST + && temps[args[3]].state == TCG_TEMP_CONST) { + uint32_t a = temps[args[2]].val; + uint32_t b = temps[args[3]].val; + uint64_t r = (uint64_t)a * b; + TCGArg rl, rh; + + /* We emit the extra nop when we emit the mulu2. */ + assert(gen_opc_buf[op_index + 1] == INDEX_op_nop); + + rl = args[0]; + rh = args[1]; + gen_opc_buf[op_index] = INDEX_op_movi_i32; + gen_opc_buf[++op_index] = INDEX_op_movi_i32; + tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r); + tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32)); + gen_args += 4; + args += 4; + break; + } + goto do_default; + case INDEX_op_brcond2_i32: tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]); if (tmp != 2) { diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index 1f5a021..044e648 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -997,6 +997,8 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0), TCGV_LOW(arg1), TCGV_LOW(arg2)); + /* Allow the optimizer room to replace mulu2 with two moves. */ + tcg_gen_op0(INDEX_op_nop); tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2)); tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1); diff --git a/tcg/tcg.c b/tcg/tcg.c index 21c1074..8280489 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -1337,6 +1337,25 @@ static void tcg_liveness_analysis(TCGContext *s) } goto do_not_remove; + case INDEX_op_mulu2_i32: + args -= 4; + nb_iargs = 2; + nb_oargs = 2; + /* Likewise, test for the high part of the operation dead. */ + if (dead_temps[args[1]]) { + if (dead_temps[args[0]]) { + goto do_remove; + } + gen_opc_buf[op_index] = op = INDEX_op_mul_i32; + args[1] = args[2]; + args[2] = args[3]; + assert(gen_opc_buf[op_index + 1] == INDEX_op_nop); + tcg_set_nop(s, gen_opc_buf + op_index + 1, args + 3, 1); + /* Fall through and mark the single-word operation live. */ + nb_oargs = 1; + } + goto do_not_remove; + default: /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */ args -= def->nb_args;
Like add2, do operand ordering, constant folding, and dead operand elimination. The latter happens about 15% of all mulu2 during an x86_64 bios boot. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/optimize.c | 26 ++++++++++++++++++++++++++ tcg/tcg-op.h | 2 ++ tcg/tcg.c | 19 +++++++++++++++++++ 3 files changed, 47 insertions(+)