Message ID | 1400871431-12655-3-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
Il 23/05/2014 20:57, Richard Henderson ha scritto: > For a 64-bit host, the high bits of a register after a 32-bit operation > are undefined. Adjust the temps mask for all 32-bit ops to reflect that. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/optimize.c | 33 ++++++++++++++++++++++++++------- > 1 file changed, 26 insertions(+), 7 deletions(-) > > diff --git a/tcg/optimize.c b/tcg/optimize.c > index 83e1387..19e4831 100644 > --- a/tcg/optimize.c > +++ b/tcg/optimize.c > @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args, > TCGOpcode old_op, TCGArg dst, TCGArg src) > { > TCGOpcode new_op = op_to_mov(old_op); > + tcg_target_ulong mask; > > s->gen_opc_buf[op_index] = new_op; > > reset_temp(dst); > - temps[dst].mask = temps[src].mask; > + mask = temps[src].mask; > + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { > + /* High bits of the destination are now garbage. */ Or they are zero on x86_64... perhaps this could be some kind of TCG target hook. > + mask |= ~0xffffffffull; > + } > + temps[dst].mask = mask; > + > assert(temps[src].state != TCG_TEMP_CONST); > > if (s->temps[src].type == s->temps[dst].type) { > @@ -194,13 +201,20 @@ static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args, > TCGOpcode old_op, TCGArg dst, TCGArg val) > { > TCGOpcode new_op = op_to_movi(old_op); > + tcg_target_ulong mask; > > s->gen_opc_buf[op_index] = new_op; > > reset_temp(dst); > temps[dst].state = TCG_TEMP_CONST; > temps[dst].val = val; > - temps[dst].mask = val; > + mask = val; > + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { > + /* High bits of the destination are now garbage. */ > + mask |= ~0xffffffffull; > + } > + temps[dst].mask = mask; > + > gen_args[0] = dst; > gen_args[1] = val; > } > @@ -539,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > for (op_index = 0; op_index < nb_ops; op_index++) { > TCGOpcode op = s->gen_opc_buf[op_index]; > const TCGOpDef *def = &tcg_op_defs[op]; > - tcg_target_ulong mask, affected; > + tcg_target_ulong mask, partmask, affected; > int nb_oargs, nb_iargs, nb_args, i; > TCGArg tmp; > > @@ -901,13 +915,18 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > break; > } > > - /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit > - results */ > + /* 32-bit ops (non 64-bit ops and non load/store ops) generate > + 32-bit results. For the result is zero test below, we can > + ignore high bits, but for further optimizations we need to > + record that the high bits contain garbage. */ > + partmask = mask; > if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { > - mask &= 0xffffffffu; > + mask |= ~(tcg_target_ulong)0xffffffffu; > + partmask &= 0xffffffffu; > + affected &= 0xffffffffu; > } > > - if (mask == 0) { > + if (partmask == 0) { > assert(nb_oargs == 1); > tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0); > args += nb_args; > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
On 05/23/2014 12:46 PM, Paolo Bonzini wrote: >> @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int >> op_index, TCGArg *gen_args, >> TCGOpcode old_op, TCGArg dst, TCGArg src) >> { >> TCGOpcode new_op = op_to_mov(old_op); >> + tcg_target_ulong mask; >> >> s->gen_opc_buf[op_index] = new_op; >> >> reset_temp(dst); >> - temps[dst].mask = temps[src].mask; >> + mask = temps[src].mask; >> + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { >> + /* High bits of the destination are now garbage. */ > > Or they are zero on x86_64... perhaps this could be some kind of TCG target hook. Only if we actually issued a "mov" insn. If the register allocator decided to adjust its tables instead, we won't issue an insn, at which point we really do have garbage in the high bits. I don't think we should be thinking about the target at all at this point. r~
On Fri, May 23, 2014 at 11:57:11AM -0700, Richard Henderson wrote: > For a 64-bit host, the high bits of a register after a 32-bit operation > are undefined. Adjust the temps mask for all 32-bit ops to reflect that. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/optimize.c | 33 ++++++++++++++++++++++++++------- > 1 file changed, 26 insertions(+), 7 deletions(-) > > diff --git a/tcg/optimize.c b/tcg/optimize.c > index 83e1387..19e4831 100644 > --- a/tcg/optimize.c > +++ b/tcg/optimize.c > @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args, > TCGOpcode old_op, TCGArg dst, TCGArg src) > { > TCGOpcode new_op = op_to_mov(old_op); > + tcg_target_ulong mask; > > s->gen_opc_buf[op_index] = new_op; > > reset_temp(dst); > - temps[dst].mask = temps[src].mask; > + mask = temps[src].mask; > + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { > + /* High bits of the destination are now garbage. */ > + mask |= ~0xffffffffull; > + } > + temps[dst].mask = mask; > + > assert(temps[src].state != TCG_TEMP_CONST); > > if (s->temps[src].type == s->temps[dst].type) { > @@ -194,13 +201,20 @@ static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args, > TCGOpcode old_op, TCGArg dst, TCGArg val) > { > TCGOpcode new_op = op_to_movi(old_op); > + tcg_target_ulong mask; > > s->gen_opc_buf[op_index] = new_op; > > reset_temp(dst); > temps[dst].state = TCG_TEMP_CONST; > temps[dst].val = val; > - temps[dst].mask = val; > + mask = val; > + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { > + /* High bits of the destination are now garbage. */ > + mask |= ~0xffffffffull; > + } > + temps[dst].mask = mask; > + > gen_args[0] = dst; > gen_args[1] = val; > } > @@ -539,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > for (op_index = 0; op_index < nb_ops; op_index++) { > TCGOpcode op = s->gen_opc_buf[op_index]; > const TCGOpDef *def = &tcg_op_defs[op]; > - tcg_target_ulong mask, affected; > + tcg_target_ulong mask, partmask, affected; > int nb_oargs, nb_iargs, nb_args, i; > TCGArg tmp; > > @@ -901,13 +915,18 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, > break; > } > > - /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit > - results */ > + /* 32-bit ops (non 64-bit ops and non load/store ops) generate > + 32-bit results. For the result is zero test below, we can > + ignore high bits, but for further optimizations we need to > + record that the high bits contain garbage. */ > + partmask = mask; > if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { > - mask &= 0xffffffffu; > + mask |= ~(tcg_target_ulong)0xffffffffu; > + partmask &= 0xffffffffu; > + affected &= 0xffffffffu; > } > > - if (mask == 0) { > + if (partmask == 0) { > assert(nb_oargs == 1); > tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0); > args += nb_args; I think the real problem is that we decided to ignore very quickly in the TCG code that a 32-bit zero/sign extension is different than a move from a 32-bit to a 64-bit TCG temp. Your patch is basically recreating this information by setting a flag when a 32-bit operation occurs (that is an operation on 32-bit TCG temps), and clearing it when a 32-bit to 64-bit conversion occurs. If we add real extu_i32_i64 and ext_i32_i64 ops for 64-bit hosts, propagated down to each TCG target, we can use them as a "barrier" in the propagation. That means for example that extu_i32_i64 will always have to be executed, as the state of the 32-bit high part would be unknown, while we know that after this op, the 32-bit high part would be only zero. Thus we don't lose any possible optimization, while we keep the information. This also means we don't risk the copy propagation mixing 32 and 64-bit types (at least in for this direction). IIRC we got bugs like that in the past. That said, fixing the real problem is probably quite intrusive, so in the meantime this workaround looks fine to me: Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
diff --git a/tcg/optimize.c b/tcg/optimize.c index 83e1387..19e4831 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args, TCGOpcode old_op, TCGArg dst, TCGArg src) { TCGOpcode new_op = op_to_mov(old_op); + tcg_target_ulong mask; s->gen_opc_buf[op_index] = new_op; reset_temp(dst); - temps[dst].mask = temps[src].mask; + mask = temps[src].mask; + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { + /* High bits of the destination are now garbage. */ + mask |= ~0xffffffffull; + } + temps[dst].mask = mask; + assert(temps[src].state != TCG_TEMP_CONST); if (s->temps[src].type == s->temps[dst].type) { @@ -194,13 +201,20 @@ static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args, TCGOpcode old_op, TCGArg dst, TCGArg val) { TCGOpcode new_op = op_to_movi(old_op); + tcg_target_ulong mask; s->gen_opc_buf[op_index] = new_op; reset_temp(dst); temps[dst].state = TCG_TEMP_CONST; temps[dst].val = val; - temps[dst].mask = val; + mask = val; + if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) { + /* High bits of the destination are now garbage. */ + mask |= ~0xffffffffull; + } + temps[dst].mask = mask; + gen_args[0] = dst; gen_args[1] = val; } @@ -539,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, for (op_index = 0; op_index < nb_ops; op_index++) { TCGOpcode op = s->gen_opc_buf[op_index]; const TCGOpDef *def = &tcg_op_defs[op]; - tcg_target_ulong mask, affected; + tcg_target_ulong mask, partmask, affected; int nb_oargs, nb_iargs, nb_args, i; TCGArg tmp; @@ -901,13 +915,18 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, break; } - /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit - results */ + /* 32-bit ops (non 64-bit ops and non load/store ops) generate + 32-bit results. For the result is zero test below, we can + ignore high bits, but for further optimizations we need to + record that the high bits contain garbage. */ + partmask = mask; if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { - mask &= 0xffffffffu; + mask |= ~(tcg_target_ulong)0xffffffffu; + partmask &= 0xffffffffu; + affected &= 0xffffffffu; } - if (mask == 0) { + if (partmask == 0) { assert(nb_oargs == 1); tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0); args += nb_args;
For a 64-bit host, the high bits of a register after a 32-bit operation are undefined. Adjust the temps mask for all 32-bit ops to reflect that. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/optimize.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-)