diff mbox

target-mips: apply workaround for TCG optimizations for MFC1

Message ID 1436891912-14742-1-git-send-email-leon.alrae@imgtec.com
State New
Headers show

Commit Message

Leon Alrae July 14, 2015, 4:38 p.m. UTC
There seems to be an issue when trying to keep a pointer in bottom 32-bits
of a 64-bit floating point register. Load and store instructions accessing
this address for some reason use the whole 64-bit content of floating point
register rather than truncated 32-bit value. The following load uses
incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:

0x00400c60:  mfc1       t8,$f0
0x00400c64:  lw t9,0(t8)

It can be reproduced with the following linux userland program when running
on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
mips32r6-generic CPUs have this bit set in linux-user).

int main(int argc, char *argv[])
{
    int tmp = 0x11111111;
    /* Set f0 */
    __asm__ ("mtc1  %0, $f0\n"
             "mthc1 %1, $f0\n"
             : : "r" (&tmp), "r" (tmp));
    /* At this point $f0: w:76fff040 d:1111111176fff040 */
    __asm__ ("mfc1 $t8, $f0\n"
             "lw   $t9, 0($t8)\n"); /* <--- crash! */
    return 0;
}

Running above program in normal (non-singlestep mode) leads to:

Program received signal SIGSEGV, Segmentation fault.
0x00005555559f6f37 in static_code_gen_buffer ()
(gdb) x/i 0x00005555559f6f37
=> 0x5555559f6f37 <static_code_gen_buffer+78359>:       mov    %gs:0x0(%rbp),%ebp
(gdb) info registers rbp
rbp            0x1111111176fff040       0x1111111176fff040

The program runs fine in singlestep mode, or with disabled TCG
optimizations. Also, I'm not able to reproduce it in system emulation.

Signed-off-by: Leon Alrae <leon.alrae@imgtec.com>
---
I had been investigating this some time ago, but had to move to other
things and haven't managed to get back to it. And now, since 2.4 release
is relatively close I think workaround is better than nothing (apparently
some MIPS32R6 compilers may keep a pointer in a floating point register
which exposes this problem in QEMU). Ideas and comments are welcome.

More dumps if anyone is interested (I isolated TB for these two
instructions by stopping translation after mthc1 and lw):

IN: main
0x00400c60:  mfc1	t8,$f0
0x00400c64:  lw	t9,0(t8)

OP:
 ld_i32 tmp0,env,$0xfffffffffffffffc
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,ne,$L0

 ---- 0x400c60
 mov_i32 tmp1,w0.d0
 mov_i32 tmp0,tmp1
 mov_i32 t8,tmp0

 ---- 0x400c64
 mov_i32 tmp0,t8
 qemu_ld_i32 tmp0,tmp0,un+leul,2
 mov_i32 t9,tmp0
 goto_tb $0x0
 movi_i32 PC,$0x400c68
 exit_tb $0x7ffff35d5d30
 set_label $L0
 exit_tb $0x7ffff35d5d33

OP after optimization and liveness analysis:
 ld_i32 tmp0,env,$0xfffffffffffffffc
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,ne,$L0

 ---- 0x400c60
 mov_i32 tmp1,w0.d0
 mov_i32 tmp0,tmp1
 mov_i32 t8,tmp0

 ---- 0x400c64
 qemu_ld_i32 tmp0,t8,un+leul,2
 mov_i32 t9,tmp0
 goto_tb $0x0
 movi_i32 PC,$0x400c68
 exit_tb $0x7ffff35d5d30
 set_label $L0
 exit_tb $0x7ffff35d5d33

OUT: [size=78]
0x5555559f6f20:  mov    -0x4(%r14),%ebp
0x5555559f6f24:  test   %ebp,%ebp
0x5555559f6f26:  jne    0x5555559f6f5f
0x5555559f6f2c:  mov    0xe8(%r14),%rbp
0x5555559f6f33:  mov    %ebp,0x60(%r14)
0x5555559f6f37:  mov    %gs:0x0(%rbp),%ebp
0x5555559f6f3b:  mov    %ebp,0x64(%r14)
0x5555559f6f3f:  jmpq   0x5555559f6f44
0x5555559f6f44:  mov    $0x400c68,%ebp
0x5555559f6f49:  mov    %ebp,0x80(%r14)
0x5555559f6f50:  mov    $0x7ffff35d5d30,%rax
0x5555559f6f5a:  jmpq   0x5555579e3936
0x5555559f6f5f:  mov    $0x7ffff35d5d33,%rax
0x5555559f6f69:  jmpq   0x5555579e3936

---
 target-mips/translate.c | 6 ++++++
 1 file changed, 6 insertions(+)

Comments

Aurelien Jarno July 14, 2015, 5:09 p.m. UTC | #1
On 2015-07-14 17:38, Leon Alrae wrote:
> There seems to be an issue when trying to keep a pointer in bottom 32-bits
> of a 64-bit floating point register. Load and store instructions accessing
> this address for some reason use the whole 64-bit content of floating point
> register rather than truncated 32-bit value. The following load uses
> incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:
> 
> 0x00400c60:  mfc1       t8,$f0
> 0x00400c64:  lw t9,0(t8)
> 
> It can be reproduced with the following linux userland program when running
> on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
> mips32r6-generic CPUs have this bit set in linux-user).
> 
> int main(int argc, char *argv[])
> {
>     int tmp = 0x11111111;
>     /* Set f0 */
>     __asm__ ("mtc1  %0, $f0\n"
>              "mthc1 %1, $f0\n"
>              : : "r" (&tmp), "r" (tmp));
>     /* At this point $f0: w:76fff040 d:1111111176fff040 */
>     __asm__ ("mfc1 $t8, $f0\n"
>              "lw   $t9, 0($t8)\n"); /* <--- crash! */
>     return 0;
> }
> 
> Running above program in normal (non-singlestep mode) leads to:
> 
> Program received signal SIGSEGV, Segmentation fault.
> 0x00005555559f6f37 in static_code_gen_buffer ()
> (gdb) x/i 0x00005555559f6f37
> => 0x5555559f6f37 <static_code_gen_buffer+78359>:       mov    %gs:0x0(%rbp),%ebp
> (gdb) info registers rbp
> rbp            0x1111111176fff040       0x1111111176fff040
> 
> The program runs fine in singlestep mode, or with disabled TCG
> optimizations. Also, I'm not able to reproduce it in system emulation.

I am able to reproduce the problem, but for me disabling the
optimizations doesn't help. That said the problem is just another issue
with the "let's assume the target supports move between 32 and 64 bit
registers". At some point we should add a paragraph to tcg/README, to
define how handle 32 vs 64 bit registers and what the TCG targets should
expect. We had to add special code to handle that for sparc
(trunc_shr_i32 instruction), but also code to the optimizer to remember
about "garbage" high bits. I am not sure someone has a global view about
how all this code interacts.

In this precise case the problem seems to be related to the following code
in tcg/i386/tcg-target.c:

|         /* ??? We assume all operations have left us with register contents
|            that are zero extended.  So far this appears to be true.  If we
|            want to enforce this, we can either do an explicit zero-extension
|            here, or (if GUEST_BASE == 0, or a segment register is in use)
|            use the ADDR32 prefix.  For now, do nothing.  */
|         if (GUEST_BASE && guest_base_flags) {
|             seg = guest_base_flags;
|             offset = 0;
|         } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
|             tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
|             tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
|             base = TCG_REG_L1;
|             offset = 0;
|         }

I guess we are still in time for 2.4 to fix this, but in case it's not
possible we can apply your patch.
Paolo Bonzini July 14, 2015, 6:20 p.m. UTC | #2
On 14/07/2015 19:09, Aurelien Jarno wrote:
> On 2015-07-14 17:38, Leon Alrae wrote:
>> There seems to be an issue when trying to keep a pointer in bottom 32-bits
>> of a 64-bit floating point register. Load and store instructions accessing
>> this address for some reason use the whole 64-bit content of floating point
>> register rather than truncated 32-bit value. The following load uses
>> incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:
>>
>> 0x00400c60:  mfc1       t8,$f0
>> 0x00400c64:  lw t9,0(t8)
>>
>> It can be reproduced with the following linux userland program when running
>> on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
>> mips32r6-generic CPUs have this bit set in linux-user).
>>
>> int main(int argc, char *argv[])
>> {
>>     int tmp = 0x11111111;
>>     /* Set f0 */
>>     __asm__ ("mtc1  %0, $f0\n"
>>              "mthc1 %1, $f0\n"
>>              : : "r" (&tmp), "r" (tmp));
>>     /* At this point $f0: w:76fff040 d:1111111176fff040 */
>>     __asm__ ("mfc1 $t8, $f0\n"
>>              "lw   $t9, 0($t8)\n"); /* <--- crash! */
>>     return 0;
>> }
>>
>> Running above program in normal (non-singlestep mode) leads to:
>>
>> Program received signal SIGSEGV, Segmentation fault.
>> 0x00005555559f6f37 in static_code_gen_buffer ()
>> (gdb) x/i 0x00005555559f6f37
>> => 0x5555559f6f37 <static_code_gen_buffer+78359>:       mov    %gs:0x0(%rbp),%ebp
>> (gdb) info registers rbp
>> rbp            0x1111111176fff040       0x1111111176fff040
>>
>> The program runs fine in singlestep mode, or with disabled TCG
>> optimizations. Also, I'm not able to reproduce it in system emulation.
> 
> I am able to reproduce the problem, but for me disabling the
> optimizations doesn't help. That said the problem is just another issue
> with the "let's assume the target supports move between 32 and 64 bit
> registers". At some point we should add a paragraph to tcg/README, to
> define how handle 32 vs 64 bit registers and what the TCG targets should
> expect. We had to add special code to handle that for sparc
> (trunc_shr_i32 instruction), but also code to the optimizer to remember
> about "garbage" high bits. I am not sure someone has a global view about
> how all this code interacts.

I certainly don't have a global view, so much that I didn't think at 
all of the optimizer... Instead, it looks to me like a bug in the 
register allocator.  In particular this code in tcg_reg_alloc_mov:

        if (IS_DEAD_ARG(1) && !ts->fixed_reg && !ots->fixed_reg) {
            /* the mov can be suppressed */
            if (ots->val_type == TEMP_VAL_REG) {
                s->reg_to_temp[ots->reg] = -1;
            }
            ots->reg = ts->reg;
            temp_dead(s, args[1]);
        }

is not covering the "itype != otype" case.  In addition, the 
IS_DEAD_ARG(1) case can be covered above in the

    if (((NEED_SYNC_ARG(0) || ots->fixed_reg) && ts->val_type != TEMP_VAL_REG)
        || ts->val_type == TEMP_VAL_MEM) {

conditional: in this case there's no need at all to go through
itype, and it's possible to load directly into ots.

Paolo
Aurelien Jarno July 14, 2015, 6:37 p.m. UTC | #3
On 2015-07-14 20:20, Paolo Bonzini wrote:
> 
> 
> On 14/07/2015 19:09, Aurelien Jarno wrote:
> > On 2015-07-14 17:38, Leon Alrae wrote:
> >> There seems to be an issue when trying to keep a pointer in bottom 32-bits
> >> of a 64-bit floating point register. Load and store instructions accessing
> >> this address for some reason use the whole 64-bit content of floating point
> >> register rather than truncated 32-bit value. The following load uses
> >> incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:
> >>
> >> 0x00400c60:  mfc1       t8,$f0
> >> 0x00400c64:  lw t9,0(t8)
> >>
> >> It can be reproduced with the following linux userland program when running
> >> on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
> >> mips32r6-generic CPUs have this bit set in linux-user).
> >>
> >> int main(int argc, char *argv[])
> >> {
> >>     int tmp = 0x11111111;
> >>     /* Set f0 */
> >>     __asm__ ("mtc1  %0, $f0\n"
> >>              "mthc1 %1, $f0\n"
> >>              : : "r" (&tmp), "r" (tmp));
> >>     /* At this point $f0: w:76fff040 d:1111111176fff040 */
> >>     __asm__ ("mfc1 $t8, $f0\n"
> >>              "lw   $t9, 0($t8)\n"); /* <--- crash! */
> >>     return 0;
> >> }
> >>
> >> Running above program in normal (non-singlestep mode) leads to:
> >>
> >> Program received signal SIGSEGV, Segmentation fault.
> >> 0x00005555559f6f37 in static_code_gen_buffer ()
> >> (gdb) x/i 0x00005555559f6f37
> >> => 0x5555559f6f37 <static_code_gen_buffer+78359>:       mov    %gs:0x0(%rbp),%ebp
> >> (gdb) info registers rbp
> >> rbp            0x1111111176fff040       0x1111111176fff040
> >>
> >> The program runs fine in singlestep mode, or with disabled TCG
> >> optimizations. Also, I'm not able to reproduce it in system emulation.
> > 
> > I am able to reproduce the problem, but for me disabling the
> > optimizations doesn't help. That said the problem is just another issue
> > with the "let's assume the target supports move between 32 and 64 bit
> > registers". At some point we should add a paragraph to tcg/README, to
> > define how handle 32 vs 64 bit registers and what the TCG targets should
> > expect. We had to add special code to handle that for sparc
> > (trunc_shr_i32 instruction), but also code to the optimizer to remember
> > about "garbage" high bits. I am not sure someone has a global view about
> > how all this code interacts.
> 
> I certainly don't have a global view, so much that I didn't think at 
> all of the optimizer... Instead, it looks to me like a bug in the 
> register allocator.  In particular this code in tcg_reg_alloc_mov:

That's exactly my point when I said that someone doesn't have a global
view. I think the fact that we don't check for type when simplifying
moves in the register allocator is intentional, the same way we simply
transform the trunc op into a mov op (except on sparc). This is done
because it's not needed for example on x86 and most architectures,
given 32-bit instructions do not care about the high part of the
registers.

Basically size changing ops are trunc_i64_i32, ext_i32_i64 and
extu_i32_i64. We can be conservative and implement all of them as real
instructions in all TCG backends. In that case the mov op never has
to deal with registers of different size (just like we enforce that at
the TCG frotnend level), and the register allocator and the optimizer
do not have to deal with this. However that's suboptimal on some
architectures, that's why on x86 we decided to just replace the
trunc_i64_i32 by a move. But if we do this simplification it should be
done everywhere (in that case, including in the qemu_ld op). And
DOCUMENTED somewhere, given different choices can be made for different
backends.

As for the optimizer, it's goal is to predict the value of the registers
by constant folding. It should be seen as another CPU, with its own
rules. For example TCG internally stores 32-bit constants as signed
extended. The optimizer should follow the same convention.
Paolo Bonzini July 14, 2015, 8:56 p.m. UTC | #4
On 14/07/2015 20:37, Aurelien Jarno wrote:
>> > 
>> > I certainly don't have a global view, so much that I didn't think at 
>> > all of the optimizer... Instead, it looks to me like a bug in the 
>> > register allocator.  In particular this code in tcg_reg_alloc_mov:
> That's exactly my point when I said that someone doesn't have a global
> view. I think the fact that we don't check for type when simplifying
> moves in the register allocator is intentional, the same way we simply
> transform the trunc op into a mov op (except on sparc). This is done
> because it's not needed for example on x86 and most architectures,
> given 32-bit instructions do not care about the high part of the
> registers.
> 
> Basically size changing ops are trunc_i64_i32, ext_i32_i64 and
> extu_i32_i64. We can be conservative and implement all of them as real
> instructions in all TCG backends. In that case the mov op never has
> to deal with registers of different size (just like we enforce that at
> the TCG frotnend level), and the register allocator and the optimizer
> do not have to deal with this. However that's suboptimal on some
> architectures, that's why on x86 we decided to just replace the
> trunc_i64_i32 by a move. But if we do this simplification it should be
> done everywhere (in that case, including in the qemu_ld op). And
> DOCUMENTED somewhere, given different choices can be made for different
> backends.

I think there are four cases:

1) 64-bit processors that do not have loads with 32-bit addresses, and
do not zero extend on 32-bit operations---possibly because 32-bit
operations do not exist at all.

	=> qemu_ld/qemu_st must truncate the address

	ia64, s390, sparc all fall under this group.

2) 64-bit processors that have loads with 32-bit addresses.

	=> qemu_ld/qemu_st can use 32-bit addresses to do the
	   truncation

	aarch64, I think, falls under this group

3) Processors that do not have 32-bit loads, and automatically zero
extend on 32-bit operations

	=> qemu_ld/qemu_st could use 64-bit addresses and no truncation

x86 currently falls under 3, because it doesn't use ADDR32, but the
register allocator is breaking case 3 by forcing 64-bit operations when
loading from a global.

I am not sure if the optimizer could also break this case, or if it is
working by chance.  So, the simplest fix for 2.4 would be to add the
prefix as suggested in the comment and make x86 fall under 2.

If the optimizer is not breaking this case, fixing the register
allocator would be an option, and then the ADDR32 prefix could be reverted.

Even if the prefix was added, modifying the register allocator to use
32-bit loads would still be useful as an optimization, since on x86
32-bit loads are smaller than 64-bit loads.

Paolo
Aurelien Jarno July 14, 2015, 10:09 p.m. UTC | #5
On 2015-07-14 22:56, Paolo Bonzini wrote:
> 
> 
> On 14/07/2015 20:37, Aurelien Jarno wrote:
> >> > 
> >> > I certainly don't have a global view, so much that I didn't think at 
> >> > all of the optimizer... Instead, it looks to me like a bug in the 
> >> > register allocator.  In particular this code in tcg_reg_alloc_mov:
> > That's exactly my point when I said that someone doesn't have a global
> > view. I think the fact that we don't check for type when simplifying
> > moves in the register allocator is intentional, the same way we simply
> > transform the trunc op into a mov op (except on sparc). This is done
> > because it's not needed for example on x86 and most architectures,
> > given 32-bit instructions do not care about the high part of the
> > registers.
> > 
> > Basically size changing ops are trunc_i64_i32, ext_i32_i64 and
> > extu_i32_i64. We can be conservative and implement all of them as real
> > instructions in all TCG backends. In that case the mov op never has
> > to deal with registers of different size (just like we enforce that at
> > the TCG frotnend level), and the register allocator and the optimizer
> > do not have to deal with this. However that's suboptimal on some
> > architectures, that's why on x86 we decided to just replace the
> > trunc_i64_i32 by a move. But if we do this simplification it should be
> > done everywhere (in that case, including in the qemu_ld op). And
> > DOCUMENTED somewhere, given different choices can be made for different
> > backends.
> 
> I think there are four cases:

Well I think we should not see it in terms of only the qemu_ld/qemu_st
32-bit ops, but 32-bit ops in general.

> 1) 64-bit processors that do not have loads with 32-bit addresses, and
> do not zero extend on 32-bit operations---possibly because 32-bit
> operations do not exist at all.
> 
> 	=> qemu_ld/qemu_st must truncate the address
> 
> 	ia64, s390, sparc all fall under this group.
> 
> 2) 64-bit processors that have loads with 32-bit addresses.
> 
> 	=> qemu_ld/qemu_st can use 32-bit addresses to do the
> 	   truncation
> 
> 	aarch64, I think, falls under this group

I don't think that works. We don't want to get a load with a 32-bit
address. We want a load of (guest_base + address), with guest_base
possibly being 64-bit, address being 32-bit and the result likely
being 64-bit.

> 3) Processors that do not have 32-bit loads, and automatically zero
> extend on 32-bit operations
> 
> 	=> qemu_ld/qemu_st could use 64-bit addresses and no truncation
> 
> x86 currently falls under 3, because it doesn't use ADDR32, but the
> register allocator is breaking case 3 by forcing 64-bit operations when
> loading from a global.

Well the use of ADDR32 is a bit special, it only works because we can't
use %gs to add the guest base address. When we can't use %gs, ADDR32
can't work.

> I am not sure if the optimizer could also break this case, or if it is

Now that we track high bits are "garbage", the optimizer should be safe.

> working by chance.  So, the simplest fix for 2.4 would be to add the
> prefix as suggested in the comment and make x86 fall under 2.

I think it's the way to go, at least until we have a better view of how
the 32 to 64-bit register works.

> If the optimizer is not breaking this case, fixing the register
> allocator would be an option, and then the ADDR32 prefix could be reverted.

I don't think the register allocator is at fault at all. The register
tcg_reg_alloc_mov doesn't check for the register type because a TCG mov
is by definition only between registers of the same size. We have
different ops (trunc, ext, extu) to handle moves between different
register size.

The problem is that we replace the trunc instruction by a mov (except on
sparc) in tcg_gen_trunc_shr_i64_i32 to get more optimized code:

| ...
|     } else if (count == 0) {
|         tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
|     } else {
|         TCGv_i64 t = tcg_temp_new_i64();
|         tcg_gen_shri_i64(t, arg, count);
|         tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
|         tcg_temp_free_i64(t);
|     }
| ...

If we actually implement the trunc_shr_i64_i32 instruction on all
targets, we get rid of this problem without having to tweak the register
allocator. But the generated code is then slightly less optimal, as we
emit an, x86 mov instruction to do the zero extension.

> Even if the prefix was added, modifying the register allocator to use
> 32-bit loads would still be useful as an optimization, since on x86
> 32-bit loads are smaller than 64-bit loads.

AFAIK, that's already the case. The REXW prefix is only emitted for
64-bit ops. The user mode qemu_ld/st is a bit different case, because
there you mix values and addresses.
Paolo Bonzini July 15, 2015, 7:31 a.m. UTC | #6
On 15/07/2015 00:09, Aurelien Jarno wrote:
>> > 2) 64-bit processors that have loads with 32-bit addresses.
>> > 
>> > 	=> qemu_ld/qemu_st can use 32-bit addresses to do the
>> > 	   truncation
>> > 
>> > 	aarch64, I think, falls under this group
> I don't think that works. We don't want to get a load with a 32-bit
> address. We want a load of (guest_base + address), with guest_base
> possibly being 64-bit, address being 32-bit and the result likely
> being 64-bit.

aarch64, IIUC, has complicated addressing modes with a 64-bit base and a
32-bit sign- or zero-extended index, which is exactly what you need
here.  However, the backend is not using it, so right now aarch64 is the
same as x86.

> Well the use of ADDR32 is a bit special, it only works because we can't
> use %gs to add the guest base address. When we can't use %gs, ADDR32
> can't work.

Yes.  bsd-user would have to sign extend, in particular.

> I don't think the register allocator is at fault at all. The register
> tcg_reg_alloc_mov doesn't check for the register type because a TCG mov
> is by definition only between registers of the same size.

Ok, I see your point.  If you put it like this :) the fault definitely
lies in the backends.  What I'm proposing would be in a new
tcg_reg_alloc_trunc function, and it would require implementing a
non-noop trunc.

I still believe the register allocator can be improved to do 32-bit
loads, though as an optimization and not as a bugfix:

> > Even if the prefix was added, modifying the register allocator to use
> > 32-bit loads would still be useful as an optimization, since on x86
> > 32-bit loads are smaller than 64-bit loads.
>
> AFAIK, that's already the case. The REXW prefix is only emitted for
> 64-bit ops.

Yes, but a load from a 64-bit register to a 32-bit destination emits
REX.W.  From Leon's dump:

 mov_i32 tmp1,w0.d0  => mov    0xe8(%r14),%rbp
 mov_i32 tmp0,tmp1
 mov_i32 t8,tmp0     => mov    %ebp,0x60(%r14)

Note %rbp as the load destination and %ebp as the source of the store.

Paolo
Aurelien Jarno July 15, 2015, 8:06 a.m. UTC | #7
On 2015-07-15 09:31, Paolo Bonzini wrote:
> Ok, I see your point.  If you put it like this :) the fault definitely
> lies in the backends.  What I'm proposing would be in a new
> tcg_reg_alloc_trunc function, and it would require implementing a
> non-noop trunc.

Why not reusing the existing trunc_shr_i64_i32 op? AFAIU, it has been 
designed exactly for that.

Actually I think we should implement the following ops as optional but
*real* TCG ops:
- trunc_shr_i64_i32
- extu_i32_i64
- ext_i32_i64

Then each backend can implement the one it considers necessary. If not
implemented in a backend it is simply replaced by a mov. This would also
allow to remove the "remember high bits as garbage" in the optimizer,
which I consider a band aid more than a real fix.

Note that we might have multiple choices for example on x86:

1) implement trunc_shr_i64_i32 and ext_i32_i64
This way we make sure that all 32-bit values are always stored
zero-extended (even if a move has been propagated by the register
allocator or by the optimizer). The extu_i32_i64 can therefore always
be considered as a mov op.

2) implement extu_i32_i64 and ext_i32_i64
We have to guarantee that all 32-bit ops ignore the high part of the
registers (which is not the case currently for qemu_ld/st in user mode)
as they might contain garbage. Given that we have to properly zero and
sign extend the value when converting a 32-bit value in a 64-bit value.

> I still believe the register allocator can be improved to do 32-bit
> loads, though as an optimization and not as a bugfix:
> 
> > > Even if the prefix was added, modifying the register allocator to use
> > > 32-bit loads would still be useful as an optimization, since on x86
> > > 32-bit loads are smaller than 64-bit loads.
> >
> > AFAIK, that's already the case. The REXW prefix is only emitted for
> > 64-bit ops.
> 
> Yes, but a load from a 64-bit register to a 32-bit destination emits
> REX.W.  From Leon's dump:
> 
>  mov_i32 tmp1,w0.d0  => mov    0xe8(%r14),%rbp
>  mov_i32 tmp0,tmp1
>  mov_i32 t8,tmp0     => mov    %ebp,0x60(%r14)
> 
> Note %rbp as the load destination and %ebp as the source of the store.

Indeed, that's something we might want to improve (and is due to the
fact we have just replaced trunc_shr_i64_i32 by a move on x86). Note
however that this simplification might be target specific (it is at
least little endian specific if we don't adjust the address).
Richard Henderson July 15, 2015, 9:46 a.m. UTC | #8
On 07/14/2015 05:38 PM, Leon Alrae wrote:
> There seems to be an issue when trying to keep a pointer in bottom 32-bits
> of a 64-bit floating point register. Load and store instructions accessing
> this address for some reason use the whole 64-bit content of floating point
> register rather than truncated 32-bit value. The following load uses
> incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:
>
> 0x00400c60:  mfc1       t8,$f0
> 0x00400c64:  lw t9,0(t8)
>
> It can be reproduced with the following linux userland program when running
> on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
> mips32r6-generic CPUs have this bit set in linux-user).
>
> int main(int argc, char *argv[])
> {
>      int tmp = 0x11111111;
>      /* Set f0 */
>      __asm__ ("mtc1  %0, $f0\n"
>               "mthc1 %1, $f0\n"
>               : : "r" (&tmp), "r" (tmp));
>      /* At this point $f0: w:76fff040 d:1111111176fff040 */
>      __asm__ ("mfc1 $t8, $f0\n"
>               "lw   $t9, 0($t8)\n"); /* <--- crash! */
>      return 0;
> }

What compilation options, exactly?  I'm having trouble reproducing.
Alternately, perhaps you can send me a binary.


r~
Aurelien Jarno July 15, 2015, 9:59 a.m. UTC | #9
On 2015-07-15 10:46, Richard Henderson wrote:
> On 07/14/2015 05:38 PM, Leon Alrae wrote:
> >There seems to be an issue when trying to keep a pointer in bottom 32-bits
> >of a 64-bit floating point register. Load and store instructions accessing
> >this address for some reason use the whole 64-bit content of floating point
> >register rather than truncated 32-bit value. The following load uses
> >incorrect address which leads to a crash if upper 32 bits of $f0 isn't 0:
> >
> >0x00400c60:  mfc1       t8,$f0
> >0x00400c64:  lw t9,0(t8)
> >
> >It can be reproduced with the following linux userland program when running
> >on a MIPS32 with CP0.Status.FR=1 (by default mips32r5-generic and
> >mips32r6-generic CPUs have this bit set in linux-user).
> >
> >int main(int argc, char *argv[])
> >{
> >     int tmp = 0x11111111;
> >     /* Set f0 */
> >     __asm__ ("mtc1  %0, $f0\n"
> >              "mthc1 %1, $f0\n"
> >              : : "r" (&tmp), "r" (tmp));
> >     /* At this point $f0: w:76fff040 d:1111111176fff040 */
> >     __asm__ ("mfc1 $t8, $f0\n"
> >              "lw   $t9, 0($t8)\n"); /* <--- crash! */
> >     return 0;
> >}
> 
> What compilation options, exactly?  I'm having trouble reproducing.
> Alternately, perhaps you can send me a binary.

Please find attached the corresponding static binary. You should run it
with:

  qemu-mipsel -cpu mips32r5-generic ./mfc1
diff mbox

Patch

diff --git a/target-mips/translate.c b/target-mips/translate.c
index 3ae09f8..3f6b701 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -8731,6 +8731,12 @@  static void gen_cp1 (DisasContext *ctx, uint32_t opc, int rt, int fs)
         }
         gen_store_gpr(t0, rt);
         opn = "mfc1";
+#if defined(CONFIG_USER_ONLY)
+        /* FIXME
+           Workaround: end translation to avoid TCG optimization with next
+           instruction. */
+        ctx->bstate = BS_STOP;
+#endif
         break;
     case OPC_MTC1:
         gen_load_gpr(t0, rt);