diff mbox

[3/4] tcg-ppc: Convert to helper_ret_ld/st_mmu

Message ID 1378051658-3393-4-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Sept. 1, 2013, 4:07 p.m. UTC
Drop the ld/st_trampolines, loading the return address into a
parameter register directly.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h |   4 +-
 tcg/ppc/tcg-target.c    | 220 +++++++++++++++++++-----------------------------
 2 files changed, 86 insertions(+), 138 deletions(-)

Comments

Paolo Bonzini Aug. 19, 2013, 7:42 p.m. UTC | #1
On 09/01/2013 06:07 PM, Richard Henderson wrote:
> Drop the ld/st_trampolines, loading the return address into a
> parameter register directly.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>   include/exec/exec-all.h |   4 +-
>   tcg/ppc/tcg-target.c    | 220 +++++++++++++++++++-----------------------------
>   2 files changed, 86 insertions(+), 138 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index beb4149..a81e805 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -324,9 +324,7 @@ extern uintptr_t tci_tb_ptr;
>      In some implementations, we pass the "logical" return address manually;
>      in others, we must infer the logical return from the true return.  */
>   #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
> -# if defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
> -#  define GETRA_LDST(RA)   (*(int32_t *)((RA) - 4))
> -# elif defined(__arm__)
> +# if defined(__arm__)
>   /* We define two insns between the return address and the branch back to
>      straight-line.  Find and decode that branch insn.  */
>   #  define GETRA_LDST(RA)   tcg_getra_ldst(RA)
> diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
> index b0fbc54..a890319 100644
> --- a/tcg/ppc/tcg-target.c
> +++ b/tcg/ppc/tcg-target.c
> @@ -551,27 +551,26 @@ static void add_qemu_ldst_label (TCGContext *s,
>       label->label_ptr[0] = label_ptr;
>   }
>
> -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
> -   int mmu_idx) */
> +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
> + *                                     int mmu_idx, uintptr_t ra)
> + */
>   static const void * const qemu_ld_helpers[4] = {
> -    helper_ldb_mmu,
> -    helper_ldw_mmu,
> -    helper_ldl_mmu,
> -    helper_ldq_mmu,
> +    helper_ret_ldub_mmu,
> +    helper_ret_lduw_mmu,
> +    helper_ret_ldul_mmu,
> +    helper_ret_ldq_mmu,
>   };
>
> -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
> -   uintxx_t val, int mmu_idx) */
> +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
> + *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
> + */
>   static const void * const qemu_st_helpers[4] = {
> -    helper_stb_mmu,
> -    helper_stw_mmu,
> -    helper_stl_mmu,
> -    helper_stq_mmu,
> +    helper_ret_stb_mmu,
> +    helper_ret_stw_mmu,
> +    helper_ret_stl_mmu,
> +    helper_ret_stq_mmu,
>   };
>
> -static void *ld_trampolines[4];
> -static void *st_trampolines[4];
> -
>   static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
>                                  int addr_reg, int addr_reg2, int s_bits,
>                                  int offset1, int offset2, uint8_t **label_ptr)
> @@ -608,9 +607,14 @@ static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
>       tcg_out32 (s, CMP | BF (6) | RA (addr_reg2) | RB (r1));
>       tcg_out32 (s, CRAND | BT (7, CR_EQ) | BA (6, CR_EQ) | BB (7, CR_EQ));
>   #endif
> +
> +    /* Use a conditional branch-and-link so that we load a pointer to
> +       somewhere within the current opcode, for passing on to the helper.
> +       This address cannot be used for a tail call, but it's shorter
> +       than forming an address from scratch.  */
>       *label_ptr = s->code_ptr;
>       retranst = ((uint16_t *) s->code_ptr)[1] & ~3;
> -    tcg_out32 (s, BC | BI (7, CR_EQ) | retranst | BO_COND_FALSE);
> +    tcg_out32(s, BC | BI(7, CR_EQ) | retranst | BO_COND_FALSE | LK);
>
>       /* r0 now contains &env->tlb_table[mem_index][index].addr_x */
>       tcg_out32 (s, (LWZ
> @@ -833,132 +837,99 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
>   }
>
>   #if defined(CONFIG_SOFTMMU)
> -static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
> +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>   {
> -    int s_bits;
> -    int ir;
> -    int opc = label->opc;
> -    int mem_index = label->mem_index;
> -    int data_reg = label->datalo_reg;
> -    int data_reg2 = label->datahi_reg;
> -    int addr_reg = label->addrlo_reg;
> -    uint8_t *raddr = label->raddr;
> -    uint8_t **label_ptr = &label->label_ptr[0];
> +    TCGReg ir, datalo, datahi;
> +    int opc = lb->opc;
>
> -    s_bits = opc & 3;
> +    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
>
> -    /* resolve label address */
> -    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
> +    ir = TCG_REG_R3;
> +    tcg_out_mov(s, TCG_TYPE_PTR, ir++, TCG_AREG0);
>
> -    /* slow path */
> -    ir = 4;
> -#if TARGET_LONG_BITS == 32
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
> -#else
> +    if (TARGET_LONG_BITS == 32) {
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
> +    } else {
>   #ifdef TCG_TARGET_CALL_ALIGN_ARGS
> -    ir |= 1;
> -#endif
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
> +        ir |= 1;
>   #endif
> -    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
> -    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
> -    tcg_out32 (s, (tcg_target_long) raddr);
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrhi_reg);
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
> +    }
> +
> +    tcg_out_movi(s, TCG_TYPE_I32, ir++, lb->mem_index);
> +    tcg_out32(s, MFSPR | RT(ir++) | LR);
> +
> +    tcg_out_call(s, (uintptr_t)qemu_ld_helpers[opc & 3], 1);
> +
> +    datalo = lb->datalo_reg;
>       switch (opc) {
>       case 0|4:
> -        tcg_out32 (s, EXTSB | RA (data_reg) | RS (3));
> +        tcg_out32(s, EXTSB | RA(datalo) | RS(TCG_REG_R3));
>           break;
>       case 1|4:
> -        tcg_out32 (s, EXTSH | RA (data_reg) | RS (3));
> +        tcg_out32(s, EXTSH | RA(datalo) | RS(TCG_REG_R3));
>           break;
> -    case 0:
> -    case 1:
> -    case 2:
> -        if (data_reg != 3)
> -            tcg_out_mov (s, TCG_TYPE_I32, data_reg, 3);
> +
> +    default:
> +        tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R3);
>           break;
> +
>       case 3:
> -        if (data_reg == 3) {
> -            if (data_reg2 == 4) {
> -                tcg_out_mov (s, TCG_TYPE_I32, 0, 4);
> -                tcg_out_mov (s, TCG_TYPE_I32, 4, 3);
> -                tcg_out_mov (s, TCG_TYPE_I32, 3, 0);
> -            }
> -            else {
> -                tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
> -                tcg_out_mov (s, TCG_TYPE_I32, 3, 4);
> -            }
> -        }
> -        else {
> -            if (data_reg != 4) tcg_out_mov (s, TCG_TYPE_I32, data_reg, 4);
> -            if (data_reg2 != 3) tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
> +        datahi = lb->datahi_reg;
> +        if (datalo != TCG_REG_R3) {
> +            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
> +            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
> +        } else if (datahi != TCG_REG_R4) {
> +            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
> +            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
> +        } else {
> +            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, TCG_REG_R4);
> +            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
> +            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R0);
>           }
>           break;
>       }
> +
>       /* Jump to the code corresponding to next IR of qemu_st */
> -    tcg_out_b (s, 0, (tcg_target_long) raddr);
> +    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
>   }
>
> -static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
> +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
>   {
> -    int ir;
> -    int opc = label->opc;
> -    int mem_index = label->mem_index;
> -    int data_reg = label->datalo_reg;
> -    int data_reg2 = label->datahi_reg;
> -    int addr_reg = label->addrlo_reg;
> -    uint8_t *raddr = label->raddr;
> -    uint8_t **label_ptr = &label->label_ptr[0];
> -
> -    /* resolve label address */
> -    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
> -
> -    /* slow path */
> -    ir = 4;
> -#if TARGET_LONG_BITS == 32
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
> -#else
> +    TCGReg ir;
> +
> +    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
> +
> +    ir = TCG_REG_R3;
> +    tcg_out_mov(s, TCG_TYPE_PTR, ir++, TCG_AREG0);
> +
> +    if (TARGET_LONG_BITS == 32) {
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
> +    } else {
>   #ifdef TCG_TARGET_CALL_ALIGN_ARGS
> -    ir |= 1;
> -#endif
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
> -    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
> +        ir |= 1;
>   #endif
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrhi_reg);
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
> +    }
>
> -    switch (opc) {
> -    case 0:
> -        tcg_out32 (s, (RLWINM
> -                       | RA (ir)
> -                       | RS (data_reg)
> -                       | SH (0)
> -                       | MB (24)
> -                       | ME (31)));
> -        break;
> -    case 1:
> -        tcg_out32 (s, (RLWINM
> -                       | RA (ir)
> -                       | RS (data_reg)
> -                       | SH (0)
> -                       | MB (16)
> -                       | ME (31)));
> -        break;
> -    case 2:
> -        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
> -        break;
> -    case 3:
> +    if (lb->opc != 3) {
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datalo_reg);
> +    } else {
>   #ifdef TCG_TARGET_CALL_ALIGN_ARGS
>           ir |= 1;
>   #endif
> -        tcg_out_mov (s, TCG_TYPE_I32, ir++, data_reg2);
> -        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
> -        break;
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datahi_reg);
> +        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datalo_reg);
>       }
> -    ir++;
>
> -    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
> -    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
> -    tcg_out32 (s, (tcg_target_long) raddr);
> -    tcg_out_b (s, 0, (tcg_target_long) raddr);
> +    tcg_out_movi(s, TCG_TYPE_I32, ir++, lb->mem_index);
> +    tcg_out32(s, MFSPR | RT(ir++) | LR);
> +
> +    tcg_out_call(s, (uintptr_t)qemu_st_helpers[lb->opc], 1);
> +
> +    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
>   }
>
>   void tcg_out_tb_finalize(TCGContext *s)
> @@ -979,17 +950,6 @@ void tcg_out_tb_finalize(TCGContext *s)
>   }
>   #endif
>
> -#ifdef CONFIG_SOFTMMU
> -static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
> -{
> -    tcg_out32 (s, MFSPR | RT (3) | LR);
> -    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
> -    tcg_out32 (s, MTSPR | RS (3) | LR);
> -    tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
> -    tcg_out_b (s, 0, (tcg_target_long) ptr);
> -}
> -#endif
> -
>   static void tcg_target_qemu_prologue (TCGContext *s)
>   {
>       int i, frame_size;
> @@ -1050,16 +1010,6 @@ static void tcg_target_qemu_prologue (TCGContext *s)
>       tcg_out32 (s, MTSPR | RS (0) | LR);
>       tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size);
>       tcg_out32 (s, BCLR | BO_ALWAYS);
> -
> -#ifdef CONFIG_SOFTMMU
> -    for (i = 0; i < 4; ++i) {
> -        ld_trampolines[i] = s->code_ptr;
> -        emit_ldst_trampoline (s, qemu_ld_helpers[i]);
> -
> -        st_trampolines[i] = s->code_ptr;
> -        emit_ldst_trampoline (s, qemu_st_helpers[i]);
> -    }
> -#endif
>   }
>
>   static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
>

Bad news... with this patch, either with or without patch 2, trying to 
execute sieve.flat from kvm-unit-tests (it doesn't matter if it is 
compiled as 32-bit or 64-bit, and with both i386-softmmu and 
x86_64-softmmu targets) fails as follows on my PowerBook:

qemu: fatal: Trying to execute code outside RAM or ROM at 0x70360000

EAX=00006d20 EBX=00007025 ECX=00000000 EDX=00000000
ESI=07fd7bd0 EDI=000f1930 EBP=07fd7b00 ESP=00006e0c
EIP=70270000 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 ffffffff 008f9300
CS =f000 000f0000 ffffffff 008f9b00
SS =0000 00000000 ffffffff 008f9300
DS =0000 00000000 ffffffff 008f9300
FS =0000 00000000 ffffffff 008f9300
GS =0000 00000000 ffffffff 008f9300
LDT=0000 00000000 0000ffff 00008200
TR =0000 00000000 0000ffff 00008b00
GDT=     000f69e0 00000037
IDT=     00000000 000003ff
CR0=00000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
DR6=ffff0ff0 DR7=00000400
CCS=00000044 CCD=00006df8 CCO=ADDL
EFER=0000000000000000
FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
XMM00=00000000000000000000000000000000 
XMM01=00000000000000000000000000000000
XMM02=00000000000000000000000000000000 
XMM03=00000000000000000000000000000000
XMM04=00000000000000000000000000000000 
XMM05=00000000000000000000000000000000
XMM06=00000000000000000000000000000000 
XMM07=00000000000000000000000000000000
Aborted

The failure happens as soon as the first hardware interrupt is serviced:

Servicing hardware INT=0x08
Servicing hardware INT=0x09
----------------
IN:
0x000fe98f:  push   %es
0x000fe990:  push   %ebp
0x000fe992:  push   %edi
0x000fe994:  push   %esi
0x000fe996:  push   %ebx
0x000fe998:  sub    $0x44,%esp
0x000fe99c:  mov    $0x40,%eax
0x000fe9a2:  mov    %ax,%es
0x000fe9a4:  mov    %es:0x6c,%edx
0x000fe9aa:  inc    %edx
0x000fe9ac:  cmp    $0x1800af,%edx
0x000fe9b3:  jbe    0xfe9c6

----------------
IN:
0x000f77f4:  xor    %edx,%edx
0x000f77f7:  calll  *%ecx

qemu: fatal: Trying to execute code outside RAM or ROM at 0x70360000

Command line:

i386-softmmu/qemu-system-i386 -kernel sieve32.flat -serial stdio -device 
isa-debug-exit,iobase=0xf4 -nographic

My two patches + 2/4 from this series work.  I didn't try 4/4 because it 
doesn't apply cleanly on top of my patches.	

Paolo
Richard Henderson Sept. 9, 2013, 5:42 p.m. UTC | #2
On 08/19/2013 12:42 PM, Paolo Bonzini wrote:
> Bad news... with this patch, either with or without patch 2, trying to execute
> sieve.flat from kvm-unit-tests (it doesn't matter if it is compiled as 32-bit
> or 64-bit, and with both i386-softmmu and x86_64-softmmu targets) fails as
> follows on my PowerBook:
> 
> qemu: fatal: Trying to execute code outside RAM or ROM at 0x70360000

Hum.  Are you sure it's anything related to the ppc backend at all?  This
test doesn't work with an x86_64 host either.

qemu: fatal: Trying to execute code outside RAM or ROM at 0x004001ba

EAX=80000011 EBX=00009500 ECX=c0000080 EDX=00000000
ESI=00000000 EDI=00542000 EBP=00000000 ESP=0044abbc
EIP=004001ba EFL=00000046 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
CS =0008 00000000 ffffffff 00cf9a00 DPL=0 CS32 [-R-]
SS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
DS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
FS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
GS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT
TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy
GDT=     0040800a 00000447
IDT=     00000000 000003ff
CR0=80000011 CR2=00000000 CR3=00407000 CR4=00000020
DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
DR6=ffff0ff0 DR7=00000400
CCS=00000000 CCD=00000000 CCO=SARL
EFER=0000000000000000
FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000
XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000
XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000
XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000
Aborted (core dumped)

This happens after one of the writes to %cr0.  Of course, the test works with
kvm enabled, so I don't blame the test so much as the target-i386 front end...

This is not new breakage, either.  I've checked back through 1.4.0 and I can't
make it work with any version of TCG.


r~
Paolo Bonzini Sept. 9, 2013, 5:49 p.m. UTC | #3
Il 09/09/2013 19:42, Richard Henderson ha scritto:
> On 08/19/2013 12:42 PM, Paolo Bonzini wrote:
>> Bad news... with this patch, either with or without patch 2, trying to execute
>> sieve.flat from kvm-unit-tests (it doesn't matter if it is compiled as 32-bit
>> or 64-bit, and with both i386-softmmu and x86_64-softmmu targets) fails as
>> follows on my PowerBook:
>>
>> qemu: fatal: Trying to execute code outside RAM or ROM at 0x70360000
> 
> Hum.  Are you sure it's anything related to the ppc backend at all?  This
> test doesn't work with an x86_64 host either.
> 
> qemu: fatal: Trying to execute code outside RAM or ROM at 0x004001ba
> 
> EAX=80000011 EBX=00009500 ECX=c0000080 EDX=00000000
> ESI=00000000 EDI=00542000 EBP=00000000 ESP=0044abbc
> EIP=004001ba EFL=00000046 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
> ES =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
> CS =0008 00000000 ffffffff 00cf9a00 DPL=0 CS32 [-R-]
> SS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
> DS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
> FS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
> GS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
> LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT
> TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy
> GDT=     0040800a 00000447
> IDT=     00000000 000003ff
> CR0=80000011 CR2=00000000 CR3=00407000 CR4=00000020
> DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
> DR6=ffff0ff0 DR7=00000400
> CCS=00000000 CCD=00000000 CCO=SARL
> EFER=0000000000000000
> FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
> FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
> FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
> FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
> FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
> XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000
> XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000
> XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000
> XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000
> Aborted (core dumped)
> 
> This happens after one of the writes to %cr0.  Of course, the test works with
> kvm enabled, so I don't blame the test so much as the target-i386 front end...
> 
> This is not new breakage, either.  I've checked back through 1.4.0 and I can't
> make it work with any version of TCG.

Strange... works here with 1.6.0 from Fedora

$ time qemu-system-x86_64 -device isa-debug-exit,iobase=0xf4 -serial
stdio -kernel sieve64.flat
enabling apic
starting sieve
static:78498 out of 1000000
paging enabled
cr0 = 80010011
cr3 = 7fff000
cr4 = 20
mapped:78498 out of 1000000
virtual:5761455 out of 100000000
virtual:5761455 out of 100000000
virtual:5761455 out of 100000000

real	0m50.056s
user	0m49.467s
sys	0m0.415s

I sent you my binaries offlist.

Paolo
Richard Henderson Sept. 9, 2013, 6:20 p.m. UTC | #4
On 09/09/2013 10:49 AM, Paolo Bonzini wrote:
> I sent you my binaries offlist.

And apparently there was something wrong with the binaries I built myself, as
yours work.  I'll now look at my ppc32 changes and see what's what.


r~
diff mbox

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index beb4149..a81e805 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -324,9 +324,7 @@  extern uintptr_t tci_tb_ptr;
    In some implementations, we pass the "logical" return address manually;
    in others, we must infer the logical return from the true return.  */
 #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-# if defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
-#  define GETRA_LDST(RA)   (*(int32_t *)((RA) - 4))
-# elif defined(__arm__)
+# if defined(__arm__)
 /* We define two insns between the return address and the branch back to
    straight-line.  Find and decode that branch insn.  */
 #  define GETRA_LDST(RA)   tcg_getra_ldst(RA)
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index b0fbc54..a890319 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -551,27 +551,26 @@  static void add_qemu_ldst_label (TCGContext *s,
     label->label_ptr[0] = label_ptr;
 }
 
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+    helper_ret_ldub_mmu,
+    helper_ret_lduw_mmu,
+    helper_ret_ldul_mmu,
+    helper_ret_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    helper_ret_stb_mmu,
+    helper_ret_stw_mmu,
+    helper_ret_stl_mmu,
+    helper_ret_stq_mmu,
 };
 
-static void *ld_trampolines[4];
-static void *st_trampolines[4];
-
 static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
                                int addr_reg, int addr_reg2, int s_bits,
                                int offset1, int offset2, uint8_t **label_ptr)
@@ -608,9 +607,14 @@  static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
     tcg_out32 (s, CMP | BF (6) | RA (addr_reg2) | RB (r1));
     tcg_out32 (s, CRAND | BT (7, CR_EQ) | BA (6, CR_EQ) | BB (7, CR_EQ));
 #endif
+
+    /* Use a conditional branch-and-link so that we load a pointer to
+       somewhere within the current opcode, for passing on to the helper.
+       This address cannot be used for a tail call, but it's shorter
+       than forming an address from scratch.  */
     *label_ptr = s->code_ptr;
     retranst = ((uint16_t *) s->code_ptr)[1] & ~3;
-    tcg_out32 (s, BC | BI (7, CR_EQ) | retranst | BO_COND_FALSE);
+    tcg_out32(s, BC | BI(7, CR_EQ) | retranst | BO_COND_FALSE | LK);
 
     /* r0 now contains &env->tlb_table[mem_index][index].addr_x */
     tcg_out32 (s, (LWZ
@@ -833,132 +837,99 @@  static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
 }
 
 #if defined(CONFIG_SOFTMMU)
-static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    int s_bits;
-    int ir;
-    int opc = label->opc;
-    int mem_index = label->mem_index;
-    int data_reg = label->datalo_reg;
-    int data_reg2 = label->datahi_reg;
-    int addr_reg = label->addrlo_reg;
-    uint8_t *raddr = label->raddr;
-    uint8_t **label_ptr = &label->label_ptr[0];
+    TCGReg ir, datalo, datahi;
+    int opc = lb->opc;
 
-    s_bits = opc & 3;
+    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
 
-    /* resolve label address */
-    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
+    ir = TCG_REG_R3;
+    tcg_out_mov(s, TCG_TYPE_PTR, ir++, TCG_AREG0);
 
-    /* slow path */
-    ir = 4;
-#if TARGET_LONG_BITS == 32
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
-#else
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
+    } else {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
-    ir |= 1;
-#endif
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
+        ir |= 1;
 #endif
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
-    tcg_out32 (s, (tcg_target_long) raddr);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrhi_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
+    }
+
+    tcg_out_movi(s, TCG_TYPE_I32, ir++, lb->mem_index);
+    tcg_out32(s, MFSPR | RT(ir++) | LR);
+
+    tcg_out_call(s, (uintptr_t)qemu_ld_helpers[opc & 3], 1);
+
+    datalo = lb->datalo_reg;
     switch (opc) {
     case 0|4:
-        tcg_out32 (s, EXTSB | RA (data_reg) | RS (3));
+        tcg_out32(s, EXTSB | RA(datalo) | RS(TCG_REG_R3));
         break;
     case 1|4:
-        tcg_out32 (s, EXTSH | RA (data_reg) | RS (3));
+        tcg_out32(s, EXTSH | RA(datalo) | RS(TCG_REG_R3));
         break;
-    case 0:
-    case 1:
-    case 2:
-        if (data_reg != 3)
-            tcg_out_mov (s, TCG_TYPE_I32, data_reg, 3);
+
+    default:
+        tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R3);
         break;
+
     case 3:
-        if (data_reg == 3) {
-            if (data_reg2 == 4) {
-                tcg_out_mov (s, TCG_TYPE_I32, 0, 4);
-                tcg_out_mov (s, TCG_TYPE_I32, 4, 3);
-                tcg_out_mov (s, TCG_TYPE_I32, 3, 0);
-            }
-            else {
-                tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
-                tcg_out_mov (s, TCG_TYPE_I32, 3, 4);
-            }
-        }
-        else {
-            if (data_reg != 4) tcg_out_mov (s, TCG_TYPE_I32, data_reg, 4);
-            if (data_reg2 != 3) tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3);
+        datahi = lb->datahi_reg;
+        if (datalo != TCG_REG_R3) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+        } else if (datahi != TCG_REG_R4) {
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R4);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, TCG_REG_R4);
+            tcg_out_mov(s, TCG_TYPE_I32, datahi, TCG_REG_R3);
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, TCG_REG_R0);
         }
         break;
     }
+
     /* Jump to the code corresponding to next IR of qemu_st */
-    tcg_out_b (s, 0, (tcg_target_long) raddr);
+    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
 }
 
-static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    int ir;
-    int opc = label->opc;
-    int mem_index = label->mem_index;
-    int data_reg = label->datalo_reg;
-    int data_reg2 = label->datahi_reg;
-    int addr_reg = label->addrlo_reg;
-    uint8_t *raddr = label->raddr;
-    uint8_t **label_ptr = &label->label_ptr[0];
-
-    /* resolve label address */
-    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
-
-    /* slow path */
-    ir = 4;
-#if TARGET_LONG_BITS == 32
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
-#else
+    TCGReg ir;
+
+    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
+
+    ir = TCG_REG_R3;
+    tcg_out_mov(s, TCG_TYPE_PTR, ir++, TCG_AREG0);
+
+    if (TARGET_LONG_BITS == 32) {
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
+    } else {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
-    ir |= 1;
-#endif
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
+        ir |= 1;
 #endif
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrhi_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->addrlo_reg);
+    }
 
-    switch (opc) {
-    case 0:
-        tcg_out32 (s, (RLWINM
-                       | RA (ir)
-                       | RS (data_reg)
-                       | SH (0)
-                       | MB (24)
-                       | ME (31)));
-        break;
-    case 1:
-        tcg_out32 (s, (RLWINM
-                       | RA (ir)
-                       | RS (data_reg)
-                       | SH (0)
-                       | MB (16)
-                       | ME (31)));
-        break;
-    case 2:
-        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
-        break;
-    case 3:
+    if (lb->opc != 3) {
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datalo_reg);
+    } else {
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
         ir |= 1;
 #endif
-        tcg_out_mov (s, TCG_TYPE_I32, ir++, data_reg2);
-        tcg_out_mov (s, TCG_TYPE_I32, ir, data_reg);
-        break;
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datahi_reg);
+        tcg_out_mov(s, TCG_TYPE_I32, ir++, lb->datalo_reg);
     }
-    ir++;
 
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
-    tcg_out32 (s, (tcg_target_long) raddr);
-    tcg_out_b (s, 0, (tcg_target_long) raddr);
+    tcg_out_movi(s, TCG_TYPE_I32, ir++, lb->mem_index);
+    tcg_out32(s, MFSPR | RT(ir++) | LR);
+
+    tcg_out_call(s, (uintptr_t)qemu_st_helpers[lb->opc], 1);
+
+    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
 }
 
 void tcg_out_tb_finalize(TCGContext *s)
@@ -979,17 +950,6 @@  void tcg_out_tb_finalize(TCGContext *s)
 }
 #endif
 
-#ifdef CONFIG_SOFTMMU
-static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
-{
-    tcg_out32 (s, MFSPR | RT (3) | LR);
-    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
-    tcg_out32 (s, MTSPR | RS (3) | LR);
-    tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
-    tcg_out_b (s, 0, (tcg_target_long) ptr);
-}
-#endif
-
 static void tcg_target_qemu_prologue (TCGContext *s)
 {
     int i, frame_size;
@@ -1050,16 +1010,6 @@  static void tcg_target_qemu_prologue (TCGContext *s)
     tcg_out32 (s, MTSPR | RS (0) | LR);
     tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size);
     tcg_out32 (s, BCLR | BO_ALWAYS);
-
-#ifdef CONFIG_SOFTMMU
-    for (i = 0; i < 4; ++i) {
-        ld_trampolines[i] = s->code_ptr;
-        emit_ldst_trampoline (s, qemu_ld_helpers[i]);
-
-        st_trampolines[i] = s->code_ptr;
-        emit_ldst_trampoline (s, qemu_st_helpers[i]);
-    }
-#endif
 }
 
 static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,