diff mbox

target-tilegx: Execute _start and reach to __libc_start_main successfully

Message ID 54EC2DEE.8050809@sunrus.com.cn
State New
Headers show

Commit Message

Chen Gang Feb. 24, 2015, 7:53 a.m. UTC
Tilegx Qemu can decode bundle, disassemble code, and generate tcg code
for 1st TB block (__start). Then directly jump to __libc_start_main (2nd
TB block).

In __libc_start_main, tilegx qemu continues executing, and reach to the
indirectly jump statement, and jump to 3rd TB block correctly.

The related test/output for 1st TB block is:

  The disassembly code (have compared with objdump):

    TB block start position: [0000000000010f60] _start

    y0: 00000000500bfdb4    move r52, r54
    y1: 1c06400000000000    fnop
    y2: 0208000007600000    ld r1, r54

    x0: 0000000051483000    fnop
    x1: 180f86c600000000    addi r12, r54, -16

    x0: 00000000403f8336    andi r54, r12, -8
    x1: 286af00680000000    lnk r13

    y0: 00000000500bf005    move r5, r0
    y1: 040046c600000000    addi r12, r54, 8
    y2: 03f8000007600000    st r54, r63

    y0: 00000000500bfff7    move r55, r63
    y1: 0400468100000000    addi r2, r52, 8
    y2: 03f8000004c00000    st r12, r63

    x0: 0000000040110d86    addi r6, r54, 16
    x1: 07ffffe000000000    moveli r0, -1

    x0: 000000007ffff000    shl16insli r0, r0, -1
    x1: 000007e180000000    moveli r3, 0

    x0: 000000007ffa8000    shl16insli r0, r0, -88
    x1: 3800006180000000    shl16insli r3, r3, 0

    x0: 00000000500cd000    add r0, r0, r13
    x1: 3877406180000000    shl16insli r3, r3, 3816

    x0: 0000000010000fcc    moveli r12, 0
    x1: 2806686180000000    add r3, r3, r13

    x0: 000000007000030c    shl16insli r12, r12, 0
    x1: 000007e200000000    moveli r4, 0

    x0: 000000007039030c    shl16insli r12, r12, 912
    x1: 3800008200000000    shl16insli r4, r4, 0

    x0: 00000000500cd30c    add r12, r12, r13
    x1: 3881808200000000    shl16insli r4, r4, 4144

    x0: 00000000500cd104    add r4, r4, r13
    x1: 286a718000000000    jr r12

  The tcg code before optimization is:

    ld_i32 tmp0,env,$0xfffffffffffffff4
    movi_i32 tmp1,$0x0
    brcond_i32 tmp0,tmp1,ne,$0x0

    mov_i64 tmp2,sp
    ld_i64 tmp3,env,$0x1b0
    mov_i64 bp,tmp2
    mov_i64 r1,tmp3

    movi_i64 tmp3,$0xfffffffffffffff0
    add_i64 tmp2,sp,tmp3
    mov_i64 r12,tmp2

    movi_i64 tmp3,$0xfffffffffffffff8
    and_i64 tmp2,r12,tmp3
    movi_i64 tmp3,$0x10f78
    mov_i64 sp,tmp2
    mov_i64 r13,tmp3

    mov_i64 tmp2,r0
    movi_i64 tmp4,$0x8
    add_i64 tmp3,sp,tmp4
    st_i64 sp,env,$0x1c0
    mov_i64 r5,tmp2
    mov_i64 r12,tmp3

    movi_i64 tmp2,$0x0
    movi_i64 tmp4,$0x8
    add_i64 tmp3,bp,tmp4
    st_i64 r12,env,$0x1c0
    mov_i64 lr,tmp2
    mov_i64 r2,tmp3

    movi_i64 tmp3,$0x10
    add_i64 tmp2,sp,tmp3
    movi_i64 tmp3,$0xffffffffffffffff
    mov_i64 r6,tmp2
    mov_i64 r0,tmp3

    movi_i64 tmp3,$0x10
    shl_i64 tmp2,r0,tmp3
    movi_i64 tmp4,$0xffff
    or_i64 tmp3,tmp2,tmp4
    movi_i64 tmp2,$0x0
    mov_i64 r0,tmp3
    mov_i64 r3,tmp2

    movi_i64 tmp3,$0x10
    shl_i64 tmp2,r0,tmp3
    movi_i64 tmp4,$0xffa8
    or_i64 tmp3,tmp2,tmp4
    movi_i64 tmp4,$0x10
    shl_i64 tmp2,r3,tmp4
    mov_i64 tmp4,tmp2
    mov_i64 r0,tmp3
    mov_i64 r3,tmp4

    add_i64 tmp2,r0,r13
    movi_i64 tmp4,$0x10
    shl_i64 tmp3,r3,tmp4
    movi_i64 tmp5,$0xee8
    or_i64 tmp4,tmp3,tmp5
    mov_i64 r0,tmp2
    mov_i64 r3,tmp4

    movi_i64 tmp2,$0x0
    add_i64 tmp3,r3,r13
    mov_i64 r12,tmp2
    mov_i64 r3,tmp3

    movi_i64 tmp3,$0x10
    shl_i64 tmp2,r12,tmp3
    mov_i64 tmp3,tmp2
    movi_i64 tmp2,$0x0
    mov_i64 r12,tmp3
    mov_i64 r4,tmp2

    movi_i64 tmp3,$0x10
    shl_i64 tmp2,r12,tmp3
    movi_i64 tmp4,$0x390
    or_i64 tmp3,tmp2,tmp4
    movi_i64 tmp4,$0x10
    shl_i64 tmp2,r4,tmp4
    mov_i64 tmp4,tmp2
    mov_i64 r12,tmp3
    mov_i64 r4,tmp4

    add_i64 tmp2,r12,r13
    movi_i64 tmp4,$0x10
    shl_i64 tmp3,r4,tmp4
    movi_i64 tmp5,$0x1030
    or_i64 tmp4,tmp3,tmp5
    mov_i64 r12,tmp2
    mov_i64 r4,tmp4

    add_i64 tmp2,r4,r13
    mov_i64 tmp3,r12
    mov_i64 r4,tmp2
    mov_i64 pc,tmp3

    exit_tb $0x0

    set_label $0x0
    exit_tb $0x7fd21f340013

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
---
 linux-user/main.c         |   1 +
 target-tilegx/cpu-qom.h   |   2 +
 target-tilegx/cpu.c       |   4 -
 target-tilegx/cpu.h       |  21 +-
 target-tilegx/translate.c | 876 +++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 885 insertions(+), 19 deletions(-)

Comments

Chen Gang Feb. 24, 2015, 8:07 a.m. UTC | #1
On 2/24/15 15:53, Chen Gang S wrote:

[...]

> diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
> index 5131fa7..82c751e 100644
> --- a/target-tilegx/translate.c
> +++ b/target-tilegx/translate.c
> @@ -25,18 +25,880 @@
>  #include "exec/cpu_ldst.h"
>  #include "exec/helper-gen.h"
>  
> +#define TILEGX_BUNDLE_SIZE            8  /* Each bundle size in bytes */
> +#define TILEGX_BUNDLE_INSNS           3  /* Maximized insns per bundle */
> +#define TILEGX_BUNDLE_OPCS            10 /* Assume maximized opcs per bundle */
> +
> +/* Check Bundle whether is Y type, else is X type */
> +#define TILEGX_BUNDLE_TYPE_MASK       0xc000000000000000ULL
> +#define TILEGX_BUNDLE_TYPE_Y(bundle)  ((bundle) & TILEGX_BUNDLE_TYPE_MASK)
> +#define TILEGX_BUNDLE_TYPE_X(bundle)  (!TILEGX_BUNDLE_TYPE_Y(bundle))
> +
> +/* Bundle pipe mask, still remain the bundle type */

Oh, this comment is incorrect, it should be: "Bundle pipe mask without
bundle type".

> +#define TILEGX_PIPE_X0(bundle)        ((bundle) & 0x000000007fffffffULL)
> +#define TILEGX_PIPE_X1(bundle)        ((bundle) & 0x3fffffff80000000ULL)
> +#define TILEGX_PIPE_Y0(bundle)        ((bundle) & 0x00000000780fffffULL)
> +#define TILEGX_PIPE_Y1(bundle)        ((bundle) & 0x3c07ffff80000000ULL)
> +#define TILEGX_PIPE_Y2(bundle)        ((bundle) & 0x03f8000007f00000ULL)
> +

[...]

Thanks.
Chris Metcalf Feb. 24, 2015, 2:21 p.m. UTC | #2
On 2/24/2015 2:53 AM, Chen Gang S wrote:
>   typedef struct CPUTLState {
> +    uint64_t regs[TILEGX_R_COUNT]; /* Common used registers by outside */
> +    uint64_t zero;                 /* Zero register */
> +    uint64_t pc;                   /* Current pc */
>       CPU_COMMON
>   } CPUTLState;

I skimmed through this and my only comment is that I was surprised to see "zero" as part of the state, since it's always 0.  :-)  No doubt there is some reason that this makes sense.

> +#define TILEGX_GEN_CHK_BEGIN(x) \
> +    if ((x) == TILEGX_R_ZERO) {
> +#define TILEGX_GEN_CHK_END(x) \
> +        return 0; \
> +    } \
> +    if ((x) >= TILEGX_R_COUNT) { \
> +        return -1; \
> +    }

This macro pattern seems potentially a little confusing and I do wonder if there is some way to avoid having to explicitly check the zero register every time; for example, maybe you make it a legitimate part of the state and declare that there are 64 registers, and then just always finish any register-update phase by re-zeroing that register?  It might yield a smaller code footprint and probably be just as fast, as long as it was easy to know where registers were updated.

Also, note that you should model accesses to registers 56..62 as causing an interrupt to be raised, rather than returning -1.  But you might choose to just put this on your TODO list and continue making forward progress for the time being.  But a FIXME comment here would be appropriate.

> +    case 0x3800000000000000ULL:

There are a lot of constants defined in the tile <arch/opcode.h> header, and presumably you could synthesize these constant values from them.  Perhaps it would make sense to import that header into qemu and then use symbolic values for all of this kind of thing?

I can't really comment on most of the rest of the code, and I only skimmed it quickly, but generally: good work getting as far as you have!
Peter Maydell Feb. 24, 2015, 2:38 p.m. UTC | #3
On 24 February 2015 at 23:21, Chris Metcalf <cmetcalf@ezchip.com> wrote:
> On 2/24/2015 2:53 AM, Chen Gang S wrote:
>>
>>   typedef struct CPUTLState {
>> +    uint64_t regs[TILEGX_R_COUNT]; /* Common used registers by outside */
>> +    uint64_t zero;                 /* Zero register */
>> +    uint64_t pc;                   /* Current pc */
>>       CPU_COMMON
>>   } CPUTLState;
>
>
> I skimmed through this and my only comment is that I was surprised to see
> "zero" as part of the state, since it's always 0.  :-)  No doubt there is
> some reason that this makes sense.

I think that is definitely an error...

>> +#define TILEGX_GEN_CHK_BEGIN(x) \
>> +    if ((x) == TILEGX_R_ZERO) {
>> +#define TILEGX_GEN_CHK_END(x) \
>> +        return 0; \
>> +    } \
>> +    if ((x) >= TILEGX_R_COUNT) { \
>> +        return -1; \
>> +    }
>
>
> This macro pattern seems potentially a little confusing and I do wonder if
> there is some way to avoid having to explicitly check the zero register
> every time; for example, maybe you make it a legitimate part of the state
> and declare that there are 64 registers, and then just always finish any
> register-update phase by re-zeroing that register?  It might yield a smaller
> code footprint and probably be just as fast, as long as it was easy to know
> where registers were updated.

See target-arm/translate-a64.c for one way to handle an
always-reads-as-zero register.

-- PMM
Chen Gang Feb. 24, 2015, 3:39 p.m. UTC | #4
On 2/24/15 22:38, Peter Maydell wrote:
> On 24 February 2015 at 23:21, Chris Metcalf <cmetcalf@ezchip.com> wrote:
>> On 2/24/2015 2:53 AM, Chen Gang S wrote:
>>>
>>>   typedef struct CPUTLState {
>>> +    uint64_t regs[TILEGX_R_COUNT]; /* Common used registers by outside */
>>> +    uint64_t zero;                 /* Zero register */
>>> +    uint64_t pc;                   /* Current pc */
>>>       CPU_COMMON
>>>   } CPUTLState;
>>
>>
>> I skimmed through this and my only comment is that I was surprised to see
>> "zero" as part of the state, since it's always 0.  :-)  No doubt there is
>> some reason that this makes sense.
> 
> I think that is definitely an error...
> 
>>> +#define TILEGX_GEN_CHK_BEGIN(x) \
>>> +    if ((x) == TILEGX_R_ZERO) {
>>> +#define TILEGX_GEN_CHK_END(x) \
>>> +        return 0; \
>>> +    } \
>>> +    if ((x) >= TILEGX_R_COUNT) { \
>>> +        return -1; \
>>> +    }
>>
>>
>> This macro pattern seems potentially a little confusing and I do wonder if
>> there is some way to avoid having to explicitly check the zero register
>> every time; for example, maybe you make it a legitimate part of the state
>> and declare that there are 64 registers, and then just always finish any
>> register-update phase by re-zeroing that register?  It might yield a smaller
>> code footprint and probably be just as fast, as long as it was easy to know
>> where registers were updated.
> 
> See target-arm/translate-a64.c for one way to handle an
> always-reads-as-zero register.
> 

After read through target-arm/translate-a64.c, I guess, the main reason
is: the zero register (r31) shares with the sp register (also r31).

 - So it uses cpu_reg() and cpu_reg_sp() for them.

 - For each zero register access, it will new a tcg temporary variable
   for it, and release it after finish decoding one insn (so it will not
   overwrite sp register.).

For tilegx, zero register (r63) does not share with other registers (sp
is r54), so we needn't use wrap functions for it.


Thanks.
Chen Gang Feb. 24, 2015, 4:31 p.m. UTC | #5
On 2/24/15 22:21, Chris Metcalf wrote:
> On 2/24/2015 2:53 AM, Chen Gang S wrote:
>>   typedef struct CPUTLState {
>> +    uint64_t regs[TILEGX_R_COUNT]; /* Common used registers by outside */
>> +    uint64_t zero;                 /* Zero register */
>> +    uint64_t pc;                   /* Current pc */
>>       CPU_COMMON
>>   } CPUTLState;
> 
> I skimmed through this and my only comment is that I was surprised to see "zero" as part of the state, since it's always 0.  :-)  No doubt there is some reason that this makes sense.
> 

Originally, I did not add zero register, but after think of, for gen_st
operation, tcg_gen_st*() only support from tcg_target_long to register,
so I add zero register for "offsetof(CPUTLState, zero)".

Welcome any better ways for it.
 
>> +#define TILEGX_GEN_CHK_BEGIN(x) \
>> +    if ((x) == TILEGX_R_ZERO) {
>> +#define TILEGX_GEN_CHK_END(x) \
>> +        return 0; \
>> +    } \
>> +    if ((x) >= TILEGX_R_COUNT) { \
>> +        return -1; \
>> +    }
> 
> This macro pattern seems potentially a little confusing and I do wonder if there is some way to avoid having to explicitly check the zero register every time; for example, maybe you make it a legitimate part of the state and declare that there are 64 registers, and then just always finish any register-update phase by re-zeroing that register?  It might yield a smaller code footprint and probably be just as fast, as long as it was easy to know where registers were updated.
> 

Originally, I really used 64 registers, but after tried, I found that I
still had to process TILEGX_R_ZERO specially, e.g.

 - When src is zero, we can use mov operation instead of add operation.

 - When dst is zero, in most cases, we can just skip the insn, but in
   some cases, it may cause exception in user mode (e.g st zero r0).

Originally, I also tried a wrap function for zero register, but I found
for different operands, when they meet zero register, they would need
different operations:

 - For add/addi operation, it will change to mov/movi operation.

 - For mov operation, it will change to movi operation.

 - For shl3add, if srcb is zero register, it will change to shli
   operation.

 - For ld/st operation, it still be ld/st operation, but they need
   "offsetof(CPUTLState, zero)", and in some cases, it should be cause
   exception.

So after think of, for me, I still prefer to use 56 registers with
individual zero register, and use macros for it.

> Also, note that you should model accesses to registers 56..62 as causing an interrupt to be raised, rather than returning -1.  But you might choose to just put this on your TODO list and continue making forward progress for the time being.  But a FIXME comment here would be appropriate.
>

Yeah, thanks. I shall add it when I send patch v2 for it.

Also I forgot to use "offsetof(CPUTLState, zero)" for ld zero register
case, and still "return 0" for "st zero r1" or "ld r1 zero". I shall
fix it in the patch v2.

 
>> +    case 0x3800000000000000ULL:
> 
> There are a lot of constants defined in the tile <arch/opcode.h> header, and presumably you could synthesize these constant values from them.  Perhaps it would make sense to import that header into qemu and then use symbolic values for all of this kind of thing?
>

OK, thanks.  originally I wanted to reuse them, but after think of, I
prefer the 64-bit immediate number instead of.

 - The decoding function may be very long, but it is still a simple
   function, I guess, it is still simple enough for readers.

 - 64-bit immediate number has better performance under 64-bit machine
   (e.g x86-64).

But I guess, there are still quite a few of inline functions (e.g. get
src/dst) in arch/opcode_tilegx.h may be reused by me in the future. :-)


> I can't really comment on most of the rest of the code, and I only skimmed it quickly, but generally: good work getting as far as you have!
> 

Thank you for your work and your encouragement.


Thanks.
Richard Henderson Feb. 24, 2015, 4:42 p.m. UTC | #6
On 02/24/2015 05:39 AM, Chen Gang S wrote:
> After read through target-arm/translate-a64.c, I guess, the main reason
> is: the zero register (r31) shares with the sp register (also r31).
> 
>  - So it uses cpu_reg() and cpu_reg_sp() for them.
> 
>  - For each zero register access, it will new a tcg temporary variable
>    for it, and release it after finish decoding one insn (so it will not
>    overwrite sp register.).
> 
> For tilegx, zero register (r63) does not share with other registers (sp
> is r54), so we needn't use wrap functions for it.

Perhaps aarch64 is confusing for you.  But Alpha also has zero registers, and
also uses wrapper functions.  See load_gpr and dest_gpr.

The very most important reason to use a wrapper, and thus a tcg temporary that
keeps getting re-initialized to zero, is that the tcg optimizer gets to see
that zero and optimize the code accordingly.


r~
Chris Metcalf Feb. 24, 2015, 4:46 p.m. UTC | #7
On 2/24/2015 11:31 AM, Chen Gang S wrote:
>   - When dst is zero, in most cases, we can just skip the insn, but in
>     some cases, it may cause exception in user mode (e.g st zero r0).

Be careful of skipping instructions in general, since for example a "ld zero, r1" may be targeting an MMIO address that we are reading to trigger some device operation, but don't need the result from.

> - For add/addi operation, it will change to mov/movi operation.
>
>   - For mov operation, it will change to movi operation.
>
>   - For shl3add, if srcb is zero register, it will change to shli
>     operation.

I assume you are referring to missed performance optimization opportunities if you don't treat "zero" specially?  You certainly could treat "add r1, r2, zero" as just an "add" instruction anyway, just with a zero addend.  You don't have to convert it to a move instruction.  Likewise with the others.

> OK, thanks.  originally I wanted to reuse them, but after think of, I
> prefer the 64-bit immediate number instead of.
>
>   - The decoding function may be very long, but it is still a simple
>     function, I guess, it is still simple enough for readers.
>
>   - 64-bit immediate number has better performance under 64-bit machine
>     (e.g x86-64).

What I mean is you should just directly use all those accessor functions.  Then your code would look like

switch (get_Opcode_X1(bundle)) {   // this is a 59-bit shift and mask by 0x7
case SHL16INSLI_OPCODE_X1:   // this is the constant 7
   [...]
}

Typically dealing with smaller integers is higher-performance on any platform, I suspect, even on x86 when you can have large inline constants in the code.  The compiler might be smart enough to do this for you, to be fair.  In any case any performance difference of this switch is almost certainly lost in the noise.

More to the point, named constants are almost always better for code maintainability than raw integers in code.

Also, my point is that you should just include a verbatim copy of the kernel header into qemu, and then use those names.  This is helpful to people who are already familiar with the <arch/opcode.h> naming, and reduces the amount of code needed to review qemu in any case.
Chen Gang Feb. 24, 2015, 5:08 p.m. UTC | #8
On 2/25/15 00:42, Richard Henderson wrote:
> On 02/24/2015 05:39 AM, Chen Gang S wrote:
>> After read through target-arm/translate-a64.c, I guess, the main reason
>> is: the zero register (r31) shares with the sp register (also r31).
>>
>>  - So it uses cpu_reg() and cpu_reg_sp() for them.
>>
>>  - For each zero register access, it will new a tcg temporary variable
>>    for it, and release it after finish decoding one insn (so it will not
>>    overwrite sp register.).
>>
>> For tilegx, zero register (r63) does not share with other registers (sp
>> is r54), so we needn't use wrap functions for it.
> 
> Perhaps aarch64 is confusing for you.  But Alpha also has zero registers, and
> also uses wrapper functions.  See load_gpr and dest_gpr.
> 
> The very most important reason to use a wrapper, and thus a tcg temporary that
> keeps getting re-initialized to zero, is that the tcg optimizer gets to see
> that zero and optimize the code accordingly.
> 

OK, thanks, what you said above sounds reasonable to me, I shall use tcg
temporary variable for zero register, when I send patch v2.

Thanks.
Chen Gang Feb. 24, 2015, 5:25 p.m. UTC | #9
On 2/25/15 00:46, Chris Metcalf wrote:
> On 2/24/2015 11:31 AM, Chen Gang S wrote:
>>   - When dst is zero, in most cases, we can just skip the insn, but in
>>     some cases, it may cause exception in user mode (e.g st zero r0).
> 
> Be careful of skipping instructions in general, since for example a "ld zero, r1" may be targeting an MMIO address that we are reading to trigger some device operation, but don't need the result from.

OK, thanks, I shall notice about it, next.

> 
>> - For add/addi operation, it will change to mov/movi operation.
>>
>>   - For mov operation, it will change to movi operation.
>>
>>   - For shl3add, if srcb is zero register, it will change to shli
>>     operation.
> 
> I assume you are referring to missed performance optimization opportunities if you don't treat "zero" specially?  You certainly could treat "add r1, r2, zero" as just an "add" instruction anyway, just with a zero addend.  You don't have to convert it to a move instruction.  Likewise with the others.
> 

OK, thanks. Next, I shall not change the instruction, and will only use
tcg temporary variable instead of zero register, when I send patch v2.

>> OK, thanks.  originally I wanted to reuse them, but after think of, I
>> prefer the 64-bit immediate number instead of.
>>
>>   - The decoding function may be very long, but it is still a simple
>>     function, I guess, it is still simple enough for readers.
>>
>>   - 64-bit immediate number has better performance under 64-bit machine
>>     (e.g x86-64).
> 
> What I mean is you should just directly use all those accessor functions.  Then your code would look like
> 
> switch (get_Opcode_X1(bundle)) {   // this is a 59-bit shift and mask by 0x7
> case SHL16INSLI_OPCODE_X1:   // this is the constant 7
>   [...]
> }
> 
> Typically dealing with smaller integers is higher-performance on any platform, I suspect, even on x86 when you can have large inline constants in the code.  The compiler might be smart enough to do this for you, to be fair.  In any case any performance difference of this switch is almost certainly lost in the noise.
> 

Hmm... maybe what you said is correct, but I am not quite sure.

> More to the point, named constants are almost always better for code maintainability than raw integers in code.
> 

For me, if the raw integer is only used once, we needn't define a macro
for it (instead of, we can give a comment for it).

> Also, my point is that you should just include a verbatim copy of the kernel header into qemu, and then use those names.  This is helpful to people who are already familiar with the <arch/opcode.h> naming, and reduces the amount of code needed to review qemu in any case.
> 

OK, thanks. What you said above sounds reasonable to me, for compatible
reason, I shall use "arch/opcode_tilegx.h" totally -- it will be helpful
for the readers who are already familiar with "arch/opcode_tilegx.h".

Thanks.
Richard Henderson Feb. 24, 2015, 5:55 p.m. UTC | #10
On 02/23/2015 09:53 PM, Chen Gang S wrote:
> +        env->zero = 0;

I replied elsewhere about the zero register.

> +#define TILEGX_GEN_CHK_BEGIN(x) \
> +    if ((x) == TILEGX_R_ZERO) {
> +#define TILEGX_GEN_CHK_END(x) \
> +        return 0; \
> +    } \
> +    if ((x) >= TILEGX_R_COUNT) { \
> +        return -1; \
> +    }
> +
> +#define TILEGX_GEN_CHK_SIMPLE(x) \
> +    TILEGX_GEN_CHK_BEGIN(x) \
> +    TILEGX_GEN_CHK_END(x)
> +
> +#define TILEGX_GEN_SAVE_PREP(rdst) \
> +    dc->tmp_regcur->idx = rdst; \
> +    dc->tmp_regcur->val = tcg_temp_new_i64();
> +
> +#define TILEGX_GEN_SAVE_TMP_1(func, rdst, x1) \
> +    TILEGX_GEN_SAVE_PREP(rdst) \
> +    func(dc->tmp_regcur->val, x1);
> +
> +#define TILEGX_GEN_SAVE_TMP_2(func, rdst, x1, x2) \
> +    TILEGX_GEN_SAVE_PREP(rdst) \
> +    func(dc->tmp_regcur->val, x1, x2);

I don't like these macros at all.  I think that proper functions would be
easier to read.  In particular, if you use helper functions like the load_gpr
and dest_gpr functions from target-alpha, you'll wind up with much more
readable code.

> +static const char *reg_names[] = {

static const char * const reg_names[].

I.e. the array itself should also be const.

Even better is to use

static const char reg_names[][4]

I.e. make an array of 4 byte arrays.  This will actually save space on 64-bit
hosts.

> +     "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
> +     "r8",  "r9", "r10", "r11", "r12", "r13", "r14", "r15",
> +    "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
> +    "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
> +    "r32", "r33", "r34", "r35", "r36", "r37", "r38", "r39",
> +    "r40", "r41", "r42", "r43", "r44", "r45", "r46", "r47",
> +    "r48", "r49", "r50", "r51",  "bp",  "tp",  "sp",  "lr"
> +};


> +/* This is the state at translation time.  */
> +struct DisasContext {
> +    uint64_t pc;                   /* Current pc */
> +    unsigned int cpustate_changed; /* Whether cpu state changed */
> +
> +    struct {
> +        unsigned char idx;         /* index */
> +        TCGv val;                  /* value */
> +    } *tmp_regcur,                 /* Current temporary registers */
> +    tmp_regs[TILEGX_BUNDLE_INSNS]; /* All temporary registers */

I think the pointer is overkill, but whatever.  If you keep it, pull this
structure out and give it a proper name.

Please don't use "unsigned char".  "uint8_t" is better, though perhaps just
plain "int" is best.  Surely you're not trying to save space here...

> +static int gen_move(struct DisasContext *dc,
> +                    unsigned char rdst, unsigned char rsrc)

Use a typedef and not the struct tag.

> +{
> +    qemu_log("move r%d, r%d", rdst, rsrc);
> +    TILEGX_GEN_CHK_SIMPLE(rdst);
> +    TILEGX_GEN_CHK_BEGIN(rsrc);
> +        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, 0);
> +    TILEGX_GEN_CHK_END(rsrc);
> +
> +    TILEGX_GEN_SAVE_TMP_1(tcg_gen_mov_i64, rdst, cpu_regs[rsrc]);
> +    return 0;
> +}

Use the proper log flags.  In this case, CPU_LOG_TB_IN_ASM.

With helpers this becomes

static void gen_move(DisasContext *dc, int rdst, int rsrc)
{
    TCGv tdst = dest_gpr(dc, rdst);
    TCGv tsrc = load_gpr(dc, rsrc);
    tcg_gen_mov_tl(tdst, tsrc);
}

which is clearly much much easier to read than your macros.

> +static int gen_moveli(struct DisasContext *dc, unsigned char rdst, short im16)

Don't use "short", use "int16_t".

> +{
> +    qemu_log("moveli r%d, %d", rdst, im16);
> +    return gen_move_imm(dc, rdst, (int64_t)im16);

The cast is useless, implied by C.

> +static int gen_movei(struct DisasContext *dc, unsigned char rdst, char im8)

Don't use "char", use "int8_t".

> +static int gen_ld(struct DisasContext *dc,
> +                  unsigned char rdst, unsigned char rsrc)
> +{
> +    qemu_log("ld r%d, r%d", rdst, rsrc);
> +    TILEGX_GEN_CHK_SIMPLE(rdst);
> +    TILEGX_GEN_CHK_BEGIN(rsrc);
> +        TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env, 0);
> +    TILEGX_GEN_CHK_END(rsrc);
> +
> +    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env,
> +                          offsetof(CPUTLState, regs[rsrc]));

You're using the wrong kind of load.  Indeed, this is completely confused and
amounts to a broken sort of register-to-register move.

The kind of load you are generating is a host-side load.  What you actually
want is a guest-side load.

    tcg_gen_qemu_ld_i64(tdst, tsrc, dc->mem_idx, MO_TEQ)

You probably want to add a parameter of type TCGMemOp so that you can handle
all of the various loads with the same function.

> +static int gen_st(struct DisasContext *dc,
> +                  unsigned char rdst, unsigned char rsrc)
> +{
> +    qemu_log("st r%d, r%d", rdst, rsrc);
> +    TILEGX_GEN_CHK_SIMPLE(rdst);
> +    TILEGX_GEN_CHK_BEGIN(rsrc);
> +        tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, zero));
> +    TILEGX_GEN_CHK_END(rsrc);
> +
> +    tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, regs[rsrc]));
> +    return 0;
> +}

Similarly, this should be using tcg_gen_qemu_st_i64.

> +static int gen_shl16insli(struct DisasContext *dc,
> +                          unsigned char rdst, unsigned char rsrc, short im16)

Use uint16_t, as the field is not signed.

> +{
> +    int64_t imm;
> +    TCGv_i64 tmp;
> +
> +    qemu_log("shl16insli r%d, r%d, %d", rdst, rsrc, im16);

which means you print the right value here,

> +    TILEGX_GEN_CHK_SIMPLE(rdst);
> +
> +    imm = (int64_t)im16 & 0x000000000000ffffLL;

and you don't need the mask here.

> +
> +    TILEGX_GEN_CHK_BEGIN(rsrc);
> +        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, imm);
> +    TILEGX_GEN_CHK_END(rsrc);
> +
> +    tmp = tcg_temp_new_i64();
> +    tcg_gen_shli_i64(tmp, cpu_regs[rsrc], 16);
> +    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ori_i64, rdst, tmp, imm);
> +    tcg_temp_free_i64(tmp);

You don't need an extra temporary.  You know that dst will be a new temporary.

    tcg_gen_shli_i64(tdst, tsrc, 16);
    tcg_gen_ori_i64(tdst, tdst, imm);

> +        if (rsrc == 0x3f) {

You might as well use TILEGX_R_ZERO, now that you've defined it.

> +            /* moveli Dest, Imm16 */
> +            if (gen_moveli(dc, rdst, im16)) {
> +                /* FIXME: raise an exception for invalid instruction */
> +                return -1;

Please do not scatter these checks everywhere.  I'm not sure what you're trying
to accomplish with them.  Certainly gen_movli can't fail.  In particular, all
invalid instruction detection should happen here and the "gen" routines should
only be given valid inputs.

> +    case 0x0000000040000000ULL:
> +        switch (TILEGX_CODE_X0_20(bundle)) {
> +        /* andi Dest, SrcA, Imm8 */
> +        case 0x0000000000300000ULL:
> +            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
> +            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
> +            im8 = (char)((bundle >> 12) & TILEGX_DATA_IMM8);

Pull these field extracts to the top of the switch -- there are only a limited
amount of them that are possible for the X0 pipeline.

Use the extract64 and sextract64 functions instead of bare shifts and masks.

> +            /* FIXME: raise an exception for invalid instruction */
> +            qemu_log("20 bundle value: %16.16llx", TILEGX_CODE_X0_20(bundle));

You might as well get this right from the very beginning.  It is very easy to
define a helper to throw an exception.  (Harder to consume the exception,
perhaps, but "consume" can simply abort for now.)

> +        case 0x0000000001040000ULL:
> +            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
> +            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
> +            rsrcb = (unsigned char)((bundle >> 12) & TILEGX_DATA_REGISTER);
> +            /* move Dest, Src */
> +            if (rsrcb == 0x3f) {
> +                if (gen_move(dc, rdst, rsrc)) {
> +                    /* FIXME: raise an exception for invalid instruction */
> +                    return -1;
> +                }
> +            } else {
> +                qemu_log("invalid rsrcb, not 0x3f for move in x0");
> +            }

Um, what instruction is this supposed to be?  (Sadly, the tilegx architecture
pdf that I have doesn't have instructions indexed by opcode, so it's hard for
me to actually look this up.)

> +    case 0x1800000000000000ULL:
> +        switch (TILEGX_CODE_X1_51(bundle)) {
> +        /* addi Dest, SrcA, Imm8 */
> +        case 0x0008000000000000ULL:
> +            rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
> +            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
> +            im8 = (char)((bundle >> 43) & TILEGX_DATA_IMM8);
> +            if (rsrc == 0x3f) {
> +                if (gen_movei(dc, rdst, im8)) {
> +                    /* FIXME: raise an exception for invalid instruction */
> +                    return -1;
> +                }
> +            } else {
> +                if (gen_addi(dc, rdst, rsrc, im8)) {
> +                    /* FIXME: raise an exception for invalid instruction */
> +                    return -1;
> +                }
> +            }

Since this is the 3rd copy of this, clearly the movei optimization should be
moved into gen_addi, and gen_movei should be removed.

> +static int translate_one_bundle(struct DisasContext *dc, uint64_t bundle)
> +{
> +    int i, ret = 0;
> +    TCGv tmp;
> +
> +    for (i = 0; i < TILEGX_BUNDLE_INSNS; i++) {
> +        dc->tmp_regs[i].idx = TILEGX_R_NOREG;
> +        TCGV_UNUSED_I64(dc->tmp_regs[i].val);
> +    }

Emit

    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT))) {
        tcg_gen_debug_insn_start(dc->pc);
    }

here.

> +
> +    /* For can regress completely, it must be y0 -> y1 -> y2, or x0 -> x1. */

I don't know what this sentence means.

> +    if (TILEGX_BUNDLE_TYPE_Y(bundle)) {
> +        dc->tmp_regcur = dc->tmp_regs + 0;
> +        ret = translate_y0_bundle(dc, TILEGX_PIPE_Y0(bundle));
> +        if (ret) {
> +            goto err;
> +        }

You don't need these gotos.  If a given pipeline raises an invalid instruction
exception, then it's true that all code emitted afterward is dead.  But that's
ok, since it simply won't be executed.  Invalid instructions are exceedingly
unlikely and it's silly to optimize for that case.

> +    /* FIXME: do not consider about search_pc firstly. */

You have to do this right away.  This path will be used the very first time you
execute a load or store instruction.

> +err:
> +    exit(-1);
>  }

Huh?


r~
Chris Metcalf Feb. 24, 2015, 6:18 p.m. UTC | #11
On 2/24/2015 12:25 PM, Chen Gang S wrote:
> For me, if the raw integer is only used once, we needn't define a macro
> for it (instead of, we can give a comment for it).

The advantage of names even in this case is that you can group all the
macro definitions in one place where they are easy to read and review.
Then later when you use them they are self-documenting.  And if you
are going to use opcode_tilegx.h anyway, you get the names "for free".
Chen Gang Feb. 25, 2015, 1:01 a.m. UTC | #12
On 2/25/15 02:18, Chris Metcalf wrote:
> On 2/24/2015 12:25 PM, Chen Gang S wrote:
>> For me, if the raw integer is only used once, we needn't define a macro
>> for it (instead of, we can give a comment for it).
> 
> The advantage of names even in this case is that you can group all the
> macro definitions in one place where they are easy to read and review.
> Then later when you use them they are self-documenting.

Yeah, what you said sounds reasonable to me.

At present (and originally), I was not quit sure each number's meaning,
so I left them as raw number, now. After I have enough more numbers, I
shall consider of their meanings, together, then use macros in one area.


>                                                         And if you
> are going to use opcode_tilegx.h anyway, you get the names "for free".
> 

OK, thanks. I shall use opcode_tilegx.h, and we needn't consider about
these raw numbers.

That is also one of reason why I am not consider more for these numbers:
since the code is before reviewing, if not quite necessary, I will not
devote more time resources on coding styles. ;-)


Thanks.
Chen Gang Feb. 25, 2015, 3:40 a.m. UTC | #13
Firstly, thank you very much for your details patient work!

And sorry, I guess, I can not finish patch v2 within this month. I shall
try to finish patch v2 within 2015-03-15.

The details reply is below, please check, thanks.

On 2/25/15 01:55, Richard Henderson wrote:
> On 02/23/2015 09:53 PM, Chen Gang S wrote:
>> +        env->zero = 0;
> 
> I replied elsewhere about the zero register.
>
 
OK, thanks.

>> +#define TILEGX_GEN_CHK_BEGIN(x) \
>> +    if ((x) == TILEGX_R_ZERO) {
>> +#define TILEGX_GEN_CHK_END(x) \
>> +        return 0; \
>> +    } \
>> +    if ((x) >= TILEGX_R_COUNT) { \
>> +        return -1; \
>> +    }
>> +
>> +#define TILEGX_GEN_CHK_SIMPLE(x) \
>> +    TILEGX_GEN_CHK_BEGIN(x) \
>> +    TILEGX_GEN_CHK_END(x)
>> +
>> +#define TILEGX_GEN_SAVE_PREP(rdst) \
>> +    dc->tmp_regcur->idx = rdst; \
>> +    dc->tmp_regcur->val = tcg_temp_new_i64();
>> +
>> +#define TILEGX_GEN_SAVE_TMP_1(func, rdst, x1) \
>> +    TILEGX_GEN_SAVE_PREP(rdst) \
>> +    func(dc->tmp_regcur->val, x1);
>> +
>> +#define TILEGX_GEN_SAVE_TMP_2(func, rdst, x1, x2) \
>> +    TILEGX_GEN_SAVE_PREP(rdst) \
>> +    func(dc->tmp_regcur->val, x1, x2);
> 
> I don't like these macros at all.  I think that proper functions would be
> easier to read.  In particular, if you use helper functions like the load_gpr
> and dest_gpr functions from target-alpha, you'll wind up with much more
> readable code.
> 

Yeah, load_gpr and dest_gpr in target-alpha are really cool!!

>> +static const char *reg_names[] = {
> 
> static const char * const reg_names[].
> 
> I.e. the array itself should also be const.
> 

OK, thanks.

> Even better is to use
> 
> static const char reg_names[][4]
> 
> I.e. make an array of 4 byte arrays.  This will actually save space on 64-bit
> hosts.
> 

OK, thanks, what you said above sound reasonable to me.

>> +     "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
>> +     "r8",  "r9", "r10", "r11", "r12", "r13", "r14", "r15",
>> +    "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
>> +    "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
>> +    "r32", "r33", "r34", "r35", "r36", "r37", "r38", "r39",
>> +    "r40", "r41", "r42", "r43", "r44", "r45", "r46", "r47",
>> +    "r48", "r49", "r50", "r51",  "bp",  "tp",  "sp",  "lr"
>> +};
> 
> 
>> +/* This is the state at translation time.  */
>> +struct DisasContext {
>> +    uint64_t pc;                   /* Current pc */
>> +    unsigned int cpustate_changed; /* Whether cpu state changed */
>> +
>> +    struct {
>> +        unsigned char idx;         /* index */
>> +        TCGv val;                  /* value */
>> +    } *tmp_regcur,                 /* Current temporary registers */
>> +    tmp_regs[TILEGX_BUNDLE_INSNS]; /* All temporary registers */
> 
> I think the pointer is overkill, but whatever.  If you keep it, pull this
> structure out and give it a proper name.
> 

For me, it is a simple way to let the structure only as a name space,
and tmp_regcur is always related with tmp_regs[], so we can let them
together, the code is still simple and clear enough.

> Please don't use "unsigned char".  "uint8_t" is better, though perhaps just
> plain "int" is best.  Surely you're not trying to save space here...
> 

OK, thanks. For my taste, "uint8_t" is still better, since the idx
should always be within 0x3f.

>> +static int gen_move(struct DisasContext *dc,
>> +                    unsigned char rdst, unsigned char rsrc)
> 
> Use a typedef and not the struct tag.
> 

OK, thanks. In my taste, I dislike "typedef struct foo foo;", but I
shall follow qemu styles for DisasContext.

>> +{
>> +    qemu_log("move r%d, r%d", rdst, rsrc);
>> +    TILEGX_GEN_CHK_SIMPLE(rdst);
>> +    TILEGX_GEN_CHK_BEGIN(rsrc);
>> +        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, 0);
>> +    TILEGX_GEN_CHK_END(rsrc);
>> +
>> +    TILEGX_GEN_SAVE_TMP_1(tcg_gen_mov_i64, rdst, cpu_regs[rsrc]);
>> +    return 0;
>> +}
> 
> Use the proper log flags.  In this case, CPU_LOG_TB_IN_ASM.
> 

OK, thanks. I shall use it next

> With helpers this becomes
> 
> static void gen_move(DisasContext *dc, int rdst, int rsrc)
> {
>     TCGv tdst = dest_gpr(dc, rdst);
>     TCGv tsrc = load_gpr(dc, rsrc);
>     tcg_gen_mov_tl(tdst, tsrc);
> }
> 
> which is clearly much much easier to read than your macros.
> 

Yeah.

>> +static int gen_moveli(struct DisasContext *dc, unsigned char rdst, short im16)
> 
> Don't use "short", use "int16_t".
> 

OK.

>> +{
>> +    qemu_log("moveli r%d, %d", rdst, im16);
>> +    return gen_move_imm(dc, rdst, (int64_t)im16);
> 
> The cast is useless, implied by C.
> 

OK.

>> +static int gen_movei(struct DisasContext *dc, unsigned char rdst, char im8)
> 
> Don't use "char", use "int8_t".
> 

OK.

>> +static int gen_ld(struct DisasContext *dc,
>> +                  unsigned char rdst, unsigned char rsrc)
>> +{
>> +    qemu_log("ld r%d, r%d", rdst, rsrc);
>> +    TILEGX_GEN_CHK_SIMPLE(rdst);
>> +    TILEGX_GEN_CHK_BEGIN(rsrc);
>> +        TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env, 0);
>> +    TILEGX_GEN_CHK_END(rsrc);
>> +
>> +    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env,
>> +                          offsetof(CPUTLState, regs[rsrc]));
> 
> You're using the wrong kind of load.  Indeed, this is completely confused and
> amounts to a broken sort of register-to-register move.
> 
> The kind of load you are generating is a host-side load.  What you actually
> want is a guest-side load.
> 
>     tcg_gen_qemu_ld_i64(tdst, tsrc, dc->mem_idx, MO_TEQ)
> 
> You probably want to add a parameter of type TCGMemOp so that you can handle
> all of the various loads with the same function.
> 

OK, thanks. I shall read the details next, and implement it.

>> +static int gen_st(struct DisasContext *dc,
>> +                  unsigned char rdst, unsigned char rsrc)
>> +{
>> +    qemu_log("st r%d, r%d", rdst, rsrc);
>> +    TILEGX_GEN_CHK_SIMPLE(rdst);
>> +    TILEGX_GEN_CHK_BEGIN(rsrc);
>> +        tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, zero));
>> +    TILEGX_GEN_CHK_END(rsrc);
>> +
>> +    tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, regs[rsrc]));
>> +    return 0;
>> +}
> 
> Similarly, this should be using tcg_gen_qemu_st_i64.
> 

Yeah.

>> +static int gen_shl16insli(struct DisasContext *dc,
>> +                          unsigned char rdst, unsigned char rsrc, short im16)
> 
> Use uint16_t, as the field is not signed.
> 

objdump print it as signed, e.g. "shl16insli r32, r32, -30680", and
tcg_gen_ori_i64 also use int64_t for it.

>> +{
>> +    int64_t imm;
>> +    TCGv_i64 tmp;
>> +
>> +    qemu_log("shl16insli r%d, r%d, %d", rdst, rsrc, im16);
> 
> which means you print the right value here,
> 

objdump print it as signed.


>> +    TILEGX_GEN_CHK_SIMPLE(rdst);
>> +
>> +    imm = (int64_t)im16 & 0x000000000000ffffLL;
> 
> and you don't need the mask here.
> 

If im16 is signed, we have to use the mask here.

>> +
>> +    TILEGX_GEN_CHK_BEGIN(rsrc);
>> +        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, imm);
>> +    TILEGX_GEN_CHK_END(rsrc);
>> +
>> +    tmp = tcg_temp_new_i64();
>> +    tcg_gen_shli_i64(tmp, cpu_regs[rsrc], 16);
>> +    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ori_i64, rdst, tmp, imm);
>> +    tcg_temp_free_i64(tmp);
> 
> You don't need an extra temporary.  You know that dst will be a new temporary.
> 
>     tcg_gen_shli_i64(tdst, tsrc, 16);
>     tcg_gen_ori_i64(tdst, tdst, imm);
> 

Yeah, thanks.

>> +        if (rsrc == 0x3f) {
> 
> You might as well use TILEGX_R_ZERO, now that you've defined it.
> 

Yeah, thanks.

>> +            /* moveli Dest, Imm16 */
>> +            if (gen_moveli(dc, rdst, im16)) {
>> +                /* FIXME: raise an exception for invalid instruction */
>> +                return -1;
> 
> Please do not scatter these checks everywhere.  I'm not sure what you're trying
> to accomplish with them.  Certainly gen_movli can't fail.  In particular, all
> invalid instruction detection should happen here and the "gen" routines should
> only be given valid inputs.
> 

OK, thanks. What you said above sounds reasonable to me.

>> +    case 0x0000000040000000ULL:
>> +        switch (TILEGX_CODE_X0_20(bundle)) {
>> +        /* andi Dest, SrcA, Imm8 */
>> +        case 0x0000000000300000ULL:
>> +            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
>> +            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
>> +            im8 = (char)((bundle >> 12) & TILEGX_DATA_IMM8);
> 
> Pull these field extracts to the top of the switch -- there are only a limited
> amount of them that are possible for the X0 pipeline.
> 
> Use the extract64 and sextract64 functions instead of bare shifts and masks.
> 

OK, thanks What you said above sounds reasonable to me. But if we use
"arch/opcode_tilegx.h" next, we can use their own related inline
functions instead of.

>> +            /* FIXME: raise an exception for invalid instruction */
>> +            qemu_log("20 bundle value: %16.16llx", TILEGX_CODE_X0_20(bundle));
> 
> You might as well get this right from the very beginning.  It is very easy to
> define a helper to throw an exception.  (Harder to consume the exception,
> perhaps, but "consume" can simply abort for now.)
> 

OK, thanks. I shall try for it.

>> +        case 0x0000000001040000ULL:
>> +            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
>> +            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
>> +            rsrcb = (unsigned char)((bundle >> 12) & TILEGX_DATA_REGISTER);
>> +            /* move Dest, Src */
>> +            if (rsrcb == 0x3f) {
>> +                if (gen_move(dc, rdst, rsrc)) {
>> +                    /* FIXME: raise an exception for invalid instruction */
>> +                    return -1;
>> +                }
>> +            } else {
>> +                qemu_log("invalid rsrcb, not 0x3f for move in x0");
>> +            }
> 
> Um, what instruction is this supposed to be?  (Sadly, the tilegx architecture
> pdf that I have doesn't have instructions indexed by opcode, so it's hard for
> me to actually look this up.)
> 

For me, the most efficient way is: when we meet it, then fix it.

>> +    case 0x1800000000000000ULL:
>> +        switch (TILEGX_CODE_X1_51(bundle)) {
>> +        /* addi Dest, SrcA, Imm8 */
>> +        case 0x0008000000000000ULL:
>> +            rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
>> +            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
>> +            im8 = (char)((bundle >> 43) & TILEGX_DATA_IMM8);
>> +            if (rsrc == 0x3f) {
>> +                if (gen_movei(dc, rdst, im8)) {
>> +                    /* FIXME: raise an exception for invalid instruction */
>> +                    return -1;
>> +                }
>> +            } else {
>> +                if (gen_addi(dc, rdst, rsrc, im8)) {
>> +                    /* FIXME: raise an exception for invalid instruction */
>> +                    return -1;
>> +                }
>> +            }
> 
> Since this is the 3rd copy of this, clearly the movei optimization should be
> moved into gen_addi, and gen_movei should be removed.
> 

Oh, I forgot to provide related comment for them, in ISA.pdf, for x1,
movei is the pseudo instruction for addi when src is zero register.

>> +static int translate_one_bundle(struct DisasContext *dc, uint64_t bundle)
>> +{
>> +    int i, ret = 0;
>> +    TCGv tmp;
>> +
>> +    for (i = 0; i < TILEGX_BUNDLE_INSNS; i++) {
>> +        dc->tmp_regs[i].idx = TILEGX_R_NOREG;
>> +        TCGV_UNUSED_I64(dc->tmp_regs[i].val);
>> +    }
> 
> Emit
> 
>     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT))) {
>         tcg_gen_debug_insn_start(dc->pc);
>     }
> 
> here.
> 

OK, thanks.

>> +
>> +    /* For can regress completely, it must be y0 -> y1 -> y2, or x0 -> x1. */
> 
> I don't know what this sentence means.
> 

I guess, it needs more contents:

  y2 and x1 may contents st instruction (x1 may also contents 'jal'),
  which should be executed at last, or when exception raised, we can
  not be sure each bundle must be atomic (none pipes should execute).

>> +    if (TILEGX_BUNDLE_TYPE_Y(bundle)) {
>> +        dc->tmp_regcur = dc->tmp_regs + 0;
>> +        ret = translate_y0_bundle(dc, TILEGX_PIPE_Y0(bundle));
>> +        if (ret) {
>> +            goto err;
>> +        }
> 
> You don't need these gotos.  If a given pipeline raises an invalid instruction
> exception, then it's true that all code emitted afterward is dead.  But that's
> ok, since it simply won't be executed.  Invalid instructions are exceedingly
> unlikely and it's silly to optimize for that case.
> 

OK, thanks. I guess your meaning is: we need not check the return value
for each pipe, just raise exception is enough. e.g.

 - Add a flag in DisasContext.

 - When exception happens, set the flag.

 - In main looping gen_intermediate_code_internal(), if the flag is set,
   we need break the tb block, and return.

>> +    /* FIXME: do not consider about search_pc firstly. */
> 
> You have to do this right away.  This path will be used the very first time you
> execute a load or store instruction.
> 

OK, thanks. What you said above sounds reasonable to me.


Thanks.
Richard Henderson Feb. 25, 2015, 5:19 p.m. UTC | #14
On 02/24/2015 05:40 PM, Chen Gang S wrote:
>>> +static int gen_shl16insli(struct DisasContext *dc,
>>> +                          unsigned char rdst, unsigned char rsrc, short im16)
>>
>> Use uint16_t, as the field is not signed.
>>
> 
> objdump print it as signed, e.g. "shl16insli r32, r32, -30680"

I think you've just found a bug in objdump.  ;-)

>> Use the extract64 and sextract64 functions instead of bare shifts and masks.
>>
> 
> OK, thanks What you said above sounds reasonable to me. But if we use
> "arch/opcode_tilegx.h" next, we can use their own related inline
> functions instead of.

Yes, using the opcode_tilegx.h functions is a good idea.  I didn't know about
those before.  I've looked through that file now and using them will make
reviewing this code much easier.

>>> +    /* For can regress completely, it must be y0 -> y1 -> y2, or x0 -> x1. */
>>
>> I don't know what this sentence means.
>>
> 
> I guess, it needs more contents:
> 
>   y2 and x1 may contents st instruction (x1 may also contents 'jal'),
>   which should be executed at last, or when exception raised, we can
>   not be sure each bundle must be atomic (none pipes should execute).

How about something like:

	Only pipelines X1 and Y2 can issue branches, memory ops, or
	issue trapping instructions like system calls.  As these
	operations cannot be buffered, it is convenient to translate
	those pipelines last.

After all, we're already buffering the output of all of the pipes.  If the
memory op succeeds, then all of the rest of the arithmetic will succeed, so it
doesn't really matter that much what ordering we give the pipes.  No state will
change until we reach the end of the bundle and write back the results.

>> You don't need these gotos.  If a given pipeline raises an invalid instruction
>> exception, then it's true that all code emitted afterward is dead.  But that's
>> ok, since it simply won't be executed.  Invalid instructions are exceedingly
>> unlikely and it's silly to optimize for that case.
>>
> 
> OK, thanks. I guess your meaning is: we need not check the return value
> for each pipe, just raise exception is enough. e.g.
> 
>  - Add a flag in DisasContext.
> 
>  - When exception happens, set the flag.
> 
>  - In main looping gen_intermediate_code_internal(), if the flag is set,
>    we need break the tb block, and return.

Yes, that will be fine.


r~
Chen Gang Feb. 26, 2015, 1:44 a.m. UTC | #15
On 02/26/2015 01:19 AM, Richard Henderson wrote:
> On 02/24/2015 05:40 PM, Chen Gang S wrote:
>>>> +static int gen_shl16insli(struct DisasContext *dc,
>>>> +                          unsigned char rdst, unsigned char rsrc, short im16)
>>>
>>> Use uint16_t, as the field is not signed.
>>>
>>
>> objdump print it as signed, e.g. "shl16insli r32, r32, -30680"
> 
> I think you've just found a bug in objdump.  ;-)
> 

According to the ISA document, I guess, it assumes imm16 is unsigned
number (although ISA document does not mention about it, directly). But
for tcg_gen_ori_i64, it assumes imm16 is signed number:

  void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);

So for me, I am not sure that objdump must be a bug. For safety reason,
I still prefer to treat imm16 as signed number.

>>> Use the extract64 and sextract64 functions instead of bare shifts and masks.
>>>
>>
>> OK, thanks What you said above sounds reasonable to me. But if we use
>> "arch/opcode_tilegx.h" next, we can use their own related inline
>> functions instead of.
> 
> Yes, using the opcode_tilegx.h functions is a good idea.  I didn't know about
> those before.  I've looked through that file now and using them will make
> reviewing this code much easier.
> 

OK, thanks.

>>>> +    /* For can regress completely, it must be y0 -> y1 -> y2, or x0 -> x1. */
>>>
>>> I don't know what this sentence means.
>>>
>>
>> I guess, it needs more contents:
>>
>>   y2 and x1 may contents st instruction (x1 may also contents 'jal'),
>>   which should be executed at last, or when exception raised, we can
>>   not be sure each bundle must be atomic (none pipes should execute).
> 
> How about something like:
> 
> 	Only pipelines X1 and Y2 can issue branches, memory ops, or
> 	issue trapping instructions like system calls.  As these

Y1 can issue branches, which can be buffered. And Y1 also can issue call
(which will store pc to sp stack) -- e.g. jalr, jalrp.

> 	operations cannot be buffered, it is convenient to translate
> 	those pipelines last.
> 
> After all, we're already buffering the output of all of the pipes.  If the
> memory op succeeds, then all of the rest of the arithmetic will succeed, so it
> doesn't really matter that much what ordering we give the pipes.  No state will
> change until we reach the end of the bundle and write back the results.
> 

OK, thanks. After check ISA document again, for me, we have to still use
"y0, y1, y2", e.g. de5e598034ac3000 { fnop ; jalr r12 ; st r10, r11 }

 If y0 -> y1 -> y2:

 - if jalr succeeds, it will write pc to sp stack, but sp is not changed
   (just like lr, pc, they are buffered to tcg temporary variables).

 - if st fails, as the result, we can still say the whole bundle is not
   execute (it has already written pc to sp stack, but sp isn't changed,
   so it is still OK).

 If y0 -> y2 -> y1:

 - if st succeeds, it will write data to the useful memory.

 - if jalr fails (e.g. sp stack is full, which may cause memory access
   issue), we can not restore the bundle.


If what I said is correct, for me, we still need some comments, but the
comments really need to be improved, it may like this:

  Must be sure of pipe y1 before pipe y2, or the bundle { ; jalr ... ;
  st ...} may not be restored if failure occurs. For common habbit, we
  can still use "y0, y1, y2" and "x0, x1", it is OK (y1 before y2).
  


>>> You don't need these gotos.  If a given pipeline raises an invalid instruction
>>> exception, then it's true that all code emitted afterward is dead.  But that's
>>> ok, since it simply won't be executed.  Invalid instructions are exceedingly
>>> unlikely and it's silly to optimize for that case.
>>>
>>
>> OK, thanks. I guess your meaning is: we need not check the return value
>> for each pipe, just raise exception is enough. e.g.
>>
>>  - Add a flag in DisasContext.
>>
>>  - When exception happens, set the flag.
>>
>>  - In main looping gen_intermediate_code_internal(), if the flag is set,
>>    we need break the tb block, and return.
> 
> Yes, that will be fine.

OK, thanks.


Thanks.
Richard Henderson Feb. 26, 2015, 4:31 p.m. UTC | #16
On 02/25/2015 03:44 PM, Chen Gang S wrote:
> OK, thanks. After check ISA document again, for me, we have to still use
> "y0, y1, y2", e.g. de5e598034ac3000 { fnop ; jalr r12 ; st r10, r11 }
> 
>  If y0 -> y1 -> y2:
> 
>  - if jalr succeeds, it will write pc to sp stack, but sp is not changed
>    (just like lr, pc, they are buffered to tcg temporary variables).
> 
>  - if st fails, as the result, we can still say the whole bundle is not
>    execute (it has already written pc to sp stack, but sp isn't changed,
>    so it is still OK).
> 
>  If y0 -> y2 -> y1:
> 
>  - if st succeeds, it will write data to the useful memory.
> 
>  - if jalr fails (e.g. sp stack is full, which may cause memory access
>    issue), we can not restore the bundle.

You need to re-check the ISA document.  JALR does not write to the "real" stack
at all, and cannot raise any kind of exception.

Section 2.1.2.3 clearly defines pushReturnStack as part of the branch
prediction mechanism on the cpu.  It can be completely ignored for QEMU.


r~
Chen Gang Feb. 26, 2015, 11:30 p.m. UTC | #17
On 02/27/2015 12:31 AM, Richard Henderson wrote:
> On 02/25/2015 03:44 PM, Chen Gang S wrote:
>> OK, thanks. After check ISA document again, for me, we have to still use
>> "y0, y1, y2", e.g. de5e598034ac3000 { fnop ; jalr r12 ; st r10, r11 }
>>
>>  If y0 -> y1 -> y2:
>>
>>  - if jalr succeeds, it will write pc to sp stack, but sp is not changed
>>    (just like lr, pc, they are buffered to tcg temporary variables).
>>
>>  - if st fails, as the result, we can still say the whole bundle is not
>>    execute (it has already written pc to sp stack, but sp isn't changed,
>>    so it is still OK).
>>
>>  If y0 -> y2 -> y1:
>>
>>  - if st succeeds, it will write data to the useful memory.
>>
>>  - if jalr fails (e.g. sp stack is full, which may cause memory access
>>    issue), we can not restore the bundle.
> 
> You need to re-check the ISA document.  JALR does not write to the "real" stack
> at all, and cannot raise any kind of exception.
> 
> Section 2.1.2.3 clearly defines pushReturnStack as part of the branch
> prediction mechanism on the cpu.  It can be completely ignored for QEMU.
> 

OK, thanks. What you said above sounds reasonable to me.


Thanks.
Chris Metcalf Feb. 27, 2015, 3:01 a.m. UTC | #18
On 2/25/2015 8:44 PM, Chen Gang S wrote:
> On 02/26/2015 01:19 AM, Richard Henderson wrote:
>> On 02/24/2015 05:40 PM, Chen Gang S wrote:
>>>>> +static int gen_shl16insli(struct DisasContext *dc,
>>>>> +                          unsigned char rdst, unsigned char rsrc, short im16)
>>>> Use uint16_t, as the field is not signed.
>>>>
>>> objdump print it as signed, e.g. "shl16insli r32, r32, -30680"
>> I think you've just found a bug in objdump.  ;-)
>>
> According to the ISA document, I guess, it assumes imm16 is unsigned
> number (although ISA document does not mention about it, directly). But
> for tcg_gen_ori_i64, it assumes imm16 is signed number:
>
>    void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
>
> So for me, I am not sure that objdump must be a bug. For safety reason,
> I still prefer to treat imm16 as signed number.

You have to consider the immediate value as having the high 48 bits zero
when you "or" it into the result, so it's hard to consider it as signed in
any meaningful way.  Richard is right.

> Y1 can issue branches, which can be buffered. And Y1 also can issue call
> (which will store pc to sp stack) -- e.g. jalr, jalrp.

Richard already covered this, but yes, jal/jalr etc are not memory ops.
There is only one memory op possible per bundle.
Chen Gang Feb. 27, 2015, 3:41 a.m. UTC | #19
On 2/27/15 11:01, Chris Metcalf wrote:
> On 2/25/2015 8:44 PM, Chen Gang S wrote:
>> On 02/26/2015 01:19 AM, Richard Henderson wrote:
>>> On 02/24/2015 05:40 PM, Chen Gang S wrote:
>>>>>> +static int gen_shl16insli(struct DisasContext *dc,
>>>>>> +                          unsigned char rdst, unsigned char rsrc, short im16)
>>>>> Use uint16_t, as the field is not signed.
>>>>>
>>>> objdump print it as signed, e.g. "shl16insli r32, r32, -30680"
>>> I think you've just found a bug in objdump.  ;-)
>>>
>> According to the ISA document, I guess, it assumes imm16 is unsigned
>> number (although ISA document does not mention about it, directly). But
>> for tcg_gen_ori_i64, it assumes imm16 is signed number:
>>
>>    void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
>>
>> So for me, I am not sure that objdump must be a bug. For safety reason,
>> I still prefer to treat imm16 as signed number.
> 
> You have to consider the immediate value as having the high 48 bits zero
> when you "or" it into the result, so it's hard to consider it as signed in
> any meaningful way.  Richard is right.
> 

That the reason why I have to use:

  imm = (int64_t)im16 & 0x000000000000ffffLL;

But I don't know why both objdump and tcg_gen_ori_i64 treate imm16 as
signed integer. I guess:

 - For tcg_gen_ori_i64, I don't know why, does it need improvement?

 - For objdump, for me, the main reason is that the opcode only has
   TILEGX_OP_TYPE_IMMEDIATE which is only for signed immediate number,
   no one for unsigned immediate number (the same in Linux kernel).

If I have to use unsigned immediate number, I guess, at least, we also
need fix the related issue in both binutils and Linux kernel (if
possible, I should do it, within next month).


>> Y1 can issue branches, which can be buffered. And Y1 also can issue call
>> (which will store pc to sp stack) -- e.g. jalr, jalrp.
> 
> Richard already covered this, but yes, jal/jalr etc are not memory ops.
> There is only one memory op possible per bundle.
> 

OK, thanks.


Thanks.
diff mbox

Patch

diff --git a/linux-user/main.c b/linux-user/main.c
index 76e5e80..ef89bfd 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -4464,6 +4464,7 @@  int main(int argc, char **argv, char **envp)
         env->regs[53] = regs->tp;  /* TILEGX_R_TP */
         env->regs[54] = regs->sp;  /* TILEGX_R_SP */
         env->regs[55] = regs->lr;  /* TILEGX_R_LR */
+        env->zero = 0;
         env->pc = regs->lr;
     }
 #else
diff --git a/target-tilegx/cpu-qom.h b/target-tilegx/cpu-qom.h
index e15a8b8..866a77d 100644
--- a/target-tilegx/cpu-qom.h
+++ b/target-tilegx/cpu-qom.h
@@ -69,4 +69,6 @@  static inline TilegxCPU *tilegx_env_get_cpu(CPUTLState *env)
 
 #define ENV_GET_CPU(e) CPU(tilegx_env_get_cpu(e))
 
+#define ENV_OFFSET offsetof(TilegxCPU, env)
+
 #endif
diff --git a/target-tilegx/cpu.c b/target-tilegx/cpu.c
index 3dd66b5..a10cc24 100644
--- a/target-tilegx/cpu.c
+++ b/target-tilegx/cpu.c
@@ -69,10 +69,6 @@  static void tilegx_cpu_realizefn(DeviceState *dev, Error **errp)
     mcc->parent_realize(dev, errp);
 }
 
-static void tilegx_tcg_init(void)
-{
-}
-
 static void tilegx_cpu_initfn(Object *obj)
 {
     CPUState *cs = CPU(obj);
diff --git a/target-tilegx/cpu.h b/target-tilegx/cpu.h
index 09a2b26..73794ab 100644
--- a/target-tilegx/cpu.h
+++ b/target-tilegx/cpu.h
@@ -30,16 +30,20 @@ 
 #include "fpu/softfloat.h"
 
 /* Tilegx register alias */
-#define TILEGX_R_RE  0   /*  0 register, for function/syscall return value */
-#define TILEGX_R_NR  10  /* 10 register, for syscall number */
-#define TILEGX_R_BP  52  /* 52 register, optional frame pointer */
-#define TILEGX_R_TP  53  /* TP register, thread local storage data */
-#define TILEGX_R_SP  54  /* SP register, stack pointer */
-#define TILEGX_R_LR  55  /* LR register, may save pc, but it is not pc */
+#define TILEGX_R_RE    0   /*  0 register, for function/syscall return value */
+#define TILEGX_R_NR    10  /* 10 register, for syscall number */
+#define TILEGX_R_BP    52  /* 52 register, optional frame pointer */
+#define TILEGX_R_TP    53  /* TP register, thread local storage data */
+#define TILEGX_R_SP    54  /* SP register, stack pointer */
+#define TILEGX_R_LR    55  /* LR register, may save pc, but it is not pc */
+#define TILEGX_R_ZERO  63  /* Zero register, always zero */
+#define TILEGX_R_COUNT 56  /* Only 56 registers are really useful */
+#define TILEGX_R_NOREG 255 /* Invalid register value */
 
 typedef struct CPUTLState {
-    uint64_t regs[56];
-    uint64_t pc;
+    uint64_t regs[TILEGX_R_COUNT]; /* Common used registers by outside */
+    uint64_t zero;                 /* Zero register */
+    uint64_t pc;                   /* Current pc */
     CPU_COMMON
 } CPUTLState;
 
@@ -54,6 +58,7 @@  typedef struct CPUTLState {
 
 #include "exec/cpu-all.h"
 
+void tilegx_tcg_init(void);
 int cpu_tilegx_exec(CPUTLState *s);
 int cpu_tilegx_signal_handler(int host_signum, void *pinfo, void *puc);
 
diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
index 5131fa7..82c751e 100644
--- a/target-tilegx/translate.c
+++ b/target-tilegx/translate.c
@@ -25,18 +25,880 @@ 
 #include "exec/cpu_ldst.h"
 #include "exec/helper-gen.h"
 
+#define TILEGX_BUNDLE_SIZE            8  /* Each bundle size in bytes */
+#define TILEGX_BUNDLE_INSNS           3  /* Maximized insns per bundle */
+#define TILEGX_BUNDLE_OPCS            10 /* Assume maximized opcs per bundle */
+
+/* Check Bundle whether is Y type, else is X type */
+#define TILEGX_BUNDLE_TYPE_MASK       0xc000000000000000ULL
+#define TILEGX_BUNDLE_TYPE_Y(bundle)  ((bundle) & TILEGX_BUNDLE_TYPE_MASK)
+#define TILEGX_BUNDLE_TYPE_X(bundle)  (!TILEGX_BUNDLE_TYPE_Y(bundle))
+
+/* Bundle pipe mask, still remain the bundle type */
+#define TILEGX_PIPE_X0(bundle)        ((bundle) & 0x000000007fffffffULL)
+#define TILEGX_PIPE_X1(bundle)        ((bundle) & 0x3fffffff80000000ULL)
+#define TILEGX_PIPE_Y0(bundle)        ((bundle) & 0x00000000780fffffULL)
+#define TILEGX_PIPE_Y1(bundle)        ((bundle) & 0x3c07ffff80000000ULL)
+#define TILEGX_PIPE_Y2(bundle)        ((bundle) & 0x03f8000007f00000ULL)
+
+/* Code mask */
+#define TILEGX_CODE_X0(bundle)        ((bundle) & 0x0000000070000000ULL)
+#define TILEGX_CODE_X0_18(bundle)     ((bundle) & 0x000000000ffc0000ULL)
+#define TILEGX_CODE_X0_20(bundle)     ((bundle) & 0x000000000ff00000ULL)
+#define TILEGX_CODE_X1(bundle)        ((bundle) & 0x3800000000000000ULL)
+#define TILEGX_CODE_X1_49(bundle)     ((bundle) & 0x07fe000000000000ULL)
+#define TILEGX_CODE_X1_49_43(bundle)  ((bundle) & 0x0001f80000000000ULL)
+#define TILEGX_CODE_X1_51(bundle)     ((bundle) & 0x07f8000000000000ULL)
+#define TILEGX_CODE_X1_54(bundle)     ((bundle) & 0x07c0000000000000ULL)
+#define TILEGX_CODE_Y0(bundle)        ((bundle) & 0x0000000078000000ULL)
+#define TILEGX_CODE_Y0_E(bundle)      ((bundle) & 0x00000000000c0000ULL)
+#define TILEGX_CODE_Y1(bundle)        ((bundle) & 0x3c00000000000000ULL)
+#define TILEGX_CODE_Y1_E(bundle)      ((bundle) & 0x0006000000000000ULL)
+#define TILEGX_CODE_Y2(bundle)        ((bundle) & 0x0200000004000000ULL)
+/* No Y2_E */
+
+/* (F)Nop operation, only have effect within their own pipe */
+#define TILEGX_OPCX0_FNOP             0x0000000051483000ULL
+#define TILEGX_OPCX0_NOP              0x0000000051485000ULL
+#define TILEGX_OPCX1_FNOP             0x286a300000000000ULL
+#define TILEGX_OPCX1_NOP              0x286b080000000000ULL
+#define TILEGX_OPCY0_FNOP             0x00000000300c3000ULL
+#define TILEGX_OPCY0_NOP              0x00000000300c5000ULL
+#define TILEGX_OPCY1_FNOP             0x1c06400000000000ULL
+#define TILEGX_OPCY1_NOP              0x1c06780000000000ULL
+/* No Y2 (F)NOP */
+
+/* Data width mask */
+#define TILEGX_DATA_REGISTER          0x3f    /* Register data width mask */
+#define TILEGX_DATA_IMM6              0x3f    /* Imm6 data width mask */
+#define TILEGX_DATA_IMM8              0xff    /* Imm8 data width mask */
+#define TILEGX_DATA_IMM11             0x7ff   /* Imm11 data width mask */
+#define TILEGX_DATA_IMM16             0xffff  /* Imm16 data width mask */
+
+#define TILEGX_GEN_CHK_BEGIN(x) \
+    if ((x) == TILEGX_R_ZERO) {
+#define TILEGX_GEN_CHK_END(x) \
+        return 0; \
+    } \
+    if ((x) >= TILEGX_R_COUNT) { \
+        return -1; \
+    }
+
+#define TILEGX_GEN_CHK_SIMPLE(x) \
+    TILEGX_GEN_CHK_BEGIN(x) \
+    TILEGX_GEN_CHK_END(x)
+
+#define TILEGX_GEN_SAVE_PREP(rdst) \
+    dc->tmp_regcur->idx = rdst; \
+    dc->tmp_regcur->val = tcg_temp_new_i64();
+
+#define TILEGX_GEN_SAVE_TMP_1(func, rdst, x1) \
+    TILEGX_GEN_SAVE_PREP(rdst) \
+    func(dc->tmp_regcur->val, x1);
+
+#define TILEGX_GEN_SAVE_TMP_2(func, rdst, x1, x2) \
+    TILEGX_GEN_SAVE_PREP(rdst) \
+    func(dc->tmp_regcur->val, x1, x2);
+
+static TCGv_ptr cpu_env;
+static TCGv_i64 cpu_pc;
+static TCGv_i64 cpu_regs[TILEGX_R_COUNT];
+
+static const char *reg_names[] = {
+     "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+     "r8",  "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+    "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+    "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
+    "r32", "r33", "r34", "r35", "r36", "r37", "r38", "r39",
+    "r40", "r41", "r42", "r43", "r44", "r45", "r46", "r47",
+    "r48", "r49", "r50", "r51",  "bp",  "tp",  "sp",  "lr"
+};
+
+#include "exec/gen-icount.h"
+
+/* This is the state at translation time.  */
+struct DisasContext {
+    uint64_t pc;                   /* Current pc */
+    unsigned int cpustate_changed; /* Whether cpu state changed */
+
+    struct {
+        unsigned char idx;         /* index */
+        TCGv val;                  /* value */
+    } *tmp_regcur,                 /* Current temporary registers */
+    tmp_regs[TILEGX_BUNDLE_INSNS]; /* All temporary registers */
+
+    struct {
+        TCGCond cond;              /* Branch condition */
+        TCGv dest;                 /* pc jump destination, if will jump */
+        TCGv val1;                 /* Firt value for condition comparing */
+        TCGv val2;                 /* Second value for condition comparing */
+    } jmp;                         /* Jump object, only once in each TB block */
+};
+
+void tilegx_tcg_init(void)
+{
+    int i;
+
+    cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
+    cpu_pc = tcg_global_mem_new_i64(TCG_AREG0, offsetof(CPUTLState, pc), "pc");
+    for (i = 0; i < TILEGX_R_COUNT; i++) {
+        cpu_regs[i] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUTLState, regs[i]),
+                                             reg_names[i]);
+    }
+}
+
+static int gen_move(struct DisasContext *dc,
+                    unsigned char rdst, unsigned char rsrc)
+{
+    qemu_log("move r%d, r%d", rdst, rsrc);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, 0);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    TILEGX_GEN_SAVE_TMP_1(tcg_gen_mov_i64, rdst, cpu_regs[rsrc]);
+    return 0;
+}
+
+static int gen_move_imm(struct DisasContext *dc,
+                        unsigned char rdst, int64_t imm)
+{
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, imm);
+    return 0;
+}
+
+static int gen_moveli(struct DisasContext *dc, unsigned char rdst, short im16)
+{
+    qemu_log("moveli r%d, %d", rdst, im16);
+    return gen_move_imm(dc, rdst, (int64_t)im16);
+}
+
+static int gen_movei(struct DisasContext *dc, unsigned char rdst, char im8)
+{
+    qemu_log("movei r%d, %d", rdst, im8);
+    return gen_move_imm(dc, rdst, (int64_t)im8);
+}
+
+static int gen_ld(struct DisasContext *dc,
+                  unsigned char rdst, unsigned char rsrc)
+{
+    qemu_log("ld r%d, r%d", rdst, rsrc);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env, 0);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ld_i64, rdst, cpu_env,
+                          offsetof(CPUTLState, regs[rsrc]));
+    return 0;
+}
+
+static int gen_st(struct DisasContext *dc,
+                  unsigned char rdst, unsigned char rsrc)
+{
+    qemu_log("st r%d, r%d", rdst, rsrc);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, zero));
+    TILEGX_GEN_CHK_END(rsrc);
+
+    tcg_gen_st_i64(cpu_regs[rdst], cpu_env, offsetof(CPUTLState, regs[rsrc]));
+    return 0;
+}
+
+static int gen_addimm(struct DisasContext *dc,
+                      unsigned char rdst, unsigned char rsrc, int64_t imm)
+{
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, imm);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    TILEGX_GEN_SAVE_TMP_2(tcg_gen_addi_i64, rdst, cpu_regs[rsrc], imm);
+    return 0;
+}
+
+static int gen_addi(struct DisasContext *dc,
+                    unsigned char rdst, unsigned char rsrc, char im8)
+{
+    qemu_log("addi r%d, r%d, %d", rdst, rsrc, im8);
+    return gen_addimm(dc, rdst, rsrc, (int64_t)im8);
+}
+
+static int gen_addli(struct DisasContext *dc,
+                     unsigned char rdst, unsigned char rsrc, short im16)
+{
+    qemu_log("addli r%d, r%d, %d", rdst, rsrc, im16);
+    return gen_addimm(dc, rdst, rsrc, (int64_t)im16);
+}
+
+static int gen_add(struct DisasContext *dc,
+                   unsigned char rdst, unsigned char rsrc, unsigned char rsrcb)
+{
+    qemu_log("add r%d, r%d, r%d", rdst, rsrc, rsrcb);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_CHK_BEGIN(rsrcb);
+            TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, 0);
+        TILEGX_GEN_CHK_END(rsrcb);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_mov_i64, rdst, cpu_regs[rsrcb]);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    TILEGX_GEN_CHK_BEGIN(rsrcb);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_mov_i64, rdst, cpu_regs[rsrc]);
+    TILEGX_GEN_CHK_END(rsrcb);
+
+    TILEGX_GEN_SAVE_TMP_2(tcg_gen_add_i64, rdst,
+                          cpu_regs[rsrc], cpu_regs[rsrcb]);
+    return 0;
+}
+
+static int gen_andimm(struct DisasContext *dc,
+                      unsigned char rdst, unsigned char rsrc, int64_t imm)
+{
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, 0);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    TILEGX_GEN_SAVE_TMP_2(tcg_gen_andi_i64, rdst,
+                          cpu_regs[rsrc], (uint64_t)imm);
+    return 0;
+}
+
+static int gen_andi(struct DisasContext *dc,
+                    unsigned char rdst, unsigned char rsrc, char im8)
+{
+    qemu_log("andi r%d, r%d, %d", rdst, rsrc, im8);
+    return gen_andimm(dc, rdst, rsrc, (int64_t)im8);
+}
+
+static int gen_lnk(struct DisasContext *dc, unsigned char rdst)
+{
+    qemu_log("lnk r%d", rdst);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+    TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, dc->pc + TILEGX_BUNDLE_SIZE);
+    return 0;
+}
+
+static int gen_shl16insli(struct DisasContext *dc,
+                          unsigned char rdst, unsigned char rsrc, short im16)
+{
+    int64_t imm;
+    TCGv_i64 tmp;
+
+    qemu_log("shl16insli r%d, r%d, %d", rdst, rsrc, im16);
+    TILEGX_GEN_CHK_SIMPLE(rdst);
+
+    imm = (int64_t)im16 & 0x000000000000ffffLL;
+
+    TILEGX_GEN_CHK_BEGIN(rsrc);
+        TILEGX_GEN_SAVE_TMP_1(tcg_gen_movi_i64, rdst, imm);
+    TILEGX_GEN_CHK_END(rsrc);
+
+    tmp = tcg_temp_new_i64();
+    tcg_gen_shli_i64(tmp, cpu_regs[rsrc], 16);
+    TILEGX_GEN_SAVE_TMP_2(tcg_gen_ori_i64, rdst, tmp, imm);
+    tcg_temp_free_i64(tmp);
+    return 0;
+}
+
+static int gen_jr(struct DisasContext *dc, unsigned char rsrc)
+{
+    qemu_log("jr r%d", rsrc);
+    TILEGX_GEN_CHK_SIMPLE(rsrc);
+
+    dc->jmp.dest = tcg_temp_new_i64();
+
+    dc->jmp.cond = TCG_COND_ALWAYS;
+    tcg_gen_mov_i64(dc->jmp.dest, cpu_regs[rsrc]);
+    return 0;
+}
+
+static int gen_beqz(struct DisasContext *dc, unsigned char rsrc, int off)
+{
+    qemu_log("beqz r%d, %d", rsrc, off);
+    TILEGX_GEN_CHK_SIMPLE(rsrc);
+
+    dc->jmp.dest = tcg_temp_new_i64();
+    dc->jmp.val1 = tcg_temp_new_i64();
+    dc->jmp.val2 = tcg_temp_new_i64();
+
+    dc->jmp.cond = TCG_COND_EQ;
+    tcg_gen_movi_i64(dc->jmp.dest, dc->pc + (int64_t)off * TILEGX_BUNDLE_SIZE);
+    tcg_gen_mov_i64(dc->jmp.val1, cpu_regs[rsrc]);
+    tcg_gen_movi_i64(dc->jmp.val2, 0);
+
+    return 0;
+}
+
+static int translate_x0_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    unsigned char rdst, rsrc, rsrcb;
+    char im8;
+    short im16;
+
+    qemu_log("\nx0: %16.16lx\t", bundle);
+    if (bundle == TILEGX_OPCX0_FNOP) {
+        qemu_log("fnop");
+        return 0;
+    }
+
+    if (bundle == TILEGX_OPCX0_NOP) {
+        qemu_log("nop");
+        return 0;
+    }
+
+    switch (TILEGX_CODE_X0(bundle)) {
+    case 0x0000000010000000ULL:
+        rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+        im16 = (short)((bundle >> 12) & TILEGX_DATA_IMM16);
+        if (rsrc == 0x3f) {
+            /* moveli Dest, Imm16 */
+            if (gen_moveli(dc, rdst, im16)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+        } else {
+            /* addli Dest, SrcA, Imm16 */
+            if (gen_addli(dc, rdst, rsrc, im16)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+        }
+        break;
+
+    case 0x0000000040000000ULL:
+        switch (TILEGX_CODE_X0_20(bundle)) {
+        /* andi Dest, SrcA, Imm8 */
+        case 0x0000000000300000ULL:
+            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+            im8 = (char)((bundle >> 12) & TILEGX_DATA_IMM8);
+            if (gen_andi(dc, rdst, rsrc, im8)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        /* addi Dest, SrcA, Imm8 */
+        case 0x0000000000100000ULL:
+            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+            im8 = (char)((bundle >> 12) & TILEGX_DATA_IMM8);
+            if (gen_addi(dc, rdst, rsrc, im8)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("20 bundle value: %16.16llx", TILEGX_CODE_X0_20(bundle));
+            return -1;
+        }
+        break;
+
+    case 0x0000000050000000ULL:
+        switch (TILEGX_CODE_X0_18(bundle)) {
+        /* add Dest, SrcA, SrcB */
+        case 0x00000000000c0000ULL:
+            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+            rsrcb = (unsigned char)((bundle >> 12) & TILEGX_DATA_REGISTER);
+            if (gen_add(dc, rdst, rsrc, rsrcb)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        case 0x0000000001040000ULL:
+            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+            rsrcb = (unsigned char)((bundle >> 12) & TILEGX_DATA_REGISTER);
+            /* move Dest, Src */
+            if (rsrcb == 0x3f) {
+                if (gen_move(dc, rdst, rsrc)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+            } else {
+                qemu_log("invalid rsrcb, not 0x3f for move in x0");
+            }
+            break;
+
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("18 bundle value: %16.16llx", TILEGX_CODE_X0_18(bundle));
+            return -1;
+        }
+        break;
+
+    /* shl16insli Dest, SrcA, Imm16 */
+    case 0x0000000070000000ULL:
+        rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+        im16 = (short)((bundle >> 12) & TILEGX_DATA_IMM16);
+        if (gen_shl16insli(dc, rdst, rsrc, im16)) {
+            /* FIXME: raise an exception for invalid instruction */
+            return -1;
+        }
+        break;
+
+    default:
+        /* FIXME: raise an exception for invalid instruction */
+        qemu_log("bundle value: %16.16llx", TILEGX_CODE_X0(bundle));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int translate_x1_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    unsigned char rdst, rsrc, rsrcb;
+    char im8;
+    short im16;
+    struct {
+        int v :17;
+    } off17;
+
+    qemu_log("\nx1: %16.16lx\t", bundle);
+    if (bundle == TILEGX_OPCX1_FNOP) {
+        qemu_log("fnop");
+        return 0;
+    }
+
+    if (bundle == TILEGX_OPCX1_NOP) {
+        qemu_log("nop");
+        return 0;
+    }
+
+    switch (TILEGX_CODE_X1(bundle)) {
+    case 0x0000000000000000ULL:
+        rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+        im16 = (short)((bundle >> 43) & TILEGX_DATA_IMM16);
+        if (rsrc == 0x3f) {
+            /* moveli Dest, Imm16 */
+            if (gen_moveli(dc, rdst, im16)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+        } else {
+            /* addli Dest, SrcA, Imm16 */
+            if (gen_addli(dc, rdst, rsrc, im16)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+        }
+        break;
+
+    case 0x1000000000000000ULL:
+        switch (TILEGX_CODE_X1_54(bundle)) {
+        /* beqz Src, Broff */
+        case 0x0440000000000000ULL:
+            off17.v = (int)((bundle >> 31) & TILEGX_DATA_IMM6);
+            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+            off17.v |= (int)((bundle >> 37) & (TILEGX_DATA_IMM11 << 6));
+            if (gen_beqz(dc, rsrc, (int)off17.v)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("54 bundle value: %llx", TILEGX_CODE_X1_51(bundle));
+            return -1;
+        }
+        break;
+
+    case 0x1800000000000000ULL:
+        switch (TILEGX_CODE_X1_51(bundle)) {
+        /* addi Dest, SrcA, Imm8 */
+        case 0x0008000000000000ULL:
+            rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+            im8 = (char)((bundle >> 43) & TILEGX_DATA_IMM8);
+            if (rsrc == 0x3f) {
+                if (gen_movei(dc, rdst, im8)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+            } else {
+                if (gen_addi(dc, rdst, rsrc, im8)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+            }
+            break;
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("51 bundle value: %llx", TILEGX_CODE_X1_51(bundle));
+            return -1;
+        }
+        break;
+
+    case 0x2800000000000000ULL:
+        switch (TILEGX_CODE_X1_49(bundle)) {
+        /* add Dest, SrcA, SrcB */
+        case 0x0006000000000000ULL:
+            rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+            rsrcb = (unsigned char)((bundle >> 43) & TILEGX_DATA_REGISTER);
+            if (gen_add(dc, rdst, rsrc, rsrcb)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        /* store Dest, Src */
+        case 0x0062000000000000ULL:
+            rdst = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 43) & TILEGX_DATA_REGISTER);
+            if (gen_st(dc, rdst, rsrc)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        case 0x006a000000000000ULL:
+            switch (TILEGX_CODE_X1_49_43(bundle)) {
+            /* lnk Dest */
+            case 0x0000f00000000000ULL:
+                rdst = (unsigned char)((bundle >> 31)  & TILEGX_DATA_REGISTER);
+                if (gen_lnk(dc, rdst)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+                break;
+
+            /* jr SrcA */
+            case 0x0000700000000000ULL:
+                rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+                if (gen_jr(dc, rsrc)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+                break;
+
+            default:
+                /* FIXME: raise an exception for invalid instruction */
+                qemu_log("49_43 bundle value: %16.16llx",
+                       TILEGX_CODE_X1_49_43(bundle));
+                return -1;
+            }
+            break;
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("49 bundle value: %16.16llx", TILEGX_CODE_X1_49(bundle));
+            return -1;
+        }
+        break;
+
+    /* shl16insli Dest, SrcA, Imm16 */
+    case 0x3800000000000000ULL:
+        rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+        im16 = (short)((bundle >> 43) & TILEGX_DATA_IMM16);
+        if (gen_shl16insli(dc, rdst, rsrc, im16)) {
+            /* FIXME: raise an exception for invalid instruction */
+            return -1;
+        }
+        break;
+
+    default:
+        /* FIXME: raise an exception for invalid instruction */
+        qemu_log("bundle value: %16.16llx", TILEGX_CODE_X1(bundle));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int translate_y0_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    unsigned char rsrc, rdst;
+    char im8;
+
+    qemu_log("\ny0: %16.16lx\t", bundle);
+    if (bundle == TILEGX_OPCY0_FNOP) {
+        qemu_log("fnop");
+        return 0;
+    }
+
+    if (bundle == TILEGX_OPCY0_NOP) {
+        qemu_log("nop");
+        return 0;
+    }
+
+    switch (TILEGX_CODE_Y0(bundle)) {
+    /* addi Dest, SrcA, Imm8 */
+    case 0x0000000000000000ULL:
+        rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+        im8 = (char)((bundle >> 12) & TILEGX_DATA_IMM8);
+        if (gen_addi(dc, rdst, rsrc, im8)) {
+            /* FIXME: raise an exception for invalid instruction */
+            return -1;
+        }
+        break;
+
+    case 0x0000000050000000ULL:
+        switch (TILEGX_CODE_Y0_E(bundle)) {
+        /* move Dest, SrcA */
+        case 0x0000000000080000ULL:
+            rdst = (unsigned char)(bundle & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 6) & TILEGX_DATA_REGISTER);
+            if (gen_move(dc, rdst, rsrc)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            return -1;
+        }
+        break;
+
+    default:
+        /* FIXME: raise an exception for invalid instruction */
+        qemu_log("bundle value: %16.16llx", TILEGX_CODE_Y0(bundle));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int translate_y1_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    unsigned char rdst, rsrc, rsrcb;
+    char im8;
+
+    qemu_log("\ny1: %16.16lx\t", bundle);
+    if (bundle == TILEGX_OPCY1_FNOP) {
+        qemu_log("fnop");
+        return 0;
+    }
+
+    if (bundle == TILEGX_OPCY1_NOP) {
+        qemu_log("nop");
+        return 0;
+    }
+
+    switch (TILEGX_CODE_Y1(bundle)) {
+    /* addi Dest, SrcA, Imm8 */
+    case 0x0400000000000000ULL:
+        rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+        rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+        im8 = (char)((bundle >> 43) & TILEGX_DATA_IMM8);
+        if (gen_addi(dc, rdst, rsrc, im8)) {
+            /* FIXME: raise an exception for invalid instruction */
+            return -1;
+        }
+        break;
+
+    case 0x2c00000000000000ULL:
+        switch(TILEGX_CODE_Y1_E(bundle)) {
+        case 0x0004000000000000ULL:
+            rdst = (unsigned char)((bundle >> 31) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 37) & TILEGX_DATA_REGISTER);
+            rsrcb = (unsigned char)((bundle >> 43) & TILEGX_DATA_REGISTER);
+            /* move Dest, Src */
+            if (rsrcb == 0x3f) {
+                if (gen_move(dc, rdst, rsrc)) {
+                    /* FIXME: raise an exception for invalid instruction */
+                    return -1;
+                }
+            } else {
+                qemu_log("invalid rsrcb, not 0x3f for move in y1");
+            }
+            break;
+
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("y1 E bundle value: %16.16llx", TILEGX_CODE_Y1_E(bundle));
+            break;
+        }
+        break;
+
+    default:
+        /* FIXME: raise an exception for invalid instruction */
+        qemu_log("bundle value: %16.16llx", TILEGX_CODE_Y1(bundle));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int translate_y2_bundle(struct DisasContext *dc, uint64_t bundle,
+                               uint64_t mode)
+{
+    unsigned char rsrc, rdst;
+
+    qemu_log("\ny2: %16.16lx\t", bundle);
+    switch (TILEGX_CODE_Y2(bundle)) {
+    case 0x0200000004000000ULL:
+        switch (mode) {
+        /* ld Dest, Src */
+        case 0x8000000000000000ULL:
+            rdst = (unsigned char)((bundle >> 51) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 20) & TILEGX_DATA_REGISTER);
+            if (gen_ld(dc, rdst, rsrc)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+        /* st Dest, Src */
+        case 0xc000000000000000ULL:
+            rdst = (unsigned char)((bundle >> 20) & TILEGX_DATA_REGISTER);
+            rsrc = (unsigned char)((bundle >> 51) & TILEGX_DATA_REGISTER);
+            if (gen_st(dc, rdst, rsrc)) {
+                /* FIXME: raise an exception for invalid instruction */
+                return -1;
+            }
+            break;
+        default:
+            /* FIXME: raise an exception for invalid instruction */
+            qemu_log("bundle value: %16.16llx,  mode: %16.16lx",
+                   TILEGX_CODE_Y2(bundle), mode);
+            return -1;
+        }
+        break;
+
+    default:
+        /* FIXME: raise an exception for invalid instruction */
+        qemu_log("bundle value: %16.16llx", TILEGX_CODE_Y2(bundle));
+        return -1;
+    }
+
+    return 0;
+}
+
+static int translate_one_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    int i, ret = 0;
+    TCGv tmp;
+
+    for (i = 0; i < TILEGX_BUNDLE_INSNS; i++) {
+        dc->tmp_regs[i].idx = TILEGX_R_NOREG;
+        TCGV_UNUSED_I64(dc->tmp_regs[i].val);
+    }
+
+    /* For can regress completely, it must be y0 -> y1 -> y2, or x0 -> x1. */
+    if (TILEGX_BUNDLE_TYPE_Y(bundle)) {
+        dc->tmp_regcur = dc->tmp_regs + 0;
+        ret = translate_y0_bundle(dc, TILEGX_PIPE_Y0(bundle));
+        if (ret) {
+            goto err;
+        }
+        dc->tmp_regcur = dc->tmp_regs + 1;
+        ret = translate_y1_bundle(dc, TILEGX_PIPE_Y1(bundle));
+        if (ret) {
+            goto err;
+        }
+        dc->tmp_regcur = dc->tmp_regs + 2;
+        ret = translate_y2_bundle(dc, TILEGX_PIPE_Y2(bundle),
+                                  bundle & TILEGX_BUNDLE_TYPE_MASK);
+        if (ret) {
+            goto err;
+        }
+    } else {
+        dc->tmp_regcur = dc->tmp_regs + 0;
+        ret = translate_x0_bundle(dc, TILEGX_PIPE_X0(bundle));
+        if (ret) {
+            goto err;
+        }
+        dc->tmp_regcur = dc->tmp_regs + 1;
+        ret = translate_x1_bundle(dc, TILEGX_PIPE_X1(bundle));
+        if (ret) {
+            goto err;
+        }
+    }
+
+    for (i = 0; i < TILEGX_BUNDLE_INSNS; i++) {
+        if (dc->tmp_regs[i].idx == TILEGX_R_NOREG) {
+            continue;
+        }
+        tcg_gen_mov_i64(cpu_regs[dc->tmp_regs[i].idx], dc->tmp_regs[i].val);
+        tcg_temp_free_i64(dc->tmp_regs[i].val);
+    }
+
+    if (dc->jmp.cond != TCG_COND_NEVER) {
+        if (dc->jmp.cond == TCG_COND_ALWAYS) {
+            tcg_gen_mov_i64(cpu_pc, dc->jmp.dest);
+        } else {
+            tmp = tcg_const_i64(dc->pc + TILEGX_BUNDLE_SIZE);
+            tcg_gen_movcond_i64(dc->jmp.cond, cpu_pc,
+                                dc->jmp.val1, dc->jmp.val2,
+                                dc->jmp.dest, tmp);
+            tcg_temp_free_i64(dc->jmp.val1);
+            tcg_temp_free_i64(dc->jmp.val2);
+            tcg_temp_free_i64(tmp);
+        }
+        tcg_temp_free_i64(dc->jmp.dest);
+        tcg_gen_exit_tb(0);
+    }
+
+err:
+    return ret;
+}
+
 static inline void gen_intermediate_code_internal(TilegxCPU *cpu,
                                                   TranslationBlock *tb,
                                                   bool search_pc)
 {
-    /*
-     * FIXME: after load elf64 tilegx binary successfully, it will quit, at
-     * present, and will implement the related features next.
-     */
-    fprintf(stderr, "\nLoad elf64 tilegx successfully\n");
-    fprintf(stderr, "reach code start position: [" TARGET_FMT_lx "] %s\n\n",
+    int ret = 0;
+    struct DisasContext ctx;
+    struct DisasContext *dc = &ctx;
+
+    CPUTLState *env = &cpu->env;
+    uint64_t pc_start = tb->pc;
+    uint64_t next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+    int num_insns = 0;
+    int max_insns = tb->cflags & CF_COUNT_MASK;
+
+    /* FIXME: do not consider about search_pc firstly. */
+    qemu_log("TB block start position: [" TARGET_FMT_lx "] %s\n\n",
             tb->pc, lookup_symbol(tb->pc));
-    exit(0);
+
+    dc->pc = pc_start;
+    dc->cpustate_changed = 0;
+    dc->jmp.cond = TCG_COND_NEVER;
+    TCGV_UNUSED_I64(dc->jmp.dest);
+    TCGV_UNUSED_I64(dc->jmp.val1);
+    TCGV_UNUSED_I64(dc->jmp.val2);
+
+    if (!max_insns) {
+        max_insns = CF_COUNT_MASK;
+    }
+
+    gen_tb_start(tb);
+    do {
+        ret = translate_one_bundle(dc, cpu_ldq_data(env, dc->pc));
+        if (ret) {
+            goto err;
+        }
+        num_insns++;
+        dc->pc += TILEGX_BUNDLE_SIZE;
+    } while (tcg_op_buf_count() <= OPC_MAX_SIZE - TILEGX_BUNDLE_OPCS
+             && num_insns <= max_insns - TILEGX_BUNDLE_INSNS
+             && dc->pc <= next_page_start - TILEGX_BUNDLE_SIZE
+             && dc->jmp.cond == TCG_COND_NEVER
+             && !dc->cpustate_changed);
+    gen_tb_end(tb, num_insns);
+
+    tb->size = dc->pc - pc_start;
+    tb->icount = num_insns;
+
+    return;
+err:
+    exit(-1);
 }
 
 void gen_intermediate_code(CPUTLState *env, struct TranslationBlock *tb)