diff mbox

[08/10] target-avr: adding instruction translation

Message ID 1464898022-97990-8-git-send-email-rolnik@amazon.com
State New
Headers show

Commit Message

Michael Rolnik June 2, 2016, 8:07 p.m. UTC
Signed-off-by: Michael Rolnik <mrolnik@gmail.com>
---
 target-avr/translate-inst.c | 2443 +++++++++++++++++++++++++++++++++++++++++++
 target-avr/translate.h      |  123 +++
 2 files changed, 2566 insertions(+)
 create mode 100644 target-avr/translate-inst.c
 create mode 100644 target-avr/translate.h

Comments

Richard Henderson June 5, 2016, 3:27 a.m. UTC | #1
On 06/02/2016 01:07 PM, Michael Rolnik wrote:
> Signed-off-by: Michael Rolnik <mrolnik@gmail.com>
> ---
>  target-avr/translate-inst.c | 2443 +++++++++++++++++++++++++++++++++++++++++++

Is there a reason this code isn't going into translate.c?
You wouldn't need the declarations in translate-inst.h or translate.h.

> +/*
> +    NOTE:   all registers are assumed to hold 8 bit values.
> +            so all operations done on registers should preseve this property
> +*/
> +
> +/*
> +    NOTE:   the flags C,H,V,N,V have either 0 or 1 values
> +    NOTE:   the flag Z has inverse logic, when the value of Zf is 0 the flag is assumed to be set, non zero - not set
> +*/

This documentation belongs with the declarations in cpu.h.

> +void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr);
> +void    gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr);
> +void    gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr);
> +void    gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr);
> +void    gen_ZNSf(TCGv R);
> +void    gen_push_ret(CPUAVRState *env, int    ret);
> +void    gen_pop_ret(CPUAVRState *env, TCGv    ret);
> +void    gen_jmp_ez(void);
> +void    gen_jmp_z(void);
> +
> +void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv l); /*  H:M:L   = addr  */
> +void    gen_set_xaddr(TCGv addr);
> +void    gen_set_yaddr(TCGv addr);
> +void    gen_set_zaddr(TCGv addr);
> +
> +TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L);            /*  addr = H:M:L    */
> +TCGv    gen_get_xaddr(void);
> +TCGv    gen_get_yaddr(void);
> +TCGv    gen_get_zaddr(void);
> +int     sex(int Imm, unsigned bits);

Order these functions properly and you don't need forward declarations.

> +
> +void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr)
> +{
> +    TCGv    t1      = tcg_temp_new_i32();
> +    TCGv    t2      = tcg_temp_new_i32();
> +    TCGv    t3      = tcg_temp_new_i32();
> +
> +    tcg_gen_and_tl(t1, Rd, Rr);    /*  t1  = Rd & Rr  */
> +    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & ~R  */
> +    tcg_gen_and_tl(t2, Rd, t2);

tcg_gen_andc_tl.

> +    tcg_gen_not_tl(t3, R);             /*  t3  = Rr  *~R  */
> +    tcg_gen_and_tl(t3, Rr, t3);

Likewise.

> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = t1 | t2 | t3  */
> +    tcg_gen_or_tl(t1, t1, t3);

While this is exactly the formula in the manual, it's also equal to

     ((Rd ^ Rr) ^ R) & 16

where we examine the difference between the non-carry addition (Rd ^ Rr) and 
the carry addition (R) to find the carry out of bit 3.  This reduces the 
operation count to 2 from 5.

> +    tcg_gen_shri_tl(cpu_Cf, t1, 7);     /*  Cf  = t1(7)  */

Of course, Cf is also bit 8 of R, at least before you masked that off.

> +    tcg_gen_shri_tl(cpu_Hf, t1, 3);     /*  Hf  = t1(3)  */
> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);

Consider leaving Hf in bit 3 (or 4, as above) instead of shifting and masking 
to bit 0.  This makes it (slightly) more complicated to read the full SREG 
value, but it saves two instructions per arithmetic instruction.  This will add 
up quickly.

One can make the same argument for most of the other flags.

Compare how this is handled in target-arm for cpu_[NZCV]F.

> +void    gen_add_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
> +{
> +    TCGv    t1      = tcg_temp_new_i32();
> +    TCGv    t2      = tcg_temp_new_i32();
> +
> +    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd & ~Rr & R  */
> +    tcg_gen_not_tl(t2, Rr);
> +    tcg_gen_and_tl(t1, t1, t2);
> +    tcg_gen_and_tl(t1, t1, R);
> +
> +    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & Rr & ~R  */
> +    tcg_gen_and_tl(t2, t2, Rd);
> +    tcg_gen_and_tl(t2, t2, Rr);
> +
> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & Rr & ~R | ~Rd & ~Rr & R  */

While this is indeed the formula in the manual, a better expression is

   (R ^ Rd) & ~(Rd ^ Rr)

which is 3 operations instead of 5.  As alluded above, it might be best to keep 
Vf in bit 7 instead of bit 0.

Also note that if you use bit 4 to compute Hf, as above, then one can share a 
sub-expression with the computation of Vf.

> +void    gen_sub_CHf(TCGv    R, TCGv    Rd, TCGv    Rr)
> +{
> +    TCGv    t1      = tcg_temp_new_i32();
> +    TCGv    t2      = tcg_temp_new_i32();
> +    TCGv    t3      = tcg_temp_new_i32();
> +
> +    /*  Cf & Hf  */
> +    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd  */
> +    tcg_gen_and_tl(t2, t1, Rr);    /*  t2  = ~Rd & Rr  */
> +    tcg_gen_or_tl(t3, t1, Rr);    /*  t3  = (~Rd | Rr) & R  */
> +    tcg_gen_and_tl(t3, t3, R);
> +    tcg_gen_or_tl(t2, t2, t3);    /*  t2  = ~Rd & Rr | ~Rd & R | R & Rr  */
> +    tcg_gen_shri_tl(cpu_Cf, t2, 7);     /*  Cf  = t2(7)  */
> +    tcg_gen_shri_tl(cpu_Hf, t2, 3);     /*  Hf  = t2(3)  */
> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);

Note that carry and borrow are related, and thus this is *also* computable via 
((Rd ^ Rr) ^ R) on bit 4.

> +void    gen_sub_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
> +{
> +    TCGv    t1      = tcg_temp_new_i32();
> +    TCGv    t2      = tcg_temp_new_i32();
> +
> +    /*  Vf  */
> +    tcg_gen_and_tl(t1, Rr, R);     /*  t1  = Rd & ~Rr & ~R  */
> +    tcg_gen_not_tl(t1, t1);
> +    tcg_gen_and_tl(t1, t1, Rd);
> +    tcg_gen_not_tl(t2, Rd);            /*  t2  = ~Rd & Rr & R  */
> +    tcg_gen_and_tl(t2, t2, Rr);
> +    tcg_gen_and_tl(t2, t2, R);
> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & ~Rr & ~R | ~Rd & Rr & R  */
> +    tcg_gen_shri_tl(cpu_Vf, t1, 7);     /*  Vf  = t1(7)  */

This is equal to

   (R ^ Rd) & (Rd ^ Rr)

> +void    gen_ZNSf(TCGv    R)
> +{
> +    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
> +    tcg_gen_shri_tl(cpu_Nf, R, 7);     /*  Nf  = R(7)  */
> +    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */

This should be xor.

> +void    gen_push_ret(CPUAVRState *env, int    ret)
> +{
> +    tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x0000ff)), cpu_sp, DATA_INDEX);

You don't need a local constant, and you should be freeing these 3 temps.

I'll also note that the piece-wise store is big-endian, so you could perform 
this in 1 store for 2_BYTE and 2 stores for 3_BYTE.

> +void    gen_jmp_ez()
> +{
> +    tcg_gen_mov_tl(cpu_pc, cpu_eind);
> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[31]);
> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
> +    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffffff);
> +    tcg_gen_exit_tb(0);
> +}
> +
> +void    gen_jmp_z()
> +{
> +    tcg_gen_mov_tl(cpu_pc, cpu_r[31]);
> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
> +    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffff);

Note this is

     tcg_gen_deposit_tl(cpu_pc, cpu_r[30], cpu_r[31], 8, 8);

Also consider storing EIND (and perhaps already shifted so that you can just OR 
it in.

> +void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv L)
> +{
> +    tcg_gen_andi_tl(L, addr, 0xff);
> +
> +    tcg_gen_shri_tl(addr, addr, 8);
> +    tcg_gen_andi_tl(M, addr, 0xff);
> +
> +    tcg_gen_shri_tl(addr, addr, 8);
> +    tcg_gen_andi_tl(H, addr, 0xff);
> +}
> +
> +void    gen_set_xaddr(TCGv addr)
> +{
> +    gen_set_addr(addr, cpu_rampX, cpu_r[27], cpu_r[26]);
> +}
> +
> +void    gen_set_yaddr(TCGv addr)
> +{
> +    gen_set_addr(addr, cpu_rampY, cpu_r[29], cpu_r[28]);
> +}
> +
> +void    gen_set_zaddr(TCGv addr)
> +{
> +    gen_set_addr(addr, cpu_rampZ, cpu_r[31], cpu_r[30]);
> +}
> +
> +TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L)
> +{
> +    TCGv    addr= tcg_temp_new_i32();
> +
> +    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
> +    tcg_gen_shli_tl(addr, addr, 8);
> +    tcg_gen_or_tl(addr, addr, M);
> +    tcg_gen_shli_tl(addr, addr, 8);
> +    tcg_gen_or_tl(addr, addr, L);
> +
> +    return  addr;
> +}
> +
> +TCGv    gen_get_xaddr()
> +{
> +    return  gen_get_addr(cpu_rampX, cpu_r[27], cpu_r[26]);
> +}
> +
> +TCGv    gen_get_yaddr()
> +{
> +    return  gen_get_addr(cpu_rampY, cpu_r[29], cpu_r[28]);
> +}
> +
> +TCGv    gen_get_zaddr()
> +{
> +    return  gen_get_addr(cpu_rampZ, cpu_r[31], cpu_r[30]);
> +}

Wow.  Um... Surely it would be better to store X and Y internally as whole 
24-bit quantities, and Z as a 16-bit quantity (to be extended with rampz, 
rampd, or eind as needed).

This would allow normal loads, stores, and autoincrement to operate 
efficiently.  When you see byte accesses to R[26-31], you extract/deposit the 
bytes as required.  I assume that happens (significantly) less often than 
operating on X/Y/Z as a unit...

One might make the same argument for R[24-25] as it pertains to adiw et al.

> +int     sex(int Imm, unsigned bits)
> +{
> +    Imm <<= 32 - bits;
> +    Imm >>= 32 - bits;
> +
> +    return  Imm;
> +}

This is sextract32(imm, 0, bits).

> +int    avr_translate_ADIW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
> +        gen_helper_unsupported(cpu_env);
> +
> +        return BS_EXCP;
> +    }
> +
> +    TCGv    RdL = cpu_r[24 + 2 * ADIW_Rd(opcode)];
> +    TCGv    RdH = cpu_r[25 + 2 * ADIW_Rd(opcode)];
> +    int     Imm = (ADIW_hImm(opcode) << 4) | (ADIW_lImm(opcode));
> +    TCGv    R   = tcg_temp_new_i32();
> +    TCGv    Rd  = tcg_temp_new_i32();
> +    TCGv    t0  = tcg_temp_new_i32();
> +
> +    /*  op  */
> +    tcg_gen_shli_tl(Rd, RdH, 8);     /*  R = ((H << 8) | L) + Imm  */
> +    tcg_gen_or_tl(Rd, RdL, Rd);
> +    tcg_gen_addi_tl(R, Rd, Imm);
> +    tcg_gen_andi_tl(R, R, 0xffff);/*  make it 16 bits  */
> +
> +    /*  Cf  */
> +    tcg_gen_not_tl(t0, R);             /*  t0  = Rd & ~R  */
> +    tcg_gen_and_tl(t0, Rd, t0);
> +    tcg_gen_shri_tl(cpu_Cf, t0, 15);    /*  Cf  = t0(15)  */

Or simply bit 16.

> +    /*  Sf  */
> +    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */

xor.

> +int    avr_translate_ASR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv    Rd  = cpu_r[ASR_Rd(opcode)];
> +    TCGv    t1  = tcg_temp_new_i32();
> +    TCGv    t2  = tcg_temp_new_i32();
> +
> +    /*  op  */
> +    tcg_gen_andi_tl(t1, Rd, 0x80);      /*  t1  = (Rd & 0x80) | (Rd >> 1)  */
> +    tcg_gen_shri_tl(t2, Rd, 1);
> +    tcg_gen_or_tl(t1, t1, t2);
> +
> +    /*  Cf  */
> +    tcg_gen_andi_tl(cpu_Cf, Rd, 1);         /*  Cf  = Rd(0)  */
> +
> +    /*  Vf  */
> +    tcg_gen_and_tl(cpu_Vf, cpu_Nf, cpu_Cf);/*  Vf  = Nf & Cf  */

xor.

> +/*
> +    Copies the T Flag in the SREG (Status Register) to bit b in register Rd.
> +*/
> +int    avr_translate_BLD(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv    Rd  = cpu_r[BLD_Rd(opcode)];
> +    TCGv    t1  = tcg_temp_new_i32();
> +
> +    tcg_gen_shli_tl(t1, cpu_Tf, BLD_Bit(opcode));
> +    tcg_gen_or_tl(Rd, Rd, t1);
> +    tcg_gen_and_tl(Rd, Rd, t1);

Err... extra and?  I'm pretty sure this insn simply sets bit B, and doesn't 
clear all of the others.

> +/*
> +    Clears the specified bits in register Rd. Performs the logical AND between the contents of register Rd and the
> +    complement of the constant mask K. The result will be placed in register Rd.
> +*/
> +int    avr_translate_COM(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv    Rd  = cpu_r[COM_Rd(opcode)];
> +    TCGv    R   = tcg_temp_new_i32();
> +
> +    tcg_gen_movi_tl(R, 0xff);
> +    tcg_gen_sub_tl(Rd, R, Rd);

Better as xor with 0xff.

> +/*
> +    This instruction performs a compare between two registers Rd and Rr, and skips the next instruction if Rd = Rr.
> +*/
> +int    avr_translate_CPSE(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv        Rd  = cpu_r[CPSE_Rd(opcode)];
> +    TCGv        Rr  = cpu_r[(CPSE_hRr(opcode) << 4) | (CPSE_lRr(opcode))];
> +    TCGLabel   *skip= gen_new_label();
> +
> +    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst is skipped  */
> +    tcg_gen_brcond_i32(TCG_COND_EQ, Rd, Rr, skip);
> +    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);      /*  PC if next inst is not skipped  */
> +    gen_set_label(skip);

Any reason you're not using goto_tb?

> +/*
> +    Subtracts one -1- from the contents of register Rd and places the result in the destination register Rd.
> +    The C Flag in SREG is not affected by the operation, thus allowing the DEC instruction to be used on a loop
> +    counter in multiple-precision computations.
> +    When operating on unsigned values, only BREQ and BRNE branches can be expected to perform consistently.
> +    When operating on two’s complement values, all signed branches are available.
> +*/
> +int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv        Rd  = cpu_r[DEC_Rd(opcode)];
> +
> +    tcg_gen_subi_tl(Rd, Rd, 1);     /*  Rd  = Rd - 1  */
> +    tcg_gen_andi_tl(Rd, Rd, 0xff);  /*  make it 8 bits  */
> +
> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /* cpu_Vf   = Rd == 0x7f  */

This is INC overflow.

> +int    avr_translate_INC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv    Rd  = cpu_r[INC_Rd(opcode)];
> +
> +    tcg_gen_addi_tl(Rd, Rd, 1);
> +    tcg_gen_andi_tl(Rd, Rd, 0xff);
> +
> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x80);  /* cpu_Vf   = Rd == 0x80  */

This is DEC overflow.

> +/*
> +    Load one byte indirect from data space to register and stores an clear the bits in data space specified by the
> +    register. The instruction can > +/*
only be used towards internal SRAM.
> +    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
> +    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
> +    space, the RAMPZ in register in the I/O area has to be changed.
> +    The Z-pointer Register is left unchanged by the operation. This instruction is especially suited for clearing status
> +    bits stored in SRAM.
> +*/
> +int    avr_translate_LAC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
> +        gen_helper_unsupported(cpu_env);
> +
> +        return BS_EXCP;
> +    }
> +
> +    TCGv    Rr  = cpu_r[LAC_Rr(opcode)];
> +    TCGv    addr= gen_get_zaddr();
> +    TCGv    t0  = tcg_temp_new_i32();
> +    TCGv    t1  = tcg_temp_new_i32();
> +
> +    tcg_gen_qemu_ld8u(
> +                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
> +    tcg_gen_movi_tl(t1, 0xff);                  /*  t1      = t0 & (0xff - Rr)  */
> +    tcg_gen_sub_tl(t1, t1, Rr);
> +    tcg_gen_and_tl(t1, t0, t1);

These last 3 are

   tcg_gen_andc_tl(t1, t0, Rr);

> +/*
> +    This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication.
> +*/
> +int    avr_translate_MULS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
> +        gen_helper_unsupported(cpu_env);
> +
> +        return BS_EXCP;
> +    }
> +
> +    TCGv    R0  = cpu_r[0];
> +    TCGv    R1  = cpu_r[1];
> +    TCGv    Rd  = cpu_r[16 + MULS_Rd(opcode)];
> +    TCGv    Rr  = cpu_r[16 + MULS_Rr(opcode)];
> +    TCGv    R   = tcg_temp_new_i32();
> +
> +    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
> +    tcg_gen_ext8s_tl(Rr, Rr);            /*  make Rr full 32 bit signed  */

You need to either zero-extend these again afterward, or you need to perform 
the extensions into a temporary.

> +/*
> +    Replaces the contents of register Rd with its two’s complement; the value $80 is left unchanged.
> +*/
> +int    avr_translate_NEG(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    TCGv    Rd  = cpu_r[NEG_Rd(opcode)];
> +    TCGv    R   = tcg_temp_new_i32();
> +
> +    tcg_gen_neg_tl(R, Rd);
> +
> +    tcg_gen_setcondi_tl(TCG_COND_NE, cpu_Cf, R, 0x00);  /*  Cf  = R != 0x00  */
> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, R, 0x80);  /*  Vf  = R == 0x80  */

Correct, but surely it's better to re-use the normal subtraction 
infrastructure.  In particular, setcond is at minimum two operations.  The tcg 
optimizer should do approximately as well folding away the zero from the subtract.

> +    tcg_gen_not_tl(cpu_Hf, Rd);                                /*  Hf  = (~Rd | R)(3)  */
> +    tcg_gen_or_tl(cpu_Hf, cpu_Hf, R);
> +    tcg_gen_shri_tl(cpu_Hf, cpu_Hf, 3);
> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
> +    gen_ZNSf(R);
> +
> +    tcg_temp_free_i32(R);

You failed to write back the result.

> +int    avr_translate_XCH(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
> +{
> +    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
> +        gen_helper_unsupported(cpu_env);
> +
> +        return BS_EXCP;
> +    }
> +
> +    TCGv    Rd      = cpu_r[XCH_Rd(opcode)];
> +    TCGv    t0      = tcg_temp_new_i32();
> +    TCGv    addr    = gen_get_zaddr();
> +
> +    tcg_gen_qemu_ld8u(
> +                    addr, t0, DATA_INDEX);
> +    tcg_gen_qemu_st8(
> +                    addr, Rd, DATA_INDEX);

The address and data temporaries are swapped.


r~
Michael Rolnik June 5, 2016, 9:47 p.m. UTC | #2
please see my answer inside.

On Sun, Jun 5, 2016 at 6:27 AM, Richard Henderson <rth@twiddle.net> wrote:

> On 06/02/2016 01:07 PM, Michael Rolnik wrote:
>
>> Signed-off-by: Michael Rolnik <mrolnik@gmail.com>
>> ---
>>  target-avr/translate-inst.c | 2443
>> +++++++++++++++++++++++++++++++++++++++++++
>>
>
> Is there a reason this code isn't going into translate.c?
> You wouldn't need the declarations in translate-inst.h or translate.h.

I see here two levels of logic
a. instruction translation
b. general flow of program translation.


>
>
> +/*
>> +    NOTE:   all registers are assumed to hold 8 bit values.
>> +            so all operations done on registers should preseve this
>> property
>> +*/
>> +
>> +/*
>> +    NOTE:   the flags C,H,V,N,V have either 0 or 1 values
>> +    NOTE:   the flag Z has inverse logic, when the value of Zf is 0 the
>> flag is assumed to be set, non zero - not set
>> +*/
>>
>
> This documentation belongs with the declarations in cpu.h.

moved.

>
>
> +void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr);
>> +void    gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr);
>> +void    gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr);
>> +void    gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr);
>> +void    gen_ZNSf(TCGv R);
>> +void    gen_push_ret(CPUAVRState *env, int    ret);
>> +void    gen_pop_ret(CPUAVRState *env, TCGv    ret);
>> +void    gen_jmp_ez(void);
>> +void    gen_jmp_z(void);
>> +
>> +void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv l); /*  H:M:L   =
>> addr  */
>> +void    gen_set_xaddr(TCGv addr);
>> +void    gen_set_yaddr(TCGv addr);
>> +void    gen_set_zaddr(TCGv addr);
>> +
>> +TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L);            /*  addr =
>> H:M:L    */
>> +TCGv    gen_get_xaddr(void);
>> +TCGv    gen_get_yaddr(void);
>> +TCGv    gen_get_zaddr(void);
>> +int     sex(int Imm, unsigned bits);
>>
>
> Order these functions properly and you don't need forward declarations.

is it a requirements? this way it look cleaner.

>
>
> +
>> +void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr)
>> +{
>> +    TCGv    t1      = tcg_temp_new_i32();
>> +    TCGv    t2      = tcg_temp_new_i32();
>> +    TCGv    t3      = tcg_temp_new_i32();
>> +
>> +    tcg_gen_and_tl(t1, Rd, Rr);    /*  t1  = Rd & Rr  */
>> +    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & ~R  */
>> +    tcg_gen_and_tl(t2, Rd, t2);
>>
>
> tcg_gen_andc_tl.

thanks

>
>
> +    tcg_gen_not_tl(t3, R);             /*  t3  = Rr  *~R  */
>> +    tcg_gen_and_tl(t3, Rr, t3);
>>
>
> Likewise.

thanks

>
>
> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = t1 | t2 | t3  */
>> +    tcg_gen_or_tl(t1, t1, t3);
>>
>
> While this is exactly the formula in the manual, it's also equal to
>
>     ((Rd ^ Rr) ^ R) & 16
>
Please explain. I don't see it.

http://www.wolframalpha.com/input/?i=A+and+B+or+A+and+not+C+or+B+and+not+C,+A+xor+B+xor+C


>
>
> where we examine the difference between the non-carry addition (Rd ^ Rr)
> and the carry addition (R) to find the carry out of bit 3.  This reduces
> the operation count to 2 from 5.
>
> +    tcg_gen_shri_tl(cpu_Cf, t1, 7);     /*  Cf  = t1(7)  */
>>
>
> Of course, Cf is also bit 8 of R, at least before you masked that off.
>
> +    tcg_gen_shri_tl(cpu_Hf, t1, 3);     /*  Hf  = t1(3)  */
>> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
>>
>
> Consider leaving Hf in bit 3 (or 4, as above) instead of shifting and
> masking to bit 0.  This makes it (slightly) more complicated to read the
> full SREG value, but it saves two instructions per arithmetic instruction.
> This will add up quickly.
>
yes you are right. next version.

>
> One can make the same argument for most of the other flags.
>
> Compare how this is handled in target-arm for cpu_[NZCV]F.
>
> +void    gen_add_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
>> +{
>> +    TCGv    t1      = tcg_temp_new_i32();
>> +    TCGv    t2      = tcg_temp_new_i32();
>> +
>> +    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd & ~Rr & R  */
>> +    tcg_gen_not_tl(t2, Rr);
>> +    tcg_gen_and_tl(t1, t1, t2);
>> +    tcg_gen_and_tl(t1, t1, R);
>> +
>> +    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & Rr & ~R  */
>> +    tcg_gen_and_tl(t2, t2, Rd);
>> +    tcg_gen_and_tl(t2, t2, Rr);
>> +
>> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & Rr & ~R | ~Rd & ~Rr &
>> R  */
>>
>
> While this is indeed the formula in the manual, a better expression is
>
>   (R ^ Rd) & ~(Rd ^ Rr)
>
right, thanks.

>
> which is 3 operations instead of 5.  As alluded above, it might be best to
> keep Vf in bit 7 instead of bit 0.
>
> Also note that if you use bit 4 to compute Hf, as above, then one can
> share a sub-expression with the computation of Vf.
>
> +void    gen_sub_CHf(TCGv    R, TCGv    Rd, TCGv    Rr)
>> +{
>> +    TCGv    t1      = tcg_temp_new_i32();
>> +    TCGv    t2      = tcg_temp_new_i32();
>> +    TCGv    t3      = tcg_temp_new_i32();
>> +
>> +    /*  Cf & Hf  */
>> +    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd  */
>> +    tcg_gen_and_tl(t2, t1, Rr);    /*  t2  = ~Rd & Rr  */
>> +    tcg_gen_or_tl(t3, t1, Rr);    /*  t3  = (~Rd | Rr) & R  */
>> +    tcg_gen_and_tl(t3, t3, R);
>> +    tcg_gen_or_tl(t2, t2, t3);    /*  t2  = ~Rd & Rr | ~Rd & R | R & Rr
>> */
>> +    tcg_gen_shri_tl(cpu_Cf, t2, 7);     /*  Cf  = t2(7)  */
>> +    tcg_gen_shri_tl(cpu_Hf, t2, 3);     /*  Hf  = t2(3)  */
>> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
>>
>
> Note that carry and borrow are related, and thus this is *also* computable
> via ((Rd ^ Rr) ^ R) on bit 4.

please explain, I don't see it
http://www.wolframalpha.com/input/?i=not+A+and+B+or+not+A+and+C+or++C+and+B,+A+xor+B+xor+C

>
>
> +void    gen_sub_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
>> +{
>> +    TCGv    t1      = tcg_temp_new_i32();
>> +    TCGv    t2      = tcg_temp_new_i32();
>> +
>> +    /*  Vf  */
>> +    tcg_gen_and_tl(t1, Rr, R);     /*  t1  = Rd & ~Rr & ~R  */
>> +    tcg_gen_not_tl(t1, t1);
>> +    tcg_gen_and_tl(t1, t1, Rd);
>> +    tcg_gen_not_tl(t2, Rd);            /*  t2  = ~Rd & Rr & R  */
>> +    tcg_gen_and_tl(t2, t2, Rr);
>> +    tcg_gen_and_tl(t2, t2, R);
>> +    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & ~Rr & ~R | ~Rd & Rr &
>> R  */
>> +    tcg_gen_shri_tl(cpu_Vf, t1, 7);     /*  Vf  = t1(7)  */
>>
>
> This is equal to
>
>   (R ^ Rd) & (Rd ^ Rr)
>
> +void    gen_ZNSf(TCGv    R)
>> +{
>> +    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
>> +    tcg_gen_shri_tl(cpu_Nf, R, 7);     /*  Nf  = R(7)  */
>> +    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */
>>
>
> This should be xor.

thanks. fixed

>
>
> +void    gen_push_ret(CPUAVRState *env, int    ret)
>> +{
>> +    tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x0000ff)), cpu_sp,
>> DATA_INDEX);
>>
>
> You don't need a local constant, and you should be freeing these 3 temps.

done.


>
>
I'll also note that the piece-wise store is big-endian, so you could
> perform this in 1 store for 2_BYTE and 2 stores for 3_BYTE.

I got an expression that the platform is little endian.

>
>
> +void    gen_jmp_ez()
>> +{
>> +    tcg_gen_mov_tl(cpu_pc, cpu_eind);
>> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
>> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[31]);
>> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
>> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
>> +    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffffff);
>> +    tcg_gen_exit_tb(0);
>> +}
>> +
>> +void    gen_jmp_z()
>> +{
>> +    tcg_gen_mov_tl(cpu_pc, cpu_r[31]);
>> +    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
>> +    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
>> +    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffff);
>>
>
> Note this is
>
>     tcg_gen_deposit_tl(cpu_pc, cpu_r[30], cpu_r[31], 8, 8);
>
> Also consider storing EIND (and perhaps already shifted so that you can
> just OR it in.
>
> +void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv L)
>> +{
>> +    tcg_gen_andi_tl(L, addr, 0xff);
>> +
>> +    tcg_gen_shri_tl(addr, addr, 8);
>> +    tcg_gen_andi_tl(M, addr, 0xff);
>> +
>> +    tcg_gen_shri_tl(addr, addr, 8);
>> +    tcg_gen_andi_tl(H, addr, 0xff);
>> +}
>> +
>> +void    gen_set_xaddr(TCGv addr)
>> +{
>> +    gen_set_addr(addr, cpu_rampX, cpu_r[27], cpu_r[26]);
>> +}
>> +
>> +void    gen_set_yaddr(TCGv addr)
>> +{
>> +    gen_set_addr(addr, cpu_rampY, cpu_r[29], cpu_r[28]);
>> +}
>> +
>> +void    gen_set_zaddr(TCGv addr)
>> +{
>> +    gen_set_addr(addr, cpu_rampZ, cpu_r[31], cpu_r[30]);
>> +}
>> +
>> +TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L)
>> +{
>> +    TCGv    addr= tcg_temp_new_i32();
>> +
>> +    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
>> +    tcg_gen_shli_tl(addr, addr, 8);
>> +    tcg_gen_or_tl(addr, addr, M);
>> +    tcg_gen_shli_tl(addr, addr, 8);
>> +    tcg_gen_or_tl(addr, addr, L);
>> +
>> +    return  addr;
>> +}
>> +
>> +TCGv    gen_get_xaddr()
>> +{
>> +    return  gen_get_addr(cpu_rampX, cpu_r[27], cpu_r[26]);
>> +}
>> +
>> +TCGv    gen_get_yaddr()
>> +{
>> +    return  gen_get_addr(cpu_rampY, cpu_r[29], cpu_r[28]);
>> +}
>> +
>> +TCGv    gen_get_zaddr()
>> +{
>> +    return  gen_get_addr(cpu_rampZ, cpu_r[31], cpu_r[30]);
>> +}
>>
>
> Wow.  Um... Surely it would be better to store X and Y internally as whole
> 24-bit quantities, and Z as a 16-bit quantity (to be extended with rampz,
> rampd, or eind as needed).
>
rampX/Y/Z are represented now as 0x00ff0000.
X/Y/Z can be represented as 16 bits registers, however I do not know if and
when r26-r31 are used as 8 bits, so if X/Y/Z are represented as 16 bits it
would be hard to use r26-r31 in arithmetics


>
> This would allow normal loads, stores, and autoincrement to operate
> efficiently.  When you see byte accesses to R[26-31], you extract/deposit
> the bytes as required.  I assume that happens (significantly) less often
> than operating on X/Y/Z as a unit...
>
> One might make the same argument for R[24-25] as it pertains to adiw et al.

I don't think so, it will make simple arithmetics complex.

>
>
> +int     sex(int Imm, unsigned bits)
>> +{
>> +    Imm <<= 32 - bits;
>> +    Imm >>= 32 - bits;
>> +
>> +    return  Imm;
>> +}
>>
>
> This is sextract32(imm, 0, bits).

done. thanks.

>
>
> +int    avr_translate_ADIW(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
>> +        gen_helper_unsupported(cpu_env);
>> +
>> +        return BS_EXCP;
>> +    }
>> +
>> +    TCGv    RdL = cpu_r[24 + 2 * ADIW_Rd(opcode)];
>> +    TCGv    RdH = cpu_r[25 + 2 * ADIW_Rd(opcode)];
>> +    int     Imm = (ADIW_hImm(opcode) << 4) | (ADIW_lImm(opcode));
>> +    TCGv    R   = tcg_temp_new_i32();
>> +    TCGv    Rd  = tcg_temp_new_i32();
>> +    TCGv    t0  = tcg_temp_new_i32();
>> +
>> +    /*  op  */
>> +    tcg_gen_shli_tl(Rd, RdH, 8);     /*  R = ((H << 8) | L) + Imm  */
>> +    tcg_gen_or_tl(Rd, RdL, Rd);
>> +    tcg_gen_addi_tl(R, Rd, Imm);
>> +    tcg_gen_andi_tl(R, R, 0xffff);/*  make it 16 bits  */
>> +
>> +    /*  Cf  */
>> +    tcg_gen_not_tl(t0, R);             /*  t0  = Rd & ~R  */
>> +    tcg_gen_and_tl(t0, Rd, t0);
>> +    tcg_gen_shri_tl(cpu_Cf, t0, 15);    /*  Cf  = t0(15)  */
>>
>
> Or simply bit 16.
>
> +    /*  Sf  */
>> +    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */
>>
>
> xor.

fixed.

>
>
> +int    avr_translate_ASR(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv    Rd  = cpu_r[ASR_Rd(opcode)];
>> +    TCGv    t1  = tcg_temp_new_i32();
>> +    TCGv    t2  = tcg_temp_new_i32();
>> +
>> +    /*  op  */
>> +    tcg_gen_andi_tl(t1, Rd, 0x80);      /*  t1  = (Rd & 0x80) | (Rd >>
>> 1)  */
>> +    tcg_gen_shri_tl(t2, Rd, 1);
>> +    tcg_gen_or_tl(t1, t1, t2);
>> +
>> +    /*  Cf  */
>> +    tcg_gen_andi_tl(cpu_Cf, Rd, 1);         /*  Cf  = Rd(0)  */
>> +
>> +    /*  Vf  */
>> +    tcg_gen_and_tl(cpu_Vf, cpu_Nf, cpu_Cf);/*  Vf  = Nf & Cf  */
>>
>
> xor.
>
> +/*
>> +    Copies the T Flag in the SREG (Status Register) to bit b in register
>> Rd.
>> +*/
>> +int    avr_translate_BLD(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv    Rd  = cpu_r[BLD_Rd(opcode)];
>> +    TCGv    t1  = tcg_temp_new_i32();
>> +
>> +    tcg_gen_shli_tl(t1, cpu_Tf, BLD_Bit(opcode));
>> +    tcg_gen_or_tl(Rd, Rd, t1);
>> +    tcg_gen_and_tl(Rd, Rd, t1);
>>
>
> Err... extra and?  I'm pretty sure this insn simply sets bit B, and
> doesn't clear all of the others.

it was a bug. fixed.

>
>
> +/*
>> +    Clears the specified bits in register Rd. Performs the logical AND
>> between the contents of register Rd and the
>> +    complement of the constant mask K. The result will be placed in
>> register Rd.
>> +*/
>> +int    avr_translate_COM(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv    Rd  = cpu_r[COM_Rd(opcode)];
>> +    TCGv    R   = tcg_temp_new_i32();
>> +
>> +    tcg_gen_movi_tl(R, 0xff);
>> +    tcg_gen_sub_tl(Rd, R, Rd);
>>
>
> Better as xor with 0xff.
>
done

>
> +/*
>> +    This instruction performs a compare between two registers Rd and Rr,
>> and skips the next instruction if Rd = Rr.
>> +*/
>> +int    avr_translate_CPSE(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv        Rd  = cpu_r[CPSE_Rd(opcode)];
>> +    TCGv        Rr  = cpu_r[(CPSE_hRr(opcode) << 4) |
>> (CPSE_lRr(opcode))];
>> +    TCGLabel   *skip= gen_new_label();
>> +
>> +    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst
>> is skipped  */
>> +    tcg_gen_brcond_i32(TCG_COND_EQ, Rd, Rr, skip);
>> +    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);      /*  PC if next inst
>> is not skipped  */
>> +    gen_set_label(skip);
>>
>
> Any reason you're not using goto_tb?

I think, this place can be optimized by not exiting tb on skip. I will do
it in next version.

>
>
> +/*
>> +    Subtracts one -1- from the contents of register Rd and places the
>> result in the destination register Rd.
>> +    The C Flag in SREG is not affected by the operation, thus allowing
>> the DEC instruction to be used on a loop
>> +    counter in multiple-precision computations.
>> +    When operating on unsigned values, only BREQ and BRNE branches can
>> be expected to perform consistently.
>> +    When operating on two’s complement values, all signed branches are
>> available.
>> +*/
>> +int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv        Rd  = cpu_r[DEC_Rd(opcode)];
>> +
>> +    tcg_gen_subi_tl(Rd, Rd, 1);     /*  Rd  = Rd - 1  */
>> +    tcg_gen_andi_tl(Rd, Rd, 0xff);  /*  make it 8 bits  */
>> +
>> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /* cpu_Vf   =
>> Rd == 0x7f  */
>>
>
> This is INC overflow.

please explain, I don't see a problem here

>
>
> +int    avr_translate_INC(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv    Rd  = cpu_r[INC_Rd(opcode)];
>> +
>> +    tcg_gen_addi_tl(Rd, Rd, 1);
>> +    tcg_gen_andi_tl(Rd, Rd, 0xff);
>> +
>> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x80);  /* cpu_Vf   =
>> Rd == 0x80  */
>>
>
> This is DEC overflow.
> please explain, I don't see a problem here
>
> +/*
>> +    Load one byte indirect from data space to register and stores an
>> clear the bits in data space specified by the
>> +    register. The instruction can > +/*
>>
> only be used towards internal SRAM.
>
>> +    The data location is pointed to by the Z (16 bits) Pointer Register
>> in the Register File. Memory access is limited
>> +    to the current data segment of 64KB. To access another data segment
>> in devices with more than 64KB data
>> +    space, the RAMPZ in register in the I/O area has to be changed.
>> +    The Z-pointer Register is left unchanged by the operation. This
>> instruction is especially suited for clearing status
>> +    bits stored in SRAM.
>> +*/
>> +int    avr_translate_LAC(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
>> +        gen_helper_unsupported(cpu_env);
>> +
>> +        return BS_EXCP;
>> +    }
>> +
>> +    TCGv    Rr  = cpu_r[LAC_Rr(opcode)];
>> +    TCGv    addr= gen_get_zaddr();
>> +    TCGv    t0  = tcg_temp_new_i32();
>> +    TCGv    t1  = tcg_temp_new_i32();
>> +
>> +    tcg_gen_qemu_ld8u(
>> +                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
>> +    tcg_gen_movi_tl(t1, 0xff);                  /*  t1      = t0 & (0xff
>> - Rr)  */
>> +    tcg_gen_sub_tl(t1, t1, Rr);
>> +    tcg_gen_and_tl(t1, t0, t1);
>>
>
> These last 3 are
>
fixed

>
>   tcg_gen_andc_tl(t1, t0, Rr);
>
> +/*
>> +    This instruction performs 8-bit x 8-bit -> 16-bit signed
>> multiplication.
>> +*/
>> +int    avr_translate_MULS(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
>> +        gen_helper_unsupported(cpu_env);
>> +
>> +        return BS_EXCP;
>> +    }
>> +
>> +    TCGv    R0  = cpu_r[0];
>> +    TCGv    R1  = cpu_r[1];
>> +    TCGv    Rd  = cpu_r[16 + MULS_Rd(opcode)];
>> +    TCGv    Rr  = cpu_r[16 + MULS_Rr(opcode)];
>> +    TCGv    R   = tcg_temp_new_i32();
>> +
>> +    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed
>> */
>> +    tcg_gen_ext8s_tl(Rr, Rr);            /*  make Rr full 32 bit signed
>> */
>>
>
> You need to either zero-extend these again afterward, or you need to
> perform the extensions into a temporary.

oops

>
>
> +/*
>> +    Replaces the contents of register Rd with its two’s complement; the
>> value $80 is left unchanged.
>> +*/
>> +int    avr_translate_NEG(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    TCGv    Rd  = cpu_r[NEG_Rd(opcode)];
>> +    TCGv    R   = tcg_temp_new_i32();
>> +
>> +    tcg_gen_neg_tl(R, Rd);
>> +
>> +    tcg_gen_setcondi_tl(TCG_COND_NE, cpu_Cf, R, 0x00);  /*  Cf  = R !=
>> 0x00  */
>> +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, R, 0x80);  /*  Vf  = R ==
>> 0x80  */
>>
>
> Correct, but surely it's better to re-use the normal subtraction
> infrastructure.  In particular, setcond is at minimum two operations.  The
> tcg optimizer should do approximately as well folding away the zero from
> the subtract.
>
> +    tcg_gen_not_tl(cpu_Hf, Rd);                                /*  Hf  =
>> (~Rd | R)(3)  */
>> +    tcg_gen_or_tl(cpu_Hf, cpu_Hf, R);
>> +    tcg_gen_shri_tl(cpu_Hf, cpu_Hf, 3);
>> +    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
>> +    gen_ZNSf(R);
>> +
>> +    tcg_temp_free_i32(R);
>>
>
> You failed to write back the result.
>
done.

>
> +int    avr_translate_XCH(CPUAVRState *env, DisasContext *ctx, uint32_t
>> opcode)
>> +{
>> +    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
>> +        gen_helper_unsupported(cpu_env);
>> +
>> +        return BS_EXCP;
>> +    }
>> +
>> +    TCGv    Rd      = cpu_r[XCH_Rd(opcode)];
>> +    TCGv    t0      = tcg_temp_new_i32();
>> +    TCGv    addr    = gen_get_zaddr();
>> +
>> +    tcg_gen_qemu_ld8u(
>> +                    addr, t0, DATA_INDEX);
>> +    tcg_gen_qemu_st8(
>> +                    addr, Rd, DATA_INDEX);
>>
>
> The address and data temporaries are swapped.
>
> fixed.

>
> r~
>
Richard Henderson June 5, 2016, 11:34 p.m. UTC | #3
On 06/05/2016 02:47 PM, Michael Rolnik wrote:
>     Is there a reason this code isn't going into translate.c?
>     You wouldn't need the declarations in translate-inst.h or translate.h.
>
> I see here two levels of logic
> a. instruction translation
> b. general flow of program translation.

FWIW, static functions can be automatically inlined by the compiler, whereas 
external function calls can't.  In theory, the compiler could auto-inline the 
entire translator into a single function.

>     Order these functions properly and you don't need forward declarations.
>
> is it a requirements? this way it look cleaner.

Does it?  In my experience it just means you've got to edit two places when one 
changes things.

>     While this is exactly the formula in the manual, it's also equal to
>
>         ((Rd ^ Rr) ^ R) & 16
>
> Please explain. I don't see it.
>
> http://www.wolframalpha.com/input/?i=A+and+B+or+A+and+not+C+or+B+and+not+C,+A+xor+B+xor+C

I did explain:

>     where we examine the difference between the non-carry addition (Rd ^ Rr)
>     and the carry addition (R) to find the carry out of bit 3.  This reduces
>     the operation count to 2 from 5.

It's not a manipulation of the original expression, but a different way of 
looking at the problem.

You want to compute carry-out of bit 3.  Given the *result* of the addition, 
it's easier to examine bit 4, into which carry-in has happened, rather than 
examine bit 3 and re-compute the carry-out.

The AVR hardware probably computes it exactly as described in the manual, 
because that can be done in parallel with the addition, and has a lower total 
gate latency.  This is fairly common in the industry, where the documentation 
follows the implementation more closely than it perhaps should.

>     Note that carry and borrow are related, and thus this is *also* computable
>     via ((Rd ^ Rr) ^ R) on bit 4.
>
> please explain, I don't see it
> http://www.wolframalpha.com/input/?i=not+A+and+B+or+not+A+and+C+or++C+and+B,+A+xor+B+xor+C

As above, given the *result* of the subtraction, examining bit 4 into which 
borrow-in has happened.

Once you accept that, you'll note that the same expression can be used to 
re-create both carry-in and borrow-in.

>     I'll also note that the piece-wise store is big-endian, so you could
>     perform this in 1 store for 2_BYTE and 2 stores for 3_BYTE.
>
> I got an expression that the platform is little endian.

Then you've got the order of the stores wrong.  Your code pushes the LSB before 
pushing the MSB, which results in the MSB at the lower address, which means 
big-endian.

>     Wow.  Um... Surely it would be better to store X and Y internally as whole
>     24-bit quantities, and Z as a 16-bit quantity (to be extended with rampz,
>     rampd, or eind as needed).
>
> rampX/Y/Z are represented now as 0x00ff0000.
> X/Y/Z can be represented as 16 bits registers, however I do not know if and
> when r26-r31 are used as 8 bits, so if X/Y/Z are represented as 16 bits it
> would be hard to use r26-r31 in arithmetics

You would use a setup like the following, and use these functions instead of 
other direct accesses to the cpu registers.  This setup requires similar 
functions in cpu.h for use by e.g. gdbstub.c.


TCGv cpu_rb[24];
TCGv cpu_rw[4];

TCGv read_byte(unsigned rb)
{
     TCGv byte = tcg_temp_new();
     if (rb < 24) {
         tcg_gen_mov_tl(byte, cpu_rb[rb]);
     } else {
         unsigned rw = (rb - 24) / 2;
         if (rb & 1) {
             tcg_gen_shri_tl(byte, cpu_rw[rw]);
         } else {
             tcg_gen_ext8u_tl(byte, cpu_rw[rw]);
         }
     }
     return byte;
}

void write_byte(unsigned rb, TCGv val)
{
     if (rb < 24) {
         tcg_gen_mov_tl(cpu_rb[rb], val);
     } else {
         unsigned rw = (rb - 24) / 2;
         tcg_gen_deposit_tl(cpu_rw[rw], cpu_rw[rw], val, (rb & 1) * 8, 8);
     }
}

/* Return RB+1:RB.  */
TCGv read_word(unsigned rb)
{
     TCGv word = tcg_temp_new();
     if (rb < 24) {
         tcg_gen_deposit_tl(word, cpu_rb[rb], cpu_rb[rb + 1], 8, 8);
     } else {
         unsigned rw = (rb - 24) / 2;
         tcg_gen_mov_tl(word, cpu_rw[rw]);
     }
     return word;
}

void write_word(unsigned rb, TCGv val)
{
     if (rb < 24) {
         tcg_gen_ext8u_tl(cpu_rb[rb], val);
         tcg_gen_shri_tl(cpu_rb[rb + 1], val, 8);
     } else {
         unsigned rw = (rb - 24) / 2;
         tcg_gen_mov_tl(cpu_rw[rw], val);
     }
}

>         +int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t
...
>         +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /* cpu_Vf   =
>         Rd == 0x7f  */
>
>
>     This is INC overflow.
>
> please explain, I don't see a problem here

You have swapped the overflow conditions for INC and DEC.

     127 + 1 -> -128
     -128 - 1 -> 127


r~
Michael Rolnik June 6, 2016, 6:52 a.m. UTC | #4
On Mon, Jun 6, 2016 at 2:34 AM, Richard Henderson <rth@twiddle.net> wrote:

> On 06/05/2016 02:47 PM, Michael Rolnik wrote:
>
>>     Is there a reason this code isn't going into translate.c?
>>     You wouldn't need the declarations in translate-inst.h or translate.h.
>>
>> I see here two levels of logic
>> a. instruction translation
>> b. general flow of program translation.
>>
>
> FWIW, static functions can be automatically inlined by the compiler,
> whereas external function calls can't.  In theory, the compiler could
> auto-inline the entire translator into a single function.

these functions are called through pointer so there is no way for these
functions to be inlined.

>
>
>     Order these functions properly and you don't need forward declarations.
>>
>> is it a requirements? this way it look cleaner.
>>
>
> Does it?  In my experience it just means you've got to edit two places
> when one changes things.

It still one place because it's either instruction translation or a program
translation.
from my point of view translate.c will have almost no bugs in very close
feature whereas translate-inst.c will hove bug fixes and modification.
having it in two files will isolate translate.c from erroneous
modifications.

>
>
>     While this is exactly the formula in the manual, it's also equal to
>>
>>         ((Rd ^ Rr) ^ R) & 16
>>
>> Please explain. I don't see it.
>>
>>
>> http://www.wolframalpha.com/input/?i=A+and+B+or+A+and+not+C+or+B+and+not+C,+A+xor+B+xor+C
>>
>
> I did explain:

truth table shows that these computations are different.

>
>
>     where we examine the difference between the non-carry addition (Rd ^
>> Rr)
>>     and the carry addition (R) to find the carry out of bit 3.  This
>> reduces
>>     the operation count to 2 from 5.
>>
>
> It's not a manipulation of the original expression, but a different way of
> looking at the problem.
>
> You want to compute carry-out of bit 3.  Given the *result* of the
> addition, it's easier to examine bit 4, into which carry-in has happened,
> rather than examine bit 3 and re-compute the carry-out.
>
> The AVR hardware probably computes it exactly as described in the manual,
> because that can be done in parallel with the addition, and has a lower
> total gate latency.  This is fairly common in the industry, where the
> documentation follows the implementation more closely than it perhaps
> should.

you can't look onto 4th bit because 4th bits in the input were not 0s.

>
>
>     Note that carry and borrow are related, and thus this is *also*
>> computable
>>     via ((Rd ^ Rr) ^ R) on bit 4.
>>
>> please explain, I don't see it
>>
>> http://www.wolframalpha.com/input/?i=not+A+and+B+or+not+A+and+C+or++C+and+B,+A+xor+B+xor+C
>>
>
> As above, given the *result* of the subtraction, examining bit 4 into
> which borrow-in has happened.
>
> Once you accept that, you'll note that the same expression can be used to
> re-create both carry-in and borrow-in.
>
>     I'll also note that the piece-wise store is big-endian, so you could
>>     perform this in 1 store for 2_BYTE and 2 stores for 3_BYTE.
>>
>> I got an expression that the platform is little endian.
>>
>
> Then you've got the order of the stores wrong.  Your code pushes the LSB
> before pushing the MSB, which results in the MSB at the lower address,
> which means big-endian.

this is right. However as far as I understand AVR is neither little nor big
endian because there it's 8 bit architecture (see here
http://www.avrfreaks.net/forum/endian-issue). for time being I defined the
platform to be little endian with ret address exception

>
>
>     Wow.  Um... Surely it would be better to store X and Y internally as
>> whole
>>     24-bit quantities, and Z as a 16-bit quantity (to be extended with
>> rampz,
>>     rampd, or eind as needed).
>>
>> rampX/Y/Z are represented now as 0x00ff0000.
>> X/Y/Z can be represented as 16 bits registers, however I do not know if
>> and
>> when r26-r31 are used as 8 bits, so if X/Y/Z are represented as 16 bits it
>> would be hard to use r26-r31 in arithmetics
>>
>
> You would use a setup like the following, and use these functions instead
> of other direct accesses to the cpu registers.  This setup requires similar
> functions in cpu.h for use by e.g. gdbstub.c.
>
I will have to think about it.

>
>
> TCGv cpu_rb[24];
> TCGv cpu_rw[4];
>
> TCGv read_byte(unsigned rb)
> {
>     TCGv byte = tcg_temp_new();
>     if (rb < 24) {
>         tcg_gen_mov_tl(byte, cpu_rb[rb]);
>     } else {
>         unsigned rw = (rb - 24) / 2;
>         if (rb & 1) {
>             tcg_gen_shri_tl(byte, cpu_rw[rw]);
>         } else {
>             tcg_gen_ext8u_tl(byte, cpu_rw[rw]);
>         }
>     }
>     return byte;
> }
>
> void write_byte(unsigned rb, TCGv val)
> {
>     if (rb < 24) {
>         tcg_gen_mov_tl(cpu_rb[rb], val);
>     } else {
>         unsigned rw = (rb - 24) / 2;
>         tcg_gen_deposit_tl(cpu_rw[rw], cpu_rw[rw], val, (rb & 1) * 8, 8);
>     }
> }
>
> /* Return RB+1:RB.  */
> TCGv read_word(unsigned rb)
> {
>     TCGv word = tcg_temp_new();
>     if (rb < 24) {
>         tcg_gen_deposit_tl(word, cpu_rb[rb], cpu_rb[rb + 1], 8, 8);
>     } else {
>         unsigned rw = (rb - 24) / 2;
>         tcg_gen_mov_tl(word, cpu_rw[rw]);
>     }
>     return word;
> }
>
> void write_word(unsigned rb, TCGv val)
> {
>     if (rb < 24) {
>         tcg_gen_ext8u_tl(cpu_rb[rb], val);
>         tcg_gen_shri_tl(cpu_rb[rb + 1], val, 8);
>     } else {
>         unsigned rw = (rb - 24) / 2;
>         tcg_gen_mov_tl(cpu_rw[rw], val);
>     }
> }
>
>         +int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx,
>> uint32_t
>>
> ...
>
>>         +    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /*
>> cpu_Vf   =
>>         Rd == 0x7f  */
>>
>>
>>     This is INC overflow.
>>
>> please explain, I don't see a problem here
>>
>
> You have swapped the overflow conditions for INC and DEC.
>
>     127 + 1 -> -128
>     -128 - 1 -> 127

this is how it's defined in the document.

>
>
>
> r~
>
Richard Henderson June 6, 2016, 7:17 p.m. UTC | #5
On 06/05/2016 11:52 PM, Michael Rolnik wrote:
> truth table shows that these computations are different.

You're not giving the right inputs to the truth table.

> you can't look onto 4th bit because 4th bits in the input were not 0s.

What did you think the xor's do?  They remove the non-zero input bits.

#include <stdio.h>

static int orig(int r, int d, int s)
{
   return (((d & s) | (d & ~r) | (s & ~r)) & 2) != 0;
}

static int mine(int r, int d, int s)
{
   return (((d ^ s) ^ r) & 4) != 0;
}

int main()
{
   int s, d;
   for (s = 0; s < 8; ++s)
     for (d = 0; d < 8; ++d)
       {
	int r = d + s;
	int o = orig(r, d, s);
         int m = mine(r, d, s);

	if (o != m)
	  printf("%2d = %d + %d  (o=%d, m=%d)\n", r, d, s, o, m);
       }
   return 0;
}

This performs tests on 3-bit inputs, testing for carry-out on bit 1, just like 
Hf computes carry-out on bit 3.


>     Then you've got the order of the stores wrong.  Your code pushes the LSB
>     before pushing the MSB, which results in the MSB at the lower address,
>     which means big-endian.
>
> this is right. However as far as I understand AVR is neither little nor big
> endian because there it's 8 bit architecture (see
> here http://www.avrfreaks.net/forum/endian-issue). for time being I defined the
> platform to be little endian with ret address exception

True, AVR is an 8-bit core, where endianness doesn't (normally) apply.  And you 
are right that ADIW does treat the registers as little-endian.

But the only multi-byte store to memory is in big-endian order.  So why 
wouldn't you want to take advantage of that fact?

>     You have swapped the overflow conditions for INC and DEC.
...
> this is how it's defined in the document.

No, it isn't.  Look again, you've swapped them.


r~
Michael Rolnik June 6, 2016, 7:34 p.m. UTC | #6
On Mon, Jun 6, 2016 at 10:17 PM, Richard Henderson <rth@twiddle.net> wrote:

> On 06/05/2016 11:52 PM, Michael Rolnik wrote:
>
>> truth table shows that these computations are different.
>>
>
> You're not giving the right inputs to the truth table.
>
> you can't look onto 4th bit because 4th bits in the input were not 0s.
>>
>
> What did you think the xor's do?  They remove the non-zero input bits.
>
> #include <stdio.h>
>
> static int orig(int r, int d, int s)
> {
>   return (((d & s) | (d & ~r) | (s & ~r)) & 2) != 0;
> }
>
> static int mine(int r, int d, int s)
> {
>   return (((d ^ s) ^ r) & 4) != 0;
> }
>
> int main()
> {
>   int s, d;
>   for (s = 0; s < 8; ++s)
>     for (d = 0; d < 8; ++d)
>       {
>         int r = d + s;
>         int o = orig(r, d, s);
>         int m = mine(r, d, s);
>
>         if (o != m)
>           printf("%2d = %d + %d  (o=%d, m=%d)\n", r, d, s, o, m);
>       }
>   return 0;
> }
>
> This performs tests on 3-bit inputs, testing for carry-out on bit 1, just
> like Hf computes carry-out on bit 3.

you are right I can look onto 4th bit, however I do the whole computation
of C flag already.

>
>
>
>     Then you've got the order of the stores wrong.  Your code pushes the
>> LSB
>>     before pushing the MSB, which results in the MSB at the lower address,
>>     which means big-endian.
>>
>> this is right. However as far as I understand AVR is neither little nor
>> big
>> endian because there it's 8 bit architecture (see
>> here http://www.avrfreaks.net/forum/endian-issue). for time being I
>> defined the
>> platform to be little endian with ret address exception
>>
>
> True, AVR is an 8-bit core, where endianness doesn't (normally) apply.
> And you are right that ADIW does treat the registers as little-endian.
>
> But the only multi-byte store to memory is in big-endian order.  So why
> wouldn't you want to take advantage of that fact?

I will.

>
>
>     You have swapped the overflow conditions for INC and DEC.
>>
> ...
>
>> this is how it's defined in the document.
>>
>
> No, it isn't.  Look again, you've swapped them.

checking again.

>
>
>
> r~
>
diff mbox

Patch

diff --git a/target-avr/translate-inst.c b/target-avr/translate-inst.c
new file mode 100644
index 0000000..fe9d9fc
--- /dev/null
+++ b/target-avr/translate-inst.c
@@ -0,0 +1,2443 @@ 
+/*
+ *  QEMU AVR CPU
+ *
+ *  Copyright (c) 2016 Michael Rolnik
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see
+ *  <http://www.gnu.org/licenses/lgpl-2.1.html>
+ */
+
+#include "translate.h"
+#include "translate-inst.h"
+
+/*
+    NOTE:   all registers are assumed to hold 8 bit values.
+            so all operations done on registers should preseve this property
+*/
+
+/*
+    NOTE:   the flags C,H,V,N,V have either 0 or 1 values
+    NOTE:   the flag Z has inverse logic, when the value of Zf is 0 the flag is assumed to be set, non zero - not set
+*/
+
+void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_ZNSf(TCGv R);
+void    gen_push_ret(CPUAVRState *env, int    ret);
+void    gen_pop_ret(CPUAVRState *env, TCGv    ret);
+void    gen_jmp_ez(void);
+void    gen_jmp_z(void);
+
+void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv l); /*  H:M:L   = addr  */
+void    gen_set_xaddr(TCGv addr);
+void    gen_set_yaddr(TCGv addr);
+void    gen_set_zaddr(TCGv addr);
+
+TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L);            /*  addr = H:M:L    */
+TCGv    gen_get_xaddr(void);
+TCGv    gen_get_yaddr(void);
+TCGv    gen_get_zaddr(void);
+int     sex(int Imm, unsigned bits);
+
+void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+    TCGv    t3      = tcg_temp_new_i32();
+
+    tcg_gen_and_tl(t1, Rd, Rr);    /*  t1  = Rd & Rr  */
+    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & ~R  */
+    tcg_gen_and_tl(t2, Rd, t2);
+    tcg_gen_not_tl(t3, R);             /*  t3  = Rr  *~R  */
+    tcg_gen_and_tl(t3, Rr, t3);
+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = t1 | t2 | t3  */
+    tcg_gen_or_tl(t1, t1, t3);
+
+    tcg_gen_shri_tl(cpu_Cf, t1, 7);     /*  Cf  = t1(7)  */
+    tcg_gen_shri_tl(cpu_Hf, t1, 3);     /*  Hf  = t1(3)  */
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
+
+    tcg_temp_free_i32(t3);
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+}
+
+void    gen_add_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+
+    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd & ~Rr & R  */
+    tcg_gen_not_tl(t2, Rr);
+    tcg_gen_and_tl(t1, t1, t2);
+    tcg_gen_and_tl(t1, t1, R);
+
+    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & Rr & ~R  */
+    tcg_gen_and_tl(t2, t2, Rd);
+    tcg_gen_and_tl(t2, t2, Rr);
+
+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & Rr & ~R | ~Rd & ~Rr & R  */
+
+    tcg_gen_shri_tl(cpu_Vf, t1, 7);     /*  Vf  = t1(7)  */
+
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+}
+
+void    gen_sub_CHf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+    TCGv    t3      = tcg_temp_new_i32();
+
+    /*  Cf & Hf  */
+    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd  */
+    tcg_gen_and_tl(t2, t1, Rr);    /*  t2  = ~Rd & Rr  */
+    tcg_gen_or_tl(t3, t1, Rr);    /*  t3  = (~Rd | Rr) & R  */
+    tcg_gen_and_tl(t3, t3, R);
+    tcg_gen_or_tl(t2, t2, t3);    /*  t2  = ~Rd & Rr | ~Rd & R | R & Rr  */
+    tcg_gen_shri_tl(cpu_Cf, t2, 7);     /*  Cf  = t2(7)  */
+    tcg_gen_shri_tl(cpu_Hf, t2, 3);     /*  Hf  = t2(3)  */
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
+
+    tcg_temp_free_i32(t3);
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+}
+
+void    gen_sub_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+
+    /*  Vf  */
+    tcg_gen_and_tl(t1, Rr, R);     /*  t1  = Rd & ~Rr & ~R  */
+    tcg_gen_not_tl(t1, t1);
+    tcg_gen_and_tl(t1, t1, Rd);
+    tcg_gen_not_tl(t2, Rd);            /*  t2  = ~Rd & Rr & R  */
+    tcg_gen_and_tl(t2, t2, Rr);
+    tcg_gen_and_tl(t2, t2, R);
+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & ~Rr & ~R | ~Rd & Rr & R  */
+    tcg_gen_shri_tl(cpu_Vf, t1, 7);     /*  Vf  = t1(7)  */
+
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+}
+
+void    gen_ZNSf(TCGv    R)
+{
+    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
+    tcg_gen_shri_tl(cpu_Nf, R, 7);     /*  Nf  = R(7)  */
+    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */
+}
+
+void    gen_push_ret(CPUAVRState *env, int    ret)
+{
+    tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x0000ff)), cpu_sp, DATA_INDEX);
+    tcg_gen_subi_tl(cpu_sp, cpu_sp, 1);
+
+    if (avr_feature(env, AVR_FEATURE_2_BYTE_PC)
+        || avr_feature(env, AVR_FEATURE_3_BYTE_PC)) {
+        tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x00ff00) >> 8), cpu_sp, DATA_INDEX);
+        tcg_gen_subi_tl(cpu_sp, cpu_sp, 1);
+    }
+
+    if (avr_feature(env, AVR_FEATURE_3_BYTE_PC)) {
+        tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0xff0000) >> 16), cpu_sp, DATA_INDEX);
+        tcg_gen_subi_tl(cpu_sp, cpu_sp, 1);
+    }
+}
+
+void    gen_pop_ret(CPUAVRState *env, TCGv    ret)
+{
+    TCGv    t0  = tcg_const_i32(0);
+    TCGv    t1  = tcg_const_i32(0);
+    TCGv    t2  = tcg_const_i32(0);
+
+    if (avr_feature(env, AVR_FEATURE_3_BYTE_PC)) {
+        tcg_gen_addi_tl(cpu_sp, cpu_sp, 1);
+        tcg_gen_qemu_ld8u(t2, cpu_sp, DATA_INDEX);
+        tcg_gen_shli_tl(t2, t2, 16);
+    }
+
+    if (avr_feature(env, AVR_FEATURE_2_BYTE_PC)
+        ||  avr_feature(env, AVR_FEATURE_3_BYTE_PC)) {
+        tcg_gen_addi_tl(cpu_sp, cpu_sp, 1);
+        tcg_gen_qemu_ld8u(t1, cpu_sp, DATA_INDEX);
+        tcg_gen_shli_tl(t1, t1, 8);
+    }
+
+    tcg_gen_addi_tl(cpu_sp, cpu_sp, 1);
+    tcg_gen_qemu_ld8u(t0, cpu_sp, DATA_INDEX);
+
+    tcg_gen_or_tl(ret, t0, t1);
+    tcg_gen_or_tl(ret, ret, t2);
+
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t2);
+}
+
+void    gen_jmp_ez()
+{
+    tcg_gen_mov_tl(cpu_pc, cpu_eind);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[31]);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffffff);
+    tcg_gen_exit_tb(0);
+}
+
+void    gen_jmp_z()
+{
+    tcg_gen_mov_tl(cpu_pc, cpu_r[31]);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffff);
+    tcg_gen_exit_tb(0);
+}
+
+void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv L)
+{
+    tcg_gen_andi_tl(L, addr, 0xff);
+
+    tcg_gen_shri_tl(addr, addr, 8);
+    tcg_gen_andi_tl(M, addr, 0xff);
+
+    tcg_gen_shri_tl(addr, addr, 8);
+    tcg_gen_andi_tl(H, addr, 0xff);
+}
+
+void    gen_set_xaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+void    gen_set_yaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+void    gen_set_zaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampZ, cpu_r[31], cpu_r[30]);
+}
+
+TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L)
+{
+    TCGv    addr= tcg_temp_new_i32();
+
+    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
+    tcg_gen_shli_tl(addr, addr, 8);
+    tcg_gen_or_tl(addr, addr, M);
+    tcg_gen_shli_tl(addr, addr, 8);
+    tcg_gen_or_tl(addr, addr, L);
+
+    return  addr;
+}
+
+TCGv    gen_get_xaddr()
+{
+    return  gen_get_addr(cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+TCGv    gen_get_yaddr()
+{
+    return  gen_get_addr(cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+TCGv    gen_get_zaddr()
+{
+    return  gen_get_addr(cpu_rampZ, cpu_r[31], cpu_r[30]);
+}
+
+int     sex(int Imm, unsigned bits)
+{
+    Imm <<= 32 - bits;
+    Imm >>= 32 - bits;
+
+    return  Imm;
+}
+
+/*
+    Adds two registers and the contents of the C Flag and places the result in the destination register Rd.
+*/
+int    avr_translate_ADC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ADC_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(ADC_hRr(opcode) << 4) | (ADC_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_add_tl(R, Rd, Rr);             /*  R   = Rd + Rr + Cf  */
+    tcg_gen_add_tl(R, R, cpu_Cf);
+    tcg_gen_andi_tl(R, R, 0xff);            /*  make it 8 bits  */
+
+    gen_add_CHf(R, Rd, Rr);
+    gen_add_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Adds two registers without the C Flag and places the result in the destination register Rd.
+*/
+int    avr_translate_ADD(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ADD_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(ADD_hRr(opcode) << 4) | (ADD_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_add_tl(R, Rd, Rr);             /*  Rd = Rd + Rr  */
+    tcg_gen_andi_tl(R, R, 0xff);            /*  make it 8 bits  */
+
+    gen_add_CHf(R, Rd, Rr);
+    gen_add_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Adds an immediate value (0 - 63) to a register pair and places the result in the register pair. This instruction
+    operates on the upper four register pairs, and is well suited for operations on the pointer registers.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_ADIW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    RdL = cpu_r[24 + 2 * ADIW_Rd(opcode)];
+    TCGv    RdH = cpu_r[25 + 2 * ADIW_Rd(opcode)];
+    int     Imm = (ADIW_hImm(opcode) << 4) | (ADIW_lImm(opcode));
+    TCGv    R   = tcg_temp_new_i32();
+    TCGv    Rd  = tcg_temp_new_i32();
+    TCGv    t0  = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_shli_tl(Rd, RdH, 8);     /*  R = ((H << 8) | L) + Imm  */
+    tcg_gen_or_tl(Rd, RdL, Rd);
+    tcg_gen_addi_tl(R, Rd, Imm);
+    tcg_gen_andi_tl(R, R, 0xffff);/*  make it 16 bits  */
+
+    /*  Cf  */
+    tcg_gen_not_tl(t0, R);             /*  t0  = Rd & ~R  */
+    tcg_gen_and_tl(t0, Rd, t0);
+    tcg_gen_shri_tl(cpu_Cf, t0, 15);    /*  Cf  = t0(15)  */
+
+    /*  Vf  */
+    tcg_gen_not_tl(t0, Rd);            /*  t0  = ~Rd & R  */
+    tcg_gen_and_tl(t0, R, t0);
+
+    tcg_gen_shri_tl(cpu_Vf, t0, 15);    /*  Vf  = t0(15)  */
+
+    /*  Zf  */
+    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
+
+    /*  Nf  */
+    tcg_gen_shri_tl(cpu_Nf, R, 15);    /*  Nf  = R(15)  */
+
+    /*  Sf  */
+    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */
+
+    /*  R  */
+    tcg_gen_andi_tl(RdL, R, 0xff);
+    tcg_gen_shri_tl(RdH, R, 8);
+
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(Rd);
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Performs the logical AND between the contents of register Rd and register Rr and places the result in the
+    destination register Rd.
+*/
+int    avr_translate_AND(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[AND_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(AND_hRr(opcode) << 4) | (AND_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_and_tl(R, Rd, Rr);             /*  Rd  = Rd and Rr  */
+
+    /*  Vf  */
+    tcg_gen_movi_tl(cpu_Vf, 0x00);          /*  Vf  = 0  */
+
+    /*  Zf  */
+    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
+
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Performs the logical AND between the contents of register Rd and a constant and places the result in the
+    destination register Rd.
+*/
+int    avr_translate_ANDI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ANDI_Rd(opcode)];
+    int     Imm = (ANDI_hImm(opcode) << 4) | (ANDI_lImm(opcode));
+
+    /*  op  */
+    tcg_gen_andi_tl(Rd, Rd, Imm);   /*  Rd  = Rd & Imm  */
+
+    tcg_gen_movi_tl(cpu_Vf, 0x00);          /*  Vf  = 0  */
+    gen_ZNSf(Rd);
+
+    return  BS_NONE;
+}
+
+/*
+    Shifts all bits in Rd one place to the right. Bit 7 is held constant. Bit 0 is loaded into the C Flag of the SREG. This
+    operation effectively divides a signed value by two without changing its sign. The Carry Flag can be used to
+    round the result.
+*/
+int    avr_translate_ASR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ASR_Rd(opcode)];
+    TCGv    t1  = tcg_temp_new_i32();
+    TCGv    t2  = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_andi_tl(t1, Rd, 0x80);      /*  t1  = (Rd & 0x80) | (Rd >> 1)  */
+    tcg_gen_shri_tl(t2, Rd, 1);
+    tcg_gen_or_tl(t1, t1, t2);
+
+    /*  Cf  */
+    tcg_gen_andi_tl(cpu_Cf, Rd, 1);         /*  Cf  = Rd(0)  */
+
+    /*  Vf  */
+    tcg_gen_and_tl(cpu_Vf, cpu_Nf, cpu_Cf);/*  Vf  = Nf & Cf  */
+
+    gen_ZNSf(t1);
+
+    /*  op  */
+    tcg_gen_mov_tl(Rd, t1);
+
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+
+    return  BS_NONE;
+}
+
+/*
+    Clears a single Flag in SREG.
+*/
+int    avr_translate_BCLR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    switch(BCLR_Bit(opcode)) {
+        case    0x00:   tcg_gen_movi_tl(cpu_Cf, 0x00);  break;
+        case    0x01:   tcg_gen_movi_tl(cpu_Zf, 0x01);  break;
+        case    0x02:   tcg_gen_movi_tl(cpu_Nf, 0x00);  break;
+        case    0x03:   tcg_gen_movi_tl(cpu_Vf, 0x00);  break;
+        case    0x04:   tcg_gen_movi_tl(cpu_Sf, 0x00);  break;
+        case    0x05:   tcg_gen_movi_tl(cpu_Hf, 0x00);  break;
+        case    0x06:   tcg_gen_movi_tl(cpu_Tf, 0x00);  break;
+        case    0x07:   tcg_gen_movi_tl(cpu_If, 0x00);  break;
+    }
+
+    return  BS_NONE;
+}
+
+/*
+    Copies the T Flag in the SREG (Status Register) to bit b in register Rd.
+*/
+int    avr_translate_BLD(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[BLD_Rd(opcode)];
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_shli_tl(t1, cpu_Tf, BLD_Bit(opcode));
+    tcg_gen_or_tl(Rd, Rd, t1);
+    tcg_gen_and_tl(Rd, Rd, t1);
+
+    tcg_temp_free_i32(t1);
+
+    return  BS_NONE;
+}
+
+/*
+    Conditional relative branch. Tests a single bit in SREG and branches relatively to PC if the bit is cleared. This
+    instruction branches relatively to PC in either direction (PC - 63 <= destination <= PC + 64). The parameter k is the
+    offset from PC and is represented in two’s complement form.
+*/
+int    avr_translate_BRBC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGLabel   *taken   = gen_new_label();
+    int         Imm     = sex(BRBC_Imm(opcode), 7);
+
+    switch(BRBC_Bit(opcode)) {
+        case    0x00:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Cf, 0, taken); break;
+        case    0x01:   tcg_gen_brcondi_i32(TCG_COND_NE, cpu_Zf, 0, taken); break;
+        case    0x02:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Nf, 0, taken); break;
+        case    0x03:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Vf, 0, taken); break;
+        case    0x04:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Sf, 0, taken); break;
+        case    0x05:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Hf, 0, taken); break;
+        case    0x06:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Tf, 0, taken); break;
+        case    0x07:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_If, 0, taken); break;
+    }
+
+    gen_goto_tb(env, ctx, 1, ctx->inst[0].npc);
+    gen_set_label(taken);
+    gen_goto_tb(env, ctx, 0, ctx->inst[0].npc + Imm);
+
+    return BS_BRANCH;
+}
+
+/*
+    Conditional relative branch. Tests a single bit in SREG and branches relatively to PC if the bit is set. This
+    instruction branches relatively to PC in either direction (PC - 63 <= destination <= PC + 64). The parameter k is the
+    offset from PC and is represented in two’s complement form.
+*/
+int    avr_translate_BRBS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGLabel   *taken   = gen_new_label();
+    int         Imm     = sex(BRBS_Imm(opcode), 7);
+
+    switch(BRBS_Bit(opcode)) {
+        case    0x00:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Cf, 1, taken); break;
+        case    0x01:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Zf, 0, taken); break;
+        case    0x02:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Nf, 1, taken); break;
+        case    0x03:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Vf, 1, taken); break;
+        case    0x04:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Sf, 1, taken); break;
+        case    0x05:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Hf, 1, taken); break;
+        case    0x06:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_Tf, 1, taken); break;
+        case    0x07:   tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_If, 1, taken); break;
+    }
+
+    gen_goto_tb(env, ctx, 1, ctx->inst[0].npc);
+    gen_set_label(taken);
+    gen_goto_tb(env, ctx, 0, ctx->inst[0].npc + Imm);
+
+    return BS_BRANCH;
+}
+
+/*
+    Sets a single Flag or bit in SREG.
+*/
+int    avr_translate_BSET(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    switch(BSET_Bit(opcode)) {
+        case    0x00:   tcg_gen_movi_tl(cpu_Cf, 0x01);  break;
+        case    0x01:   tcg_gen_movi_tl(cpu_Zf, 0x00);  break;
+        case    0x02:   tcg_gen_movi_tl(cpu_Nf, 0x01);  break;
+        case    0x03:   tcg_gen_movi_tl(cpu_Vf, 0x01);  break;
+        case    0x04:   tcg_gen_movi_tl(cpu_Sf, 0x01);  break;
+        case    0x05:   tcg_gen_movi_tl(cpu_Hf, 0x01);  break;
+        case    0x06:   tcg_gen_movi_tl(cpu_Tf, 0x01);  break;
+        case    0x07:   tcg_gen_movi_tl(cpu_If, 0x01);  break;
+    }
+
+    return  BS_NONE;
+}
+
+/*
+    The BREAK instruction is used by the On-chip Debug system, and is normally not used in the application
+    software. When the BREAK instruction is executed, the AVR CPU is set in the Stopped Mode. This gives the
+    On-chip Debugger access to internal resources.
+    If any Lock bits are set, or either the JTAGEN or OCDEN Fuses are unprogrammed, the CPU will treat the
+    BREAK instruction as a NOP and will not enter the Stopped mode.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_BREAK(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_BREAK) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    /*  TODO:   ???  */
+    return  BS_NONE;
+}
+
+/*
+    Stores bit b from Rd to the T Flag in SREG (Status Register).
+*/
+int    avr_translate_BST(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[BST_Rd(opcode)];
+
+    tcg_gen_andi_tl(cpu_Tf, Rd, 1 << BST_Bit(opcode));
+    tcg_gen_shri_tl(cpu_Tf, cpu_Tf, BST_Bit(opcode));
+
+    return  BS_NONE;
+}
+
+/*
+    Calls to a subroutine within the entire Program memory. The return address (to the instruction after the CALL)
+    will be stored onto the Stack. (See also RCALL). The Stack Pointer uses a post-decrement scheme during
+    CALL.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_CALL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_JMP_CALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    int     Imm = (CALL_hImm(opcode) << 17) | CALL_lImm(opcode);
+    int     ret = ctx->inst[0].npc;
+
+    gen_push_ret(env, ret);
+
+    gen_goto_tb(env, ctx, 0, Imm);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Clears a specified bit in an I/O Register. This instruction operates on the lower 32 I/O Registers – addresses 0-31.  */
+int    avr_translate_CBI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    data= cpu_io[CBI_Imm(opcode)];
+    TCGv    port= tcg_const_i32(CBI_Imm(opcode));
+
+    tcg_gen_andi_tl(data, data, ~(1 << CBI_Bit(opcode)));
+    gen_helper_outb(cpu_env, port, data);
+
+    return  BS_NONE;
+}
+
+/*
+    Clears the specified bits in register Rd. Performs the logical AND between the contents of register Rd and the
+    complement of the constant mask K. The result will be placed in register Rd.
+*/
+int    avr_translate_COM(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[COM_Rd(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_movi_tl(R, 0xff);
+    tcg_gen_sub_tl(Rd, R, Rd);
+
+    tcg_gen_movi_tl(cpu_Cf, 1);         /*  Cf  = 1  */
+    tcg_gen_movi_tl(cpu_Vf, 0);         /*  Vf  = 0  */
+    gen_ZNSf(Rd);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs a compare between two registers Rd and Rr. None of the registers are changed. All
+    conditional branches can be used after this instruction.
+*/
+int    avr_translate_CP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[CP_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(CP_hRr(opcode) << 4) | (CP_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);    /*  R  = Rd - Rr  */
+    tcg_gen_andi_tl(R, R, 0xff);  /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs a compare between two registers Rd and Rr and also takes into account the previous
+    carry. None of the registers are changed. All conditional branches can be used after this instruction.
+*/
+int    avr_translate_CPC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[CPC_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(CPC_hRr(opcode) << 4) | (CPC_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);    /*  R  = Rd - Rr - Cf  */
+    tcg_gen_sub_tl(R, R, cpu_Cf);
+    tcg_gen_andi_tl(R, R, 0xff);  /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs a compare between register Rd and a constant. The register is not changed. All
+    conditional branches can be used after this instruction.
+*/
+int    avr_translate_CPI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[CPI_Rd(opcode)];
+    int     Imm = (CPI_hImm(opcode) << 4) | CPI_lImm(opcode);
+    TCGv    Rr  = tcg_const_i32(Imm);
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);    /*  R  = Rd - Rr  */
+    tcg_gen_andi_tl(R, R, 0xff);  /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs a compare between two registers Rd and Rr, and skips the next instruction if Rd = Rr.
+*/
+int    avr_translate_CPSE(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rd  = cpu_r[CPSE_Rd(opcode)];
+    TCGv        Rr  = cpu_r[(CPSE_hRr(opcode) << 4) | (CPSE_lRr(opcode))];
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst is skipped  */
+    tcg_gen_brcond_i32(TCG_COND_EQ, Rd, Rr, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);      /*  PC if next inst is not skipped  */
+    gen_set_label(skip);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Subtracts one -1- from the contents of register Rd and places the result in the destination register Rd.
+    The C Flag in SREG is not affected by the operation, thus allowing the DEC instruction to be used on a loop
+    counter in multiple-precision computations.
+    When operating on unsigned values, only BREQ and BRNE branches can be expected to perform consistently.
+    When operating on two’s complement values, all signed branches are available.
+*/
+int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rd  = cpu_r[DEC_Rd(opcode)];
+
+    tcg_gen_subi_tl(Rd, Rd, 1);     /*  Rd  = Rd - 1  */
+    tcg_gen_andi_tl(Rd, Rd, 0xff);  /*  make it 8 bits  */
+
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /* cpu_Vf   = Rd == 0x7f  */
+    gen_ZNSf(Rd);
+
+    return  BS_NONE;
+}
+
+/*
+
+    The module is an instruction set extension to the AVR CPU, performing DES iterations. The 64-bit data block
+    (plaintext or ciphertext) is placed in the CPU register file, registers R0-R7, where LSB of data is placed in LSB of
+    R0 and MSB of data is placed in MSB of R7. The full 64-bit key (including parity bits) is placed in registers R8-
+    R15, organized in the register file with LSB of key in LSB of R8 and MSB of key in MSB of R15. Executing one
+    DES instruction performs one round in the DES algorithm. Sixteen rounds must be executed in increasing order
+    to form the correct DES ciphertext or plaintext. Intermediate results are stored in the register file (R0-R15) after
+    each DES instruction. The instruction's operand (K) determines which round is executed, and the half carry flag
+    (H) determines whether encryption or decryption is performed.
+    The DES algorithm is described in “Specifications for the Data Encryption Standard” (Federal Information
+    Processing Standards Publication 46). Intermediate results in this implementation differ from the standard
+    because the initial permutation and the inverse initial permutation are performed each iteration. This does not
+    affect the result in the final ciphertext or plaintext, but reduces execution time.
+*/
+int    avr_translate_DES(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    /*  TODO:  */
+    if (avr_feature(env, AVR_FEATURE_DES) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    return  BS_NONE;
+}
+
+/*
+    Indirect call of a subroutine pointed to by the Z (16 bits) Pointer Register in the Register File and the EIND
+    Register in the I/O space. This instruction allows for indirect calls to the entire 4M (words) Program memory
+    space. See also ICALL. The Stack Pointer uses a post-decrement scheme during EICALL.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_EICALL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_EIJMP_EICALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    int ret = ctx->inst[0].npc;
+
+    gen_push_ret(env, ret);
+
+    gen_jmp_ez();
+
+    return BS_BRANCH;
+}
+
+/*
+    Indirect jump to the address pointed to by the Z (16 bits) Pointer Register in the Register File and the EIND
+    Register in the I/O space. This instruction allows for indirect jumps to the entire 4M (words) Program memory
+    space. See also IJMP.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_EIJMP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_EIJMP_EICALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    gen_jmp_ez();
+
+    return BS_BRANCH;
+}
+
+/*
+    Loads one byte pointed to by the Z-register and the RAMPZ Register in the I/O space, and places this byte in
+    the destination register Rd. This instruction features a 100% space effective constant initialization or constant
+    data fetch. The Program memory is organized in 16-bit words while the Z-pointer is a byte address. Thus, the
+    least significant bit of the Z-pointer selects either low byte (ZLSB = 0) or high byte (ZLSB = 1). This instruction can
+    address the entire Program memory space. The Z-pointer Register can either be left unchanged by the
+    operation, or it can be incremented. The incrementation applies to the entire 24-bit concatenation of the RAMPZ
+    and Z-pointer Registers.
+    Devices with Self-Programming capability can use the ELPM instruction to read the Fuse and Lock bit value.
+    Refer to the device documentation for a detailed description.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_ELPM1(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ELPM) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[0];
+    TCGv    addr= gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_ELPM2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ELPM) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[ELPM2_Rd(opcode)];
+    TCGv    addr= gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_ELPMX(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ELPMX) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[ELPMX_Rd(opcode)];
+    TCGv    addr= gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+
+    gen_set_zaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Performs the logical EOR between the contents of register Rd and register Rr and places the result in the
+    destination register Rd.
+*/
+int    avr_translate_EOR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[EOR_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(EOR_hRr(opcode) << 4) | (EOR_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_xor_tl(R, Rd, Rr);
+
+    tcg_gen_movi_tl(cpu_Vf, 0);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit unsigned multiplication and shifts the result one bit left.
+*/
+int    avr_translate_FMUL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + FMUL_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + FMUL_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+    tcg_gen_shli_tl(R, R, 1);
+
+    tcg_gen_andi_tl(R0, R, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_andi_tl(R1, R, 0xff);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 16);    /*  Cf  = R(16)  */
+    tcg_gen_andi_tl(cpu_Zf, R, 0x0000ffff);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication and shifts the result one bit left.
+*/
+int    avr_translate_FMULS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + FMULS_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + FMULS_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
+    tcg_gen_ext8s_tl(Rr, Rr);            /*  make Rr full 32 bit signed  */
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+    tcg_gen_shli_tl(R, R, 1);
+
+    tcg_gen_andi_tl(R0, R, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_andi_tl(R1, R, 0xff);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 16);    /*  Cf  = R(16)  */
+    tcg_gen_andi_tl(cpu_Zf, R, 0x0000ffff);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication and shifts the result one bit left.
+*/
+int    avr_translate_FMULSU(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + FMULSU_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + FMULSU_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+    tcg_gen_shli_tl(R, R, 1);
+
+    tcg_gen_andi_tl(R0, R, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_andi_tl(R1, R, 0xff);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 16);    /*  Cf  = R(16)  */
+    tcg_gen_andi_tl(cpu_Zf, R, 0x0000ffff);
+
+    return  BS_NONE;
+}
+
+/*
+    Calls to a subroutine within the entire 4M (words) Program memory. The return address (to the instruction after
+    the CALL) will be stored onto the Stack. See also RCALL. The Stack Pointer uses a post-decrement scheme
+    during CALL.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_ICALL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_IJMP_ICALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    int ret = ctx->inst[0].npc;
+
+    gen_push_ret(env, ret);
+    gen_jmp_z();
+
+    return BS_BRANCH;
+}
+
+/*
+    Indirect jump to the address pointed to by the Z (16 bits) Pointer Register in the Register File. The Z-pointer
+    Register is 16 bits wide and allows jump within the lowest 64K words (128KB) section of Program memory.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_IJMP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_IJMP_ICALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    gen_jmp_z();
+
+    return BS_BRANCH;
+}
+
+/*
+    Loads data from the I/O Space (Ports, Timers, Configuration Registers, etc.) into register Rd in the Register
+    File.
+*/
+int    avr_translate_IN(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[IN_Rd(opcode)];
+    int     Imm = (IN_hImm(opcode) << 4) | IN_lImm(opcode);
+    TCGv    port= tcg_const_i32(Imm);
+    TCGv    data= cpu_io[Imm];
+
+    gen_helper_inb(data, cpu_env, port);
+    tcg_gen_mov_tl(Rd, data);
+
+    return  BS_NONE;
+}
+
+/*
+    Adds one -1- to the contents of register Rd and places the result in the destination register Rd.
+    The C Flag in SREG is not affected by the operation, thus allowing the INC instruction to be used on a loop
+    counter in multiple-precision computations.
+    When operating on unsigned numbers, only BREQ and BRNE branches can be expected to perform
+    consistently. When operating on two’s complement values, all signed branches are available.
+*/
+int    avr_translate_INC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[INC_Rd(opcode)];
+
+    tcg_gen_addi_tl(Rd, Rd, 1);
+    tcg_gen_andi_tl(Rd, Rd, 0xff);
+
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x80);  /* cpu_Vf   = Rd == 0x80  */
+    gen_ZNSf(Rd);
+    return  BS_NONE;
+}
+
+/*
+    Jump to an address within the entire 4M (words) Program memory. See also RJMP.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.0
+*/
+int    avr_translate_JMP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_JMP_CALL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    gen_goto_tb(env, ctx, 0, (JMP_hImm(opcode) << 17) | JMP_lImm(opcode));
+    return  BS_BRANCH;
+}
+
+/*
+    Load one byte indirect from data space to register and stores an clear the bits in data space specified by the
+    register. The instruction can only be used towards internal SRAM.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register is left unchanged by the operation. This instruction is especially suited for clearing status
+    bits stored in SRAM.
+*/
+int    avr_translate_LAC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rr  = cpu_r[LAC_Rr(opcode)];
+    TCGv    addr= gen_get_zaddr();
+    TCGv    t0  = tcg_temp_new_i32();
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_qemu_ld8u(
+                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
+    tcg_gen_movi_tl(t1, 0xff);                  /*  t1      = t0 & (0xff - Rr)  */
+    tcg_gen_sub_tl(t1, t1, Rr);
+    tcg_gen_and_tl(t1, t0, t1);
+
+    tcg_gen_mov_tl(Rr, t0);                    /*  Rr      = t0  */
+    tcg_gen_qemu_st8(t1, addr, DATA_INDEX);    /*  mem[addr]   = t1  */
+
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Load one byte indirect from data space to register and set bits in data space specified by the register. The
+    instruction can only be used towards internal SRAM.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register is left unchanged by the operation. This instruction is especially suited for setting status
+    bits stored in SRAM.
+*/
+int    avr_translate_LAS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rr  = cpu_r[LAS_Rr(opcode)];
+    TCGv    addr= gen_get_zaddr();
+    TCGv    t0  = tcg_temp_new_i32();
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_qemu_ld8u(
+                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
+    tcg_gen_or_tl(t1, t0, Rr);
+
+    tcg_gen_mov_tl(Rr, t0);                    /*  Rr      = t0  */
+    tcg_gen_qemu_st8(t1, addr, DATA_INDEX);    /*  mem[addr]   = t1  */
+
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Load one byte indirect from data space to register and toggles bits in the data space specified by the register.
+    The instruction can only be used towards SRAM.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register is left unchanged by the operation. This instruction is especially suited for changing
+    status bits stored in SRAM.
+*/
+int    avr_translate_LAT(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rr   = cpu_r[LAT_Rr(opcode)];
+    TCGv    addr = gen_get_zaddr();
+    TCGv    t0   = tcg_temp_new_i32();
+    TCGv    t1   = tcg_temp_new_i32();
+
+    tcg_gen_qemu_ld8u(
+                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
+    tcg_gen_xor_tl(t1, t0, Rr);
+
+    tcg_gen_mov_tl(Rr, t0);                    /*  Rr      = t0  */
+    tcg_gen_qemu_st8(t1, addr, DATA_INDEX);    /*  mem[addr]   = t1  */
+
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads one byte indirect from the data space to a register. For parts with SRAM, the data space consists of the
+    Register File, I/O memory and internal SRAM (and external SRAM if applicable). For parts without SRAM, the
+    data space consists of the Register File only. In some parts the Flash Memory has been mapped to the data
+    space and can be read using this command. The EEPROM has a separate address space.
+    The data location is pointed to by the X (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPX in register in the I/O area has to be changed.
+    The X-pointer Register can either be left unchanged by the operation, or it can be post-incremented or predecremented.
+    These features are especially suited for accessing arrays, tables, and Stack Pointer usage of the
+    X-pointer Register. Note that only the low byte of the X-pointer is updated in devices with no more than 256
+    bytes data space. For such devices, the high byte of the pointer is not used by this instruction and can be used
+    for other purposes. The RAMPX Register in the I/O area is updated in parts with more than 64KB data space or
+    more than 64KB Program memory, and the increment/decrement is added to the entire 24-bit address on such
+    devices.
+    Not all variants of this instruction is available in all devices. Refer to the device specific instruction set summary.
+    In the Reduced Core tinyAVR the LD instruction can be used to achieve the same operation as LPM since the
+    program memory is mapped to the data memory space.
+*/
+int    avr_translate_LDX1(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDX1_Rd(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDX2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDX2_Rd(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+    
+    gen_set_xaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDX3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDX3_Rd(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+    gen_set_xaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads one byte indirect with or without displacement from the data space to a register. For parts with SRAM, the
+    data space consists of the Register File, I/O memory and internal SRAM (and external SRAM if applicable). For
+    parts without SRAM, the data space consists of the Register File only. In some parts the Flash Memory has
+    been mapped to the data space and can be read using this command. The EEPROM has a separate address
+    space.
+    The data location is pointed to by the Y (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPY in register in the I/O area has to be changed.
+    The Y-pointer Register can either be left unchanged by the operation, or it can be post-incremented or predecremented.
+    These features are especially suited for accessing arrays, tables, and Stack Pointer usage of the
+    Y-pointer Register. Note that only the low byte of the Y-pointer is updated in devices with no more than 256
+    bytes data space. For such devices, the high byte of the pointer is not used by this instruction and can be used
+    for other purposes. The RAMPY Register in the I/O area is updated in parts with more than 64KB data space or
+    more than 64KB Program memory, and the increment/decrement/displacement is added to the entire 24-bit
+    address on such devices.
+    Not all variants of this instruction is available in all devices. Refer to the device specific instruction set summary.
+    In the Reduced Core tinyAVR the LD instruction can be used to achieve the same operation as LPM since the
+    program memory is mapped to the data memory space.
+*/
+int    avr_translate_LDY2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDY2_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+
+    gen_set_yaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDY3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDY3_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+    gen_set_yaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDDY(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDDY_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_addi_tl(addr, addr, (LDDY_hImm(opcode) << 5) | (LDDY_mImm(opcode) << 2) | LDDY_lImm(opcode));
+                                                    /*  addr    = addr + q  */
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads one byte indirect with or without displacement from the data space to a register. For parts with SRAM, the
+    data space consists of the Register File, I/O memory and internal SRAM (and external SRAM if applicable). For
+    parts without SRAM, the data space consists of the Register File only. In some parts the Flash Memory has
+    been mapped to the data space and can be read using this command. The EEPROM has a separate address
+    space.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register can either be left unchanged by the operation, or it can be post-incremented or predecremented.
+    These features are especially suited for Stack Pointer usage of the Z-pointer Register, however
+    because the Z-pointer Register can be used for indirect subroutine calls, indirect jumps and table lookup, it is
+    often more convenient to use the X or Y-pointer as a dedicated Stack Pointer. Note that only the low byte of the
+    Z-pointer is updated in devices with no more than 256 bytes data space. For such devices, the high byte of the
+    pointer is not used by this instruction and can be used for other purposes. The RAMPZ Register in the I/O area
+    is updated in parts with more than 64KB data space or more than 64KB Program memory, and the
+    increment/decrement/displacement is added to the entire 24-bit address on such devices.
+    Not all variants of this instruction is available in all devices. Refer to the device specific instruction set summary.
+    In the Reduced Core tinyAVR the LD instruction can be used to achieve the same operation as LPM since the
+    program memory is mapped to the data memory space.
+    For using the Z-pointer for table lookup in Program memory see the LPM and ELPM instructions.
+*/
+int    avr_translate_LDZ2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[LDZ2_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+
+    gen_set_zaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDZ3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[LDZ3_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+
+    gen_set_zaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LDDZ(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[LDDZ_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_addi_tl(addr, addr, (LDDZ_hImm(opcode) << 5) | (LDDZ_mImm(opcode) << 2) | LDDZ_lImm(opcode));
+                                                    /*  addr    = addr + q  */
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads an 8 bit constant directly to register 16 to 31.
+*/
+int    avr_translate_LDI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[16 + LDI_Rd(opcode)];
+    int     imm = (LDI_hImm(opcode) << 4) | LDI_lImm(opcode);
+
+    tcg_gen_movi_tl(Rd, imm);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads one byte from the data space to a register. For parts with SRAM, the data space consists of the Register
+    File, I/O memory and internal SRAM (and external SRAM if applicable). For parts without SRAM, the data space
+    consists of the register file only. The EEPROM has a separate address space.
+    A 16-bit address must be supplied. Memory access is limited to the current data segment of 64KB. The LDS
+    instruction uses the RAMPD Register to access memory above 64KB. To access another data segment in
+    devices with more than 64KB data space, the RAMPD in register in the I/O area has to be changed.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_LDS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LDS_Rd(opcode)];
+    TCGv    addr= tcg_temp_new_i32();
+    TCGv    H   = cpu_rampD;
+
+    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
+    tcg_gen_shli_tl(addr, addr, 16);
+    tcg_gen_ori_tl(addr, addr, LDS_Imm(opcode));
+
+    tcg_gen_qemu_ld8u(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Loads one byte pointed to by the Z-register into the destination register Rd. This instruction features a 100%
+    space effective constant initialization or constant data fetch. The Program memory is organized in 16-bit words
+    while the Z-pointer is a byte address. Thus, the least significant bit of the Z-pointer selects either low byte (ZLSB
+    = 0) or high byte (ZLSB = 1). This instruction can address the first 64KB (32K words) of Program memory. The Zpointer
+    Register can either be left unchanged by the operation, or it can be incremented. The incrementation
+    does not apply to the RAMPZ Register.
+    Devices with Self-Programming capability can use the LPM instruction to read the Fuse and Lock bit values.
+    Refer to the device documentation for a detailed description.
+    The LPM instruction is not available in all devices. Refer to the device specific instruction set summary
+*/
+int    avr_translate_LPM1(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_LPM) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[0];
+    TCGv    addr= tcg_temp_new_i32();
+    TCGv    H   = cpu_r[31];
+    TCGv    L   = cpu_r[30];
+
+    tcg_gen_shli_tl(addr, H, 8);             /*  addr    = H:L  */
+    tcg_gen_or_tl(addr, addr, L);
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LPM2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_LPM) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[LPM2_Rd(opcode)];
+    TCGv    addr= tcg_temp_new_i32();
+    TCGv    H   = cpu_r[31];
+    TCGv    L   = cpu_r[30];
+
+    tcg_gen_shli_tl(addr, H, 8);             /*  addr    = H:L  */
+    tcg_gen_or_tl(addr, addr, L);
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_LPMX(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_LPMX) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd  = cpu_r[LPMX_Rd(opcode)];
+    TCGv    addr= tcg_temp_new_i32();
+    TCGv    H   = cpu_r[31];
+    TCGv    L   = cpu_r[30];
+
+    tcg_gen_shli_tl(addr, H, 8);             /*  addr    = H:L  */
+    tcg_gen_or_tl(addr, addr, L);
+
+    tcg_gen_qemu_ld8u(Rd, addr, CODE_INDEX);    /*  Rd      = mem[addr]  */
+
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+
+    tcg_gen_andi_tl(L, addr, 0xff);
+
+    tcg_gen_shri_tl(addr, addr, 8);
+    tcg_gen_andi_tl(H, addr, 0xff);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Shifts all bits in Rd one place to the right. Bit 7 is cleared. Bit 0 is loaded into the C Flag of the SREG. This
+    operation effectively divides an unsigned value by two. The C Flag can be used to round the result.
+*/
+int    avr_translate_LSR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[LSR_Rd(opcode)];
+
+    tcg_gen_andi_tl(cpu_Cf, Rd, 1);
+
+    tcg_gen_shri_tl(Rd, Rd, 1);
+
+    gen_ZNSf(Rd);
+    tcg_gen_xor_tl(cpu_Vf, cpu_Nf, cpu_Cf);
+    return  BS_NONE;
+}
+
+/*
+    This instruction makes a copy of one register into another. The source register Rr is left unchanged, while the
+    destination register Rd is loaded with a copy of Rr.
+*/
+int    avr_translate_MOV(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[MOV_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(MOV_hRr(opcode) << 4) | (MOV_lRr(opcode))];
+
+    tcg_gen_mov_tl(Rd, Rr);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction makes a copy of one register pair into another register pair. The source register pair Rr+1:Rr is
+    left unchanged, while the destination register pair Rd+1:Rd is loaded with a copy of Rr + 1:Rr.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_MOVW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MOVW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    RdL = cpu_r[MOVW_Rd(opcode)  *2 + 0];
+    TCGv    RdH = cpu_r[MOVW_Rd(opcode)  *2 + 1];
+    TCGv    RrL = cpu_r[MOVW_Rr(opcode)  *2 + 0];
+    TCGv    RrH = cpu_r[MOVW_Rr(opcode)  *2 + 1];
+
+    tcg_gen_mov_tl(RdH, RrH);
+    tcg_gen_mov_tl(RdL, RrL);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit unsigned multiplication.
+*/
+int    avr_translate_MUL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[MUL_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(MUL_hRr(opcode) << 4) | MUL_lRr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+
+    tcg_gen_mov_tl(R0, R);
+    tcg_gen_andi_tl(R0, R0, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_mov_tl(R1, R);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 15);    /*  Cf  = R(16)  */
+    tcg_gen_mov_tl(cpu_Zf, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication.
+*/
+int    avr_translate_MULS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + MULS_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + MULS_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
+    tcg_gen_ext8s_tl(Rr, Rr);            /*  make Rr full 32 bit signed  */
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+
+    tcg_gen_mov_tl(R0, R);
+    tcg_gen_andi_tl(R0, R0, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_mov_tl(R1, R);
+    tcg_gen_andi_tl(R1, R0, 0xff);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 15);    /*  Cf  = R(16)  */
+    tcg_gen_mov_tl(cpu_Zf, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit multiplication of a signed and an unsigned number.
+*/
+int    avr_translate_MULSU(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + MULSU_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + MULSU_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
+    tcg_gen_mul_tl(R, Rd, Rr);    /*  R   = Rd  *Rr  */
+
+    tcg_gen_mov_tl(R0, R);
+    tcg_gen_andi_tl(R0, R0, 0xff);
+    tcg_gen_shri_tl(R, R, 8);
+    tcg_gen_mov_tl(R1, R);
+    tcg_gen_andi_tl(R1, R0, 0xff);
+
+    tcg_gen_shri_tl(cpu_Cf, R, 16);    /*  Cf  = R(16)  */
+    tcg_gen_mov_tl(cpu_Zf, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Replaces the contents of register Rd with its two’s complement; the value $80 is left unchanged.
+*/
+int    avr_translate_NEG(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[NEG_Rd(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_neg_tl(R, Rd);
+
+    tcg_gen_setcondi_tl(TCG_COND_NE, cpu_Cf, R, 0x00);  /*  Cf  = R != 0x00  */
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, R, 0x80);  /*  Vf  = R == 0x80  */
+    tcg_gen_not_tl(cpu_Hf, Rd);                                /*  Hf  = (~Rd | R)(3)  */
+    tcg_gen_or_tl(cpu_Hf, cpu_Hf, R);
+    tcg_gen_shri_tl(cpu_Hf, cpu_Hf, 3);
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction performs a single cycle No Operation.
+*/
+int    avr_translate_NOP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+
+    /*  NOP  */
+
+    return  BS_NONE;
+}
+
+/*
+    Performs the logical OR between the contents of register Rd and register Rr and places the result in the
+    destination register Rd.
+*/
+int    avr_translate_OR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[OR_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(OR_hRr(opcode) << 4) | (OR_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_or_tl(R, Rd, Rr);
+
+    tcg_gen_movi_tl(cpu_Vf, 0);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Performs the logical OR between the contents of register Rd and a constant and places the result in the
+    destination register Rd.
+*/
+int    avr_translate_ORI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ORI_Rd(opcode)];
+    int     Imm = (ORI_hImm(opcode) << 4) | (ORI_lImm(opcode));
+
+    tcg_gen_ori_tl(Rd, Rd, Imm);   /*  Rd  = Rd | Imm  */
+
+    tcg_gen_movi_tl(cpu_Vf, 0x00);          /*  Vf  = 0  */
+    gen_ZNSf(Rd);
+
+    return  BS_NONE;
+}
+
+/*
+    Stores data from register Rr in the Register File to I/O Space (Ports, Timers, Configuration Registers, etc.).  */
+int    avr_translate_OUT(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[OUT_Rd(opcode)];
+    int     Imm = (OUT_hImm(opcode) << 4) | OUT_lImm(opcode);
+    TCGv    port= tcg_const_i32(Imm);
+    TCGv    data= cpu_io[Imm];
+
+    tcg_gen_mov_tl(data, Rd);
+    gen_helper_outb(cpu_env, port, data);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction loads register Rd with a byte from the STACK. The Stack Pointer is pre-incremented by 1 before
+    the POP.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_POP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[POP_Rd(opcode)];
+
+    tcg_gen_addi_tl(cpu_sp, cpu_sp, 1);
+    tcg_gen_qemu_ld8u(
+                    Rd, cpu_sp, DATA_INDEX);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction stores the contents of register Rr on the STACK. The Stack Pointer is post-decremented by 1
+    after the PUSH.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_PUSH(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[PUSH_Rd(opcode)];
+
+    tcg_gen_qemu_st8(
+                    Rd, cpu_sp, DATA_INDEX);
+    tcg_gen_subi_tl(cpu_sp, cpu_sp, 1);
+
+    return  BS_NONE;
+}
+
+/*
+    Relative call to an address within PC - 2K + 1 and PC + 2K (words). The return address (the instruction after the
+    RCALL) is stored onto the Stack. See also CALL. For AVR microcontrollers with Program memory not
+    exceeding 4K words (8KB) this instruction can address the entire memory from every address location. The
+    Stack Pointer uses a post-decrement scheme during RCALL.
+*/
+int    avr_translate_RCALL(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    int ret = ctx->inst[0].npc;
+    int dst = ctx->inst[0].npc + sex(RCALL_Imm(opcode), 12);
+
+    gen_push_ret(env, ret);
+
+    gen_goto_tb(env, ctx, 0, dst);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Returns from subroutine. The return address is loaded from the STACK. The Stack Pointer uses a preincrement
+    scheme during RET.
+*/
+int    avr_translate_RET(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    gen_pop_ret(env, cpu_pc);
+
+    tcg_gen_exit_tb(0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Returns from interrupt. The return address is loaded from the STACK and the Global Interrupt Flag is set.
+    Note that the Status Register is not automatically stored when entering an interrupt routine, and it is not restored
+    when returning from an interrupt routine. This must be handled by the application program. The Stack Pointer
+    uses a pre-increment scheme during RETI.
+*/
+int    avr_translate_RETI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    gen_pop_ret(env, cpu_pc);
+
+    tcg_gen_movi_tl(cpu_If, 1);
+
+    tcg_gen_exit_tb(0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Relative jump to an address within PC - 2K +1 and PC + 2K (words). For AVR microcontrollers with Program
+    memory not exceeding 4K words (8KB) this instruction can address the entire memory from every address
+    location. See also JMP.
+*/
+int    avr_translate_RJMP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    int dst = ctx->inst[0].npc + sex(RJMP_Imm(opcode), 12);
+
+    gen_goto_tb(env, ctx, 0, dst);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Shifts all bits in Rd one place to the right. The C Flag is shifted into bit 7 of Rd. Bit 0 is shifted into the C Flag.
+    This operation, combined with ASR, effectively divides multi-byte signed values by two. Combined with LSR it
+    effectively divides multi-byte unsigned values by two. The Carry Flag can be used to round the result.
+*/
+int    avr_translate_ROR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ROR_Rd(opcode)];
+    TCGv    t0  = tcg_temp_new_i32();
+
+    tcg_gen_shli_tl(t0, cpu_Cf, 7);
+    tcg_gen_andi_tl(cpu_Cf, Rd, 0);
+    tcg_gen_shri_tl(Rd, Rd, 1);
+    tcg_gen_or_tl(Rd, Rd, t0);
+
+    gen_ZNSf(Rd);
+    tcg_gen_xor_tl(cpu_Vf, cpu_Nf, cpu_Cf);
+
+    tcg_temp_free_i32(t0);
+
+    return  BS_NONE;
+}
+
+/*
+    Subtracts two registers and subtracts with the C Flag and places the result in the destination register Rd.
+*/
+int    avr_translate_SBC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[SBC_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(SBC_hRr(opcode) << 4) | (SBC_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);             /*  R   = Rd - Rr - Cf  */
+    tcg_gen_sub_tl(R, R, cpu_Cf);
+    tcg_gen_andi_tl(R, R, 0xff);            /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    SBCI – Subtract Immediate with Carry
+*/
+int    avr_translate_SBCI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[SBCI_Rd(opcode)];
+    TCGv    Rr  = tcg_const_i32((SBCI_hImm(opcode) << 4) | SBCI_lImm(opcode));
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);             /*  R   = Rd - Rr - Cf  */
+    tcg_gen_sub_tl(R, R, cpu_Cf);
+    tcg_gen_andi_tl(R, R, 0xff);            /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Sets a specified bit in an I/O Register. This instruction operates on the lower 32 I/O Registers – addresses 0-31.  */
+int    avr_translate_SBI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    data    = cpu_io[SBI_Imm(opcode)];
+    TCGv    port    = tcg_const_i32(SBI_Imm(opcode));
+
+    tcg_gen_ori_tl(data, data, 1 << SBI_Bit(opcode));
+    gen_helper_outb(cpu_env, port, data);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction tests a single bit in an I/O Register and skips the next instruction if the bit is cleared. This
+    instruction operates on the lower 32 I/O Registers – addresses 0-31.  */
+int    avr_translate_SBIC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        io  = cpu_io[SBIC_Imm(opcode)];
+    TCGv        t0  = tcg_temp_new_i32();
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst is skipped  */
+    tcg_gen_andi_tl(t0, io, 1 << SBIC_Bit(opcode));
+    tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);      /*  PC if next inst is not skipped  */
+    gen_set_label(skip);
+
+    tcg_temp_free_i32(t0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    This instruction tests a single bit in an I/O Register and skips the next instruction if the bit is set. This instruction
+    operates on the lower 32 I/O Registers – addresses 0-31.  */
+int    avr_translate_SBIS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        io  = cpu_io[SBIS_Imm(opcode)];
+    TCGv        t0  = tcg_temp_new_i32();
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);  /*  PC if next inst is skipped  */
+    tcg_gen_andi_tl(t0, io, 1 << SBIS_Bit(opcode));
+    tcg_gen_brcondi_i32(TCG_COND_NE, t0, 0, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);  /*  PC if next inst is not skipped  */
+    gen_set_label(skip);
+
+    tcg_temp_free_i32(t0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    Subtracts an immediate value (0-63) from a register pair and places the result in the register pair. This
+    instruction operates on the upper four register pairs, and is well suited for operations on the Pointer Registers.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_SBIW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    RdL = cpu_r[24 + 2 * SBIW_Rd(opcode)];
+    TCGv    RdH = cpu_r[25 + 2 * SBIW_Rd(opcode)];
+    int     Imm = (SBIW_hImm(opcode) << 4) | (SBIW_lImm(opcode));
+    TCGv    R   = tcg_temp_new_i32();
+    TCGv    Rd  = tcg_temp_new_i32();
+    TCGv    t0  = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_shli_tl(Rd, RdH, 8);     /*  R = ((H << 8) | L) - Imm  */
+    tcg_gen_or_tl(Rd, RdL, Rd);
+    tcg_gen_subi_tl(R, Rd, Imm);
+    tcg_gen_andi_tl(R, R, 0xffff);/*  make it 16 bits  */
+
+    /*  Cf  */
+    tcg_gen_not_tl(t0, R);             /*  t0  = Rd & ~R  */
+    tcg_gen_and_tl(t0, Rd, t0);
+    tcg_gen_shri_tl(cpu_Cf, t0, 15);    /*  Cf  = t0(15)  */
+
+    /*  Vf  */
+    tcg_gen_not_tl(t0, Rd);            /*  t0  = ~Rd & R  */
+    tcg_gen_and_tl(t0, R, t0);
+
+    tcg_gen_shri_tl(cpu_Vf, t0, 15);    /*  Vf  = t0(15)  */
+
+    /*  Zf  */
+    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
+
+    /*  Nf  */
+    tcg_gen_shri_tl(cpu_Nf, R, 15);    /*  Nf  = R(15)  */
+
+    /*  Sf  */
+    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */
+
+    /*  R  */
+    tcg_gen_andi_tl(RdL, R, 0xff);
+    tcg_gen_shri_tl(RdH, R, 8);
+
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(Rd);
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction tests a single bit in a register and skips the next instruction if the bit is cleared.
+*/
+int    avr_translate_SBRC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rr  = cpu_r[SBRC_Rr(opcode)];
+    TCGv        t0  = tcg_temp_new_i32();
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst is skipped  */
+    tcg_gen_andi_tl(t0, Rr, 1 << SBRC_Bit(opcode));
+    tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);  /*  PC if next inst is not skipped  */
+    gen_set_label(skip);
+
+    tcg_temp_free_i32(t0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    This instruction tests a single bit in a register and skips the next instruction if the bit is set.
+*/
+int    avr_translate_SBRS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rr  = cpu_r[SBRS_Rr(opcode)];
+    TCGv        t0  = tcg_temp_new_i32();
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);  /*  PC if next inst is skipped  */
+    tcg_gen_andi_tl(t0, Rr, 1 << SBRS_Bit(opcode));
+    tcg_gen_brcondi_i32(TCG_COND_NE, t0, 0, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);  /*  PC if next inst is not skipped  */
+    gen_set_label(skip);
+
+    tcg_temp_free_i32(t0);
+
+    return  BS_BRANCH;
+}
+
+/*
+    This instruction sets the circuit in sleep mode defined by the MCU Control Register.
+*/
+int    avr_translate_SLEEP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    gen_helper_sleep(cpu_env);
+
+    return  BS_EXCP;
+}
+
+/*
+    SPM can be used to erase a page in the Program memory, to write a page in the Program memory (that is
+    already erased), and to set Boot Loader Lock bits. In some devices, the Program memory can be written one
+    word at a time, in other devices an entire page can be programmed simultaneously after first filling a temporary
+    page buffer. In all cases, the Program memory must be erased one page at a time. When erasing the Program
+    memory, the RAMPZ and Z-register are used as page address. When writing the Program memory, the RAMPZ
+    and Z-register are used as page or word address, and the R1:R0 register pair is used as data(1). When setting
+    the Boot Loader Lock bits, the R1:R0 register pair is used as data. Refer to the device documentation for
+    detailed description of SPM usage. This instruction can address the entire Program memory.
+    The SPM instruction is not available in all devices. Refer to the device specific instruction set summary.
+    Note: 1. R1 determines the instruction high byte, and R0 determines the instruction low byte.
+*/
+int    avr_translate_SPM(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_SPM) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    /*  TODO:   ???  */
+    return  BS_NONE;
+}
+
+int    avr_translate_SPMX(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_SPMX) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    /*  TODO:   ???  */
+    return  BS_NONE;
+}
+
+int    avr_translate_STX1(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STX1_Rr(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STX2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STX2_Rr(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+    gen_set_xaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STX3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STX3_Rr(opcode)];
+    TCGv    addr= gen_get_xaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+    gen_set_xaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STY2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STY2_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+    gen_set_yaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STY3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STY3_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+    gen_set_yaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STDY(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STDY_Rd(opcode)];
+    TCGv    addr= gen_get_yaddr();
+
+    tcg_gen_addi_tl(addr, addr, (STDY_hImm(opcode) << 5) | (STDY_mImm(opcode) << 2) | STDY_lImm(opcode));
+                                                    /*  addr    = addr + q  */
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STZ2(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[STZ2_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+    tcg_gen_addi_tl(addr, addr, 1);             /*  addr    = addr + 1  */
+    
+    gen_set_zaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STZ3(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[STZ3_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_subi_tl(addr, addr, 1);             /*  addr    = addr - 1  */
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+
+    gen_set_zaddr(addr);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+int    avr_translate_STDZ(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd      = cpu_r[STDZ_Rd(opcode)];
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_addi_tl(addr, addr, (STDZ_hImm(opcode) << 5) | (STDZ_mImm(opcode) << 2) | STDZ_lImm(opcode));
+                                                    /*  addr    = addr + q  */
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Stores one byte from a Register to the data space. For parts with SRAM, the data space consists of the Register
+    File, I/O memory and internal SRAM (and external SRAM if applicable). For parts without SRAM, the data space
+    consists of the Register File only. The EEPROM has a separate address space.
+    A 16-bit address must be supplied. Memory access is limited to the current data segment of 64KB. The STS
+    instruction uses the RAMPD Register to access memory above 64KB. To access another data segment in
+    devices with more than 64KB data space, the RAMPD in register in the I/O area has to be changed.
+    This instruction is not available in all devices. Refer to the device specific instruction set summary.
+*/
+int    avr_translate_STS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[STS_Rd(opcode)];
+    TCGv    addr= tcg_temp_new_i32();
+    TCGv    H   = cpu_rampD;
+
+    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
+    tcg_gen_shli_tl(addr, addr, 16);
+    tcg_gen_ori_tl(addr, addr, STS_Imm(opcode));
+
+    tcg_gen_qemu_st8(
+                    Rd, addr, DATA_INDEX);
+
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
+/*
+    Subtracts two registers and places the result in the destination register Rd.
+*/
+int    avr_translate_SUB(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[SUB_Rd(opcode)];
+    TCGv    Rr  = cpu_r[(SUB_hRr(opcode) << 4) | (SUB_lRr(opcode))];
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);            /*  R   = Rd - Rr  */
+    tcg_gen_andi_tl(R, R, 0xff);          /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+
+    return  BS_NONE;
+}
+
+/*
+    Subtracts a register and a constant and places the result in the destination register Rd. This instruction is
+    working on Register R16 to R31 and is very well suited for operations on the X, Y, and Z-pointers.
+*/
+int    avr_translate_SUBI(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[SUBI_Rd(opcode)];
+    TCGv    Rr  = tcg_const_i32((SUBI_hImm(opcode) << 4) | SUBI_lImm(opcode));
+    TCGv    R   = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_sub_tl(R, Rd, Rr);
+                                                    /*  R   = Rd - Imm  */
+    tcg_gen_andi_tl(R, R, 0xff);          /*  make it 8 bits  */
+
+    gen_sub_CHf(R, Rd, Rr);
+    gen_sub_Vf(R, Rd, Rr);
+    gen_ZNSf(R);
+
+    /*  R  */
+    tcg_gen_mov_tl(Rd, R);
+
+    tcg_temp_free_i32(R);
+    tcg_temp_free_i32(Rr);
+
+    return  BS_NONE;
+}
+
+/*
+    Swaps high and low nibbles in a register.
+*/
+int    avr_translate_SWAP(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[SWAP_Rd(opcode)];
+    TCGv    t0  = tcg_temp_new_i32();
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_andi_tl(t0, Rd, 0x0f);
+    tcg_gen_shli_tl(t0, t0, 4);
+    tcg_gen_andi_tl(t1, Rd, 0xf0);
+    tcg_gen_shri_tl(t1, t1, 4);
+    tcg_gen_or_tl(Rd, t0, t1);
+
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t0);
+
+    return  BS_NONE;
+}
+
+/*
+    This instruction resets the Watchdog Timer. This instruction must be executed within a limited time given by the
+    WD prescaler. See the Watchdog Timer hardware specification.
+*/
+int    avr_translate_WDR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+
+    gen_helper_wdr(cpu_env);
+
+    return  BS_NONE;
+}
+
+/*
+    Exchanges one byte indirect between register and data space.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register is left unchanged by the operation. This instruction is especially suited for writing/reading
+    status bits stored in SRAM.
+*/
+int    avr_translate_XCH(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd      = cpu_r[XCH_Rd(opcode)];
+    TCGv    t0      = tcg_temp_new_i32();
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(
+                    addr, t0, DATA_INDEX);
+    tcg_gen_qemu_st8(
+                    addr, Rd, DATA_INDEX);
+    tcg_gen_mov_tl(Rd, t0);
+
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(addr);
+
+    return  BS_NONE;
+}
+
diff --git a/target-avr/translate.h b/target-avr/translate.h
new file mode 100644
index 0000000..fa65406
--- /dev/null
+++ b/target-avr/translate.h
@@ -0,0 +1,123 @@ 
+/*
+ *  QEMU AVR CPU
+ *
+ *  Copyright (c) 2016 Michael Rolnik
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see
+ *  <http://www.gnu.org/licenses/lgpl-2.1.html>
+ */
+
+#ifndef AVR_TRANSLATE_H_
+#define AVR_TRANSLATE_H_
+
+
+#include "qemu/osdep.h"
+
+#include "tcg/tcg.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "disas/disas.h"
+#include "tcg-op.h"
+#include "exec/cpu_ldst.h"
+
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "translate-inst.h"
+
+extern TCGv_env cpu_env;
+
+extern TCGv     cpu_pc;
+
+extern TCGv     cpu_Cf;
+extern TCGv     cpu_Zf;
+extern TCGv     cpu_Nf;
+extern TCGv     cpu_Vf;
+extern TCGv     cpu_Sf;
+extern TCGv     cpu_Hf;
+extern TCGv     cpu_Tf;
+extern TCGv     cpu_If;
+
+extern TCGv     cpu_rampD;
+extern TCGv     cpu_rampX;
+extern TCGv     cpu_rampY;
+extern TCGv     cpu_rampZ;
+
+extern TCGv     cpu_io[64];
+extern TCGv     cpu_r[32];
+extern TCGv     cpu_eind;
+extern TCGv     cpu_sp;
+
+enum {
+    BS_NONE     = 0,    /*  Nothing special (none of the below          */
+    BS_STOP     = 1,    /*  We want to stop translation for any reason  */
+    BS_BRANCH   = 2,    /*  A branch condition is reached               */
+    BS_EXCP     = 3,    /*  An exception condition is reached           */
+};
+
+uint32_t    get_opcode(uint8_t const *code, unsigned bitBase, unsigned bitSize);
+
+typedef struct DisasContext DisasContext;
+typedef struct InstInfo     InstInfo;
+
+typedef int (*translate_function_t)(CPUAVRState *env, DisasContext *ctx, uint32_t opcode);
+struct InstInfo {
+    target_long                 cpc;
+    target_long                 npc;
+    uint32_t                    opcode;
+    translate_function_t        translate;
+    unsigned                    length;
+};
+
+/*This is the state at translation time.  */
+struct DisasContext {
+    struct TranslationBlock    *tb;
+
+    InstInfo                    inst[2];    /*  two consequitive instructions   */
+
+    /*Routine used to access memory */
+    int                         memidx;
+    int                         bstate;
+    int                         singlestep;
+};
+
+uint32_t    avr_decode(uint32_t pc, uint32_t *length, uint32_t opcode, translate_function_t *translate);
+
+static inline void  gen_goto_tb(CPUAVRState        *env,
+                                DisasContext       *ctx,
+                                int                 n,
+                                target_ulong        dest)
+{
+    TranslationBlock   *tb;
+
+    tb      = ctx->tb;
+
+    if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)
+        &&  (ctx->singlestep == 0)) {
+        tcg_gen_goto_tb(n);
+        tcg_gen_movi_i32(cpu_pc, dest);
+        tcg_gen_exit_tb((uintptr_t)tb + n);
+    } else {
+        tcg_gen_movi_i32(cpu_pc, dest);
+
+        if (ctx->singlestep) {
+            gen_helper_debug(cpu_env);
+        }
+        tcg_gen_exit_tb(0);
+    }
+}
+
+
+#endif
+