diff mbox

[RFC] x86: use globals for CPU registers

Message ID 761ea48b0909131400i33efc212nce026adb75a4f5d2@mail.gmail.com
State Superseded
Headers show

Commit Message

Laurent Desnogues Sept. 13, 2009, 9 p.m. UTC
Hello,

this patch is a proposal to use globals for the 8 or 16 CPU
registers on i386 and x86_64.

I measured the improvement in the following conditions:

  - Machine:  i7 920
  - Software:  Fedora11 x86_64 gcc 4.4.1
  - Benchmark: SPEC2000 gcc with expr.i input
  - User mode
  - i386 and x86_64 hosts and targets, with and without the patch
    (8 combinations)

The results are:

qemu-i386_on-i386          15.82user 0.05system 0:15.91elapsed
qemu-i386_on-i386-reg      15.40user 0.02system 0:15.43elapsed
qemu-i386_on-x86_64        15.65user 0.05system 0:15.71elapsed
qemu-i386_on-x86_64-reg    15.11user 0.03system 0:15.15elapsed
qemu-x86_64_on-i386        mmap: No such device or address
qemu-x86_64_on-i386-reg    mmap: No such device or address
qemu-x86_64_on-x86_64      18.42user 0.07system 0:18.49elapsed
qemu-x86_64_on-x86_64-reg  13.22user 0.06system 0:13.31elapsed

Given my lack of knowledge of system QEMU, I will leave it to
someone else to measure the speedup.

A previous version of that patch, that only handled i386 target,
was tested by Malc who got speedup running OpenSSL on his G4.  It
was also sent to Fabrice who asked me to send it to the mailing
list.

The usage of globals is controlled by USE_REGS so that reviewers
can quickly test the benefit (or the lack of it).

Comments are welcome (except for the obvious presence of //
which is only temporary).  I need to optimize a few things once
I'm sure the temporaries (cpu_tmp0, ...) are not used outside of
the modified functions.  x86_64 was coded in a hurry and is
perhaps buggy.


Laurent

Signed-off-by: Laurent Desnogues <laurent.desnogues@gmail.com>

Comments

Aurelien Jarno Sept. 26, 2009, 10:41 p.m. UTC | #1
On Sun, Sep 13, 2009 at 11:00:08PM +0200, Laurent Desnogues wrote:
> Hello,
> 
> this patch is a proposal to use globals for the 8 or 16 CPU
> registers on i386 and x86_64.
> 
> I measured the improvement in the following conditions:
> 
>   - Machine:  i7 920
>   - Software:  Fedora11 x86_64 gcc 4.4.1
>   - Benchmark: SPEC2000 gcc with expr.i input
>   - User mode
>   - i386 and x86_64 hosts and targets, with and without the patch
>     (8 combinations)
> 
> The results are:
> 
> qemu-i386_on-i386          15.82user 0.05system 0:15.91elapsed
> qemu-i386_on-i386-reg      15.40user 0.02system 0:15.43elapsed
> qemu-i386_on-x86_64        15.65user 0.05system 0:15.71elapsed
> qemu-i386_on-x86_64-reg    15.11user 0.03system 0:15.15elapsed
> qemu-x86_64_on-i386        mmap: No such device or address
> qemu-x86_64_on-i386-reg    mmap: No such device or address
> qemu-x86_64_on-x86_64      18.42user 0.07system 0:18.49elapsed
> qemu-x86_64_on-x86_64-reg  13.22user 0.06system 0:13.31elapsed
> 
> Given my lack of knowledge of system QEMU, I will leave it to
> someone else to measure the speedup.

I'll try to provide benchmarks later.

> A previous version of that patch, that only handled i386 target,
> was tested by Malc who got speedup running OpenSSL on his G4.  It
> was also sent to Fabrice who asked me to send it to the mailing
> list.
> 
> The usage of globals is controlled by USE_REGS so that reviewers
> can quickly test the benefit (or the lack of it).
> 
> Comments are welcome (except for the obvious presence of //
> which is only temporary).  I need to optimize a few things once
> I'm sure the temporaries (cpu_tmp0, ...) are not used outside of
> the modified functions.  x86_64 was coded in a hurry and is
> perhaps buggy.
>

It basically looks good. Please find my comments inline.

> Laurent
> 
> Signed-off-by: Laurent Desnogues <laurent.desnogues@gmail.com>

> diff --git a/target-i386/translate.c b/target-i386/translate.c
> index 335fc08..dc2fcde 100644
> --- a/target-i386/translate.c
> +++ b/target-i386/translate.c
> @@ -58,10 +58,15 @@
>  
>  //#define MACRO_TEST   1
>  
> +#define USE_REGS
> +
>  /* global register indexes */
>  static TCGv_ptr cpu_env;
>  static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_tmp;
>  static TCGv_i32 cpu_cc_op;
> +#ifdef USE_REGS
> +static TCGv cpu_regs[CPU_NB_REGS];
> +#endif
>  /* local temps */
>  static TCGv cpu_T[2], cpu_T3;
>  /* local register indexes (only used inside old micro ops) */
> @@ -269,70 +274,95 @@ static inline void gen_op_andl_A0_ffff(void)
>  #define REG_LH_OFFSET 4
>  #endif
>  
> +#ifdef USE_REGS
> +#ifdef TARGET_X86_64
> +/* #warning NYI */
> +#endif
> +
>  static inline void gen_op_mov_reg_v(int ot, int reg, TCGv t0)
>  {
> +    TCGv tmp;
> +
>      switch(ot) {
>      case OT_BYTE:
> +        tmp = tcg_temp_new();
> +        tcg_gen_andi_tl(tmp, t0, 0xff);

           tcg_gen_ext8u_tl(tmp, t0); ?

>          if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
> -            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
> +            tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xff);
> +            tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
> +            //tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
>          } else {
> -            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
> +            tcg_gen_shli_tl(tmp, tmp, 8);
> +            tcg_gen_andi_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], ~0xff00);
> +            tcg_gen_or_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], tmp);
> +            //tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
>          }
> +        tcg_temp_free(tmp);
>          break;
>      case OT_WORD:
> -        tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        tmp = tcg_temp_new();
> +        tcg_gen_andi_tl(tmp, t0, 0xffff);

           tcg_gen_ext16u_tl(tmp, t0); ?

> +        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
> +        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
> +        tcg_temp_free(tmp);
> +        //tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
>          break;
>  #ifdef TARGET_X86_64
>      case OT_LONG:
> -        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>          /* high part of register set to zero */
> -        tcg_gen_movi_tl(cpu_tmp0, 0);
> -        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
> +        tcg_gen_ext32u_tl(cpu_regs[reg], t0);
> +        //tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        /* high part of register set to zero */
> +        //tcg_gen_movi_tl(cpu_tmp0, 0);
> +        //tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
>          break;
>      default:
>      case OT_QUAD:
> -        tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
> +        tcg_gen_mov_tl(cpu_regs[reg], t0);
> +        //tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
>          break;
>  #else
>      default:
>      case OT_LONG:
> -        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        tcg_gen_mov_tl(cpu_regs[reg], t0);
> +        //tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>          break;
>  #endif

	tcg_gen_ext32u_tl(cpu_regs[reg], t0) is equivalent to 
	tcg_gen_mov_tl(cpu_regs[reg], t0) if TARGET_LONG_BITS == 32, ie
	if !TARGET_X86_64. This means the OT_LONG can now be common, with
	the #ifdef only for OT_QUAD.

>      }
>  }
>  
> -static inline void gen_op_mov_reg_T0(int ot, int reg)
> -{
> -    gen_op_mov_reg_v(ot, reg, cpu_T[0]);
> -}
> -
> -static inline void gen_op_mov_reg_T1(int ot, int reg)
> -{
> -    gen_op_mov_reg_v(ot, reg, cpu_T[1]);
> -}
> -
>  static inline void gen_op_mov_reg_A0(int size, int reg)
>  {
> +    TCGv tmp;
> +
>      switch(size) {
>      case 0:
> -        tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        tmp = tcg_temp_new();
> +        tcg_gen_andi_tl(tmp, cpu_A0, 0xffff);

           tcg_gen_ext16u_tl(tmp, t0); ?

> +        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
> +        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
> +        tcg_temp_free(tmp);
> +        //tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
>          break;
>  #ifdef TARGET_X86_64
>      case 1:
> -        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>          /* high part of register set to zero */
> -        tcg_gen_movi_tl(cpu_tmp0, 0);
> -        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
> +        tcg_gen_ext32u_tl(cpu_regs[reg], cpu_A0);
> +        //tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        /* high part of register set to zero */
> +        //tcg_gen_movi_tl(cpu_tmp0, 0);
> +        //tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
>          break;
>      default:
>      case 2:
> -        tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
> +        tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
> +        //tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
>          break;
>  #else
>      default:
>      case 1:
> -        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
> +        //tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>          break;
>  #endif

	Same comment as previous to share more code between x86 and
	x86_64.

>      }
> @@ -345,59 +375,213 @@ static inline void gen_op_mov_v_reg(int ot, TCGv t0, int reg)
>          if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
>              goto std_case;
>          } else {
> -            tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
> +            tcg_gen_shri_tl(t0, cpu_regs[reg - 4], 8);
> +            tcg_gen_andi_tl(t0, t0, 0xff);

               tcg_gen_ext8u_tl(t0, t0) ?

> +            //tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
>          }
>          break;
>      default:
>      std_case:
> -        tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
> +        tcg_gen_mov_tl(t0, cpu_regs[reg]);
> +        //tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
>          break;
>      }
>  }
>  
> -static inline void gen_op_mov_TN_reg(int ot, int t_index, int reg)
> +static inline void gen_op_movl_A0_reg(int reg)
>  {
> -    gen_op_mov_v_reg(ot, cpu_T[t_index], reg);
> +    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
> +    //tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>  }
>  
> -static inline void gen_op_movl_A0_reg(int reg)
> +static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
>  {
> -    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +    TCGv tmp;
> +
> +    switch(size) {
> +    case 0:
> +        // TODO optimize
> +        tmp = tcg_temp_new();
> +        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +        tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
> +        tcg_gen_andi_tl(tmp, cpu_tmp0, 0xffff);
> +        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
> +        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
> +        tcg_temp_free(tmp);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
> +        //tcg_gen_st16_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        break;
> +    case 1:
> +        // TODO optimize
> +        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +        tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
> +#ifdef TARGET_X86_64
> +        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);

           tcg_gen_ext32u_tl(cpu_tmp0, cpu_tmp0) would automatically be
	   removed at compilation time if !TARGET_X86_64

> +#endif
> +        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
> +        //#ifdef TARGET_X86_64
> +        //tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
> +        //#endif
> +        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +#ifdef TARGET_X86_64
> +    case 2:
> +        tcg_gen_addi_tl(cpu_regs[reg], cpu_regs[reg], val);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
> +        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +#endif
> +    }
>  }
>  
> -static inline void gen_op_addl_A0_im(int32_t val)
> +static inline void gen_op_add_reg_T0(int size, int reg)
>  {
> -    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
> +    TCGv tmp;
> +
> +    switch(size) {
> +    case 0:
> +        // TODO optimize
> +        tmp = tcg_temp_new();
> +        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +        tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
> +        tcg_gen_andi_tl(tmp, cpu_tmp0, 0xffff);
> +        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
> +        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
> +        tcg_temp_free(tmp);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
> +        //tcg_gen_st16_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        break;
> +    case 1:
> +        // TODO optimize
> +        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +        tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
>  #ifdef TARGET_X86_64
> -    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
> +        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);

           same here

> +#endif
> +        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
> +        //#ifdef TARGET_X86_64
> +        //tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
> +        //#endif
> +        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +#ifdef TARGET_X86_64
> +    case 2:
> +        tcg_gen_add_tl(cpu_regs[reg], cpu_regs[reg], cpu_T[0]);
> +        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
> +        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
>  #endif
> +    }
>  }
>  
> -#ifdef TARGET_X86_64
> -static inline void gen_op_addq_A0_im(int64_t val)
> +static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
>  {
> -    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
> +    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +    if (shift != 0)
> +        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
> +    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
> +#ifdef TARGET_X86_64
> +    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
> +#endif
> +
> +    //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> +    //if (shift != 0)
> +    //    tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
> +    //tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
> +    //#ifdef TARGET_X86_64
> +    //tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
> +    //#endif
>  }
> +
> +#else
> +
> +static inline void gen_op_mov_reg_v(int ot, int reg, TCGv t0)
> +{
> +    switch(ot) {
> +    case OT_BYTE:
> +        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
> +            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
> +        } else {
> +            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
> +        }
> +        break;
> +    case OT_WORD:
> +        tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        break;
> +#ifdef TARGET_X86_64
> +    case OT_LONG:
> +        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        /* high part of register set to zero */
> +        tcg_gen_movi_tl(cpu_tmp0, 0);
> +        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
> +        break;
> +    default:
> +    case OT_QUAD:
> +        tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +#else
> +    default:
> +    case OT_LONG:
> +        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        break;
>  #endif
> -    
> -static void gen_add_A0_im(DisasContext *s, int val)
> +    }
> +}
> +
> +static inline void gen_op_mov_reg_A0(int size, int reg)
>  {
> +    switch(size) {
> +    case 0:
> +        tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
> +        break;
>  #ifdef TARGET_X86_64
> -    if (CODE64(s))
> -        gen_op_addq_A0_im(val);
> -    else
> +    case 1:
> +        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        /* high part of register set to zero */
> +        tcg_gen_movi_tl(cpu_tmp0, 0);
> +        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
> +        break;
> +    default:
> +    case 2:
> +        tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +#else
> +    default:
> +    case 1:
> +        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +        break;
>  #endif
> -        gen_op_addl_A0_im(val);
> +    }
>  }
>  
> -static inline void gen_op_addl_T0_T1(void)
> +static inline void gen_op_mov_v_reg(int ot, TCGv t0, int reg)
>  {
> -    tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
> +    switch(ot) {
> +    case OT_BYTE:
> +        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
> +            goto std_case;
> +        } else {
> +            tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
> +        }
> +        break;
> +    default:
> +    std_case:
> +        tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
> +        break;
> +    }
>  }
>  
> -static inline void gen_op_jmp_T0(void)
> +static inline void gen_op_movl_A0_reg(int reg)
>  {
> -    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUState, eip));
> +    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
>  }
>  
>  static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
> @@ -452,15 +636,10 @@ static inline void gen_op_add_reg_T0(int size, int reg)
>      }
>  }
>  
> -static inline void gen_op_set_cc_op(int32_t val)
> -{
> -    tcg_gen_movi_i32(cpu_cc_op, val);
> -}
> -
>  static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
>  {
>      tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> -    if (shift != 0) 
> +    if (shift != 0)
>          tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
>      tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
>  #ifdef TARGET_X86_64
> @@ -468,6 +647,63 @@ static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
>  #endif
>  }
>  
> +#endif
> +
> +static inline void gen_op_mov_reg_T0(int ot, int reg)
> +{
> +    gen_op_mov_reg_v(ot, reg, cpu_T[0]);
> +}
> +
> +static inline void gen_op_mov_reg_T1(int ot, int reg)
> +{
> +    gen_op_mov_reg_v(ot, reg, cpu_T[1]);
> +}
> +
> +static inline void gen_op_mov_TN_reg(int ot, int t_index, int reg)
> +{
> +    gen_op_mov_v_reg(ot, cpu_T[t_index], reg);
> +}
> +
> +static inline void gen_op_addl_A0_im(int32_t val)
> +{
> +    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
> +#ifdef TARGET_X86_64
> +    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
> +#endif
> +}
> +
> +#ifdef TARGET_X86_64
> +static inline void gen_op_addq_A0_im(int64_t val)
> +{
> +    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
> +}
> +#endif
> +    
> +static void gen_add_A0_im(DisasContext *s, int val)
> +{
> +#ifdef TARGET_X86_64
> +    if (CODE64(s))
> +        gen_op_addq_A0_im(val);
> +    else
> +#endif
> +        gen_op_addl_A0_im(val);
> +}
> +
> +static inline void gen_op_addl_T0_T1(void)
> +{
> +    tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
> +}
> +
> +static inline void gen_op_jmp_T0(void)
> +{
> +    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUState, eip));
> +}
> +
> +static inline void gen_op_set_cc_op(int32_t val)
> +{
> +    tcg_gen_movi_i32(cpu_cc_op, val);
> +}
> +
>  static inline void gen_op_movl_A0_seg(int reg)
>  {
>      tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, segs[reg].base) + REG_L_OFFSET);
> @@ -496,13 +732,21 @@ static inline void gen_op_addq_A0_seg(int reg)
>  
>  static inline void gen_op_movq_A0_reg(int reg)
>  {
> +#ifdef USE_REGS
> +    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
> +#else
>      tcg_gen_ld_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
> +#endif
>  }
>  
>  static inline void gen_op_addq_A0_reg_sN(int shift, int reg)
>  {
> +#ifdef USE_REGS
> +    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
> +#else
>      tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
> -    if (shift != 0) 
> +#endif
> +    if (shift != 0)
>          tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
>      tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
>  }
> @@ -701,14 +945,22 @@ static void gen_exts(int ot, TCGv reg)
>  
>  static inline void gen_op_jnz_ecx(int size, int label1)
>  {
> +#ifdef USE_REGS
> +    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
> +#else
>      tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[R_ECX]));
> +#endif
>      gen_extu(size + 1, cpu_tmp0);
>      tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, label1);
>  }
>  
>  static inline void gen_op_jz_ecx(int size, int label1)
>  {
> +#ifdef USE_REGS
> +    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
> +#else
>      tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[R_ECX]));
> +#endif
>      gen_extu(size + 1, cpu_tmp0);
>      tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
>  }
> @@ -4834,7 +5086,11 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
>                  rm = 0; /* avoid warning */
>              }
>              label1 = gen_new_label();
> +#ifdef USE_REGS
> +            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
> +#else
>              tcg_gen_ld_tl(t2, cpu_env, offsetof(CPUState, regs[R_EAX]));
> +#endif
>              tcg_gen_sub_tl(t2, t2, t0);
>              gen_extu(ot, t2);
>              tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
> @@ -5409,7 +5665,11 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
>              val = ldub_code(s->pc++);
>              tcg_gen_movi_tl(cpu_T3, val);
>          } else {
> +#ifdef USE_REGS
> +            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
> +#else
>              tcg_gen_ld_tl(cpu_T3, cpu_env, offsetof(CPUState, regs[R_ECX]));
> +#endif
>          }
>          gen_shiftd_rm_T1_T3(s, ot, opreg, op);
>          break;
> @@ -6317,10 +6577,18 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
>                  /* XXX: specific Intel behaviour ? */
>                  l1 = gen_new_label();
>                  gen_jcc1(s, s->cc_op, b ^ 1, l1);
> +#ifdef USE_REGS
> +                tcg_gen_mov_tl(cpu_regs[reg], t0);
> +#else
>                  tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
> +#endif
>                  gen_set_label(l1);
> +#ifdef USE_REGS
> +                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
> +#else
>                  tcg_gen_movi_tl(cpu_tmp0, 0);
>                  tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
> +#endif
>              } else
>  #endif
>              {
> @@ -7588,6 +7856,60 @@ void optimize_flags_init(void)
>      cpu_cc_tmp = tcg_global_mem_new(TCG_AREG0, offsetof(CPUState, cc_tmp),
>                                      "cc_tmp");
>  
> +#ifdef USE_REGS
> +#ifdef TARGET_X86_64
> +    cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EAX]), "rax");
> +    cpu_regs[R_ECX] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ECX]), "rcx");
> +    cpu_regs[R_EDX] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EDX]), "rdx");
> +    cpu_regs[R_EBX] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EBX]), "rbx");
> +    cpu_regs[R_ESP] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ESP]), "rsp");
> +    cpu_regs[R_EBP] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EBP]), "rbp");
> +    cpu_regs[R_ESI] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ESI]), "rsi");
> +    cpu_regs[R_EDI] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EDI]), "rdi");
> +    cpu_regs[8] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                         offsetof(CPUState, regs[8]), "r8");
> +    cpu_regs[9] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[9]), "r9");
> +    cpu_regs[10] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[10]), "r10");
> +    cpu_regs[11] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[11]), "r11");
> +    cpu_regs[12] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[12]), "r12");
> +    cpu_regs[13] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[13]), "r13");
> +    cpu_regs[14] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[14]), "r14");
> +    cpu_regs[15] = tcg_global_mem_new_i64(TCG_AREG0,
> +                                          offsetof(CPUState, regs[15]), "r15");
> +#else
> +    cpu_regs[R_EAX] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EAX]), "eax");
> +    cpu_regs[R_ECX] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ECX]), "ecx");
> +    cpu_regs[R_EDX] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EDX]), "edx");
> +    cpu_regs[R_EBX] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EBX]), "ebx");
> +    cpu_regs[R_ESP] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ESP]), "esp");
> +    cpu_regs[R_EBP] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EBP]), "ebp");
> +    cpu_regs[R_ESI] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_ESI]), "esi");
> +    cpu_regs[R_EDI] = tcg_global_mem_new_i32(TCG_AREG0,
> +                                             offsetof(CPUState, regs[R_EDI]), "edi");
> +#endif
> +#endif
> +
>      /* register helpers */
>  #define GEN_HELPER 2
>  #include "helper.h"
Aurelien Jarno Sept. 27, 2009, 12:51 p.m. UTC | #2
On Sun, Sep 13, 2009 at 11:00:08PM +0200, Laurent Desnogues wrote:
> Hello,
> 
> this patch is a proposal to use globals for the 8 or 16 CPU
> registers on i386 and x86_64.
> 
> I measured the improvement in the following conditions:
> 
>   - Machine:  i7 920
>   - Software:  Fedora11 x86_64 gcc 4.4.1
>   - Benchmark: SPEC2000 gcc with expr.i input
>   - User mode
>   - i386 and x86_64 hosts and targets, with and without the patch
>     (8 combinations)
> 
> The results are:
> 
> qemu-i386_on-i386          15.82user 0.05system 0:15.91elapsed
> qemu-i386_on-i386-reg      15.40user 0.02system 0:15.43elapsed
> qemu-i386_on-x86_64        15.65user 0.05system 0:15.71elapsed
> qemu-i386_on-x86_64-reg    15.11user 0.03system 0:15.15elapsed
> qemu-x86_64_on-i386        mmap: No such device or address
> qemu-x86_64_on-i386-reg    mmap: No such device or address
> qemu-x86_64_on-x86_64      18.42user 0.07system 0:18.49elapsed
> qemu-x86_64_on-x86_64-reg  13.22user 0.06system 0:13.31elapsed
> 
> Given my lack of knowledge of system QEMU, I will leave it to
> someone else to measure the speedup.
> 

Here are my benchmarks in system mode. I measured the improvement in the
following conditions:
  - Machine: Intel Core 2 Quad Q9450 with 8GB RAM
  - Intel Speedstep disabled
  - Host: Debian Sid amd64 (kernel 2.6.31, gcc 4.3, glibc 2.9)
  - Benchmark: boot time, compilation of small C++ application
  - Guests: Debian Lenny amd64 and i386, preloaded into memory
    with readahead.

+-------------+-------+-------+-----------+------------------+
| qemu target | guest | patch | boot time | compilation time |
+-------------+-------+-------+-----------+------------------+
|        i386 |  i386 |    no |       55s |            19.9s |
|        i386 |  i386 |   yes |       54s |            19.8s |
|      x86_64 |  i386 |    no |       67s |            24.6s |
|      x86_64 |  i386 |   yes |       63s |            23.3s |
|      x86_64 | amd64 |    no |       61s |            19.9s |
|      x86_64 | amd64 |   yes |       58s |            18.7s |
+-------------+-------+-------+-----------+------------------+

There is always a measurable gain, and even significant gain for
qemu-system-x86_64. This is consistent with the user mode benchmarks.
The gains are probably less important in system mode due to the chosen
benchmarks.

I think this patch is clearly a step in the right direction to a clean
target-i386 code, and I will be happy to commit it once it has been 
cleaned.
diff mbox

Patch

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 335fc08..dc2fcde 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -58,10 +58,15 @@ 
 
 //#define MACRO_TEST   1
 
+#define USE_REGS
+
 /* global register indexes */
 static TCGv_ptr cpu_env;
 static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_tmp;
 static TCGv_i32 cpu_cc_op;
+#ifdef USE_REGS
+static TCGv cpu_regs[CPU_NB_REGS];
+#endif
 /* local temps */
 static TCGv cpu_T[2], cpu_T3;
 /* local register indexes (only used inside old micro ops) */
@@ -269,70 +274,95 @@  static inline void gen_op_andl_A0_ffff(void)
 #define REG_LH_OFFSET 4
 #endif
 
+#ifdef USE_REGS
+#ifdef TARGET_X86_64
+/* #warning NYI */
+#endif
+
 static inline void gen_op_mov_reg_v(int ot, int reg, TCGv t0)
 {
+    TCGv tmp;
+
     switch(ot) {
     case OT_BYTE:
+        tmp = tcg_temp_new();
+        tcg_gen_andi_tl(tmp, t0, 0xff);
         if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
-            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
+            tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xff);
+            tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+            //tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
         } else {
-            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
+            tcg_gen_shli_tl(tmp, tmp, 8);
+            tcg_gen_andi_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], ~0xff00);
+            tcg_gen_or_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], tmp);
+            //tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
         }
+        tcg_temp_free(tmp);
         break;
     case OT_WORD:
-        tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        tmp = tcg_temp_new();
+        tcg_gen_andi_tl(tmp, t0, 0xffff);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        //tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
         break;
 #ifdef TARGET_X86_64
     case OT_LONG:
-        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
         /* high part of register set to zero */
-        tcg_gen_movi_tl(cpu_tmp0, 0);
-        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
+        tcg_gen_ext32u_tl(cpu_regs[reg], t0);
+        //tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        /* high part of register set to zero */
+        //tcg_gen_movi_tl(cpu_tmp0, 0);
+        //tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
         break;
     default:
     case OT_QUAD:
-        tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
+        tcg_gen_mov_tl(cpu_regs[reg], t0);
+        //tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
         break;
 #else
     default:
     case OT_LONG:
-        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        tcg_gen_mov_tl(cpu_regs[reg], t0);
+        //tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
         break;
 #endif
     }
 }
 
-static inline void gen_op_mov_reg_T0(int ot, int reg)
-{
-    gen_op_mov_reg_v(ot, reg, cpu_T[0]);
-}
-
-static inline void gen_op_mov_reg_T1(int ot, int reg)
-{
-    gen_op_mov_reg_v(ot, reg, cpu_T[1]);
-}
-
 static inline void gen_op_mov_reg_A0(int size, int reg)
 {
+    TCGv tmp;
+
     switch(size) {
     case 0:
-        tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        tmp = tcg_temp_new();
+        tcg_gen_andi_tl(tmp, cpu_A0, 0xffff);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        //tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
         break;
 #ifdef TARGET_X86_64
     case 1:
-        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
         /* high part of register set to zero */
-        tcg_gen_movi_tl(cpu_tmp0, 0);
-        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
+        tcg_gen_ext32u_tl(cpu_regs[reg], cpu_A0);
+        //tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        /* high part of register set to zero */
+        //tcg_gen_movi_tl(cpu_tmp0, 0);
+        //tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
         break;
     default:
     case 2:
-        tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
+        //tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
         break;
 #else
     default:
     case 1:
-        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
+        //tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
         break;
 #endif
     }
@@ -345,59 +375,213 @@  static inline void gen_op_mov_v_reg(int ot, TCGv t0, int reg)
         if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
             goto std_case;
         } else {
-            tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
+            tcg_gen_shri_tl(t0, cpu_regs[reg - 4], 8);
+            tcg_gen_andi_tl(t0, t0, 0xff);
+            //tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
         }
         break;
     default:
     std_case:
-        tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
+        tcg_gen_mov_tl(t0, cpu_regs[reg]);
+        //tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
         break;
     }
 }
 
-static inline void gen_op_mov_TN_reg(int ot, int t_index, int reg)
+static inline void gen_op_movl_A0_reg(int reg)
 {
-    gen_op_mov_v_reg(ot, cpu_T[t_index], reg);
+    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
+    //tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
 }
 
-static inline void gen_op_movl_A0_reg(int reg)
+static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
 {
-    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+    TCGv tmp;
+
+    switch(size) {
+    case 0:
+        // TODO optimize
+        tmp = tcg_temp_new();
+        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+        tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
+        tcg_gen_andi_tl(tmp, cpu_tmp0, 0xffff);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
+        //tcg_gen_st16_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        break;
+    case 1:
+        // TODO optimize
+        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+        tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
+#ifdef TARGET_X86_64
+        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
+#endif
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
+        //#ifdef TARGET_X86_64
+        //tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
+        //#endif
+        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+#ifdef TARGET_X86_64
+    case 2:
+        tcg_gen_addi_tl(cpu_regs[reg], cpu_regs[reg], val);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, val);
+        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+#endif
+    }
 }
 
-static inline void gen_op_addl_A0_im(int32_t val)
+static inline void gen_op_add_reg_T0(int size, int reg)
 {
-    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+    TCGv tmp;
+
+    switch(size) {
+    case 0:
+        // TODO optimize
+        tmp = tcg_temp_new();
+        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+        tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
+        tcg_gen_andi_tl(tmp, cpu_tmp0, 0xffff);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
+        //tcg_gen_st16_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        break;
+    case 1:
+        // TODO optimize
+        tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+        tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
 #ifdef TARGET_X86_64
-    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
+#endif
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
+        //#ifdef TARGET_X86_64
+        //tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, 0xffffffff);
+        //#endif
+        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+#ifdef TARGET_X86_64
+    case 2:
+        tcg_gen_add_tl(cpu_regs[reg], cpu_regs[reg], cpu_T[0]);
+        //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        //tcg_gen_add_tl(cpu_tmp0, cpu_tmp0, cpu_T[0]);
+        //tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
 #endif
+    }
 }
 
-#ifdef TARGET_X86_64
-static inline void gen_op_addq_A0_im(int64_t val)
+static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
 {
-    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+    if (shift != 0)
+        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
+    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+#ifdef TARGET_X86_64
+    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+#endif
+
+    //tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
+    //if (shift != 0)
+    //    tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
+    //tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+    //#ifdef TARGET_X86_64
+    //tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+    //#endif
 }
+
+#else
+
+static inline void gen_op_mov_reg_v(int ot, int reg, TCGv t0)
+{
+    switch(ot) {
+    case OT_BYTE:
+        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
+            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_B_OFFSET);
+        } else {
+            tcg_gen_st8_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
+        }
+        break;
+    case OT_WORD:
+        tcg_gen_st16_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        break;
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        /* high part of register set to zero */
+        tcg_gen_movi_tl(cpu_tmp0, 0);
+        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
+        break;
+    default:
+    case OT_QUAD:
+        tcg_gen_st_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+#else
+    default:
+    case OT_LONG:
+        tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        break;
 #endif
-    
-static void gen_add_A0_im(DisasContext *s, int val)
+    }
+}
+
+static inline void gen_op_mov_reg_A0(int size, int reg)
 {
+    switch(size) {
+    case 0:
+        tcg_gen_st16_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_W_OFFSET);
+        break;
 #ifdef TARGET_X86_64
-    if (CODE64(s))
-        gen_op_addq_A0_im(val);
-    else
+    case 1:
+        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        /* high part of register set to zero */
+        tcg_gen_movi_tl(cpu_tmp0, 0);
+        tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
+        break;
+    default:
+    case 2:
+        tcg_gen_st_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+#else
+    default:
+    case 1:
+        tcg_gen_st32_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+        break;
 #endif
-        gen_op_addl_A0_im(val);
+    }
 }
 
-static inline void gen_op_addl_T0_T1(void)
+static inline void gen_op_mov_v_reg(int ot, TCGv t0, int reg)
 {
-    tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+    switch(ot) {
+    case OT_BYTE:
+        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
+            goto std_case;
+        } else {
+            tcg_gen_ld8u_tl(t0, cpu_env, offsetof(CPUState, regs[reg - 4]) + REG_H_OFFSET);
+        }
+        break;
+    default:
+    std_case:
+        tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUState, regs[reg]));
+        break;
+    }
 }
 
-static inline void gen_op_jmp_T0(void)
+static inline void gen_op_movl_A0_reg(int reg)
 {
-    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUState, eip));
+    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
 }
 
 static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
@@ -452,15 +636,10 @@  static inline void gen_op_add_reg_T0(int size, int reg)
     }
 }
 
-static inline void gen_op_set_cc_op(int32_t val)
-{
-    tcg_gen_movi_i32(cpu_cc_op, val);
-}
-
 static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
 {
     tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
-    if (shift != 0) 
+    if (shift != 0)
         tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
     tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
 #ifdef TARGET_X86_64
@@ -468,6 +647,63 @@  static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
 #endif
 }
 
+#endif
+
+static inline void gen_op_mov_reg_T0(int ot, int reg)
+{
+    gen_op_mov_reg_v(ot, reg, cpu_T[0]);
+}
+
+static inline void gen_op_mov_reg_T1(int ot, int reg)
+{
+    gen_op_mov_reg_v(ot, reg, cpu_T[1]);
+}
+
+static inline void gen_op_mov_TN_reg(int ot, int t_index, int reg)
+{
+    gen_op_mov_v_reg(ot, cpu_T[t_index], reg);
+}
+
+static inline void gen_op_addl_A0_im(int32_t val)
+{
+    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+#ifdef TARGET_X86_64
+    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+#endif
+}
+
+#ifdef TARGET_X86_64
+static inline void gen_op_addq_A0_im(int64_t val)
+{
+    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+}
+#endif
+    
+static void gen_add_A0_im(DisasContext *s, int val)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s))
+        gen_op_addq_A0_im(val);
+    else
+#endif
+        gen_op_addl_A0_im(val);
+}
+
+static inline void gen_op_addl_T0_T1(void)
+{
+    tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+}
+
+static inline void gen_op_jmp_T0(void)
+{
+    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUState, eip));
+}
+
+static inline void gen_op_set_cc_op(int32_t val)
+{
+    tcg_gen_movi_i32(cpu_cc_op, val);
+}
+
 static inline void gen_op_movl_A0_seg(int reg)
 {
     tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, segs[reg].base) + REG_L_OFFSET);
@@ -496,13 +732,21 @@  static inline void gen_op_addq_A0_seg(int reg)
 
 static inline void gen_op_movq_A0_reg(int reg)
 {
+#ifdef USE_REGS
+    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
+#else
     tcg_gen_ld_tl(cpu_A0, cpu_env, offsetof(CPUState, regs[reg]));
+#endif
 }
 
 static inline void gen_op_addq_A0_reg_sN(int shift, int reg)
 {
+#ifdef USE_REGS
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+#else
     tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]));
-    if (shift != 0) 
+#endif
+    if (shift != 0)
         tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
     tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
 }
@@ -701,14 +945,22 @@  static void gen_exts(int ot, TCGv reg)
 
 static inline void gen_op_jnz_ecx(int size, int label1)
 {
+#ifdef USE_REGS
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
+#else
     tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[R_ECX]));
+#endif
     gen_extu(size + 1, cpu_tmp0);
     tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, label1);
 }
 
 static inline void gen_op_jz_ecx(int size, int label1)
 {
+#ifdef USE_REGS
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
+#else
     tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[R_ECX]));
+#endif
     gen_extu(size + 1, cpu_tmp0);
     tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
 }
@@ -4834,7 +5086,11 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 rm = 0; /* avoid warning */
             }
             label1 = gen_new_label();
+#ifdef USE_REGS
+            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+#else
             tcg_gen_ld_tl(t2, cpu_env, offsetof(CPUState, regs[R_EAX]));
+#endif
             tcg_gen_sub_tl(t2, t2, t0);
             gen_extu(ot, t2);
             tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
@@ -5409,7 +5665,11 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             val = ldub_code(s->pc++);
             tcg_gen_movi_tl(cpu_T3, val);
         } else {
+#ifdef USE_REGS
+            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+#else
             tcg_gen_ld_tl(cpu_T3, cpu_env, offsetof(CPUState, regs[R_ECX]));
+#endif
         }
         gen_shiftd_rm_T1_T3(s, ot, opreg, op);
         break;
@@ -6317,10 +6577,18 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 /* XXX: specific Intel behaviour ? */
                 l1 = gen_new_label();
                 gen_jcc1(s, s->cc_op, b ^ 1, l1);
+#ifdef USE_REGS
+                tcg_gen_mov_tl(cpu_regs[reg], t0);
+#else
                 tcg_gen_st32_tl(t0, cpu_env, offsetof(CPUState, regs[reg]) + REG_L_OFFSET);
+#endif
                 gen_set_label(l1);
+#ifdef USE_REGS
+                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
+#else
                 tcg_gen_movi_tl(cpu_tmp0, 0);
                 tcg_gen_st32_tl(cpu_tmp0, cpu_env, offsetof(CPUState, regs[reg]) + REG_LH_OFFSET);
+#endif
             } else
 #endif
             {
@@ -7588,6 +7856,60 @@  void optimize_flags_init(void)
     cpu_cc_tmp = tcg_global_mem_new(TCG_AREG0, offsetof(CPUState, cc_tmp),
                                     "cc_tmp");
 
+#ifdef USE_REGS
+#ifdef TARGET_X86_64
+    cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EAX]), "rax");
+    cpu_regs[R_ECX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ECX]), "rcx");
+    cpu_regs[R_EDX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDX]), "rdx");
+    cpu_regs[R_EBX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBX]), "rbx");
+    cpu_regs[R_ESP] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESP]), "rsp");
+    cpu_regs[R_EBP] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBP]), "rbp");
+    cpu_regs[R_ESI] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESI]), "rsi");
+    cpu_regs[R_EDI] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDI]), "rdi");
+    cpu_regs[8] = tcg_global_mem_new_i64(TCG_AREG0,
+                                         offsetof(CPUState, regs[8]), "r8");
+    cpu_regs[9] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[9]), "r9");
+    cpu_regs[10] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[10]), "r10");
+    cpu_regs[11] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[11]), "r11");
+    cpu_regs[12] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[12]), "r12");
+    cpu_regs[13] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[13]), "r13");
+    cpu_regs[14] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[14]), "r14");
+    cpu_regs[15] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[15]), "r15");
+#else
+    cpu_regs[R_EAX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EAX]), "eax");
+    cpu_regs[R_ECX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ECX]), "ecx");
+    cpu_regs[R_EDX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDX]), "edx");
+    cpu_regs[R_EBX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBX]), "ebx");
+    cpu_regs[R_ESP] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESP]), "esp");
+    cpu_regs[R_EBP] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBP]), "ebp");
+    cpu_regs[R_ESI] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESI]), "esi");
+    cpu_regs[R_EDI] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDI]), "edi");
+#endif
+#endif
+
     /* register helpers */
 #define GEN_HELPER 2
 #include "helper.h"