Patchwork [5/7] target-mips: Use TCG registers for the FPU.

login
register
mail settings
Submitter Richard Henderson
Date Sept. 17, 2012, 9:35 p.m.
Message ID <1347917713-23343-6-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/184569/
State New
Headers show

Comments

Richard Henderson - Sept. 17, 2012, 9:35 p.m.
With normal FP, this doesn't have much affect on the generated code,
because most of the FP operations are not CONST/PURE, and so we spill
registers in about the same frequency as the explicit load/stores.

But with Loongson multimedia instructions, which are all integral and
whose helpers are in fact CONST+PURE, this greatly improves the code.

Rather than over-use the deposit operation, we create TCG registers for
both the 64-bit FPU register as a whole and the two 32-bit halves.  We
only ever reference the whole register or the two half registers in any
one TB, so there's no problem with aliasing.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-mips/translate.c | 141 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 44 deletions(-)
Aurelien Jarno - Sept. 18, 2012, 4:39 p.m.
On Mon, Sep 17, 2012 at 02:35:11PM -0700, Richard Henderson wrote:
> With normal FP, this doesn't have much affect on the generated code,
> because most of the FP operations are not CONST/PURE, and so we spill
> registers in about the same frequency as the explicit load/stores.
> 
> But with Loongson multimedia instructions, which are all integral and
> whose helpers are in fact CONST+PURE, this greatly improves the code.
> 
> Rather than over-use the deposit operation, we create TCG registers for
> both the 64-bit FPU register as a whole and the two 32-bit halves.  We
> only ever reference the whole register or the two half registers in any
> one TB, so there's no problem with aliasing.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-mips/translate.c | 141 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 97 insertions(+), 44 deletions(-)
> 
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index b4301e9..df92cec 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -479,6 +479,12 @@ static TCGv cpu_dspctrl, btarget, bcond;
>  static TCGv_i32 hflags;
>  static TCGv_i32 fpu_fcr0, fpu_fcr31;
>  
> +/* FPU registers.  These alias, but we'll only use one or the other
> +   in any one TB based on MIPS_HFLAG_F64.  */
> +static TCGv_i32 fpu_f32[32];
> +static TCGv_i32 fpu_fh32[32];
> +static TCGv_i64 fpu_f64[32];
> +
>  static uint32_t gen_opc_hflags[OPC_BUF_SIZE];
>  
>  #include "gen-icount.h"
> @@ -545,26 +551,45 @@ enum {
>      BS_EXCP     = 3, /* We reached an exception condition */
>  };
>  
> -static const char *regnames[] =
> -    { "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
> -      "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
> -      "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
> -      "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", };
> +static const char * const regnames[] = {
> +    "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
> +    "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
> +    "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
> +    "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra",
> +};
> +
> +static const char * const regnames_HI[] = {
> +    "HI0", "HI1", "HI2", "HI3",
> +};
>  
> -static const char *regnames_HI[] =
> -    { "HI0", "HI1", "HI2", "HI3", };
> +static const char * const regnames_LO[] = {
> +    "LO0", "LO1", "LO2", "LO3",
> +};
>  
> -static const char *regnames_LO[] =
> -    { "LO0", "LO1", "LO2", "LO3", };
> +static const char * const regnames_ACX[] = {
> +    "ACX0", "ACX1", "ACX2", "ACX3",
> +};
>  
> -static const char *regnames_ACX[] =
> -    { "ACX0", "ACX1", "ACX2", "ACX3", };
> +static const char * const fregnames[] = {
> +    "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
> +    "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
> +    "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
> +    "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
> +};
>  
> -static const char *fregnames[] =
> -    { "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
> -      "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
> -      "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
> -      "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", };
> +static const char * const flregnames[] = {
> +    "fl0",  "fl1",  "fl2",  "fl3",  "fl4",  "fl5",  "fl6",  "fl7",
> +    "fl8",  "fl9",  "fl10", "fl11", "fl12", "fl13", "fl14", "fl15",
> +    "fl16", "fl17", "fl18", "fl19", "fl20", "fl21", "fl22", "fl23",
> +    "fl24", "fl25", "fl26", "fl27", "fl28", "fl29", "fl30", "fl31",
> +};
> +
> +static const char * const fhregnames[] = {
> +    "fh0",  "fh1",  "fh2",  "fh3",  "fh4",  "fh5",  "fh6",  "fh7",
> +    "fh8",  "fh9",  "fh10", "fh11", "fh12", "fh13", "fh14", "fh15",
> +    "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23",
> +    "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31",
> +};
>
>  #ifdef MIPS_DEBUG_DISAS
>  #define MIPS_DEBUG(fmt, ...)                                                  \
> @@ -662,55 +687,70 @@ static inline void gen_store_srsgpr (int from, int to)
>  }
>  
>  /* Floating point register moves. */
> -static inline void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        tcg_gen_trunc_i64_i32(t, fpu_f64[reg]);
> +    } else {
> +        tcg_gen_mov_i32(t, fpu_f32[reg]);
> +    }
>  }
>  
> -static inline void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_ext_i32_i64(t64, t);
> +        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(fpu_f32[reg], t);
> +    }
>  }
>  
> -static inline void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_shri_i64(t64, fpu_f64[reg], 32);
> +        tcg_gen_trunc_i64_i32(t, t64);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(t, fpu_fh32[reg]);
> +    }
>  }
>  
> -static inline void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_ext_i32_i64(t64, t);
> +        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(fpu_fh32[reg], t);
> +    }
>  }
>  
> -static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
> +static void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
>  {
>      if (ctx->hflags & MIPS_HFLAG_F64) {
> -        tcg_gen_ld_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
> +        tcg_gen_mov_i64(t, fpu_f64[reg]);
>      } else {
> -        TCGv_i32 t0 = tcg_temp_new_i32();
> -        TCGv_i32 t1 = tcg_temp_new_i32();
> -        gen_load_fpr32(ctx, t0, reg & ~1);
> -        gen_load_fpr32(ctx, t1, reg | 1);
> -        tcg_gen_concat_i32_i64(t, t0, t1);
> -        tcg_temp_free_i32(t0);
> -        tcg_temp_free_i32(t1);
> +        tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]);
>      }
>  }
>  
> -static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
> +static void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
>  {
>      if (ctx->hflags & MIPS_HFLAG_F64) {
> -        tcg_gen_st_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
> +        tcg_gen_mov_i64(fpu_f64[reg], t);
>      } else {
> -        TCGv_i64 t0 = tcg_temp_new_i64();
> -        TCGv_i32 t1 = tcg_temp_new_i32();
> -        tcg_gen_trunc_i64_i32(t1, t);
> -        gen_store_fpr32(ctx, t1, reg & ~1);
> -        tcg_gen_shri_i64(t0, t, 32);
> -        tcg_gen_trunc_i64_i32(t1, t0);
> -        gen_store_fpr32(ctx, t1, reg | 1);
> -        tcg_temp_free_i32(t1);
> -        tcg_temp_free_i64(t0);
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_shri_i64(t64, t, 32);
> +        tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64);
> +        tcg_temp_free_i64(t64);
> +        tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t);
>      }
>  }
>  
> @@ -12694,6 +12734,19 @@ static void mips_tcg_init(void)
>                                         offsetof(CPUMIPSState, active_fpu.fcr31),
>                                         "fcr31");
>  
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
> +        fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, flregnames[i]);
> +    }
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]);
> +        fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]);
> +    }
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);

This should be fpr[i].d.

> +        fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]);
> +    }
> +

Adding so many globals (i.e. multiplying by 4) has a cost that is greater
than the gains. Remember the register allocator is doing a loop on all
globals at the end of a basic block or when calling a non CONST 
helper/op. While the generated code looks nicer, this slow down the
guest by roughly 12% (measured on a boot time).

I am currently working on an optimization of the liveness/register
allocator which among other things, partly mitigates that (I hope to get
the patches ready for posting in a week or so). That said the slow down
is still around 3%. I think we should go for only mapping the fp
registers as 64-bit registers, and use trunc/shift/deposit to read/write
them. Of course the generated code doesn't look so nice, but what is
important is that the overall execution is faster, not slower.

>      /* register helpers */
>  #define GEN_HELPER 2
>  #include "helper.h"
> -- 
> 1.7.11.4
>

Patch

diff --git a/target-mips/translate.c b/target-mips/translate.c
index b4301e9..df92cec 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -479,6 +479,12 @@  static TCGv cpu_dspctrl, btarget, bcond;
 static TCGv_i32 hflags;
 static TCGv_i32 fpu_fcr0, fpu_fcr31;
 
+/* FPU registers.  These alias, but we'll only use one or the other
+   in any one TB based on MIPS_HFLAG_F64.  */
+static TCGv_i32 fpu_f32[32];
+static TCGv_i32 fpu_fh32[32];
+static TCGv_i64 fpu_f64[32];
+
 static uint32_t gen_opc_hflags[OPC_BUF_SIZE];
 
 #include "gen-icount.h"
@@ -545,26 +551,45 @@  enum {
     BS_EXCP     = 3, /* We reached an exception condition */
 };
 
-static const char *regnames[] =
-    { "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
-      "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
-      "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
-      "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", };
+static const char * const regnames[] = {
+    "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
+    "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+    "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
+    "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra",
+};
+
+static const char * const regnames_HI[] = {
+    "HI0", "HI1", "HI2", "HI3",
+};
 
-static const char *regnames_HI[] =
-    { "HI0", "HI1", "HI2", "HI3", };
+static const char * const regnames_LO[] = {
+    "LO0", "LO1", "LO2", "LO3",
+};
 
-static const char *regnames_LO[] =
-    { "LO0", "LO1", "LO2", "LO3", };
+static const char * const regnames_ACX[] = {
+    "ACX0", "ACX1", "ACX2", "ACX3",
+};
 
-static const char *regnames_ACX[] =
-    { "ACX0", "ACX1", "ACX2", "ACX3", };
+static const char * const fregnames[] = {
+    "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
+    "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
+    "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
+    "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
+};
 
-static const char *fregnames[] =
-    { "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
-      "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
-      "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
-      "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", };
+static const char * const flregnames[] = {
+    "fl0",  "fl1",  "fl2",  "fl3",  "fl4",  "fl5",  "fl6",  "fl7",
+    "fl8",  "fl9",  "fl10", "fl11", "fl12", "fl13", "fl14", "fl15",
+    "fl16", "fl17", "fl18", "fl19", "fl20", "fl21", "fl22", "fl23",
+    "fl24", "fl25", "fl26", "fl27", "fl28", "fl29", "fl30", "fl31",
+};
+
+static const char * const fhregnames[] = {
+    "fh0",  "fh1",  "fh2",  "fh3",  "fh4",  "fh5",  "fh6",  "fh7",
+    "fh8",  "fh9",  "fh10", "fh11", "fh12", "fh13", "fh14", "fh15",
+    "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23",
+    "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31",
+};
 
 #ifdef MIPS_DEBUG_DISAS
 #define MIPS_DEBUG(fmt, ...)                                                  \
@@ -662,55 +687,70 @@  static inline void gen_store_srsgpr (int from, int to)
 }
 
 /* Floating point register moves. */
-static inline void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        tcg_gen_trunc_i64_i32(t, fpu_f64[reg]);
+    } else {
+        tcg_gen_mov_i32(t, fpu_f32[reg]);
+    }
 }
 
-static inline void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t64, t);
+        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32);
+        tcg_temp_free_i64(t64);
+    } else {
+        tcg_gen_mov_i32(fpu_f32[reg], t);
+    }
 }
 
-static inline void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t64, fpu_f64[reg], 32);
+        tcg_gen_trunc_i64_i32(t, t64);
+        tcg_temp_free_i64(t64);
+    } else {
+        tcg_gen_mov_i32(t, fpu_fh32[reg]);
+    }
 }
 
-static inline void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t64, t);
+        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32);
+        tcg_temp_free_i64(t64);
+    } else {
+        tcg_gen_mov_i32(fpu_fh32[reg], t);
+    }
 }
 
-static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
+static void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 {
     if (ctx->hflags & MIPS_HFLAG_F64) {
-        tcg_gen_ld_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
+        tcg_gen_mov_i64(t, fpu_f64[reg]);
     } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        gen_load_fpr32(ctx, t0, reg & ~1);
-        gen_load_fpr32(ctx, t1, reg | 1);
-        tcg_gen_concat_i32_i64(t, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
+        tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]);
     }
 }
 
-static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
+static void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 {
     if (ctx->hflags & MIPS_HFLAG_F64) {
-        tcg_gen_st_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
+        tcg_gen_mov_i64(fpu_f64[reg], t);
     } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(t1, t);
-        gen_store_fpr32(ctx, t1, reg & ~1);
-        tcg_gen_shri_i64(t0, t, 32);
-        tcg_gen_trunc_i64_i32(t1, t0);
-        gen_store_fpr32(ctx, t1, reg | 1);
-        tcg_temp_free_i32(t1);
-        tcg_temp_free_i64(t0);
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t64, t, 32);
+        tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64);
+        tcg_temp_free_i64(t64);
+        tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t);
     }
 }
 
@@ -12694,6 +12734,19 @@  static void mips_tcg_init(void)
                                        offsetof(CPUMIPSState, active_fpu.fcr31),
                                        "fcr31");
 
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
+        fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, flregnames[i]);
+    }
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]);
+        fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]);
+    }
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
+        fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]);
+    }
+
     /* register helpers */
 #define GEN_HELPER 2
 #include "helper.h"