Patchwork [v2,1/9] x86: avoid AREG0 for FPU helpers

login
register
mail settings
Submitter Blue Swirl
Date Aug. 7, 2012, 7:49 p.m.
Message ID <6cf20d42fb6611107b9c92f927ba1564e2789043.1344368495.git.blauwirbel@gmail.com>
Download mbox | patch
Permalink /patch/175773/
State New
Headers show

Comments

Blue Swirl - Aug. 7, 2012, 7:49 p.m.
Make FPU helpers take a parameter for CPUState instead
of relying on global env.

Introduce temporary wrappers for FPU load and store ops. Remove
wrappers for non-AREG0 code. Don't call unconverted helpers
directly.

Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/Makefile.objs    |    1 -
 target-i386/cpu.h            |   11 +
 target-i386/fpu_helper.c     |  433 +++++++++++++++++++--------------------
 target-i386/helper.h         |  172 ++++++++--------
 target-i386/mem_helper.c     |   49 +++++
 target-i386/ops_sse.h        |  378 ++++++++++++++++++----------------
 target-i386/ops_sse_header.h |  334 +++++++++++++++---------------
 target-i386/translate.c      |  466 +++++++++++++++++++++++-------------------
 8 files changed, 968 insertions(+), 876 deletions(-)

Patch

diff --git a/target-i386/Makefile.objs b/target-i386/Makefile.objs
index 683fd59..af99b81 100644
--- a/target-i386/Makefile.objs
+++ b/target-i386/Makefile.objs
@@ -6,7 +6,6 @@  obj-$(CONFIG_KVM) += kvm.o hyperv.o
 obj-$(CONFIG_LINUX_USER) += ioport-user.o
 obj-$(CONFIG_BSD_USER) += ioport-user.o
 
-$(obj)/fpu_helper.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
 $(obj)/cc_helper.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
 $(obj)/int_helper.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
 $(obj)/svm_helper.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 2a61c81..9b2ead8 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -1137,4 +1137,15 @@  void do_smm_enter(CPUX86State *env1);
 
 void cpu_report_tpr_access(CPUX86State *env, TPRAccess access);
 
+/* temporary wrappers */
+uint32_t cpu_ldub_data(CPUX86State *env, target_ulong ptr);
+uint32_t cpu_lduw_data(CPUX86State *env, target_ulong ptr);
+uint32_t cpu_ldl_data(CPUX86State *env, target_ulong ptr);
+uint64_t cpu_ldq_data(CPUX86State *env, target_ulong ptr);
+
+void cpu_stb_data(CPUX86State *env, target_ulong ptr, uint32_t data);
+void cpu_stw_data(CPUX86State *env, target_ulong ptr, uint32_t data);
+void cpu_stl_data(CPUX86State *env, target_ulong ptr, uint32_t data);
+void cpu_stq_data(CPUX86State *env, target_ulong ptr, uint64_t data);
+
 #endif /* CPU_I386_H */
diff --git a/target-i386/fpu_helper.c b/target-i386/fpu_helper.c
index 6065c2e..a1d7ef7 100644
--- a/target-i386/fpu_helper.c
+++ b/target-i386/fpu_helper.c
@@ -19,13 +19,8 @@ 
 
 #include <math.h>
 #include "cpu.h"
-#include "dyngen-exec.h"
 #include "helper.h"
 
-#if !defined(CONFIG_USER_ONLY)
-#include "softmmu_exec.h"
-#endif /* !defined(CONFIG_USER_ONLY) */
-
 #define FPU_RC_MASK         0xc00
 #define FPU_RC_NEAR         0x000
 #define FPU_RC_DOWN         0x400
@@ -58,39 +53,39 @@ 
 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
 
-static inline void fpush(void)
+static inline void fpush(CPUX86State *env)
 {
     env->fpstt = (env->fpstt - 1) & 7;
     env->fptags[env->fpstt] = 0; /* validate stack entry */
 }
 
-static inline void fpop(void)
+static inline void fpop(CPUX86State *env)
 {
     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
     env->fpstt = (env->fpstt + 1) & 7;
 }
 
-static inline floatx80 helper_fldt(target_ulong ptr)
+static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr)
 {
     CPU_LDoubleU temp;
 
-    temp.l.lower = ldq(ptr);
-    temp.l.upper = lduw(ptr + 8);
+    temp.l.lower = cpu_ldq_data(env, ptr);
+    temp.l.upper = cpu_lduw_data(env, ptr + 8);
     return temp.d;
 }
 
-static inline void helper_fstt(floatx80 f, target_ulong ptr)
+static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr)
 {
     CPU_LDoubleU temp;
 
     temp.d = f;
-    stq(ptr, temp.l.lower);
-    stw(ptr + 8, temp.l.upper);
+    cpu_stq_data(env, ptr, temp.l.lower);
+    cpu_stw_data(env, ptr + 8, temp.l.upper);
 }
 
 /* x87 FPU helpers */
 
-static inline double floatx80_to_double(floatx80 a)
+static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 {
     union {
         float64 f64;
@@ -101,7 +96,7 @@  static inline double floatx80_to_double(floatx80 a)
     return u.d;
 }
 
-static inline floatx80 double_to_floatx80(double a)
+static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 {
     union {
         float64 f64;
@@ -112,7 +107,7 @@  static inline floatx80 double_to_floatx80(double a)
     return float64_to_floatx80(u.f64, &env->fp_status);
 }
 
-static void fpu_set_exception(int mask)
+static void fpu_set_exception(CPUX86State *env, int mask)
 {
     env->fpus |= mask;
     if (env->fpus & (~env->fpuc & FPUC_EM)) {
@@ -120,15 +115,15 @@  static void fpu_set_exception(int mask)
     }
 }
 
-static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
+static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
-        fpu_set_exception(FPUS_ZE);
+        fpu_set_exception(env, FPUS_ZE);
     }
     return floatx80_div(a, b, &env->fp_status);
 }
 
-static void fpu_raise_exception(void)
+static void fpu_raise_exception(CPUX86State *env)
 {
     if (env->cr[0] & CR0_NE_MASK) {
         raise_exception(env, EXCP10_COPR);
@@ -140,7 +135,7 @@  static void fpu_raise_exception(void)
 #endif
 }
 
-void helper_flds_FT0(uint32_t val)
+void helper_flds_FT0(CPUX86State *env, uint32_t val)
 {
     union {
         float32 f;
@@ -151,7 +146,7 @@  void helper_flds_FT0(uint32_t val)
     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 }
 
-void helper_fldl_FT0(uint64_t val)
+void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 {
     union {
         float64 f;
@@ -162,12 +157,12 @@  void helper_fldl_FT0(uint64_t val)
     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 }
 
-void helper_fildl_FT0(int32_t val)
+void helper_fildl_FT0(CPUX86State *env, int32_t val)
 {
     FT0 = int32_to_floatx80(val, &env->fp_status);
 }
 
-void helper_flds_ST0(uint32_t val)
+void helper_flds_ST0(CPUX86State *env, uint32_t val)
 {
     int new_fpstt;
     union {
@@ -182,7 +177,7 @@  void helper_flds_ST0(uint32_t val)
     env->fptags[new_fpstt] = 0; /* validate stack entry */
 }
 
-void helper_fldl_ST0(uint64_t val)
+void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 {
     int new_fpstt;
     union {
@@ -197,7 +192,7 @@  void helper_fldl_ST0(uint64_t val)
     env->fptags[new_fpstt] = 0; /* validate stack entry */
 }
 
-void helper_fildl_ST0(int32_t val)
+void helper_fildl_ST0(CPUX86State *env, int32_t val)
 {
     int new_fpstt;
 
@@ -207,7 +202,7 @@  void helper_fildl_ST0(int32_t val)
     env->fptags[new_fpstt] = 0; /* validate stack entry */
 }
 
-void helper_fildll_ST0(int64_t val)
+void helper_fildll_ST0(CPUX86State *env, int64_t val)
 {
     int new_fpstt;
 
@@ -217,7 +212,7 @@  void helper_fildll_ST0(int64_t val)
     env->fptags[new_fpstt] = 0; /* validate stack entry */
 }
 
-uint32_t helper_fsts_ST0(void)
+uint32_t helper_fsts_ST0(CPUX86State *env)
 {
     union {
         float32 f;
@@ -228,7 +223,7 @@  uint32_t helper_fsts_ST0(void)
     return u.i;
 }
 
-uint64_t helper_fstl_ST0(void)
+uint64_t helper_fstl_ST0(CPUX86State *env)
 {
     union {
         float64 f;
@@ -239,7 +234,7 @@  uint64_t helper_fstl_ST0(void)
     return u.i;
 }
 
-int32_t helper_fist_ST0(void)
+int32_t helper_fist_ST0(CPUX86State *env)
 {
     int32_t val;
 
@@ -250,7 +245,7 @@  int32_t helper_fist_ST0(void)
     return val;
 }
 
-int32_t helper_fistl_ST0(void)
+int32_t helper_fistl_ST0(CPUX86State *env)
 {
     int32_t val;
 
@@ -258,7 +253,7 @@  int32_t helper_fistl_ST0(void)
     return val;
 }
 
-int64_t helper_fistll_ST0(void)
+int64_t helper_fistll_ST0(CPUX86State *env)
 {
     int64_t val;
 
@@ -266,7 +261,7 @@  int64_t helper_fistll_ST0(void)
     return val;
 }
 
-int32_t helper_fistt_ST0(void)
+int32_t helper_fistt_ST0(CPUX86State *env)
 {
     int32_t val;
 
@@ -277,7 +272,7 @@  int32_t helper_fistt_ST0(void)
     return val;
 }
 
-int32_t helper_fisttl_ST0(void)
+int32_t helper_fisttl_ST0(CPUX86State *env)
 {
     int32_t val;
 
@@ -285,7 +280,7 @@  int32_t helper_fisttl_ST0(void)
     return val;
 }
 
-int64_t helper_fisttll_ST0(void)
+int64_t helper_fisttll_ST0(CPUX86State *env)
 {
     int64_t val;
 
@@ -293,38 +288,38 @@  int64_t helper_fisttll_ST0(void)
     return val;
 }
 
-void helper_fldt_ST0(target_ulong ptr)
+void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 {
     int new_fpstt;
 
     new_fpstt = (env->fpstt - 1) & 7;
-    env->fpregs[new_fpstt].d = helper_fldt(ptr);
+    env->fpregs[new_fpstt].d = helper_fldt(env, ptr);
     env->fpstt = new_fpstt;
     env->fptags[new_fpstt] = 0; /* validate stack entry */
 }
 
-void helper_fstt_ST0(target_ulong ptr)
+void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 {
-    helper_fstt(ST0, ptr);
+    helper_fstt(env, ST0, ptr);
 }
 
-void helper_fpush(void)
+void helper_fpush(CPUX86State *env)
 {
-    fpush();
+    fpush(env);
 }
 
-void helper_fpop(void)
+void helper_fpop(CPUX86State *env)
 {
-    fpop();
+    fpop(env);
 }
 
-void helper_fdecstp(void)
+void helper_fdecstp(CPUX86State *env)
 {
     env->fpstt = (env->fpstt - 1) & 7;
     env->fpus &= ~0x4700;
 }
 
-void helper_fincstp(void)
+void helper_fincstp(CPUX86State *env)
 {
     env->fpstt = (env->fpstt + 1) & 7;
     env->fpus &= ~0x4700;
@@ -332,32 +327,32 @@  void helper_fincstp(void)
 
 /* FPU move */
 
-void helper_ffree_STN(int st_index)
+void helper_ffree_STN(CPUX86State *env, int st_index)
 {
     env->fptags[(env->fpstt + st_index) & 7] = 1;
 }
 
-void helper_fmov_ST0_FT0(void)
+void helper_fmov_ST0_FT0(CPUX86State *env)
 {
     ST0 = FT0;
 }
 
-void helper_fmov_FT0_STN(int st_index)
+void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 {
     FT0 = ST(st_index);
 }
 
-void helper_fmov_ST0_STN(int st_index)
+void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 {
     ST0 = ST(st_index);
 }
 
-void helper_fmov_STN_ST0(int st_index)
+void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 {
     ST(st_index) = ST0;
 }
 
-void helper_fxchg_ST0_STN(int st_index)
+void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 {
     floatx80 tmp;
 
@@ -370,7 +365,7 @@  void helper_fxchg_ST0_STN(int st_index)
 
 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 
-void helper_fcom_ST0_FT0(void)
+void helper_fcom_ST0_FT0(CPUX86State *env)
 {
     int ret;
 
@@ -378,7 +373,7 @@  void helper_fcom_ST0_FT0(void)
     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 }
 
-void helper_fucom_ST0_FT0(void)
+void helper_fucom_ST0_FT0(CPUX86State *env)
 {
     int ret;
 
@@ -388,158 +383,158 @@  void helper_fucom_ST0_FT0(void)
 
 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 
-void helper_fcomi_ST0_FT0(void)
+void helper_fcomi_ST0_FT0(CPUX86State *env)
 {
     int eflags;
     int ret;
 
     ret = floatx80_compare(ST0, FT0, &env->fp_status);
-    eflags = helper_cc_compute_all(CC_OP);
+    eflags = cpu_cc_compute_all(env, CC_OP);
     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
     CC_SRC = eflags;
 }
 
-void helper_fucomi_ST0_FT0(void)
+void helper_fucomi_ST0_FT0(CPUX86State *env)
 {
     int eflags;
     int ret;
 
     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
-    eflags = helper_cc_compute_all(CC_OP);
+    eflags = cpu_cc_compute_all(env, CC_OP);
     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
     CC_SRC = eflags;
 }
 
-void helper_fadd_ST0_FT0(void)
+void helper_fadd_ST0_FT0(CPUX86State *env)
 {
     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 }
 
-void helper_fmul_ST0_FT0(void)
+void helper_fmul_ST0_FT0(CPUX86State *env)
 {
     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 }
 
-void helper_fsub_ST0_FT0(void)
+void helper_fsub_ST0_FT0(CPUX86State *env)
 {
     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 }
 
-void helper_fsubr_ST0_FT0(void)
+void helper_fsubr_ST0_FT0(CPUX86State *env)
 {
     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 }
 
-void helper_fdiv_ST0_FT0(void)
+void helper_fdiv_ST0_FT0(CPUX86State *env)
 {
-    ST0 = helper_fdiv(ST0, FT0);
+    ST0 = helper_fdiv(env, ST0, FT0);
 }
 
-void helper_fdivr_ST0_FT0(void)
+void helper_fdivr_ST0_FT0(CPUX86State *env)
 {
-    ST0 = helper_fdiv(FT0, ST0);
+    ST0 = helper_fdiv(env, FT0, ST0);
 }
 
 /* fp operations between STN and ST0 */
 
-void helper_fadd_STN_ST0(int st_index)
+void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 {
     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 }
 
-void helper_fmul_STN_ST0(int st_index)
+void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 {
     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 }
 
-void helper_fsub_STN_ST0(int st_index)
+void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 {
     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 }
 
-void helper_fsubr_STN_ST0(int st_index)
+void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 {
     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 }
 
-void helper_fdiv_STN_ST0(int st_index)
+void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 {
     floatx80 *p;
 
     p = &ST(st_index);
-    *p = helper_fdiv(*p, ST0);
+    *p = helper_fdiv(env, *p, ST0);
 }
 
-void helper_fdivr_STN_ST0(int st_index)
+void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 {
     floatx80 *p;
 
     p = &ST(st_index);
-    *p = helper_fdiv(ST0, *p);
+    *p = helper_fdiv(env, ST0, *p);
 }
 
 /* misc FPU operations */
-void helper_fchs_ST0(void)
+void helper_fchs_ST0(CPUX86State *env)
 {
     ST0 = floatx80_chs(ST0);
 }
 
-void helper_fabs_ST0(void)
+void helper_fabs_ST0(CPUX86State *env)
 {
     ST0 = floatx80_abs(ST0);
 }
 
-void helper_fld1_ST0(void)
+void helper_fld1_ST0(CPUX86State *env)
 {
     ST0 = floatx80_one;
 }
 
-void helper_fldl2t_ST0(void)
+void helper_fldl2t_ST0(CPUX86State *env)
 {
     ST0 = floatx80_l2t;
 }
 
-void helper_fldl2e_ST0(void)
+void helper_fldl2e_ST0(CPUX86State *env)
 {
     ST0 = floatx80_l2e;
 }
 
-void helper_fldpi_ST0(void)
+void helper_fldpi_ST0(CPUX86State *env)
 {
     ST0 = floatx80_pi;
 }
 
-void helper_fldlg2_ST0(void)
+void helper_fldlg2_ST0(CPUX86State *env)
 {
     ST0 = floatx80_lg2;
 }
 
-void helper_fldln2_ST0(void)
+void helper_fldln2_ST0(CPUX86State *env)
 {
     ST0 = floatx80_ln2;
 }
 
-void helper_fldz_ST0(void)
+void helper_fldz_ST0(CPUX86State *env)
 {
     ST0 = floatx80_zero;
 }
 
-void helper_fldz_FT0(void)
+void helper_fldz_FT0(CPUX86State *env)
 {
     FT0 = floatx80_zero;
 }
 
-uint32_t helper_fnstsw(void)
+uint32_t helper_fnstsw(CPUX86State *env)
 {
     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 }
 
-uint32_t helper_fnstcw(void)
+uint32_t helper_fnstcw(CPUX86State *env)
 {
     return env->fpuc;
 }
 
-static void update_fp_status(void)
+static void update_fp_status(CPUX86State *env)
 {
     int rnd_type;
 
@@ -575,25 +570,25 @@  static void update_fp_status(void)
     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
 }
 
-void helper_fldcw(uint32_t val)
+void helper_fldcw(CPUX86State *env, uint32_t val)
 {
     env->fpuc = val;
-    update_fp_status();
+    update_fp_status(env);
 }
 
-void helper_fclex(void)
+void helper_fclex(CPUX86State *env)
 {
     env->fpus &= 0x7f00;
 }
 
-void helper_fwait(void)
+void helper_fwait(CPUX86State *env)
 {
     if (env->fpus & FPUS_SE) {
-        fpu_raise_exception();
+        fpu_raise_exception(env);
     }
 }
 
-void helper_fninit(void)
+void helper_fninit(CPUX86State *env)
 {
     env->fpus = 0;
     env->fpstt = 0;
@@ -610,7 +605,7 @@  void helper_fninit(void)
 
 /* BCD ops */
 
-void helper_fbld_ST0(target_ulong ptr)
+void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 {
     floatx80 tmp;
     uint64_t val;
@@ -619,18 +614,18 @@  void helper_fbld_ST0(target_ulong ptr)
 
     val = 0;
     for (i = 8; i >= 0; i--) {
-        v = ldub(ptr + i);
+        v = cpu_ldub_data(env, ptr + i);
         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
     }
     tmp = int64_to_floatx80(val, &env->fp_status);
-    if (ldub(ptr + 9) & 0x80) {
+    if (cpu_ldub_data(env, ptr + 9) & 0x80) {
         floatx80_chs(tmp);
     }
-    fpush();
+    fpush(env);
     ST0 = tmp;
 }
 
-void helper_fbst_ST0(target_ulong ptr)
+void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 {
     int v;
     target_ulong mem_ref, mem_end;
@@ -640,10 +635,10 @@  void helper_fbst_ST0(target_ulong ptr)
     mem_ref = ptr;
     mem_end = mem_ref + 9;
     if (val < 0) {
-        stb(mem_end, 0x80);
+        cpu_stb_data(env, mem_end, 0x80);
         val = -val;
     } else {
-        stb(mem_end, 0x00);
+        cpu_stb_data(env, mem_end, 0x00);
     }
     while (mem_ref < mem_end) {
         if (val == 0) {
@@ -652,63 +647,63 @@  void helper_fbst_ST0(target_ulong ptr)
         v = val % 100;
         val = val / 100;
         v = ((v / 10) << 4) | (v % 10);
-        stb(mem_ref++, v);
+        cpu_stb_data(env, mem_ref++, v);
     }
     while (mem_ref < mem_end) {
-        stb(mem_ref++, 0);
+        cpu_stb_data(env, mem_ref++, 0);
     }
 }
 
-void helper_f2xm1(void)
+void helper_f2xm1(CPUX86State *env)
 {
-    double val = floatx80_to_double(ST0);
+    double val = floatx80_to_double(env, ST0);
 
     val = pow(2.0, val) - 1.0;
-    ST0 = double_to_floatx80(val);
+    ST0 = double_to_floatx80(env, val);
 }
 
-void helper_fyl2x(void)
+void helper_fyl2x(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if (fptemp > 0.0) {
         fptemp = log(fptemp) / log(2.0); /* log2(ST) */
-        fptemp *= floatx80_to_double(ST1);
-        ST1 = double_to_floatx80(fptemp);
-        fpop();
+        fptemp *= floatx80_to_double(env, ST1);
+        ST1 = double_to_floatx80(env, fptemp);
+        fpop(env);
     } else {
         env->fpus &= ~0x4700;
         env->fpus |= 0x400;
     }
 }
 
-void helper_fptan(void)
+void helper_fptan(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
         env->fpus |= 0x400;
     } else {
         fptemp = tan(fptemp);
-        ST0 = double_to_floatx80(fptemp);
-        fpush();
+        ST0 = double_to_floatx80(env, fptemp);
+        fpush(env);
         ST0 = floatx80_one;
         env->fpus &= ~0x400; /* C2 <-- 0 */
         /* the above code is for |arg| < 2**52 only */
     }
 }
 
-void helper_fpatan(void)
+void helper_fpatan(CPUX86State *env)
 {
     double fptemp, fpsrcop;
 
-    fpsrcop = floatx80_to_double(ST1);
-    fptemp = floatx80_to_double(ST0);
-    ST1 = double_to_floatx80(atan2(fpsrcop, fptemp));
-    fpop();
+    fpsrcop = floatx80_to_double(env, ST1);
+    fptemp = floatx80_to_double(env, ST0);
+    ST1 = double_to_floatx80(env, atan2(fpsrcop, fptemp));
+    fpop(env);
 }
 
-void helper_fxtract(void)
+void helper_fxtract(CPUX86State *env)
 {
     CPU_LDoubleU temp;
 
@@ -718,7 +713,7 @@  void helper_fxtract(void)
         /* Easy way to generate -inf and raising division by 0 exception */
         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
                            &env->fp_status);
-        fpush();
+        fpush(env);
         ST0 = temp.d;
     } else {
         int expdif;
@@ -726,24 +721,24 @@  void helper_fxtract(void)
         expdif = EXPD(temp) - EXPBIAS;
         /* DP exponent bias */
         ST0 = int32_to_floatx80(expdif, &env->fp_status);
-        fpush();
+        fpush(env);
         BIASEXPONENT(temp);
         ST0 = temp.d;
     }
 }
 
-void helper_fprem1(void)
+void helper_fprem1(CPUX86State *env)
 {
     double st0, st1, dblq, fpsrcop, fptemp;
     CPU_LDoubleU fpsrcop1, fptemp1;
     int expdif;
     signed long long int q;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    st0 = floatx80_to_double(env, ST0);
+    st1 = floatx80_to_double(env, ST1);
 
     if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
+        ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */
         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
         return;
     }
@@ -788,21 +783,21 @@  void helper_fprem1(void)
                   -(floor(fabs(fpsrcop))) : floor(fpsrcop);
         st0 -= (st1 * fpsrcop * fptemp);
     }
-    ST0 = double_to_floatx80(st0);
+    ST0 = double_to_floatx80(env, st0);
 }
 
-void helper_fprem(void)
+void helper_fprem(CPUX86State *env)
 {
     double st0, st1, dblq, fpsrcop, fptemp;
     CPU_LDoubleU fpsrcop1, fptemp1;
     int expdif;
     signed long long int q;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    st0 = floatx80_to_double(env, ST0);
+    st1 = floatx80_to_double(env, ST1);
 
     if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
+        ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */
         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
         return;
     }
@@ -849,25 +844,25 @@  void helper_fprem(void)
                   -(floor(fabs(fpsrcop))) : floor(fpsrcop);
         st0 -= (st1 * fpsrcop * fptemp);
     }
-    ST0 = double_to_floatx80(st0);
+    ST0 = double_to_floatx80(env, st0);
 }
 
-void helper_fyl2xp1(void)
+void helper_fyl2xp1(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if ((fptemp + 1.0) > 0.0) {
         fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */
-        fptemp *= floatx80_to_double(ST1);
-        ST1 = double_to_floatx80(fptemp);
-        fpop();
+        fptemp *= floatx80_to_double(env, ST1);
+        ST1 = double_to_floatx80(env, fptemp);
+        fpop(env);
     } else {
         env->fpus &= ~0x4700;
         env->fpus |= 0x400;
     }
 }
 
-void helper_fsqrt(void)
+void helper_fsqrt(CPUX86State *env)
 {
     if (floatx80_is_neg(ST0)) {
         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
@@ -876,27 +871,27 @@  void helper_fsqrt(void)
     ST0 = floatx80_sqrt(ST0, &env->fp_status);
 }
 
-void helper_fsincos(void)
+void helper_fsincos(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
         env->fpus |= 0x400;
     } else {
-        ST0 = double_to_floatx80(sin(fptemp));
-        fpush();
-        ST0 = double_to_floatx80(cos(fptemp));
+        ST0 = double_to_floatx80(env, sin(fptemp));
+        fpush(env);
+        ST0 = double_to_floatx80(env, cos(fptemp));
         env->fpus &= ~0x400;  /* C2 <-- 0 */
         /* the above code is for |arg| < 2**63 only */
     }
 }
 
-void helper_frndint(void)
+void helper_frndint(CPUX86State *env)
 {
     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
 }
 
-void helper_fscale(void)
+void helper_fscale(CPUX86State *env)
 {
     if (floatx80_is_any_nan(ST1)) {
         ST0 = ST1;
@@ -906,33 +901,33 @@  void helper_fscale(void)
     }
 }
 
-void helper_fsin(void)
+void helper_fsin(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
         env->fpus |= 0x400;
     } else {
-        ST0 = double_to_floatx80(sin(fptemp));
+        ST0 = double_to_floatx80(env, sin(fptemp));
         env->fpus &= ~0x400;  /* C2 <-- 0 */
         /* the above code is for |arg| < 2**53 only */
     }
 }
 
-void helper_fcos(void)
+void helper_fcos(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(ST0);
+    double fptemp = floatx80_to_double(env, ST0);
 
     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
         env->fpus |= 0x400;
     } else {
-        ST0 = double_to_floatx80(cos(fptemp));
+        ST0 = double_to_floatx80(env, cos(fptemp));
         env->fpus &= ~0x400;  /* C2 <-- 0 */
         /* the above code is for |arg| < 2**63 only */
     }
 }
 
-void helper_fxam_ST0(void)
+void helper_fxam_ST0(CPUX86State *env)
 {
     CPU_LDoubleU temp;
     int expdif;
@@ -963,7 +958,7 @@  void helper_fxam_ST0(void)
     }
 }
 
-void helper_fstenv(target_ulong ptr, int data32)
+void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
 {
     int fpus, fptag, exp, i;
     uint64_t mant;
@@ -991,37 +986,37 @@  void helper_fstenv(target_ulong ptr, int data32)
     }
     if (data32) {
         /* 32 bit */
-        stl(ptr, env->fpuc);
-        stl(ptr + 4, fpus);
-        stl(ptr + 8, fptag);
-        stl(ptr + 12, 0); /* fpip */
-        stl(ptr + 16, 0); /* fpcs */
-        stl(ptr + 20, 0); /* fpoo */
-        stl(ptr + 24, 0); /* fpos */
+        cpu_stl_data(env, ptr, env->fpuc);
+        cpu_stl_data(env, ptr + 4, fpus);
+        cpu_stl_data(env, ptr + 8, fptag);
+        cpu_stl_data(env, ptr + 12, 0); /* fpip */
+        cpu_stl_data(env, ptr + 16, 0); /* fpcs */
+        cpu_stl_data(env, ptr + 20, 0); /* fpoo */
+        cpu_stl_data(env, ptr + 24, 0); /* fpos */
     } else {
         /* 16 bit */
-        stw(ptr, env->fpuc);
-        stw(ptr + 2, fpus);
-        stw(ptr + 4, fptag);
-        stw(ptr + 6, 0);
-        stw(ptr + 8, 0);
-        stw(ptr + 10, 0);
-        stw(ptr + 12, 0);
+        cpu_stw_data(env, ptr, env->fpuc);
+        cpu_stw_data(env, ptr + 2, fpus);
+        cpu_stw_data(env, ptr + 4, fptag);
+        cpu_stw_data(env, ptr + 6, 0);
+        cpu_stw_data(env, ptr + 8, 0);
+        cpu_stw_data(env, ptr + 10, 0);
+        cpu_stw_data(env, ptr + 12, 0);
     }
 }
 
-void helper_fldenv(target_ulong ptr, int data32)
+void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
 {
     int i, fpus, fptag;
 
     if (data32) {
-        env->fpuc = lduw(ptr);
-        fpus = lduw(ptr + 4);
-        fptag = lduw(ptr + 8);
+        env->fpuc = cpu_lduw_data(env, ptr);
+        fpus = cpu_lduw_data(env, ptr + 4);
+        fptag = cpu_lduw_data(env, ptr + 8);
     } else {
-        env->fpuc = lduw(ptr);
-        fpus = lduw(ptr + 2);
-        fptag = lduw(ptr + 4);
+        env->fpuc = cpu_lduw_data(env, ptr);
+        fpus = cpu_lduw_data(env, ptr + 2);
+        fptag = cpu_lduw_data(env, ptr + 4);
     }
     env->fpstt = (fpus >> 11) & 7;
     env->fpus = fpus & ~0x3800;
@@ -1031,17 +1026,17 @@  void helper_fldenv(target_ulong ptr, int data32)
     }
 }
 
-void helper_fsave(target_ulong ptr, int data32)
+void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
 {
     floatx80 tmp;
     int i;
 
-    helper_fstenv(ptr, data32);
+    helper_fstenv(env, ptr, data32);
 
     ptr += (14 << data32);
     for (i = 0; i < 8; i++) {
         tmp = ST(i);
-        helper_fstt(tmp, ptr);
+        helper_fstt(env, tmp, ptr);
         ptr += 10;
     }
 
@@ -1059,48 +1054,34 @@  void helper_fsave(target_ulong ptr, int data32)
     env->fptags[7] = 1;
 }
 
-void helper_frstor(target_ulong ptr, int data32)
+void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
 {
     floatx80 tmp;
     int i;
 
-    helper_fldenv(ptr, data32);
+    helper_fldenv(env, ptr, data32);
     ptr += (14 << data32);
 
     for (i = 0; i < 8; i++) {
-        tmp = helper_fldt(ptr);
+        tmp = helper_fldt(env, ptr);
         ST(i) = tmp;
         ptr += 10;
     }
 }
 
 #if defined(CONFIG_USER_ONLY)
-void cpu_x86_fsave(CPUX86State *s, target_ulong ptr, int data32)
+void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
 {
-    CPUX86State *saved_env;
-
-    saved_env = env;
-    env = s;
-
-    helper_fsave(ptr, data32);
-
-    env = saved_env;
+    helper_fsave(env, ptr, data32);
 }
 
-void cpu_x86_frstor(CPUX86State *s, target_ulong ptr, int data32)
+void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
 {
-    CPUX86State *saved_env;
-
-    saved_env = env;
-    env = s;
-
-    helper_frstor(ptr, data32);
-
-    env = saved_env;
+    helper_frstor(env, ptr, data32);
 }
 #endif
 
-void helper_fxsave(target_ulong ptr, int data64)
+void helper_fxsave(CPUX86State *env, target_ulong ptr, int data64)
 {
     int fpus, fptag, i, nb_xmm_regs;
     floatx80 tmp;
@@ -1116,33 +1097,33 @@  void helper_fxsave(target_ulong ptr, int data64)
     for (i = 0; i < 8; i++) {
         fptag |= (env->fptags[i] << i);
     }
-    stw(ptr, env->fpuc);
-    stw(ptr + 2, fpus);
-    stw(ptr + 4, fptag ^ 0xff);
+    cpu_stw_data(env, ptr, env->fpuc);
+    cpu_stw_data(env, ptr + 2, fpus);
+    cpu_stw_data(env, ptr + 4, fptag ^ 0xff);
 #ifdef TARGET_X86_64
     if (data64) {
-        stq(ptr + 0x08, 0); /* rip */
-        stq(ptr + 0x10, 0); /* rdp */
+        cpu_stq_data(env, ptr + 0x08, 0); /* rip */
+        cpu_stq_data(env, ptr + 0x10, 0); /* rdp */
     } else
 #endif
     {
-        stl(ptr + 0x08, 0); /* eip */
-        stl(ptr + 0x0c, 0); /* sel  */
-        stl(ptr + 0x10, 0); /* dp */
-        stl(ptr + 0x14, 0); /* sel  */
+        cpu_stl_data(env, ptr + 0x08, 0); /* eip */
+        cpu_stl_data(env, ptr + 0x0c, 0); /* sel  */
+        cpu_stl_data(env, ptr + 0x10, 0); /* dp */
+        cpu_stl_data(env, ptr + 0x14, 0); /* sel  */
     }
 
     addr = ptr + 0x20;
     for (i = 0; i < 8; i++) {
         tmp = ST(i);
-        helper_fstt(tmp, addr);
+        helper_fstt(env, tmp, addr);
         addr += 16;
     }
 
     if (env->cr[4] & CR4_OSFXSR_MASK) {
         /* XXX: finish it */
-        stl(ptr + 0x18, env->mxcsr); /* mxcsr */
-        stl(ptr + 0x1c, 0x0000ffff); /* mxcsr_mask */
+        cpu_stl_data(env, ptr + 0x18, env->mxcsr); /* mxcsr */
+        cpu_stl_data(env, ptr + 0x1c, 0x0000ffff); /* mxcsr_mask */
         if (env->hflags & HF_CS64_MASK) {
             nb_xmm_regs = 16;
         } else {
@@ -1154,15 +1135,15 @@  void helper_fxsave(target_ulong ptr, int data64)
             || (env->hflags & HF_CPL_MASK)
             || !(env->hflags & HF_LMA_MASK)) {
             for (i = 0; i < nb_xmm_regs; i++) {
-                stq(addr, env->xmm_regs[i].XMM_Q(0));
-                stq(addr + 8, env->xmm_regs[i].XMM_Q(1));
+                cpu_stq_data(env, addr, env->xmm_regs[i].XMM_Q(0));
+                cpu_stq_data(env, addr + 8, env->xmm_regs[i].XMM_Q(1));
                 addr += 16;
             }
         }
     }
 }
 
-void helper_fxrstor(target_ulong ptr, int data64)
+void helper_fxrstor(CPUX86State *env, target_ulong ptr, int data64)
 {
     int i, fpus, fptag, nb_xmm_regs;
     floatx80 tmp;
@@ -1173,9 +1154,9 @@  void helper_fxrstor(target_ulong ptr, int data64)
         raise_exception(env, EXCP0D_GPF);
     }
 
-    env->fpuc = lduw(ptr);
-    fpus = lduw(ptr + 2);
-    fptag = lduw(ptr + 4);
+    env->fpuc = cpu_lduw_data(env, ptr);
+    fpus = cpu_lduw_data(env, ptr + 2);
+    fptag = cpu_lduw_data(env, ptr + 4);
     env->fpstt = (fpus >> 11) & 7;
     env->fpus = fpus & ~0x3800;
     fptag ^= 0xff;
@@ -1185,15 +1166,15 @@  void helper_fxrstor(target_ulong ptr, int data64)
 
     addr = ptr + 0x20;
     for (i = 0; i < 8; i++) {
-        tmp = helper_fldt(addr);
+        tmp = helper_fldt(env, addr);
         ST(i) = tmp;
         addr += 16;
     }
 
     if (env->cr[4] & CR4_OSFXSR_MASK) {
         /* XXX: finish it */
-        env->mxcsr = ldl(ptr + 0x18);
-        /* ldl(ptr + 0x1c); */
+        env->mxcsr = cpu_ldl_data(env, ptr + 0x18);
+        /* cpu_ldl_data(env, ptr + 0x1c); */
         if (env->hflags & HF_CS64_MASK) {
             nb_xmm_regs = 16;
         } else {
@@ -1205,8 +1186,8 @@  void helper_fxrstor(target_ulong ptr, int data64)
             || (env->hflags & HF_CPL_MASK)
             || !(env->hflags & HF_LMA_MASK)) {
             for (i = 0; i < nb_xmm_regs; i++) {
-                env->xmm_regs[i].XMM_Q(0) = ldq(addr);
-                env->xmm_regs[i].XMM_Q(1) = ldq(addr + 8);
+                env->xmm_regs[i].XMM_Q(0) = cpu_ldq_data(env, addr);
+                env->xmm_regs[i].XMM_Q(1) = cpu_ldq_data(env, addr + 8);
                 addr += 16;
             }
         }
@@ -1242,7 +1223,7 @@  floatx80 cpu_set_fp80(uint64_t mant, uint16_t upper)
 #define SSE_RC_CHOP         0x6000
 #define SSE_FZ              0x8000
 
-static void update_sse_status(void)
+static void update_sse_status(CPUX86State *env)
 {
     int rnd_type;
 
@@ -1271,20 +1252,20 @@  static void update_sse_status(void)
     set_flush_to_zero((env->mxcsr & SSE_FZ) ? 1 : 0, &env->fp_status);
 }
 
-void helper_ldmxcsr(uint32_t val)
+void helper_ldmxcsr(CPUX86State *env, uint32_t val)
 {
     env->mxcsr = val;
-    update_sse_status();
+    update_sse_status(env);
 }
 
-void helper_enter_mmx(void)
+void helper_enter_mmx(CPUX86State *env)
 {
     env->fpstt = 0;
     *(uint32_t *)(env->fptags) = 0;
     *(uint32_t *)(env->fptags + 4) = 0;
 }
 
-void helper_emms(void)
+void helper_emms(CPUX86State *env)
 {
     /* set to empty state */
     *(uint32_t *)(env->fptags) = 0x01010101;
@@ -1292,7 +1273,7 @@  void helper_emms(void)
 }
 
 /* XXX: suppress */
-void helper_movq(void *d, void *s)
+void helper_movq(CPUX86State *env, void *d, void *s)
 {
     *(uint64_t *)d = *(uint64_t *)s;
 }
diff --git a/target-i386/helper.h b/target-i386/helper.h
index 99ca183..6fdee8a 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -109,98 +109,98 @@  DEF_HELPER_1(invlpga, void, int)
 
 /* x86 FPU */
 
-DEF_HELPER_1(flds_FT0, void, i32)
-DEF_HELPER_1(fldl_FT0, void, i64)
-DEF_HELPER_1(fildl_FT0, void, s32)
-DEF_HELPER_1(flds_ST0, void, i32)
-DEF_HELPER_1(fldl_ST0, void, i64)
-DEF_HELPER_1(fildl_ST0, void, s32)
-DEF_HELPER_1(fildll_ST0, void, s64)
-DEF_HELPER_0(fsts_ST0, i32)
-DEF_HELPER_0(fstl_ST0, i64)
-DEF_HELPER_0(fist_ST0, s32)
-DEF_HELPER_0(fistl_ST0, s32)
-DEF_HELPER_0(fistll_ST0, s64)
-DEF_HELPER_0(fistt_ST0, s32)
-DEF_HELPER_0(fisttl_ST0, s32)
-DEF_HELPER_0(fisttll_ST0, s64)
-DEF_HELPER_1(fldt_ST0, void, tl)
-DEF_HELPER_1(fstt_ST0, void, tl)
-DEF_HELPER_0(fpush, void)
-DEF_HELPER_0(fpop, void)
-DEF_HELPER_0(fdecstp, void)
-DEF_HELPER_0(fincstp, void)
-DEF_HELPER_1(ffree_STN, void, int)
-DEF_HELPER_0(fmov_ST0_FT0, void)
-DEF_HELPER_1(fmov_FT0_STN, void, int)
-DEF_HELPER_1(fmov_ST0_STN, void, int)
-DEF_HELPER_1(fmov_STN_ST0, void, int)
-DEF_HELPER_1(fxchg_ST0_STN, void, int)
-DEF_HELPER_0(fcom_ST0_FT0, void)
-DEF_HELPER_0(fucom_ST0_FT0, void)
-DEF_HELPER_0(fcomi_ST0_FT0, void)
-DEF_HELPER_0(fucomi_ST0_FT0, void)
-DEF_HELPER_0(fadd_ST0_FT0, void)
-DEF_HELPER_0(fmul_ST0_FT0, void)
-DEF_HELPER_0(fsub_ST0_FT0, void)
-DEF_HELPER_0(fsubr_ST0_FT0, void)
-DEF_HELPER_0(fdiv_ST0_FT0, void)
-DEF_HELPER_0(fdivr_ST0_FT0, void)
-DEF_HELPER_1(fadd_STN_ST0, void, int)
-DEF_HELPER_1(fmul_STN_ST0, void, int)
-DEF_HELPER_1(fsub_STN_ST0, void, int)
-DEF_HELPER_1(fsubr_STN_ST0, void, int)
-DEF_HELPER_1(fdiv_STN_ST0, void, int)
-DEF_HELPER_1(fdivr_STN_ST0, void, int)
-DEF_HELPER_0(fchs_ST0, void)
-DEF_HELPER_0(fabs_ST0, void)
-DEF_HELPER_0(fxam_ST0, void)
-DEF_HELPER_0(fld1_ST0, void)
-DEF_HELPER_0(fldl2t_ST0, void)
-DEF_HELPER_0(fldl2e_ST0, void)
-DEF_HELPER_0(fldpi_ST0, void)
-DEF_HELPER_0(fldlg2_ST0, void)
-DEF_HELPER_0(fldln2_ST0, void)
-DEF_HELPER_0(fldz_ST0, void)
-DEF_HELPER_0(fldz_FT0, void)
-DEF_HELPER_0(fnstsw, i32)
-DEF_HELPER_0(fnstcw, i32)
-DEF_HELPER_1(fldcw, void, i32)
-DEF_HELPER_0(fclex, void)
-DEF_HELPER_0(fwait, void)
-DEF_HELPER_0(fninit, void)
-DEF_HELPER_1(fbld_ST0, void, tl)
-DEF_HELPER_1(fbst_ST0, void, tl)
-DEF_HELPER_0(f2xm1, void)
-DEF_HELPER_0(fyl2x, void)
-DEF_HELPER_0(fptan, void)
-DEF_HELPER_0(fpatan, void)
-DEF_HELPER_0(fxtract, void)
-DEF_HELPER_0(fprem1, void)
-DEF_HELPER_0(fprem, void)
-DEF_HELPER_0(fyl2xp1, void)
-DEF_HELPER_0(fsqrt, void)
-DEF_HELPER_0(fsincos, void)
-DEF_HELPER_0(frndint, void)
-DEF_HELPER_0(fscale, void)
-DEF_HELPER_0(fsin, void)
-DEF_HELPER_0(fcos, void)
-DEF_HELPER_2(fstenv, void, tl, int)
-DEF_HELPER_2(fldenv, void, tl, int)
-DEF_HELPER_2(fsave, void, tl, int)
-DEF_HELPER_2(frstor, void, tl, int)
-DEF_HELPER_2(fxsave, void, tl, int)
-DEF_HELPER_2(fxrstor, void, tl, int)
+DEF_HELPER_2(flds_FT0, void, env, i32)
+DEF_HELPER_2(fldl_FT0, void, env, i64)
+DEF_HELPER_2(fildl_FT0, void, env, s32)
+DEF_HELPER_2(flds_ST0, void, env, i32)
+DEF_HELPER_2(fldl_ST0, void, env, i64)
+DEF_HELPER_2(fildl_ST0, void, env, s32)
+DEF_HELPER_2(fildll_ST0, void, env, s64)
+DEF_HELPER_1(fsts_ST0, i32, env)
+DEF_HELPER_1(fstl_ST0, i64, env)
+DEF_HELPER_1(fist_ST0, s32, env)
+DEF_HELPER_1(fistl_ST0, s32, env)
+DEF_HELPER_1(fistll_ST0, s64, env)
+DEF_HELPER_1(fistt_ST0, s32, env)
+DEF_HELPER_1(fisttl_ST0, s32, env)
+DEF_HELPER_1(fisttll_ST0, s64, env)
+DEF_HELPER_2(fldt_ST0, void, env, tl)
+DEF_HELPER_2(fstt_ST0, void, env, tl)
+DEF_HELPER_1(fpush, void, env)
+DEF_HELPER_1(fpop, void, env)
+DEF_HELPER_1(fdecstp, void, env)
+DEF_HELPER_1(fincstp, void, env)
+DEF_HELPER_2(ffree_STN, void, env, int)
+DEF_HELPER_1(fmov_ST0_FT0, void, env)
+DEF_HELPER_2(fmov_FT0_STN, void, env, int)
+DEF_HELPER_2(fmov_ST0_STN, void, env, int)
+DEF_HELPER_2(fmov_STN_ST0, void, env, int)
+DEF_HELPER_2(fxchg_ST0_STN, void, env, int)
+DEF_HELPER_1(fcom_ST0_FT0, void, env)
+DEF_HELPER_1(fucom_ST0_FT0, void, env)
+DEF_HELPER_1(fcomi_ST0_FT0, void, env)
+DEF_HELPER_1(fucomi_ST0_FT0, void, env)
+DEF_HELPER_1(fadd_ST0_FT0, void, env)
+DEF_HELPER_1(fmul_ST0_FT0, void, env)
+DEF_HELPER_1(fsub_ST0_FT0, void, env)
+DEF_HELPER_1(fsubr_ST0_FT0, void, env)
+DEF_HELPER_1(fdiv_ST0_FT0, void, env)
+DEF_HELPER_1(fdivr_ST0_FT0, void, env)
+DEF_HELPER_2(fadd_STN_ST0, void, env, int)
+DEF_HELPER_2(fmul_STN_ST0, void, env, int)
+DEF_HELPER_2(fsub_STN_ST0, void, env, int)
+DEF_HELPER_2(fsubr_STN_ST0, void, env, int)
+DEF_HELPER_2(fdiv_STN_ST0, void, env, int)
+DEF_HELPER_2(fdivr_STN_ST0, void, env, int)
+DEF_HELPER_1(fchs_ST0, void, env)
+DEF_HELPER_1(fabs_ST0, void, env)
+DEF_HELPER_1(fxam_ST0, void, env)
+DEF_HELPER_1(fld1_ST0, void, env)
+DEF_HELPER_1(fldl2t_ST0, void, env)
+DEF_HELPER_1(fldl2e_ST0, void, env)
+DEF_HELPER_1(fldpi_ST0, void, env)
+DEF_HELPER_1(fldlg2_ST0, void, env)
+DEF_HELPER_1(fldln2_ST0, void, env)
+DEF_HELPER_1(fldz_ST0, void, env)
+DEF_HELPER_1(fldz_FT0, void, env)
+DEF_HELPER_1(fnstsw, i32, env)
+DEF_HELPER_1(fnstcw, i32, env)
+DEF_HELPER_2(fldcw, void, env, i32)
+DEF_HELPER_1(fclex, void, env)
+DEF_HELPER_1(fwait, void, env)
+DEF_HELPER_1(fninit, void, env)
+DEF_HELPER_2(fbld_ST0, void, env, tl)
+DEF_HELPER_2(fbst_ST0, void, env, tl)
+DEF_HELPER_1(f2xm1, void, env)
+DEF_HELPER_1(fyl2x, void, env)
+DEF_HELPER_1(fptan, void, env)
+DEF_HELPER_1(fpatan, void, env)
+DEF_HELPER_1(fxtract, void, env)
+DEF_HELPER_1(fprem1, void, env)
+DEF_HELPER_1(fprem, void, env)
+DEF_HELPER_1(fyl2xp1, void, env)
+DEF_HELPER_1(fsqrt, void, env)
+DEF_HELPER_1(fsincos, void, env)
+DEF_HELPER_1(frndint, void, env)
+DEF_HELPER_1(fscale, void, env)
+DEF_HELPER_1(fsin, void, env)
+DEF_HELPER_1(fcos, void, env)
+DEF_HELPER_3(fstenv, void, env, tl, int)
+DEF_HELPER_3(fldenv, void, env, tl, int)
+DEF_HELPER_3(fsave, void, env, tl, int)
+DEF_HELPER_3(frstor, void, env, tl, int)
+DEF_HELPER_3(fxsave, void, env, tl, int)
+DEF_HELPER_3(fxrstor, void, env, tl, int)
 DEF_HELPER_1(bsf, tl, tl)
 DEF_HELPER_1(bsr, tl, tl)
 DEF_HELPER_2(lzcnt, tl, tl, int)
 
 /* MMX/SSE */
 
-DEF_HELPER_1(ldmxcsr, void, i32)
-DEF_HELPER_0(enter_mmx, void)
-DEF_HELPER_0(emms, void)
-DEF_HELPER_2(movq, void, ptr, ptr)
+DEF_HELPER_2(ldmxcsr, void, env, i32)
+DEF_HELPER_1(enter_mmx, void, env)
+DEF_HELPER_1(emms, void, env)
+DEF_HELPER_3(movq, void, env, ptr, ptr)
 
 #define SHIFT 0
 #include "ops_sse_header.h"
diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c
index 91353c0..4e0af4b 100644
--- a/target-i386/mem_helper.c
+++ b/target-i386/mem_helper.c
@@ -159,3 +159,52 @@  void tlb_fill(CPUX86State *env1, target_ulong addr, int is_write, int mmu_idx,
     env = saved_env;
 }
 #endif
+
+/* temporary wrappers */
+#if defined(CONFIG_USER_ONLY)
+#define ldub_data(addr) ldub_raw(addr)
+#define lduw_data(addr) lduw_raw(addr)
+#define ldl_data(addr) ldl_raw(addr)
+#define ldq_data(addr) ldq_raw(addr)
+
+#define stb_data(addr, data) stb_raw(addr, data)
+#define stw_data(addr, data) stw_raw(addr, data)
+#define stl_data(addr, data) stl_raw(addr, data)
+#define stq_data(addr, data) stq_raw(addr, data)
+#endif
+
+#define WRAP_LD(rettype, fn)                                    \
+    rettype cpu_ ## fn(CPUX86State *env1, target_ulong addr)    \
+    {                                                           \
+        CPUX86State *saved_env;                                 \
+        rettype ret;                                            \
+                                                                \
+        saved_env = env;                                        \
+        env = env1;                                             \
+        ret = fn(addr);                                         \
+        env = saved_env;                                        \
+        return ret;                                             \
+    }
+
+WRAP_LD(uint32_t, ldub_data)
+WRAP_LD(uint32_t, lduw_data)
+WRAP_LD(uint32_t, ldl_data)
+WRAP_LD(uint64_t, ldq_data)
+#undef WRAP_LD
+
+#define WRAP_ST(datatype, fn)                                           \
+    void cpu_ ## fn(CPUX86State *env1, target_ulong addr, datatype val) \
+    {                                                                   \
+        CPUX86State *saved_env;                                         \
+                                                                        \
+        saved_env = env;                                                \
+        env = env1;                                                     \
+        fn(addr, val);                                                  \
+        env = saved_env;                                                \
+    }
+
+WRAP_ST(uint32_t, stb_data)
+WRAP_ST(uint32_t, stw_data)
+WRAP_ST(uint32_t, stl_data)
+WRAP_ST(uint64_t, stq_data)
+#undef WRAP_ST
diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
index d109512..cad9d75 100644
--- a/target-i386/ops_sse.h
+++ b/target-i386/ops_sse.h
@@ -35,7 +35,7 @@ 
 #define SUFFIX _xmm
 #endif
 
-void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -59,7 +59,7 @@  void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -80,7 +80,7 @@  void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
 #endif
 }
 
-void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -104,7 +104,7 @@  void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -124,7 +124,7 @@  void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -141,7 +141,7 @@  void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
 #endif
 }
 
-void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -161,7 +161,7 @@  void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -179,7 +179,7 @@  void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
 
@@ -198,7 +198,7 @@  void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
 }
 
 #if SHIFT == 1
-void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift, i;
 
@@ -214,7 +214,7 @@  void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
     }
 }
 
-void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift, i;
 
@@ -232,7 +232,7 @@  void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
 #endif
 
 #define SSE_HELPER_B(name, F)                                   \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)                     \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
     {                                                           \
         d->B(0) = F(d->B(0), s->B(0));                          \
         d->B(1) = F(d->B(1), s->B(1));                          \
@@ -255,7 +255,7 @@  void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
             }
 
 #define SSE_HELPER_W(name, F)                                   \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)                     \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
     {                                                           \
         d->W(0) = F(d->W(0), s->W(0));                          \
         d->W(1) = F(d->W(1), s->W(1));                          \
@@ -270,7 +270,7 @@  void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
             }
 
 #define SSE_HELPER_L(name, F)                                   \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)                     \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
     {                                                           \
         d->L(0) = F(d->L(0), s->L(0));                          \
         d->L(1) = F(d->L(1), s->L(1));                          \
@@ -281,7 +281,7 @@  void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
             }
 
 #define SSE_HELPER_Q(name, F)                                   \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)                     \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
     {                                                           \
         d->Q(0) = F(d->Q(0), s->Q(0));                          \
         XMM_ONLY(                                               \
@@ -417,7 +417,7 @@  SSE_HELPER_W(helper_pmulhw, FMULHW)
 SSE_HELPER_B(helper_pavgb, FAVG)
 SSE_HELPER_W(helper_pavgw, FAVG)
 
-void glue(helper_pmuludq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
 #if SHIFT == 1
@@ -425,7 +425,7 @@  void glue(helper_pmuludq, SUFFIX)(Reg *d, Reg *s)
 #endif
 }
 
-void glue(helper_pmaddwd, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int i;
 
@@ -445,7 +445,7 @@  static inline int abs1(int a)
     }
 }
 #endif
-void glue(helper_psadbw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     unsigned int val;
 
@@ -473,13 +473,14 @@  void glue(helper_psadbw, SUFFIX)(Reg *d, Reg *s)
 #endif
 }
 
-void glue(helper_maskmov, SUFFIX)(Reg *d, Reg *s, target_ulong a0)
+void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  target_ulong a0)
 {
     int i;
 
     for (i = 0; i < (8 << SHIFT); i++) {
         if (s->B(i) & 0x80) {
-            stb(a0 + i, d->B(i));
+            cpu_stb_data(env, a0 + i, d->B(i));
         }
     }
 }
@@ -575,29 +576,29 @@  void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 /* FPU ops */
 /* XXX: not accurate */
 
-#define SSE_HELPER_S(name, F)                           \
-    void helper_ ## name ## ps(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));  \
-        d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));  \
-        d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));  \
-        d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## ss(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## pd(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));  \
-        d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## sd(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));  \
+#define SSE_HELPER_S(name, F)                                           \
+    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));                  \
+        d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));                  \
+        d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));                  \
+        d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));                  \
+        d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));                  \
     }
 
 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
@@ -625,7 +626,7 @@  SSE_HELPER_S(sqrt, FPU_SQRT)
 
 
 /* float to float conversions */
-void helper_cvtps2pd(Reg *d, Reg *s)
+void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
 {
     float32 s0, s1;
 
@@ -635,25 +636,25 @@  void helper_cvtps2pd(Reg *d, Reg *s)
     d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
 }
 
-void helper_cvtpd2ps(Reg *d, Reg *s)
+void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s)
 {
     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
     d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
     d->Q(1) = 0;
 }
 
-void helper_cvtss2sd(Reg *d, Reg *s)
+void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
 {
     d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
 }
 
-void helper_cvtsd2ss(Reg *d, Reg *s)
+void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
 {
     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
 }
 
 /* integer to float */
-void helper_cvtdq2ps(Reg *d, Reg *s)
+void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
 {
     d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
     d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
@@ -661,7 +662,7 @@  void helper_cvtdq2ps(Reg *d, Reg *s)
     d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
 }
 
-void helper_cvtdq2pd(Reg *d, Reg *s)
+void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s)
 {
     int32_t l0, l1;
 
@@ -671,42 +672,42 @@  void helper_cvtdq2pd(Reg *d, Reg *s)
     d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
 }
 
-void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
+void helper_cvtpi2ps(CPUX86State *env, XMMReg *d, MMXReg *s)
 {
     d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
     d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
 }
 
-void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
+void helper_cvtpi2pd(CPUX86State *env, XMMReg *d, MMXReg *s)
 {
     d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
     d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
 }
 
-void helper_cvtsi2ss(XMMReg *d, uint32_t val)
+void helper_cvtsi2ss(CPUX86State *env, XMMReg *d, uint32_t val)
 {
     d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
 }
 
-void helper_cvtsi2sd(XMMReg *d, uint32_t val)
+void helper_cvtsi2sd(CPUX86State *env, XMMReg *d, uint32_t val)
 {
     d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
 }
 
 #ifdef TARGET_X86_64
-void helper_cvtsq2ss(XMMReg *d, uint64_t val)
+void helper_cvtsq2ss(CPUX86State *env, XMMReg *d, uint64_t val)
 {
     d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
 }
 
-void helper_cvtsq2sd(XMMReg *d, uint64_t val)
+void helper_cvtsq2sd(CPUX86State *env, XMMReg *d, uint64_t val)
 {
     d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
 }
 #endif
 
 /* float to integer */
-void helper_cvtps2dq(XMMReg *d, XMMReg *s)
+void helper_cvtps2dq(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
     d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
@@ -714,49 +715,49 @@  void helper_cvtps2dq(XMMReg *d, XMMReg *s)
     d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
 }
 
-void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
+void helper_cvtpd2dq(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
     d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
     d->XMM_Q(1) = 0;
 }
 
-void helper_cvtps2pi(MMXReg *d, XMMReg *s)
+void helper_cvtps2pi(CPUX86State *env, MMXReg *d, XMMReg *s)
 {
     d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
     d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
 }
 
-void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
+void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, XMMReg *s)
 {
     d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
     d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
 }
 
-int32_t helper_cvtss2si(XMMReg *s)
+int32_t helper_cvtss2si(CPUX86State *env, XMMReg *s)
 {
     return float32_to_int32(s->XMM_S(0), &env->sse_status);
 }
 
-int32_t helper_cvtsd2si(XMMReg *s)
+int32_t helper_cvtsd2si(CPUX86State *env, XMMReg *s)
 {
     return float64_to_int32(s->XMM_D(0), &env->sse_status);
 }
 
 #ifdef TARGET_X86_64
-int64_t helper_cvtss2sq(XMMReg *s)
+int64_t helper_cvtss2sq(CPUX86State *env, XMMReg *s)
 {
     return float32_to_int64(s->XMM_S(0), &env->sse_status);
 }
 
-int64_t helper_cvtsd2sq(XMMReg *s)
+int64_t helper_cvtsd2sq(CPUX86State *env, XMMReg *s)
 {
     return float64_to_int64(s->XMM_D(0), &env->sse_status);
 }
 #endif
 
 /* float to integer truncated */
-void helper_cvttps2dq(XMMReg *d, XMMReg *s)
+void helper_cvttps2dq(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
     d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
@@ -764,48 +765,48 @@  void helper_cvttps2dq(XMMReg *d, XMMReg *s)
     d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
 }
 
-void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
+void helper_cvttpd2dq(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
     d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
     d->XMM_Q(1) = 0;
 }
 
-void helper_cvttps2pi(MMXReg *d, XMMReg *s)
+void helper_cvttps2pi(CPUX86State *env, MMXReg *d, XMMReg *s)
 {
     d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
     d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
 }
 
-void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
+void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, XMMReg *s)
 {
     d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
     d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
 }
 
-int32_t helper_cvttss2si(XMMReg *s)
+int32_t helper_cvttss2si(CPUX86State *env, XMMReg *s)
 {
     return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
 }
 
-int32_t helper_cvttsd2si(XMMReg *s)
+int32_t helper_cvttsd2si(CPUX86State *env, XMMReg *s)
 {
     return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
 }
 
 #ifdef TARGET_X86_64
-int64_t helper_cvttss2sq(XMMReg *s)
+int64_t helper_cvttss2sq(CPUX86State *env, XMMReg *s)
 {
     return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
 }
 
-int64_t helper_cvttsd2sq(XMMReg *s)
+int64_t helper_cvttsd2sq(CPUX86State *env, XMMReg *s)
 {
     return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
 }
 #endif
 
-void helper_rsqrtps(XMMReg *d, XMMReg *s)
+void helper_rsqrtps(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_S(0) = float32_div(float32_one,
                               float32_sqrt(s->XMM_S(0), &env->sse_status),
@@ -821,14 +822,14 @@  void helper_rsqrtps(XMMReg *d, XMMReg *s)
                               &env->sse_status);
 }
 
-void helper_rsqrtss(XMMReg *d, XMMReg *s)
+void helper_rsqrtss(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_S(0) = float32_div(float32_one,
                               float32_sqrt(s->XMM_S(0), &env->sse_status),
                               &env->sse_status);
 }
 
-void helper_rcpps(XMMReg *d, XMMReg *s)
+void helper_rcpps(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_S(0) = float32_div(float32_one, s->XMM_S(0), &env->sse_status);
     d->XMM_S(1) = float32_div(float32_one, s->XMM_S(1), &env->sse_status);
@@ -836,7 +837,7 @@  void helper_rcpps(XMMReg *d, XMMReg *s)
     d->XMM_S(3) = float32_div(float32_one, s->XMM_S(3), &env->sse_status);
 }
 
-void helper_rcpss(XMMReg *d, XMMReg *s)
+void helper_rcpss(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_S(0) = float32_div(float32_one, s->XMM_S(0), &env->sse_status);
 }
@@ -853,12 +854,12 @@  static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
     return (src >> shift) & mask;
 }
 
-void helper_extrq_r(XMMReg *d, XMMReg *s)
+void helper_extrq_r(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
 }
 
-void helper_extrq_i(XMMReg *d, int index, int length)
+void helper_extrq_i(CPUX86State *env, XMMReg *d, int index, int length)
 {
     d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
 }
@@ -875,17 +876,17 @@  static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
     return (src & ~(mask << shift)) | ((src & mask) << shift);
 }
 
-void helper_insertq_r(XMMReg *d, XMMReg *s)
+void helper_insertq_r(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
 }
 
-void helper_insertq_i(XMMReg *d, int index, int length)
+void helper_insertq_i(CPUX86State *env, XMMReg *d, int index, int length)
 {
     d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
 }
 
-void helper_haddps(XMMReg *d, XMMReg *s)
+void helper_haddps(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     XMMReg r;
 
@@ -896,7 +897,7 @@  void helper_haddps(XMMReg *d, XMMReg *s)
     *d = r;
 }
 
-void helper_haddpd(XMMReg *d, XMMReg *s)
+void helper_haddpd(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     XMMReg r;
 
@@ -905,7 +906,7 @@  void helper_haddpd(XMMReg *d, XMMReg *s)
     *d = r;
 }
 
-void helper_hsubps(XMMReg *d, XMMReg *s)
+void helper_hsubps(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     XMMReg r;
 
@@ -916,7 +917,7 @@  void helper_hsubps(XMMReg *d, XMMReg *s)
     *d = r;
 }
 
-void helper_hsubpd(XMMReg *d, XMMReg *s)
+void helper_hsubpd(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     XMMReg r;
 
@@ -925,7 +926,7 @@  void helper_hsubpd(XMMReg *d, XMMReg *s)
     *d = r;
 }
 
-void helper_addsubps(XMMReg *d, XMMReg *s)
+void helper_addsubps(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_S(0) = float32_sub(d->XMM_S(0), s->XMM_S(0), &env->sse_status);
     d->XMM_S(1) = float32_add(d->XMM_S(1), s->XMM_S(1), &env->sse_status);
@@ -933,36 +934,36 @@  void helper_addsubps(XMMReg *d, XMMReg *s)
     d->XMM_S(3) = float32_add(d->XMM_S(3), s->XMM_S(3), &env->sse_status);
 }
 
-void helper_addsubpd(XMMReg *d, XMMReg *s)
+void helper_addsubpd(CPUX86State *env, XMMReg *d, XMMReg *s)
 {
     d->XMM_D(0) = float64_sub(d->XMM_D(0), s->XMM_D(0), &env->sse_status);
     d->XMM_D(1) = float64_add(d->XMM_D(1), s->XMM_D(1), &env->sse_status);
 }
 
 /* XXX: unordered */
-#define SSE_HELPER_CMP(name, F)                         \
-    void helper_ ## name ## ps(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));  \
-        d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));  \
-        d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));  \
-        d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## ss(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## pd(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));  \
-        d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));  \
-    }                                                   \
-                                                        \
-    void helper_ ## name ## sd(Reg *d, Reg *s)          \
-    {                                                   \
-        d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));  \
+#define SSE_HELPER_CMP(name, F)                                         \
+    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));                  \
+        d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));                  \
+        d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));                  \
+        d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));                  \
+        d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));                  \
+    }                                                                   \
+                                                                        \
+    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
+    {                                                                   \
+        d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));                  \
     }
 
 #define FPU_CMPEQ(size, a, b)                                           \
@@ -993,7 +994,7 @@  SSE_HELPER_CMP(cmpord, FPU_CMPORD)
 
 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 
-void helper_ucomiss(Reg *d, Reg *s)
+void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
 {
     int ret;
     float32 s0, s1;
@@ -1004,7 +1005,7 @@  void helper_ucomiss(Reg *d, Reg *s)
     CC_SRC = comis_eflags[ret + 1];
 }
 
-void helper_comiss(Reg *d, Reg *s)
+void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
 {
     int ret;
     float32 s0, s1;
@@ -1015,7 +1016,7 @@  void helper_comiss(Reg *d, Reg *s)
     CC_SRC = comis_eflags[ret + 1];
 }
 
-void helper_ucomisd(Reg *d, Reg *s)
+void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
 {
     int ret;
     float64 d0, d1;
@@ -1026,7 +1027,7 @@  void helper_ucomisd(Reg *d, Reg *s)
     CC_SRC = comis_eflags[ret + 1];
 }
 
-void helper_comisd(Reg *d, Reg *s)
+void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
 {
     int ret;
     float64 d0, d1;
@@ -1037,7 +1038,7 @@  void helper_comisd(Reg *d, Reg *s)
     CC_SRC = comis_eflags[ret + 1];
 }
 
-uint32_t helper_movmskps(Reg *s)
+uint32_t helper_movmskps(CPUX86State *env, Reg *s)
 {
     int b0, b1, b2, b3;
 
@@ -1048,7 +1049,7 @@  uint32_t helper_movmskps(Reg *s)
     return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
 }
 
-uint32_t helper_movmskpd(Reg *s)
+uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
 {
     int b0, b1;
 
@@ -1059,7 +1060,7 @@  uint32_t helper_movmskpd(Reg *s)
 
 #endif
 
-uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
+uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
 {
     uint32_t val;
 
@@ -1085,7 +1086,7 @@  uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
     return val;
 }
 
-void glue(helper_packsswb, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     Reg r;
 
@@ -1112,7 +1113,7 @@  void glue(helper_packsswb, SUFFIX)(Reg *d, Reg *s)
     *d = r;
 }
 
-void glue(helper_packuswb, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     Reg r;
 
@@ -1139,7 +1140,7 @@  void glue(helper_packuswb, SUFFIX)(Reg *d, Reg *s)
     *d = r;
 }
 
-void glue(helper_packssdw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     Reg r;
 
@@ -1160,7 +1161,8 @@  void glue(helper_packssdw, SUFFIX)(Reg *d, Reg *s)
 
 #define UNPCK_OP(base_name, base)                                       \
                                                                         \
-    void glue(helper_punpck ## base_name ## bw, SUFFIX)(Reg *d, Reg *s) \
+    void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
+                                                        Reg *d, Reg *s) \
     {                                                                   \
         Reg r;                                                          \
                                                                         \
@@ -1181,11 +1183,12 @@  void glue(helper_packssdw, SUFFIX)(Reg *d, Reg *s)
                  r.B(13) = s->B((base << (SHIFT + 2)) + 6);             \
                  r.B(14) = d->B((base << (SHIFT + 2)) + 7);             \
                  r.B(15) = s->B((base << (SHIFT + 2)) + 7);             \
-                                                                        ) \
+                                                                      ) \
             *d = r;                                                     \
     }                                                                   \
                                                                         \
-    void glue(helper_punpck ## base_name ## wd, SUFFIX)(Reg *d, Reg *s) \
+    void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
+                                                        Reg *d, Reg *s) \
     {                                                                   \
         Reg r;                                                          \
                                                                         \
@@ -1198,11 +1201,12 @@  void glue(helper_packssdw, SUFFIX)(Reg *d, Reg *s)
                  r.W(5) = s->W((base << (SHIFT + 1)) + 2);              \
                  r.W(6) = d->W((base << (SHIFT + 1)) + 3);              \
                  r.W(7) = s->W((base << (SHIFT + 1)) + 3);              \
-                                                                        ) \
+                                                                      ) \
             *d = r;                                                     \
     }                                                                   \
                                                                         \
-    void glue(helper_punpck ## base_name ## dq, SUFFIX)(Reg *d, Reg *s) \
+    void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
+                                                        Reg *d, Reg *s) \
     {                                                                   \
         Reg r;                                                          \
                                                                         \
@@ -1211,12 +1215,14 @@  void glue(helper_packssdw, SUFFIX)(Reg *d, Reg *s)
         XMM_ONLY(                                                       \
                  r.L(2) = d->L((base << SHIFT) + 1);                    \
                  r.L(3) = s->L((base << SHIFT) + 1);                    \
-                                                                        ) \
+                                                                      ) \
             *d = r;                                                     \
     }                                                                   \
                                                                         \
     XMM_ONLY(                                                           \
-             void glue(helper_punpck ## base_name ## qdq, SUFFIX)(Reg *d, \
+             void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \
+                                                                  *env, \
+                                                                  Reg *d, \
                                                                   Reg *s) \
              {                                                          \
                  Reg r;                                                 \
@@ -1232,25 +1238,25 @@  UNPCK_OP(h, 1)
 
 /* 3DNow! float ops */
 #if SHIFT == 0
-void helper_pi2fd(MMXReg *d, MMXReg *s)
+void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
 }
 
-void helper_pi2fw(MMXReg *d, MMXReg *s)
+void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
 }
 
-void helper_pf2id(MMXReg *d, MMXReg *s)
+void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
 }
 
-void helper_pf2iw(MMXReg *d, MMXReg *s)
+void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
                                                        &env->mmx_status));
@@ -1258,7 +1264,7 @@  void helper_pf2iw(MMXReg *d, MMXReg *s)
                                                        &env->mmx_status));
 }
 
-void helper_pfacc(MMXReg *d, MMXReg *s)
+void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     MMXReg r;
 
@@ -1267,13 +1273,13 @@  void helper_pfacc(MMXReg *d, MMXReg *s)
     *d = r;
 }
 
-void helper_pfadd(MMXReg *d, MMXReg *s)
+void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 }
 
-void helper_pfcmpeq(MMXReg *d, MMXReg *s)
+void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
                                    &env->mmx_status) ? -1 : 0;
@@ -1281,7 +1287,7 @@  void helper_pfcmpeq(MMXReg *d, MMXReg *s)
                                    &env->mmx_status) ? -1 : 0;
 }
 
-void helper_pfcmpge(MMXReg *d, MMXReg *s)
+void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
                              &env->mmx_status) ? -1 : 0;
@@ -1289,7 +1295,7 @@  void helper_pfcmpge(MMXReg *d, MMXReg *s)
                              &env->mmx_status) ? -1 : 0;
 }
 
-void helper_pfcmpgt(MMXReg *d, MMXReg *s)
+void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
                              &env->mmx_status) ? -1 : 0;
@@ -1297,7 +1303,7 @@  void helper_pfcmpgt(MMXReg *d, MMXReg *s)
                              &env->mmx_status) ? -1 : 0;
 }
 
-void helper_pfmax(MMXReg *d, MMXReg *s)
+void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
         d->MMX_S(0) = s->MMX_S(0);
@@ -1307,7 +1313,7 @@  void helper_pfmax(MMXReg *d, MMXReg *s)
     }
 }
 
-void helper_pfmin(MMXReg *d, MMXReg *s)
+void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
         d->MMX_S(0) = s->MMX_S(0);
@@ -1317,13 +1323,13 @@  void helper_pfmin(MMXReg *d, MMXReg *s)
     }
 }
 
-void helper_pfmul(MMXReg *d, MMXReg *s)
+void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 }
 
-void helper_pfnacc(MMXReg *d, MMXReg *s)
+void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     MMXReg r;
 
@@ -1332,7 +1338,7 @@  void helper_pfnacc(MMXReg *d, MMXReg *s)
     *d = r;
 }
 
-void helper_pfpnacc(MMXReg *d, MMXReg *s)
+void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     MMXReg r;
 
@@ -1341,13 +1347,13 @@  void helper_pfpnacc(MMXReg *d, MMXReg *s)
     *d = r;
 }
 
-void helper_pfrcp(MMXReg *d, MMXReg *s)
+void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
     d->MMX_S(1) = d->MMX_S(0);
 }
 
-void helper_pfrsqrt(MMXReg *d, MMXReg *s)
+void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
     d->MMX_S(1) = float32_div(float32_one,
@@ -1357,19 +1363,19 @@  void helper_pfrsqrt(MMXReg *d, MMXReg *s)
     d->MMX_L(0) = d->MMX_L(1);
 }
 
-void helper_pfsub(MMXReg *d, MMXReg *s)
+void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 }
 
-void helper_pfsubr(MMXReg *d, MMXReg *s)
+void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
 }
 
-void helper_pswapd(MMXReg *d, MMXReg *s)
+void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
 {
     MMXReg r;
 
@@ -1380,7 +1386,7 @@  void helper_pswapd(MMXReg *d, MMXReg *s)
 #endif
 
 /* SSSE3 op helpers */
-void glue(helper_pshufb, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int i;
     Reg r;
@@ -1392,7 +1398,7 @@  void glue(helper_pshufb, SUFFIX)(Reg *d, Reg *s)
     *d = r;
 }
 
-void glue(helper_phaddw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
     d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
@@ -1404,7 +1410,7 @@  void glue(helper_phaddw, SUFFIX)(Reg *d, Reg *s)
     XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
 }
 
-void glue(helper_phaddd, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
     XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
@@ -1412,7 +1418,7 @@  void glue(helper_phaddd, SUFFIX)(Reg *d, Reg *s)
     XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
 }
 
-void glue(helper_phaddsw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
     d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
@@ -1424,7 +1430,7 @@  void glue(helper_phaddsw, SUFFIX)(Reg *d, Reg *s)
     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
 }
 
-void glue(helper_pmaddubsw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
                     (int8_t)s->B(1) * (uint8_t)d->B(1));
@@ -1446,7 +1452,7 @@  void glue(helper_pmaddubsw, SUFFIX)(Reg *d, Reg *s)
 #endif
 }
 
-void glue(helper_phsubw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
     d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
@@ -1458,7 +1464,7 @@  void glue(helper_phsubw, SUFFIX)(Reg *d, Reg *s)
     XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
 }
 
-void glue(helper_phsubd, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
     XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
@@ -1466,7 +1472,7 @@  void glue(helper_phsubd, SUFFIX)(Reg *d, Reg *s)
     XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
 }
 
-void glue(helper_phsubsw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
     d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
@@ -1495,7 +1501,8 @@  SSE_HELPER_B(helper_psignb, FSIGNB)
 SSE_HELPER_W(helper_psignw, FSIGNW)
 SSE_HELPER_L(helper_psignd, FSIGNL)
 
-void glue(helper_palignr, SUFFIX)(Reg *d, Reg *s, int32_t shift)
+void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  int32_t shift)
 {
     Reg r;
 
@@ -1529,7 +1536,7 @@  void glue(helper_palignr, SUFFIX)(Reg *d, Reg *s, int32_t shift)
 
 #if SHIFT == 1
 #define SSE_HELPER_V(name, elem, num, F)                                \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)                             \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)           \
     {                                                                   \
         d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));           \
         d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));           \
@@ -1556,7 +1563,7 @@  void glue(helper_palignr, SUFFIX)(Reg *d, Reg *s, int32_t shift)
     }
 
 #define SSE_HELPER_I(name, elem, num, F)                                \
-    void glue(name, SUFFIX)(Reg *d, Reg *s, uint32_t imm)               \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \
     {                                                                   \
         d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));       \
         d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));       \
@@ -1596,7 +1603,7 @@  SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
 SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
 SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
 
-void glue(helper_ptest, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
     uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
@@ -1605,7 +1612,7 @@  void glue(helper_ptest, SUFFIX)(Reg *d, Reg *s)
 }
 
 #define SSE_HELPER_F(name, elem, num, F)        \
-    void glue(name, SUFFIX)(Reg *d, Reg *s)     \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)     \
     {                                           \
         d->elem(0) = F(0);                      \
         d->elem(1) = F(1);                      \
@@ -1634,7 +1641,7 @@  SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
 SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
 SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
 
-void glue(helper_pmuldq, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0);
     d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2);
@@ -1643,7 +1650,7 @@  void glue(helper_pmuldq, SUFFIX)(Reg *d, Reg *s)
 #define FCMPEQQ(d, s) (d == s ? -1 : 0)
 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
 
-void glue(helper_packusdw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     d->W(0) = satuw((int32_t) d->L(0));
     d->W(1) = satuw((int32_t) d->L(1));
@@ -1671,7 +1678,7 @@  SSE_HELPER_L(helper_pmaxud, MAX)
 #define FMULLD(d, s) ((int32_t)d * (int32_t)s)
 SSE_HELPER_L(helper_pmulld, FMULLD)
 
-void glue(helper_phminposuw, SUFFIX)(Reg *d, Reg *s)
+void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int idx = 0;
 
@@ -1703,7 +1710,8 @@  void glue(helper_phminposuw, SUFFIX)(Reg *d, Reg *s)
     d->W(0) = s->W(idx);
 }
 
-void glue(helper_roundps, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
+void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  uint32_t mode)
 {
     signed char prev_rounding_mode;
 
@@ -1740,7 +1748,8 @@  void glue(helper_roundps, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
     env->sse_status.float_rounding_mode = prev_rounding_mode;
 }
 
-void glue(helper_roundpd, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
+void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  uint32_t mode)
 {
     signed char prev_rounding_mode;
 
@@ -1775,7 +1784,8 @@  void glue(helper_roundpd, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
     env->sse_status.float_rounding_mode = prev_rounding_mode;
 }
 
-void glue(helper_roundss, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
+void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  uint32_t mode)
 {
     signed char prev_rounding_mode;
 
@@ -1809,7 +1819,8 @@  void glue(helper_roundss, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
     env->sse_status.float_rounding_mode = prev_rounding_mode;
 }
 
-void glue(helper_roundsd, SUFFIX)(Reg *d, Reg *s, uint32_t mode)
+void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  uint32_t mode)
 {
     signed char prev_rounding_mode;
 
@@ -1848,7 +1859,7 @@  SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
 
-void glue(helper_dpps, SUFFIX)(Reg *d, Reg *s, uint32_t mask)
+void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
 {
     float32 iresult = float32_zero;
 
@@ -1882,7 +1893,7 @@  void glue(helper_dpps, SUFFIX)(Reg *d, Reg *s, uint32_t mask)
     d->XMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
 }
 
-void glue(helper_dppd, SUFFIX)(Reg *d, Reg *s, uint32_t mask)
+void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
 {
     float64 iresult = float64_zero;
 
@@ -1902,7 +1913,8 @@  void glue(helper_dppd, SUFFIX)(Reg *d, Reg *s, uint32_t mask)
     d->XMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
 }
 
-void glue(helper_mpsadbw, SUFFIX)(Reg *d, Reg *s, uint32_t offset)
+void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                  uint32_t offset)
 {
     int s0 = (offset & 3) << 2;
     int d0 = (offset & 4) << 0;
@@ -1925,7 +1937,7 @@  void glue(helper_mpsadbw, SUFFIX)(Reg *d, Reg *s, uint32_t offset)
 #define FCMPGTQ(d, s) (d > s ? -1 : 0)
 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
 
-static inline int pcmp_elen(int reg, uint32_t ctrl)
+static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
 {
     int val;
 
@@ -1980,7 +1992,7 @@  static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
     }
 }
 
-static inline unsigned pcmpxstrx(Reg *d, Reg *s,
+static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
                                  int8_t ctrl, int valids, int validd)
 {
     unsigned int res = 0;
@@ -2080,11 +2092,12 @@  static inline int ffs1(unsigned int val)
     return ret;
 }
 
-void glue(helper_pcmpestri, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
+void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                    uint32_t ctrl)
 {
-    unsigned int res = pcmpxstrx(d, s, ctrl,
-                                 pcmp_elen(R_EDX, ctrl),
-                                 pcmp_elen(R_EAX, ctrl));
+    unsigned int res = pcmpxstrx(env, d, s, ctrl,
+                                 pcmp_elen(env, R_EDX, ctrl),
+                                 pcmp_elen(env, R_EAX, ctrl));
 
     if (res) {
         env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
@@ -2093,12 +2106,13 @@  void glue(helper_pcmpestri, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
     }
 }
 
-void glue(helper_pcmpestrm, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
+void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                    uint32_t ctrl)
 {
     int i;
-    unsigned int res = pcmpxstrx(d, s, ctrl,
-                                 pcmp_elen(R_EDX, ctrl),
-                                 pcmp_elen(R_EAX, ctrl));
+    unsigned int res = pcmpxstrx(env, d, s, ctrl,
+                                 pcmp_elen(env, R_EDX, ctrl),
+                                 pcmp_elen(env, R_EAX, ctrl));
 
     if ((ctrl >> 6) & 1) {
         if (ctrl & 1) {
@@ -2116,9 +2130,10 @@  void glue(helper_pcmpestrm, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
     }
 }
 
-void glue(helper_pcmpistri, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
+void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                    uint32_t ctrl)
 {
-    unsigned int res = pcmpxstrx(d, s, ctrl,
+    unsigned int res = pcmpxstrx(env, d, s, ctrl,
                                  pcmp_ilen(s, ctrl),
                                  pcmp_ilen(d, ctrl));
 
@@ -2129,10 +2144,11 @@  void glue(helper_pcmpistri, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
     }
 }
 
-void glue(helper_pcmpistrm, SUFFIX)(Reg *d, Reg *s, uint32_t ctrl)
+void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+                                    uint32_t ctrl)
 {
     int i;
-    unsigned int res = pcmpxstrx(d, s, ctrl,
+    unsigned int res = pcmpxstrx(env, d, s, ctrl,
                                  pcmp_ilen(s, ctrl),
                                  pcmp_ilen(d, ctrl));
 
@@ -2168,7 +2184,7 @@  target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
 
 #define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
 #define POPCOUNT(n, i) ((n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)))
-target_ulong helper_popcnt(target_ulong n, uint32_t type)
+target_ulong helper_popcnt(CPUX86State *env, target_ulong n, uint32_t type)
 {
     CC_SRC = n ? 0 : CC_Z;
 
diff --git a/target-i386/ops_sse_header.h b/target-i386/ops_sse_header.h
index 8d4b2b7..401eac6 100644
--- a/target-i386/ops_sse_header.h
+++ b/target-i386/ops_sse_header.h
@@ -34,31 +34,31 @@ 
 #define dh_is_signed_XMMReg dh_is_signed_ptr
 #define dh_is_signed_MMXReg dh_is_signed_ptr
 
-DEF_HELPER_2(glue(psrlw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psraw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psllw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psrld, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psrad, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pslld, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psrlq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psllq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psllw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psrld, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psrad, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg)
 
 #if SHIFT == 1
-DEF_HELPER_2(glue(psrldq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pslldq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg)
 #endif
 
 #define SSE_HELPER_B(name, F)\
-    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+    DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
 #define SSE_HELPER_W(name, F)\
-    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+    DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
 #define SSE_HELPER_L(name, F)\
-    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+    DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
 #define SSE_HELPER_Q(name, F)\
-    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+    DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
 SSE_HELPER_B(paddb, FADD)
 SSE_HELPER_W(paddw, FADD)
@@ -109,11 +109,11 @@  SSE_HELPER_W(pmulhw, FMULHW)
 SSE_HELPER_B(pavgb, FAVG)
 SSE_HELPER_W(pavgw, FAVG)
 
-DEF_HELPER_2(glue(pmuludq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaddwd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(pmuludq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaddwd, SUFFIX), void, env, Reg, Reg)
 
-DEF_HELPER_2(glue(psadbw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_3(glue(maskmov, SUFFIX), void, Reg, Reg, tl)
+DEF_HELPER_3(glue(psadbw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl)
 DEF_HELPER_2(glue(movl_mm_T0, SUFFIX), void, Reg, i32)
 #ifdef TARGET_X86_64
 DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64)
@@ -133,11 +133,11 @@  DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
 /* FPU ops */
 /* XXX: not accurate */
 
-#define SSE_HELPER_S(name, F)\
-    DEF_HELPER_2(name ## ps , void, Reg, Reg)        \
-    DEF_HELPER_2(name ## ss , void, Reg, Reg)        \
-    DEF_HELPER_2(name ## pd , void, Reg, Reg)        \
-    DEF_HELPER_2(name ## sd , void, Reg, Reg)
+#define SSE_HELPER_S(name, F)                            \
+    DEF_HELPER_3(name ## ps, void, env, Reg, Reg)        \
+    DEF_HELPER_3(name ## ss, void, env, Reg, Reg)        \
+    DEF_HELPER_3(name ## pd, void, env, Reg, Reg)        \
+    DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
 
 SSE_HELPER_S(add, FPU_ADD)
 SSE_HELPER_S(sub, FPU_SUB)
@@ -148,64 +148,64 @@  SSE_HELPER_S(max, FPU_MAX)
 SSE_HELPER_S(sqrt, FPU_SQRT)
 
 
-DEF_HELPER_2(cvtps2pd, void, Reg, Reg)
-DEF_HELPER_2(cvtpd2ps, void, Reg, Reg)
-DEF_HELPER_2(cvtss2sd, void, Reg, Reg)
-DEF_HELPER_2(cvtsd2ss, void, Reg, Reg)
-DEF_HELPER_2(cvtdq2ps, void, Reg, Reg)
-DEF_HELPER_2(cvtdq2pd, void, Reg, Reg)
-DEF_HELPER_2(cvtpi2ps, void, XMMReg, MMXReg)
-DEF_HELPER_2(cvtpi2pd, void, XMMReg, MMXReg)
-DEF_HELPER_2(cvtsi2ss, void, XMMReg, i32)
-DEF_HELPER_2(cvtsi2sd, void, XMMReg, i32)
+DEF_HELPER_3(cvtps2pd, void, env, Reg, Reg)
+DEF_HELPER_3(cvtpd2ps, void, env, Reg, Reg)
+DEF_HELPER_3(cvtss2sd, void, env, Reg, Reg)
+DEF_HELPER_3(cvtsd2ss, void, env, Reg, Reg)
+DEF_HELPER_3(cvtdq2ps, void, env, Reg, Reg)
+DEF_HELPER_3(cvtdq2pd, void, env, Reg, Reg)
+DEF_HELPER_3(cvtpi2ps, void, env, XMMReg, MMXReg)
+DEF_HELPER_3(cvtpi2pd, void, env, XMMReg, MMXReg)
+DEF_HELPER_3(cvtsi2ss, void, env, XMMReg, i32)
+DEF_HELPER_3(cvtsi2sd, void, env, XMMReg, i32)
 
 #ifdef TARGET_X86_64
-DEF_HELPER_2(cvtsq2ss, void, XMMReg, i64)
-DEF_HELPER_2(cvtsq2sd, void, XMMReg, i64)
+DEF_HELPER_3(cvtsq2ss, void, env, XMMReg, i64)
+DEF_HELPER_3(cvtsq2sd, void, env, XMMReg, i64)
 #endif
 
-DEF_HELPER_2(cvtps2dq, void, XMMReg, XMMReg)
-DEF_HELPER_2(cvtpd2dq, void, XMMReg, XMMReg)
-DEF_HELPER_2(cvtps2pi, void, MMXReg, XMMReg)
-DEF_HELPER_2(cvtpd2pi, void, MMXReg, XMMReg)
-DEF_HELPER_1(cvtss2si, s32, XMMReg)
-DEF_HELPER_1(cvtsd2si, s32, XMMReg)
+DEF_HELPER_3(cvtps2dq, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(cvtpd2dq, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(cvtps2pi, void, env, MMXReg, XMMReg)
+DEF_HELPER_3(cvtpd2pi, void, env, MMXReg, XMMReg)
+DEF_HELPER_2(cvtss2si, s32, env, XMMReg)
+DEF_HELPER_2(cvtsd2si, s32, env, XMMReg)
 #ifdef TARGET_X86_64
-DEF_HELPER_1(cvtss2sq, s64, XMMReg)
-DEF_HELPER_1(cvtsd2sq, s64, XMMReg)
+DEF_HELPER_2(cvtss2sq, s64, env, XMMReg)
+DEF_HELPER_2(cvtsd2sq, s64, env, XMMReg)
 #endif
 
-DEF_HELPER_2(cvttps2dq, void, XMMReg, XMMReg)
-DEF_HELPER_2(cvttpd2dq, void, XMMReg, XMMReg)
-DEF_HELPER_2(cvttps2pi, void, MMXReg, XMMReg)
-DEF_HELPER_2(cvttpd2pi, void, MMXReg, XMMReg)
-DEF_HELPER_1(cvttss2si, s32, XMMReg)
-DEF_HELPER_1(cvttsd2si, s32, XMMReg)
+DEF_HELPER_3(cvttps2dq, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(cvttpd2dq, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(cvttps2pi, void, env, MMXReg, XMMReg)
+DEF_HELPER_3(cvttpd2pi, void, env, MMXReg, XMMReg)
+DEF_HELPER_2(cvttss2si, s32, env, XMMReg)
+DEF_HELPER_2(cvttsd2si, s32, env, XMMReg)
 #ifdef TARGET_X86_64
-DEF_HELPER_1(cvttss2sq, s64, XMMReg)
-DEF_HELPER_1(cvttsd2sq, s64, XMMReg)
+DEF_HELPER_2(cvttss2sq, s64, env, XMMReg)
+DEF_HELPER_2(cvttsd2sq, s64, env, XMMReg)
 #endif
 
-DEF_HELPER_2(rsqrtps, void, XMMReg, XMMReg)
-DEF_HELPER_2(rsqrtss, void, XMMReg, XMMReg)
-DEF_HELPER_2(rcpps, void, XMMReg, XMMReg)
-DEF_HELPER_2(rcpss, void, XMMReg, XMMReg)
-DEF_HELPER_2(extrq_r, void, XMMReg, XMMReg)
-DEF_HELPER_3(extrq_i, void, XMMReg, int, int)
-DEF_HELPER_2(insertq_r, void, XMMReg, XMMReg)
-DEF_HELPER_3(insertq_i, void, XMMReg, int, int)
-DEF_HELPER_2(haddps, void, XMMReg, XMMReg)
-DEF_HELPER_2(haddpd, void, XMMReg, XMMReg)
-DEF_HELPER_2(hsubps, void, XMMReg, XMMReg)
-DEF_HELPER_2(hsubpd, void, XMMReg, XMMReg)
-DEF_HELPER_2(addsubps, void, XMMReg, XMMReg)
-DEF_HELPER_2(addsubpd, void, XMMReg, XMMReg)
-
-#define SSE_HELPER_CMP(name, F)\
-    DEF_HELPER_2( name ## ps , void, Reg, Reg)        \
-    DEF_HELPER_2( name ## ss , void, Reg, Reg)        \
-    DEF_HELPER_2( name ## pd , void, Reg, Reg)        \
-    DEF_HELPER_2( name ## sd , void, Reg, Reg)
+DEF_HELPER_3(rsqrtps, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(rsqrtss, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(rcpps, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(rcpss, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(extrq_r, void, env, XMMReg, XMMReg)
+DEF_HELPER_4(extrq_i, void, env, XMMReg, int, int)
+DEF_HELPER_3(insertq_r, void, env, XMMReg, XMMReg)
+DEF_HELPER_4(insertq_i, void, env, XMMReg, int, int)
+DEF_HELPER_3(haddps, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(haddpd, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(hsubps, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(hsubpd, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(addsubps, void, env, XMMReg, XMMReg)
+DEF_HELPER_3(addsubpd, void, env, XMMReg, XMMReg)
+
+#define SSE_HELPER_CMP(name, F)                           \
+    DEF_HELPER_3(name ## ps, void, env, Reg, Reg)         \
+    DEF_HELPER_3(name ## ss, void, env, Reg, Reg)         \
+    DEF_HELPER_3(name ## pd, void, env, Reg, Reg)         \
+    DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
 
 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
 SSE_HELPER_CMP(cmplt, FPU_CMPLT)
@@ -216,124 +216,124 @@  SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
 SSE_HELPER_CMP(cmpord, FPU_CMPORD)
 
-DEF_HELPER_2(ucomiss, void, Reg, Reg)
-DEF_HELPER_2(comiss, void, Reg, Reg)
-DEF_HELPER_2(ucomisd, void, Reg, Reg)
-DEF_HELPER_2(comisd, void, Reg, Reg)
-DEF_HELPER_1(movmskps, i32, Reg)
-DEF_HELPER_1(movmskpd, i32, Reg)
+DEF_HELPER_3(ucomiss, void, env, Reg, Reg)
+DEF_HELPER_3(comiss, void, env, Reg, Reg)
+DEF_HELPER_3(ucomisd, void, env, Reg, Reg)
+DEF_HELPER_3(comisd, void, env, Reg, Reg)
+DEF_HELPER_2(movmskps, i32, env, Reg)
+DEF_HELPER_2(movmskpd, i32, env, Reg)
 #endif
 
-DEF_HELPER_1(glue(pmovmskb, SUFFIX), i32, Reg)
-DEF_HELPER_2(glue(packsswb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(packuswb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(packssdw, SUFFIX), void, Reg, Reg)
-#define UNPCK_OP(base_name, base)                               \
-    DEF_HELPER_2(glue(punpck ## base_name ## bw, SUFFIX) , void, Reg, Reg) \
-    DEF_HELPER_2(glue(punpck ## base_name ## wd, SUFFIX) , void, Reg, Reg) \
-    DEF_HELPER_2(glue(punpck ## base_name ## dq, SUFFIX) , void, Reg, Reg)
+DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg)
+DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg)
+#define UNPCK_OP(base_name, base)                                       \
+    DEF_HELPER_3(glue(punpck ## base_name ## bw, SUFFIX), void, env, Reg, Reg) \
+    DEF_HELPER_3(glue(punpck ## base_name ## wd, SUFFIX), void, env, Reg, Reg) \
+    DEF_HELPER_3(glue(punpck ## base_name ## dq, SUFFIX), void, env, Reg, Reg)
 
 UNPCK_OP(l, 0)
 UNPCK_OP(h, 1)
 
 #if SHIFT == 1
-DEF_HELPER_2(glue(punpcklqdq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(punpckhqdq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg)
 #endif
 
 /* 3DNow! float ops */
 #if SHIFT == 0
-DEF_HELPER_2(pi2fd, void, MMXReg, MMXReg)
-DEF_HELPER_2(pi2fw, void, MMXReg, MMXReg)
-DEF_HELPER_2(pf2id, void, MMXReg, MMXReg)
-DEF_HELPER_2(pf2iw, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfacc, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfadd, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfcmpeq, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfcmpge, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfcmpgt, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfmax, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfmin, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfmul, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfnacc, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfpnacc, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfrcp, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfrsqrt, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfsub, void, MMXReg, MMXReg)
-DEF_HELPER_2(pfsubr, void, MMXReg, MMXReg)
-DEF_HELPER_2(pswapd, void, MMXReg, MMXReg)
+DEF_HELPER_3(pi2fd, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pi2fw, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pf2id, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pf2iw, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfacc, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfadd, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfcmpeq, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfcmpge, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfcmpgt, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfmax, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfmin, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfmul, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfnacc, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfpnacc, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfrcp, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfrsqrt, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfsub, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pfsubr, void, env, MMXReg, MMXReg)
+DEF_HELPER_3(pswapd, void, env, MMXReg, MMXReg)
 #endif
 
 /* SSSE3 op helpers */
-DEF_HELPER_2(glue(phaddw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phaddd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phaddsw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phsubw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phsubd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phsubsw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pabsb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pabsw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pabsd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaddubsw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmulhrsw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pshufb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psignb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psignw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(psignd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_3(glue(palignr, SUFFIX), void, Reg, Reg, s32)
+DEF_HELPER_3(glue(phaddw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phaddd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phaddsw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phsubw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phsubd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phsubsw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pabsb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pabsw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pabsd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pshufb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psignb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psignw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(psignd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(palignr, SUFFIX), void, env, Reg, Reg, s32)
 
 /* SSE4.1 op helpers */
 #if SHIFT == 1
-DEF_HELPER_2(glue(pblendvb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(blendvps, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(blendvpd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(ptest, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxbw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxbd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxbq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxwd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxwq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovsxdq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxbw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxbd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxbq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxwd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxwq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmovzxdq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmuldq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pcmpeqq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(packusdw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pminsb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pminsd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pminuw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pminud, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaxsb, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaxsd, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaxuw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmaxud, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(pmulld, SUFFIX), void, Reg, Reg)
-DEF_HELPER_2(glue(phminposuw, SUFFIX), void, Reg, Reg)
-DEF_HELPER_3(glue(roundps, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(roundpd, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(roundss, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(roundsd, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(blendps, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(blendpd, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(pblendw, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(dpps, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(dppd, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(mpsadbw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pblendvb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(blendvps, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(blendvpd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(ptest, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxbw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxbd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxbq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxwd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxwq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsxdq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxbw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxbd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxbq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxwd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxwq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovzxdq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmuldq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(packusdw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pminsb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pminsd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pminuw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pminud, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaxsb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaxsd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaxuw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmaxud, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmulld, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(phminposuw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(roundps, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(roundpd, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(roundss, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(roundsd, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(blendps, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(blendpd, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(pblendw, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(dpps, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(dppd, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, i32)
 #endif
 
 /* SSE4.2 op helpers */
 #if SHIFT == 1
-DEF_HELPER_2(glue(pcmpgtq, SUFFIX), void, Reg, Reg)
-DEF_HELPER_3(glue(pcmpestri, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(pcmpestrm, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(pcmpistri, SUFFIX), void, Reg, Reg, i32)
-DEF_HELPER_3(glue(pcmpistrm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pcmpgtq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(pcmpestri, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(pcmpestrm, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(pcmpistri, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(pcmpistrm, SUFFIX), void, env, Reg, Reg, i32)
 DEF_HELPER_3(crc32, tl, i32, tl, i32)
-DEF_HELPER_2(popcnt, tl, tl, i32)
+DEF_HELPER_3(popcnt, tl, env, tl, i32)
 #endif
 
 #undef SHIFT
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 2b11333..5e9da9d 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -1266,14 +1266,30 @@  GEN_REPZ2(cmps)
 static void gen_helper_fp_arith_ST0_FT0(int op)
 {
     switch (op) {
-    case 0: gen_helper_fadd_ST0_FT0(); break;
-    case 1: gen_helper_fmul_ST0_FT0(); break;
-    case 2: gen_helper_fcom_ST0_FT0(); break;
-    case 3: gen_helper_fcom_ST0_FT0(); break;
-    case 4: gen_helper_fsub_ST0_FT0(); break;
-    case 5: gen_helper_fsubr_ST0_FT0(); break;
-    case 6: gen_helper_fdiv_ST0_FT0(); break;
-    case 7: gen_helper_fdivr_ST0_FT0(); break;
+    case 0:
+        gen_helper_fadd_ST0_FT0(cpu_env);
+        break;
+    case 1:
+        gen_helper_fmul_ST0_FT0(cpu_env);
+        break;
+    case 2:
+        gen_helper_fcom_ST0_FT0(cpu_env);
+        break;
+    case 3:
+        gen_helper_fcom_ST0_FT0(cpu_env);
+        break;
+    case 4:
+        gen_helper_fsub_ST0_FT0(cpu_env);
+        break;
+    case 5:
+        gen_helper_fsubr_ST0_FT0(cpu_env);
+        break;
+    case 6:
+        gen_helper_fdiv_ST0_FT0(cpu_env);
+        break;
+    case 7:
+        gen_helper_fdivr_ST0_FT0(cpu_env);
+        break;
     }
 }
 
@@ -1282,12 +1298,24 @@  static void gen_helper_fp_arith_STN_ST0(int op, int opreg)
 {
     TCGv_i32 tmp = tcg_const_i32(opreg);
     switch (op) {
-    case 0: gen_helper_fadd_STN_ST0(tmp); break;
-    case 1: gen_helper_fmul_STN_ST0(tmp); break;
-    case 4: gen_helper_fsubr_STN_ST0(tmp); break;
-    case 5: gen_helper_fsub_STN_ST0(tmp); break;
-    case 6: gen_helper_fdivr_STN_ST0(tmp); break;
-    case 7: gen_helper_fdiv_STN_ST0(tmp); break;
+    case 0:
+        gen_helper_fadd_STN_ST0(cpu_env, tmp);
+        break;
+    case 1:
+        gen_helper_fmul_STN_ST0(cpu_env, tmp);
+        break;
+    case 4:
+        gen_helper_fsubr_STN_ST0(cpu_env, tmp);
+        break;
+    case 5:
+        gen_helper_fsub_STN_ST0(cpu_env, tmp);
+        break;
+    case 6:
+        gen_helper_fdivr_STN_ST0(cpu_env, tmp);
+        break;
+    case 7:
+        gen_helper_fdiv_STN_ST0(cpu_env, tmp);
+        break;
     }
 }
 
@@ -2796,13 +2824,16 @@  static inline void gen_op_movq_env_0(int d_offset)
     tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, d_offset);
 }
 
-typedef void (*SSEFunc_i_p)(TCGv_i32 val, TCGv_ptr reg);
-typedef void (*SSEFunc_l_p)(TCGv_i64 val, TCGv_ptr reg);
-typedef void (*SSEFunc_0_pi)(TCGv_ptr reg, TCGv_i32 val);
-typedef void (*SSEFunc_0_pl)(TCGv_ptr reg, TCGv_i64 val);
-typedef void (*SSEFunc_0_pp)(TCGv_ptr reg_a, TCGv_ptr reg_b);
+typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
+typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
+typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val);
+typedef void (*SSEFunc_0_epl)(TCGv_ptr env, TCGv_ptr reg, TCGv_i64 val);
+typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
+typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+                               TCGv_i32 val);
 typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
-typedef void (*SSEFunc_0_ppt)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv val);
+typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
+                               TCGv val);
 
 #define SSE_SPECIAL ((void *)1)
 #define SSE_DUMMY ((void *)2)
@@ -2811,7 +2842,7 @@  typedef void (*SSEFunc_0_ppt)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv val);
 #define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \
                      gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, }
 
-static const SSEFunc_0_pp sse_op_table1[256][4] = {
+static const SSEFunc_0_epp sse_op_table1[256][4] = {
     /* 3DNow! extensions */
     [0x0e] = { SSE_DUMMY }, /* femms */
     [0x0f] = { SSE_DUMMY }, /* pf... */
@@ -2852,8 +2883,8 @@  static const SSEFunc_0_pp sse_op_table1[256][4] = {
     [0x5f] = SSE_FOP(max),
 
     [0xc2] = SSE_FOP(cmpeq),
-    [0xc6] = { (SSEFunc_0_pp)gen_helper_shufps,
-               (SSEFunc_0_pp)gen_helper_shufpd }, /* XXX: casts */
+    [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
+               (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
 
     [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
     [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
@@ -2875,10 +2906,10 @@  static const SSEFunc_0_pp sse_op_table1[256][4] = {
     [0x6d] = { NULL, gen_helper_punpckhqdq_xmm },
     [0x6e] = { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */
     [0x6f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, , movqdu */
-    [0x70] = { (SSEFunc_0_pp)gen_helper_pshufw_mmx,
-               (SSEFunc_0_pp)gen_helper_pshufd_xmm,
-               (SSEFunc_0_pp)gen_helper_pshufhw_xmm,
-               (SSEFunc_0_pp)gen_helper_pshuflw_xmm }, /* XXX: casts */
+    [0x70] = { (SSEFunc_0_epp)gen_helper_pshufw_mmx,
+               (SSEFunc_0_epp)gen_helper_pshufd_xmm,
+               (SSEFunc_0_epp)gen_helper_pshufhw_xmm,
+               (SSEFunc_0_epp)gen_helper_pshuflw_xmm }, /* XXX: casts */
     [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
     [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
     [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
@@ -2933,8 +2964,8 @@  static const SSEFunc_0_pp sse_op_table1[256][4] = {
     [0xf4] = MMX_OP2(pmuludq),
     [0xf5] = MMX_OP2(pmaddwd),
     [0xf6] = MMX_OP2(psadbw),
-    [0xf7] = { (SSEFunc_0_pp)gen_helper_maskmov_mmx,
-               (SSEFunc_0_pp)gen_helper_maskmov_xmm }, /* XXX: casts */
+    [0xf7] = { (SSEFunc_0_epp)gen_helper_maskmov_mmx,
+               (SSEFunc_0_epp)gen_helper_maskmov_xmm }, /* XXX: casts */
     [0xf8] = MMX_OP2(psubb),
     [0xf9] = MMX_OP2(psubw),
     [0xfa] = MMX_OP2(psubl),
@@ -2944,7 +2975,7 @@  static const SSEFunc_0_pp sse_op_table1[256][4] = {
     [0xfe] = MMX_OP2(paddl),
 };
 
-static const SSEFunc_0_pp sse_op_table2[3 * 8][2] = {
+static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = {
     [0 + 2] = MMX_OP2(psrlw),
     [0 + 4] = MMX_OP2(psraw),
     [0 + 6] = MMX_OP2(psllw),
@@ -2957,19 +2988,19 @@  static const SSEFunc_0_pp sse_op_table2[3 * 8][2] = {
     [16 + 7] = { NULL, gen_helper_pslldq_xmm },
 };
 
-static const SSEFunc_0_pi sse_op_table3ai[] = {
+static const SSEFunc_0_epi sse_op_table3ai[] = {
     gen_helper_cvtsi2ss,
     gen_helper_cvtsi2sd
 };
 
 #ifdef TARGET_X86_64
-static const SSEFunc_0_pl sse_op_table3aq[] = {
+static const SSEFunc_0_epl sse_op_table3aq[] = {
     gen_helper_cvtsq2ss,
     gen_helper_cvtsq2sd
 };
 #endif
 
-static const SSEFunc_i_p sse_op_table3bi[] = {
+static const SSEFunc_i_ep sse_op_table3bi[] = {
     gen_helper_cvttss2si,
     gen_helper_cvtss2si,
     gen_helper_cvttsd2si,
@@ -2977,7 +3008,7 @@  static const SSEFunc_i_p sse_op_table3bi[] = {
 };
 
 #ifdef TARGET_X86_64
-static const SSEFunc_l_p sse_op_table3bq[] = {
+static const SSEFunc_l_ep sse_op_table3bq[] = {
     gen_helper_cvttss2sq,
     gen_helper_cvtss2sq,
     gen_helper_cvttsd2sq,
@@ -2985,7 +3016,7 @@  static const SSEFunc_l_p sse_op_table3bq[] = {
 };
 #endif
 
-static const SSEFunc_0_pp sse_op_table4[8][4] = {
+static const SSEFunc_0_epp sse_op_table4[8][4] = {
     SSE_FOP(cmpeq),
     SSE_FOP(cmplt),
     SSE_FOP(cmple),
@@ -2996,7 +3027,7 @@  static const SSEFunc_0_pp sse_op_table4[8][4] = {
     SSE_FOP(cmpord),
 };
 
-static const SSEFunc_0_pp sse_op_table5[256] = {
+static const SSEFunc_0_epp sse_op_table5[256] = {
     [0x0c] = gen_helper_pi2fw,
     [0x0d] = gen_helper_pi2fd,
     [0x1c] = gen_helper_pf2iw,
@@ -3023,13 +3054,13 @@  static const SSEFunc_0_pp sse_op_table5[256] = {
     [0xbf] = gen_helper_pavgb_mmx /* pavgusb */
 };
 
-struct SSEOpHelper_pp {
-    SSEFunc_0_pp op[2];
+struct SSEOpHelper_epp {
+    SSEFunc_0_epp op[2];
     uint32_t ext_mask;
 };
 
-struct SSEOpHelper_ppi {
-    SSEFunc_0_ppi op[2];
+struct SSEOpHelper_eppi {
+    SSEFunc_0_eppi op[2];
     uint32_t ext_mask;
 };
 
@@ -3038,7 +3069,7 @@  struct SSEOpHelper_ppi {
 #define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
 #define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
 
-static const struct SSEOpHelper_pp sse_op_table6[256] = {
+static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x00] = SSSE3_OP(pshufb),
     [0x01] = SSSE3_OP(phaddw),
     [0x02] = SSSE3_OP(phaddd),
@@ -3087,7 +3118,7 @@  static const struct SSEOpHelper_pp sse_op_table6[256] = {
     [0x41] = SSE41_OP(phminposuw),
 };
 
-static const struct SSEOpHelper_ppi sse_op_table7[256] = {
+static const struct SSEOpHelper_eppi sse_op_table7[256] = {
     [0x08] = SSE41_OP(roundps),
     [0x09] = SSE41_OP(roundpd),
     [0x0a] = SSE41_OP(roundss),
@@ -3116,9 +3147,10 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
 {
     int b1, op1_offset, op2_offset, is_xmm, val, ot;
     int modrm, mod, rm, reg, reg_addr, offset_addr;
-    SSEFunc_0_pp sse_fn_pp;
+    SSEFunc_0_epp sse_fn_epp;
+    SSEFunc_0_eppi sse_fn_eppi;
     SSEFunc_0_ppi sse_fn_ppi;
-    SSEFunc_0_ppt sse_fn_ppt;
+    SSEFunc_0_eppt sse_fn_eppt;
 
     b &= 0xff;
     if (s->prefix & PREFIX_DATA)
@@ -3129,8 +3161,8 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
         b1 = 3;
     else
         b1 = 0;
-    sse_fn_pp = sse_op_table1[b][b1];
-    if (!sse_fn_pp) {
+    sse_fn_epp = sse_op_table1[b][b1];
+    if (!sse_fn_epp) {
         goto illegal_op;
     }
     if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) {
@@ -3160,18 +3192,18 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
         if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW))
             goto illegal_op;
         /* femms */
-        gen_helper_emms();
+        gen_helper_emms(cpu_env);
         return;
     }
     if (b == 0x77) {
         /* emms */
-        gen_helper_emms();
+        gen_helper_emms(cpu_env);
         return;
     }
     /* prepare MMX state (XXX: optimize by storing fptt and fptags in
        the static cpu state) */
     if (!is_xmm) {
-        gen_helper_enter_mmx();
+        gen_helper_enter_mmx(cpu_env);
     }
 
     modrm = ldub_code(s->pc++);
@@ -3179,7 +3211,7 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
     if (is_xmm)
         reg |= rex_r;
     mod = (modrm >> 6) & 3;
-    if (sse_fn_pp == SSE_SPECIAL) {
+    if (sse_fn_epp == SSE_SPECIAL) {
         b |= (b1 << 8);
         switch(b) {
         case 0x0e7: /* movntq */
@@ -3383,11 +3415,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
                 tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
                     offsetof(CPUX86State,xmm_regs[reg]));
                 if (b1 == 1)
-                    gen_helper_extrq_i(cpu_ptr0, tcg_const_i32(bit_index),
-                        tcg_const_i32(field_length));
+                    gen_helper_extrq_i(cpu_env, cpu_ptr0,
+                                       tcg_const_i32(bit_index),
+                                       tcg_const_i32(field_length));
                 else
-                    gen_helper_insertq_i(cpu_ptr0, tcg_const_i32(bit_index),
-                        tcg_const_i32(field_length));
+                    gen_helper_insertq_i(cpu_env, cpu_ptr0,
+                                         tcg_const_i32(bit_index),
+                                         tcg_const_i32(field_length));
             }
             break;
         case 0x7e: /* movd ea, mm */
@@ -3516,8 +3550,9 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
                 tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,mmx_t0.MMX_L(1)));
                 op1_offset = offsetof(CPUX86State,mmx_t0);
             }
-            sse_fn_pp = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1];
-            if (!sse_fn_pp) {
+            sse_fn_epp = sse_op_table2[((b - 1) & 3) * 8 +
+                                       (((modrm >> 3)) & 7)][b1];
+            if (!sse_fn_epp) {
                 goto illegal_op;
             }
             if (is_xmm) {
@@ -3529,13 +3564,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             }
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op2_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op1_offset);
-            sse_fn_pp(cpu_ptr0, cpu_ptr1);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
             break;
         case 0x050: /* movmskps */
             rm = (modrm & 7) | REX_B(s);
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, 
                              offsetof(CPUX86State,xmm_regs[rm]));
-            gen_helper_movmskps(cpu_tmp2_i32, cpu_ptr0);
+            gen_helper_movmskps(cpu_tmp2_i32, cpu_env, cpu_ptr0);
             tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
             gen_op_mov_reg_T0(OT_LONG, reg);
             break;
@@ -3543,13 +3578,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             rm = (modrm & 7) | REX_B(s);
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, 
                              offsetof(CPUX86State,xmm_regs[rm]));
-            gen_helper_movmskpd(cpu_tmp2_i32, cpu_ptr0);
+            gen_helper_movmskpd(cpu_tmp2_i32, cpu_env, cpu_ptr0);
             tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
             gen_op_mov_reg_T0(OT_LONG, reg);
             break;
         case 0x02a: /* cvtpi2ps */
         case 0x12a: /* cvtpi2pd */
-            gen_helper_enter_mmx();
+            gen_helper_enter_mmx(cpu_env);
             if (mod != 3) {
                 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                 op2_offset = offsetof(CPUX86State,mmx_t0);
@@ -3563,11 +3598,11 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             switch(b >> 8) {
             case 0x0:
-                gen_helper_cvtpi2ps(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvtpi2ps(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             default:
             case 0x1:
-                gen_helper_cvtpi2pd(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvtpi2pd(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             }
             break;
@@ -3578,13 +3613,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             if (ot == OT_LONG) {
-                SSEFunc_0_pi sse_fn_pi = sse_op_table3ai[(b >> 8) & 1];
+                SSEFunc_0_epi sse_fn_epi = sse_op_table3ai[(b >> 8) & 1];
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                sse_fn_pi(cpu_ptr0, cpu_tmp2_i32);
+                sse_fn_epi(cpu_env, cpu_ptr0, cpu_tmp2_i32);
             } else {
 #ifdef TARGET_X86_64
-                SSEFunc_0_pl sse_fn_pl = sse_op_table3aq[(b >> 8) & 1];
-                sse_fn_pl(cpu_ptr0, cpu_T[0]);
+                SSEFunc_0_epl sse_fn_epl = sse_op_table3aq[(b >> 8) & 1];
+                sse_fn_epl(cpu_env, cpu_ptr0, cpu_T[0]);
 #else
                 goto illegal_op;
 #endif
@@ -3594,7 +3629,7 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
         case 0x12c: /* cvttpd2pi */
         case 0x02d: /* cvtps2pi */
         case 0x12d: /* cvtpd2pi */
-            gen_helper_enter_mmx();
+            gen_helper_enter_mmx(cpu_env);
             if (mod != 3) {
                 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                 op2_offset = offsetof(CPUX86State,xmm_t0);
@@ -3608,16 +3643,16 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             switch(b) {
             case 0x02c:
-                gen_helper_cvttps2pi(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvttps2pi(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             case 0x12c:
-                gen_helper_cvttpd2pi(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvttpd2pi(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             case 0x02d:
-                gen_helper_cvtps2pi(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvtps2pi(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             case 0x12d:
-                gen_helper_cvtpd2pi(cpu_ptr0, cpu_ptr1);
+                gen_helper_cvtpd2pi(cpu_env, cpu_ptr0, cpu_ptr1);
                 break;
             }
             break;
@@ -3641,15 +3676,15 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             }
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op2_offset);
             if (ot == OT_LONG) {
-                SSEFunc_i_p sse_fn_i_p =
+                SSEFunc_i_ep sse_fn_i_ep =
                     sse_op_table3bi[((b >> 7) & 2) | (b & 1)];
-                sse_fn_i_p(cpu_tmp2_i32, cpu_ptr0);
+                sse_fn_i_ep(cpu_tmp2_i32, cpu_env, cpu_ptr0);
                 tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
             } else {
 #ifdef TARGET_X86_64
-                SSEFunc_l_p sse_fn_l_p =
+                SSEFunc_l_ep sse_fn_l_ep =
                     sse_op_table3bq[((b >> 7) & 2) | (b & 1)];
-                sse_fn_l_p(cpu_T[0], cpu_ptr0);
+                sse_fn_l_ep(cpu_T[0], cpu_env, cpu_ptr0);
 #else
                 goto illegal_op;
 #endif
@@ -3703,14 +3738,14 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             }
             break;
         case 0x2d6: /* movq2dq */
-            gen_helper_enter_mmx();
+            gen_helper_enter_mmx(cpu_env);
             rm = (modrm & 7);
             gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
                         offsetof(CPUX86State,fpregs[rm].mmx));
             gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
             break;
         case 0x3d6: /* movdq2q */
-            gen_helper_enter_mmx();
+            gen_helper_enter_mmx(cpu_env);
             rm = (modrm & 7) | REX_B(s);
             gen_op_movq(offsetof(CPUX86State,fpregs[reg & 7].mmx),
                         offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
@@ -3722,11 +3757,11 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             if (b1) {
                 rm = (modrm & 7) | REX_B(s);
                 tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[rm]));
-                gen_helper_pmovmskb_xmm(cpu_tmp2_i32, cpu_ptr0);
+                gen_helper_pmovmskb_xmm(cpu_tmp2_i32, cpu_env, cpu_ptr0);
             } else {
                 rm = (modrm & 7);
                 tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,fpregs[rm].mmx));
-                gen_helper_pmovmskb_mmx(cpu_tmp2_i32, cpu_ptr0);
+                gen_helper_pmovmskb_mmx(cpu_tmp2_i32, cpu_env, cpu_ptr0);
             }
             tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -3745,8 +3780,8 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
                 goto illegal_op;
             }
 
-            sse_fn_pp = sse_op_table6[b].op[b1];
-            if (!sse_fn_pp) {
+            sse_fn_epp = sse_op_table6[b].op[b1];
+            if (!sse_fn_epp) {
                 goto illegal_op;
             }
             if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask))
@@ -3797,13 +3832,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
                     gen_ldq_env_A0(s->mem_index, op2_offset);
                 }
             }
-            if (sse_fn_pp == SSE_SPECIAL) {
+            if (sse_fn_epp == SSE_SPECIAL) {
                 goto illegal_op;
             }
 
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_pp(cpu_ptr0, cpu_ptr1);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
 
             if (b == 0x17)
                 s->cc_op = CC_OP_EFLAGS;
@@ -3849,14 +3884,14 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
                 goto illegal_op;
             }
 
-            sse_fn_ppi = sse_op_table7[b].op[b1];
-            if (!sse_fn_ppi) {
+            sse_fn_eppi = sse_op_table7[b].op[b1];
+            if (!sse_fn_eppi) {
                 goto illegal_op;
             }
             if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask))
                 goto illegal_op;
 
-            if (sse_fn_ppi == SSE_SPECIAL) {
+            if (sse_fn_eppi == SSE_SPECIAL) {
                 ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
                 rm = (modrm & 7) | REX_B(s);
                 if (mod != 3)
@@ -4017,7 +4052,7 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
 
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_ppi(cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
+            sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
         default:
             goto illegal_op;
@@ -4072,13 +4107,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW))
                 goto illegal_op;
             val = ldub_code(s->pc++);
-            sse_fn_pp = sse_op_table5[val];
-            if (!sse_fn_pp) {
+            sse_fn_epp = sse_op_table5[val];
+            if (!sse_fn_epp) {
                 goto illegal_op;
             }
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_pp(cpu_ptr0, cpu_ptr1);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
             break;
         case 0x70: /* pshufx insn */
         case 0xc6: /* pshufx insn */
@@ -4086,7 +4121,7 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             /* XXX: introduce a new table? */
-            sse_fn_ppi = (SSEFunc_0_ppi)sse_fn_pp;
+            sse_fn_ppi = (SSEFunc_0_ppi)sse_fn_epp;
             sse_fn_ppi(cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
         case 0xc2:
@@ -4094,11 +4129,11 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             val = ldub_code(s->pc++);
             if (val >= 8)
                 goto illegal_op;
-            sse_fn_pp = sse_op_table4[val][b1];
+            sse_fn_epp = sse_op_table4[val][b1];
 
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_pp(cpu_ptr0, cpu_ptr1);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
             break;
         case 0xf7:
             /* maskmov : we must prepare A0 */
@@ -4119,13 +4154,13 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             /* XXX: introduce a new table? */
-            sse_fn_ppt = (SSEFunc_0_ppt)sse_fn_pp;
-            sse_fn_ppt(cpu_ptr0, cpu_ptr1, cpu_A0);
+            sse_fn_eppt = (SSEFunc_0_eppt)sse_fn_epp;
+            sse_fn_eppt(cpu_env, cpu_ptr0, cpu_ptr1, cpu_A0);
             break;
         default:
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_pp(cpu_ptr0, cpu_ptr1);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
             break;
         }
         if (b == 0x2e || b == 0x2f) {
@@ -5542,30 +5577,30 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     case 0:
                         gen_op_ld_T0_A0(OT_LONG + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_flds_FT0(cpu_tmp2_i32);
+                        gen_helper_flds_FT0(cpu_env, cpu_tmp2_i32);
                         break;
                     case 1:
                         gen_op_ld_T0_A0(OT_LONG + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_fildl_FT0(cpu_tmp2_i32);
+                        gen_helper_fildl_FT0(cpu_env, cpu_tmp2_i32);
                         break;
                     case 2:
                         tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, 
                                           (s->mem_index >> 2) - 1);
-                        gen_helper_fldl_FT0(cpu_tmp1_i64);
+                        gen_helper_fldl_FT0(cpu_env, cpu_tmp1_i64);
                         break;
                     case 3:
                     default:
                         gen_op_lds_T0_A0(OT_WORD + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_fildl_FT0(cpu_tmp2_i32);
+                        gen_helper_fildl_FT0(cpu_env, cpu_tmp2_i32);
                         break;
                     }
 
                     gen_helper_fp_arith_ST0_FT0(op1);
                     if (op1 == 3) {
                         /* fcomp needs pop */
-                        gen_helper_fpop();
+                        gen_helper_fpop(cpu_env);
                     }
                 }
                 break;
@@ -5581,23 +5616,23 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     case 0:
                         gen_op_ld_T0_A0(OT_LONG + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_flds_ST0(cpu_tmp2_i32);
+                        gen_helper_flds_ST0(cpu_env, cpu_tmp2_i32);
                         break;
                     case 1:
                         gen_op_ld_T0_A0(OT_LONG + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_fildl_ST0(cpu_tmp2_i32);
+                        gen_helper_fildl_ST0(cpu_env, cpu_tmp2_i32);
                         break;
                     case 2:
                         tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, 
                                           (s->mem_index >> 2) - 1);
-                        gen_helper_fldl_ST0(cpu_tmp1_i64);
+                        gen_helper_fldl_ST0(cpu_env, cpu_tmp1_i64);
                         break;
                     case 3:
                     default:
                         gen_op_lds_T0_A0(OT_WORD + s->mem_index);
                         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                        gen_helper_fildl_ST0(cpu_tmp2_i32);
+                        gen_helper_fildl_ST0(cpu_env, cpu_tmp2_i32);
                         break;
                     }
                     break;
@@ -5605,50 +5640,50 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     /* XXX: the corresponding CPUID bit must be tested ! */
                     switch(op >> 4) {
                     case 1:
-                        gen_helper_fisttl_ST0(cpu_tmp2_i32);
+                        gen_helper_fisttl_ST0(cpu_tmp2_i32, cpu_env);
                         tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                         gen_op_st_T0_A0(OT_LONG + s->mem_index);
                         break;
                     case 2:
-                        gen_helper_fisttll_ST0(cpu_tmp1_i64);
+                        gen_helper_fisttll_ST0(cpu_tmp1_i64, cpu_env);
                         tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, 
                                           (s->mem_index >> 2) - 1);
                         break;
                     case 3:
                     default:
-                        gen_helper_fistt_ST0(cpu_tmp2_i32);
+                        gen_helper_fistt_ST0(cpu_tmp2_i32, cpu_env);
                         tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                         gen_op_st_T0_A0(OT_WORD + s->mem_index);
                         break;
                     }
-                    gen_helper_fpop();
+                    gen_helper_fpop(cpu_env);
                     break;
                 default:
                     switch(op >> 4) {
                     case 0:
-                        gen_helper_fsts_ST0(cpu_tmp2_i32);
+                        gen_helper_fsts_ST0(cpu_tmp2_i32, cpu_env);
                         tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                         gen_op_st_T0_A0(OT_LONG + s->mem_index);
                         break;
                     case 1:
-                        gen_helper_fistl_ST0(cpu_tmp2_i32);
+                        gen_helper_fistl_ST0(cpu_tmp2_i32, cpu_env);
                         tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                         gen_op_st_T0_A0(OT_LONG + s->mem_index);
                         break;
                     case 2:
-                        gen_helper_fstl_ST0(cpu_tmp1_i64);
+                        gen_helper_fstl_ST0(cpu_tmp1_i64, cpu_env);
                         tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, 
                                           (s->mem_index >> 2) - 1);
                         break;
                     case 3:
                     default:
-                        gen_helper_fist_ST0(cpu_tmp2_i32);
+                        gen_helper_fist_ST0(cpu_tmp2_i32, cpu_env);
                         tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                         gen_op_st_T0_A0(OT_WORD + s->mem_index);
                         break;
                     }
                     if ((op & 7) == 3)
-                        gen_helper_fpop();
+                        gen_helper_fpop(cpu_env);
                     break;
                 }
                 break;
@@ -5656,22 +5691,21 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fldenv(
-                                   cpu_A0, tcg_const_i32(s->dflag));
+                gen_helper_fldenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x0d: /* fldcw mem */
                 gen_op_ld_T0_A0(OT_WORD + s->mem_index);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                gen_helper_fldcw(cpu_tmp2_i32);
+                gen_helper_fldcw(cpu_env, cpu_tmp2_i32);
                 break;
             case 0x0e: /* fnstenv mem */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fstenv(cpu_A0, tcg_const_i32(s->dflag));
+                gen_helper_fstenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x0f: /* fnstcw mem */
-                gen_helper_fnstcw(cpu_tmp2_i32);
+                gen_helper_fnstcw(cpu_tmp2_i32, cpu_env);
                 tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
@@ -5679,29 +5713,29 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fldt_ST0(cpu_A0);
+                gen_helper_fldt_ST0(cpu_env, cpu_A0);
                 break;
             case 0x1f: /* fstpt mem */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fstt_ST0(cpu_A0);
-                gen_helper_fpop();
+                gen_helper_fstt_ST0(cpu_env, cpu_A0);
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x2c: /* frstor mem */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_frstor(cpu_A0, tcg_const_i32(s->dflag));
+                gen_helper_frstor(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x2e: /* fnsave mem */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fsave(cpu_A0, tcg_const_i32(s->dflag));
+                gen_helper_fsave(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x2f: /* fnstsw mem */
-                gen_helper_fnstsw(cpu_tmp2_i32);
+                gen_helper_fnstsw(cpu_tmp2_i32, cpu_env);
                 tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
@@ -5709,25 +5743,25 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fbld_ST0(cpu_A0);
+                gen_helper_fbld_ST0(cpu_env, cpu_A0);
                 break;
             case 0x3e: /* fbstp */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
                 gen_jmp_im(pc_start - s->cs_base);
-                gen_helper_fbst_ST0(cpu_A0);
-                gen_helper_fpop();
+                gen_helper_fbst_ST0(cpu_env, cpu_A0);
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x3d: /* fildll */
                 tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, 
                                   (s->mem_index >> 2) - 1);
-                gen_helper_fildll_ST0(cpu_tmp1_i64);
+                gen_helper_fildll_ST0(cpu_env, cpu_tmp1_i64);
                 break;
             case 0x3f: /* fistpll */
-                gen_helper_fistll_ST0(cpu_tmp1_i64);
+                gen_helper_fistll_ST0(cpu_tmp1_i64, cpu_env);
                 tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, 
                                   (s->mem_index >> 2) - 1);
-                gen_helper_fpop();
+                gen_helper_fpop(cpu_env);
                 break;
             default:
                 goto illegal_op;
@@ -5738,13 +5772,14 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
 
             switch(op) {
             case 0x08: /* fld sti */
-                gen_helper_fpush();
-                gen_helper_fmov_ST0_STN(tcg_const_i32((opreg + 1) & 7));
+                gen_helper_fpush(cpu_env);
+                gen_helper_fmov_ST0_STN(cpu_env,
+                                        tcg_const_i32((opreg + 1) & 7));
                 break;
             case 0x09: /* fxchg sti */
             case 0x29: /* fxchg4 sti, undocumented op */
             case 0x39: /* fxchg7 sti, undocumented op */
-                gen_helper_fxchg_ST0_STN(tcg_const_i32(opreg));
+                gen_helper_fxchg_ST0_STN(cpu_env, tcg_const_i32(opreg));
                 break;
             case 0x0a: /* grp d9/2 */
                 switch(rm) {
@@ -5753,7 +5788,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     if (s->cc_op != CC_OP_DYNAMIC)
                         gen_op_set_cc_op(s->cc_op);
                     gen_jmp_im(pc_start - s->cs_base);
-                    gen_helper_fwait();
+                    gen_helper_fwait(cpu_env);
                     break;
                 default:
                     goto illegal_op;
@@ -5762,17 +5797,17 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             case 0x0c: /* grp d9/4 */
                 switch(rm) {
                 case 0: /* fchs */
-                    gen_helper_fchs_ST0();
+                    gen_helper_fchs_ST0(cpu_env);
                     break;
                 case 1: /* fabs */
-                    gen_helper_fabs_ST0();
+                    gen_helper_fabs_ST0(cpu_env);
                     break;
                 case 4: /* ftst */
-                    gen_helper_fldz_FT0();
-                    gen_helper_fcom_ST0_FT0();
+                    gen_helper_fldz_FT0(cpu_env);
+                    gen_helper_fcom_ST0_FT0(cpu_env);
                     break;
                 case 5: /* fxam */
-                    gen_helper_fxam_ST0();
+                    gen_helper_fxam_ST0(cpu_env);
                     break;
                 default:
                     goto illegal_op;
@@ -5782,32 +5817,32 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 {
                     switch(rm) {
                     case 0:
-                        gen_helper_fpush();
-                        gen_helper_fld1_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fld1_ST0(cpu_env);
                         break;
                     case 1:
-                        gen_helper_fpush();
-                        gen_helper_fldl2t_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldl2t_ST0(cpu_env);
                         break;
                     case 2:
-                        gen_helper_fpush();
-                        gen_helper_fldl2e_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldl2e_ST0(cpu_env);
                         break;
                     case 3:
-                        gen_helper_fpush();
-                        gen_helper_fldpi_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldpi_ST0(cpu_env);
                         break;
                     case 4:
-                        gen_helper_fpush();
-                        gen_helper_fldlg2_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldlg2_ST0(cpu_env);
                         break;
                     case 5:
-                        gen_helper_fpush();
-                        gen_helper_fldln2_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldln2_ST0(cpu_env);
                         break;
                     case 6:
-                        gen_helper_fpush();
-                        gen_helper_fldz_ST0();
+                        gen_helper_fpush(cpu_env);
+                        gen_helper_fldz_ST0(cpu_env);
                         break;
                     default:
                         goto illegal_op;
@@ -5817,58 +5852,58 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             case 0x0e: /* grp d9/6 */
                 switch(rm) {
                 case 0: /* f2xm1 */
-                    gen_helper_f2xm1();
+                    gen_helper_f2xm1(cpu_env);
                     break;
                 case 1: /* fyl2x */
-                    gen_helper_fyl2x();
+                    gen_helper_fyl2x(cpu_env);
                     break;
                 case 2: /* fptan */
-                    gen_helper_fptan();
+                    gen_helper_fptan(cpu_env);
                     break;
                 case 3: /* fpatan */
-                    gen_helper_fpatan();
+                    gen_helper_fpatan(cpu_env);
                     break;
                 case 4: /* fxtract */
-                    gen_helper_fxtract();
+                    gen_helper_fxtract(cpu_env);
                     break;
                 case 5: /* fprem1 */
-                    gen_helper_fprem1();
+                    gen_helper_fprem1(cpu_env);
                     break;
                 case 6: /* fdecstp */
-                    gen_helper_fdecstp();
+                    gen_helper_fdecstp(cpu_env);
                     break;
                 default:
                 case 7: /* fincstp */
-                    gen_helper_fincstp();
+                    gen_helper_fincstp(cpu_env);
                     break;
                 }
                 break;
             case 0x0f: /* grp d9/7 */
                 switch(rm) {
                 case 0: /* fprem */
-                    gen_helper_fprem();
+                    gen_helper_fprem(cpu_env);
                     break;
                 case 1: /* fyl2xp1 */
-                    gen_helper_fyl2xp1();
+                    gen_helper_fyl2xp1(cpu_env);
                     break;
                 case 2: /* fsqrt */
-                    gen_helper_fsqrt();
+                    gen_helper_fsqrt(cpu_env);
                     break;
                 case 3: /* fsincos */
-                    gen_helper_fsincos();
+                    gen_helper_fsincos(cpu_env);
                     break;
                 case 5: /* fscale */
-                    gen_helper_fscale();
+                    gen_helper_fscale(cpu_env);
                     break;
                 case 4: /* frndint */
-                    gen_helper_frndint();
+                    gen_helper_frndint(cpu_env);
                     break;
                 case 6: /* fsin */
-                    gen_helper_fsin();
+                    gen_helper_fsin(cpu_env);
                     break;
                 default:
                 case 7: /* fcos */
-                    gen_helper_fcos();
+                    gen_helper_fcos(cpu_env);
                     break;
                 }
                 break;
@@ -5882,32 +5917,32 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     if (op >= 0x20) {
                         gen_helper_fp_arith_STN_ST0(op1, opreg);
                         if (op >= 0x30)
-                            gen_helper_fpop();
+                            gen_helper_fpop(cpu_env);
                     } else {
-                        gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                        gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                         gen_helper_fp_arith_ST0_FT0(op1);
                     }
                 }
                 break;
             case 0x02: /* fcom */
             case 0x22: /* fcom2, undocumented op */
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fcom_ST0_FT0();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fcom_ST0_FT0(cpu_env);
                 break;
             case 0x03: /* fcomp */
             case 0x23: /* fcomp3, undocumented op */
             case 0x32: /* fcomp5, undocumented op */
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fcom_ST0_FT0();
-                gen_helper_fpop();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fcom_ST0_FT0(cpu_env);
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x15: /* da/5 */
                 switch(rm) {
                 case 1: /* fucompp */
-                    gen_helper_fmov_FT0_STN(tcg_const_i32(1));
-                    gen_helper_fucom_ST0_FT0();
-                    gen_helper_fpop();
-                    gen_helper_fpop();
+                    gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(1));
+                    gen_helper_fucom_ST0_FT0(cpu_env);
+                    gen_helper_fpop(cpu_env);
+                    gen_helper_fpop(cpu_env);
                     break;
                 default:
                     goto illegal_op;
@@ -5920,10 +5955,10 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                 case 1: /* fdisi (287 only, just do nop here) */
                     break;
                 case 2: /* fclex */
-                    gen_helper_fclex();
+                    gen_helper_fclex(cpu_env);
                     break;
                 case 3: /* fninit */
-                    gen_helper_fninit();
+                    gen_helper_fninit(cpu_env);
                     break;
                 case 4: /* fsetpm (287 only, just do nop here) */
                     break;
@@ -5934,59 +5969,59 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             case 0x1d: /* fucomi */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fucomi_ST0_FT0();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fucomi_ST0_FT0(cpu_env);
                 s->cc_op = CC_OP_EFLAGS;
                 break;
             case 0x1e: /* fcomi */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fcomi_ST0_FT0();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fcomi_ST0_FT0(cpu_env);
                 s->cc_op = CC_OP_EFLAGS;
                 break;
             case 0x28: /* ffree sti */
-                gen_helper_ffree_STN(tcg_const_i32(opreg));
+                gen_helper_ffree_STN(cpu_env, tcg_const_i32(opreg));
                 break;
             case 0x2a: /* fst sti */
-                gen_helper_fmov_STN_ST0(tcg_const_i32(opreg));
+                gen_helper_fmov_STN_ST0(cpu_env, tcg_const_i32(opreg));
                 break;
             case 0x2b: /* fstp sti */
             case 0x0b: /* fstp1 sti, undocumented op */
             case 0x3a: /* fstp8 sti, undocumented op */
             case 0x3b: /* fstp9 sti, undocumented op */
-                gen_helper_fmov_STN_ST0(tcg_const_i32(opreg));
-                gen_helper_fpop();
+                gen_helper_fmov_STN_ST0(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x2c: /* fucom st(i) */
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fucom_ST0_FT0();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fucom_ST0_FT0(cpu_env);
                 break;
             case 0x2d: /* fucomp st(i) */
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fucom_ST0_FT0();
-                gen_helper_fpop();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fucom_ST0_FT0(cpu_env);
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x33: /* de/3 */
                 switch(rm) {
                 case 1: /* fcompp */
-                    gen_helper_fmov_FT0_STN(tcg_const_i32(1));
-                    gen_helper_fcom_ST0_FT0();
-                    gen_helper_fpop();
-                    gen_helper_fpop();
+                    gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(1));
+                    gen_helper_fcom_ST0_FT0(cpu_env);
+                    gen_helper_fpop(cpu_env);
+                    gen_helper_fpop(cpu_env);
                     break;
                 default:
                     goto illegal_op;
                 }
                 break;
             case 0x38: /* ffreep sti, undocumented op */
-                gen_helper_ffree_STN(tcg_const_i32(opreg));
-                gen_helper_fpop();
+                gen_helper_ffree_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fpop(cpu_env);
                 break;
             case 0x3c: /* df/4 */
                 switch(rm) {
                 case 0:
-                    gen_helper_fnstsw(cpu_tmp2_i32);
+                    gen_helper_fnstsw(cpu_tmp2_i32, cpu_env);
                     tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
                     gen_op_mov_reg_T0(OT_WORD, R_EAX);
                     break;
@@ -5997,17 +6032,17 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             case 0x3d: /* fucomip */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fucomi_ST0_FT0();
-                gen_helper_fpop();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fucomi_ST0_FT0(cpu_env);
+                gen_helper_fpop(cpu_env);
                 s->cc_op = CC_OP_EFLAGS;
                 break;
             case 0x3e: /* fcomip */
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
-                gen_helper_fcomi_ST0_FT0();
-                gen_helper_fpop();
+                gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
+                gen_helper_fcomi_ST0_FT0(cpu_env);
+                gen_helper_fpop(cpu_env);
                 s->cc_op = CC_OP_EFLAGS;
                 break;
             case 0x10 ... 0x13: /* fcmovxx */
@@ -6023,7 +6058,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
                     l1 = gen_new_label();
                     gen_jcc1(s, s->cc_op, op1, l1);
-                    gen_helper_fmov_ST0_STN(tcg_const_i32(opreg));
+                    gen_helper_fmov_ST0_STN(cpu_env, tcg_const_i32(opreg));
                     gen_set_label(l1);
                 }
                 break;
@@ -6742,7 +6777,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
             gen_jmp_im(pc_start - s->cs_base);
-            gen_helper_fwait();
+            gen_helper_fwait(cpu_env);
         }
         break;
     case 0xcc: /* int3 */
@@ -7579,7 +7614,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
             gen_jmp_im(pc_start - s->cs_base);
-            gen_helper_fxsave(cpu_A0, tcg_const_i32((s->dflag == 2)));
+            gen_helper_fxsave(cpu_env, cpu_A0, tcg_const_i32((s->dflag == 2)));
             break;
         case 1: /* fxrstor */
             if (mod == 3 || !(s->cpuid_features & CPUID_FXSR) ||
@@ -7593,7 +7628,8 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
             gen_jmp_im(pc_start - s->cs_base);
-            gen_helper_fxrstor(cpu_A0, tcg_const_i32((s->dflag == 2)));
+            gen_helper_fxrstor(cpu_env, cpu_A0,
+                               tcg_const_i32((s->dflag == 2)));
             break;
         case 2: /* ldmxcsr */
         case 3: /* stmxcsr */
@@ -7608,7 +7644,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             if (op == 2) {
                 gen_op_ld_T0_A0(OT_LONG + s->mem_index);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-                gen_helper_ldmxcsr(cpu_tmp2_i32);
+                gen_helper_ldmxcsr(cpu_env, cpu_tmp2_i32);
             } else {
                 tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, mxcsr));
                 gen_op_st_T0_A0(OT_LONG + s->mem_index);
@@ -7671,7 +7707,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             ot = OT_QUAD;
 
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
-        gen_helper_popcnt(cpu_T[0], cpu_T[0], tcg_const_i32(ot));
+        gen_helper_popcnt(cpu_T[0], cpu_env, cpu_T[0], tcg_const_i32(ot));
         gen_op_mov_reg_T0(ot, reg);
 
         s->cc_op = CC_OP_EFLAGS;