Patchwork target-i386: Use mulu2 and muls2

login
register
mail settings
Submitter Richard Henderson
Date Feb. 26, 2013, 8:06 p.m.
Message ID <1361909183-14653-1-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/223387/
State New
Headers show

Comments

Richard Henderson - Feb. 26, 2013, 8:06 p.m.
These correspond very closely to the insns that we're emulating.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/helper.h     |   4 --
 target-i386/int_helper.c |  40 ------------
 target-i386/translate.c  | 167 ++++++++++++++++-------------------------------
 3 files changed, 56 insertions(+), 155 deletions(-)

Blue, this patch got dropped from the series that you applied.
I think there was an initial conflict in this patch, before I
reminded you of the listed dependency.  But the subsequent
application didn't happen.


r~
Blue Swirl - Feb. 27, 2013, 7:42 p.m.
Thanks, applied.

On Tue, Feb 26, 2013 at 8:06 PM, Richard Henderson <rth@twiddle.net> wrote:
> These correspond very closely to the insns that we're emulating.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-i386/helper.h     |   4 --
>  target-i386/int_helper.c |  40 ------------
>  target-i386/translate.c  | 167 ++++++++++++++++-------------------------------
>  3 files changed, 56 insertions(+), 155 deletions(-)
>
> Blue, this patch got dropped from the series that you applied.
> I think there was an initial conflict in this patch, before I
> reminded you of the listed dependency.  But the subsequent
> application didn't happen.
>
>
> r~
>
>
> diff --git a/target-i386/helper.h b/target-i386/helper.h
> index 26a0cc8..d6974df 100644
> --- a/target-i386/helper.h
> +++ b/target-i386/helper.h
> @@ -14,12 +14,8 @@ DEF_HELPER_2(idivw_AX, void, env, tl)
>  DEF_HELPER_2(divl_EAX, void, env, tl)
>  DEF_HELPER_2(idivl_EAX, void, env, tl)
>  #ifdef TARGET_X86_64
> -DEF_HELPER_2(mulq_EAX_T0, void, env, tl)
> -DEF_HELPER_2(imulq_EAX_T0, void, env, tl)
> -DEF_HELPER_3(imulq_T0_T1, tl, env, tl, tl)
>  DEF_HELPER_2(divq_EAX, void, env, tl)
>  DEF_HELPER_2(idivq_EAX, void, env, tl)
> -DEF_HELPER_FLAGS_2(umulh, TCG_CALL_NO_RWG_SE, tl, tl, tl)
>  #endif
>
>  DEF_HELPER_2(aam, void, env, int)
> diff --git a/target-i386/int_helper.c b/target-i386/int_helper.c
> index 3b56075..74c7c36 100644
> --- a/target-i386/int_helper.c
> +++ b/target-i386/int_helper.c
> @@ -374,46 +374,6 @@ static int idiv64(uint64_t *plow, uint64_t *phigh, int64_t b)
>      return 0;
>  }
>
> -void helper_mulq_EAX_T0(CPUX86State *env, target_ulong t0)
> -{
> -    uint64_t r0, r1;
> -
> -    mulu64(&r0, &r1, EAX, t0);
> -    EAX = r0;
> -    EDX = r1;
> -    CC_DST = r0;
> -    CC_SRC = r1;
> -}
> -
> -target_ulong helper_umulh(target_ulong t0, target_ulong t1)
> -{
> -    uint64_t h, l;
> -    mulu64(&l, &h, t0, t1);
> -    return h;
> -}
> -
> -void helper_imulq_EAX_T0(CPUX86State *env, target_ulong t0)
> -{
> -    uint64_t r0, r1;
> -
> -    muls64(&r0, &r1, EAX, t0);
> -    EAX = r0;
> -    EDX = r1;
> -    CC_DST = r0;
> -    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
> -}
> -
> -target_ulong helper_imulq_T0_T1(CPUX86State *env, target_ulong t0,
> -                                target_ulong t1)
> -{
> -    uint64_t r0, r1;
> -
> -    muls64(&r0, &r1, t0, t1);
> -    CC_DST = r0;
> -    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
> -    return r0;
> -}
> -
>  void helper_divq_EAX(CPUX86State *env, target_ulong t0)
>  {
>      uint64_t r0, r1;
> diff --git a/target-i386/translate.c b/target-i386/translate.c
> index 605cd88..3b92f3b 100644
> --- a/target-i386/translate.c
> +++ b/target-i386/translate.c
> @@ -4111,31 +4111,18 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
>                  ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
>                  gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
>                  switch (ot) {
> -                    TCGv_i64 t0, t1;
>                  default:
> -                    t0 = tcg_temp_new_i64();
> -                    t1 = tcg_temp_new_i64();
> -#ifdef TARGET_X86_64
> -                    tcg_gen_ext32u_i64(t0, cpu_T[0]);
> -                    tcg_gen_ext32u_i64(t1, cpu_regs[R_EDX]);
> -#else
> -                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
> -                    tcg_gen_extu_i32_i64(t0, cpu_regs[R_EDX]);
> -#endif
> -                    tcg_gen_mul_i64(t0, t0, t1);
> -                    tcg_gen_trunc_i64_tl(cpu_T[0], t0);
> -                    tcg_gen_shri_i64(t0, t0, 32);
> -                    tcg_gen_trunc_i64_tl(cpu_T[1], t0);
> -                    tcg_temp_free_i64(t0);
> -                    tcg_temp_free_i64(t1);
> -                    gen_op_mov_reg_T0(OT_LONG, s->vex_v);
> -                    gen_op_mov_reg_T1(OT_LONG, reg);
> +                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
> +                    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
> +                    tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
> +                                      cpu_tmp2_i32, cpu_tmp3_i32);
> +                    tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
> +                    tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
>                      break;
>  #ifdef TARGET_X86_64
>                  case OT_QUAD:
> -                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[R_EDX]);
> -                    tcg_gen_mul_tl(cpu_regs[s->vex_v], cpu_T[0], cpu_T[1]);
> -                    gen_helper_umulh(cpu_regs[reg], cpu_T[0], cpu_T[1]);
> +                    tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
> +                                      cpu_T[0], cpu_regs[R_EDX]);
>                      break;
>  #endif
>                  }
> @@ -5032,39 +5019,22 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
>                  break;
>              default:
>              case OT_LONG:
> -#ifdef TARGET_X86_64
> -                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
> -                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
> -                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
> -                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
> -                gen_op_mov_reg_T0(OT_LONG, R_EAX);
> -                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
> -                gen_op_mov_reg_T0(OT_LONG, R_EDX);
> -                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
> -#else
> -                {
> -                    TCGv_i64 t0, t1;
> -                    t0 = tcg_temp_new_i64();
> -                    t1 = tcg_temp_new_i64();
> -                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
> -                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
> -                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
> -                    tcg_gen_mul_i64(t0, t0, t1);
> -                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
> -                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
> -                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                    tcg_gen_shri_i64(t0, t0, 32);
> -                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
> -                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
> -                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
> -                }
> -#endif
> +                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
> +                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
> +                tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
> +                                  cpu_tmp2_i32, cpu_tmp3_i32);
> +                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
> +                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
> +                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
> +                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
>                  set_cc_op(s, CC_OP_MULL);
>                  break;
>  #ifdef TARGET_X86_64
>              case OT_QUAD:
> -                gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
> +                tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
> +                                  cpu_T[0], cpu_regs[R_EAX]);
> +                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
> +                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
>                  set_cc_op(s, CC_OP_MULQ);
>                  break;
>  #endif
> @@ -5100,41 +5070,25 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
>                  break;
>              default:
>              case OT_LONG:
> -#ifdef TARGET_X86_64
> -                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
> -                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
> -                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
> -                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
> -                gen_op_mov_reg_T0(OT_LONG, R_EAX);
> -                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
> -                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
> -                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
> -                gen_op_mov_reg_T0(OT_LONG, R_EDX);
> -#else
> -                {
> -                    TCGv_i64 t0, t1;
> -                    t0 = tcg_temp_new_i64();
> -                    t1 = tcg_temp_new_i64();
> -                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
> -                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
> -                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
> -                    tcg_gen_mul_i64(t0, t0, t1);
> -                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
> -                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
> -                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
> -                    tcg_gen_shri_i64(t0, t0, 32);
> -                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
> -                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
> -                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
> -                }
> -#endif
> +                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
> +                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
> +                tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
> +                                  cpu_tmp2_i32, cpu_tmp3_i32);
> +                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
> +                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
> +                tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
> +                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
> +                tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
> +                tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
>                  set_cc_op(s, CC_OP_MULL);
>                  break;
>  #ifdef TARGET_X86_64
>              case OT_QUAD:
> -                gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
> +                tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
> +                                  cpu_T[0], cpu_regs[R_EAX]);
> +                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
> +                tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
> +                tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
>                  set_cc_op(s, CC_OP_MULQ);
>                  break;
>  #endif
> @@ -5389,37 +5343,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
>          } else {
>              gen_op_mov_TN_reg(ot, 1, reg);
>          }
> -
> -#ifdef TARGET_X86_64
> -        if (ot == OT_QUAD) {
> -            gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
> -        } else
> -#endif
> -        if (ot == OT_LONG) {
> +        switch (ot) {
>  #ifdef TARGET_X86_64
> -                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
> -                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
> -                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
> -                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
> -                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
> -#else
> -                {
> -                    TCGv_i64 t0, t1;
> -                    t0 = tcg_temp_new_i64();
> -                    t1 = tcg_temp_new_i64();
> -                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
> -                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
> -                    tcg_gen_mul_i64(t0, t0, t1);
> -                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
> -                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
> -                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
> -                    tcg_gen_shri_i64(t0, t0, 32);
> -                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
> -                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
> -                }
> +        case OT_QUAD:
> +            tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
> +            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
> +            tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
> +            tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
> +            break;
>  #endif
> -        } else {
> +        case OT_LONG:
> +            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
> +            tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
> +            tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
> +                              cpu_tmp2_i32, cpu_tmp3_i32);
> +            tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
> +            tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
> +            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
> +            tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
> +            tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
> +            break;
> +        default:
>              tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
>              tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
>              /* XXX: use 32 bit mul which could be faster */
> @@ -5427,8 +5371,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
>              tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
>              tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
>              tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
> +            gen_op_mov_reg_T0(ot, reg);
> +            break;
>          }
> -        gen_op_mov_reg_T0(ot, reg);
>          set_cc_op(s, CC_OP_MULB + ot);
>          break;
>      case 0x1c0:
> --
> 1.8.1.2
>

Patch

diff --git a/target-i386/helper.h b/target-i386/helper.h
index 26a0cc8..d6974df 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -14,12 +14,8 @@  DEF_HELPER_2(idivw_AX, void, env, tl)
 DEF_HELPER_2(divl_EAX, void, env, tl)
 DEF_HELPER_2(idivl_EAX, void, env, tl)
 #ifdef TARGET_X86_64
-DEF_HELPER_2(mulq_EAX_T0, void, env, tl)
-DEF_HELPER_2(imulq_EAX_T0, void, env, tl)
-DEF_HELPER_3(imulq_T0_T1, tl, env, tl, tl)
 DEF_HELPER_2(divq_EAX, void, env, tl)
 DEF_HELPER_2(idivq_EAX, void, env, tl)
-DEF_HELPER_FLAGS_2(umulh, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 #endif
 
 DEF_HELPER_2(aam, void, env, int)
diff --git a/target-i386/int_helper.c b/target-i386/int_helper.c
index 3b56075..74c7c36 100644
--- a/target-i386/int_helper.c
+++ b/target-i386/int_helper.c
@@ -374,46 +374,6 @@  static int idiv64(uint64_t *plow, uint64_t *phigh, int64_t b)
     return 0;
 }
 
-void helper_mulq_EAX_T0(CPUX86State *env, target_ulong t0)
-{
-    uint64_t r0, r1;
-
-    mulu64(&r0, &r1, EAX, t0);
-    EAX = r0;
-    EDX = r1;
-    CC_DST = r0;
-    CC_SRC = r1;
-}
-
-target_ulong helper_umulh(target_ulong t0, target_ulong t1)
-{
-    uint64_t h, l;
-    mulu64(&l, &h, t0, t1);
-    return h;
-}
-
-void helper_imulq_EAX_T0(CPUX86State *env, target_ulong t0)
-{
-    uint64_t r0, r1;
-
-    muls64(&r0, &r1, EAX, t0);
-    EAX = r0;
-    EDX = r1;
-    CC_DST = r0;
-    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
-}
-
-target_ulong helper_imulq_T0_T1(CPUX86State *env, target_ulong t0,
-                                target_ulong t1)
-{
-    uint64_t r0, r1;
-
-    muls64(&r0, &r1, t0, t1);
-    CC_DST = r0;
-    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
-    return r0;
-}
-
 void helper_divq_EAX(CPUX86State *env, target_ulong t0)
 {
     uint64_t r0, r1;
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 605cd88..3b92f3b 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -4111,31 +4111,18 @@  static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
                 gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
                 switch (ot) {
-                    TCGv_i64 t0, t1;
                 default:
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-#ifdef TARGET_X86_64
-                    tcg_gen_ext32u_i64(t0, cpu_T[0]);
-                    tcg_gen_ext32u_i64(t1, cpu_regs[R_EDX]);
-#else
-                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_extu_i32_i64(t0, cpu_regs[R_EDX]);
-#endif
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_tl(cpu_T[0], t0);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_tl(cpu_T[1], t0);
-                    tcg_temp_free_i64(t0);
-                    tcg_temp_free_i64(t1);
-                    gen_op_mov_reg_T0(OT_LONG, s->vex_v);
-                    gen_op_mov_reg_T1(OT_LONG, reg);
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
+                    tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                      cpu_tmp2_i32, cpu_tmp3_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
                     break;
 #ifdef TARGET_X86_64
                 case OT_QUAD:
-                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[R_EDX]);
-                    tcg_gen_mul_tl(cpu_regs[s->vex_v], cpu_T[0], cpu_T[1]);
-                    gen_helper_umulh(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                    tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
+                                      cpu_T[0], cpu_regs[R_EDX]);
                     break;
 #endif
                 }
@@ -5032,39 +5019,22 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -5100,41 +5070,25 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -5389,37 +5343,27 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         } else {
             gen_op_mov_TN_reg(ot, 1, reg);
         }
-
-#ifdef TARGET_X86_64
-        if (ot == OT_QUAD) {
-            gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
-        } else
-#endif
-        if (ot == OT_LONG) {
+        switch (ot) {
 #ifdef TARGET_X86_64
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
-                }
+        case OT_QUAD:
+            tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
+            tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
+            break;
 #endif
-        } else {
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+            tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                              cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
+            tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+            break;
+        default:
             tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
             /* XXX: use 32 bit mul which could be faster */
@@ -5427,8 +5371,9 @@  static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
             tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+            gen_op_mov_reg_T0(ot, reg);
+            break;
         }
-        gen_op_mov_reg_T0(ot, reg);
         set_cc_op(s, CC_OP_MULB + ot);
         break;
     case 0x1c0: