Patchwork [3/3] target-sparc: Inline some generation of carry for ADDX/SUBX.

login
register
mail settings
Submitter Richard Henderson
Date May 10, 2010, 10:23 p.m.
Message ID <aabfc3eb39190fad8837e3bf1abd5e0f09eda5d4.1273529974.git.rth@twiddle.net>
Download mbox | patch
Permalink /patch/52243/
State New
Headers show

Comments

Richard Henderson - May 10, 2010, 10:23 p.m.
Computing carry is trivial for some inputs.  By avoiding an
external function call, we generate near-optimal code for
the common cases of add+addx (double-word arithmetic) and
cmp+addx (a setcc pattern).

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-sparc/helper.h    |    2 +-
 target-sparc/op_helper.c |    2 +-
 target-sparc/translate.c |  268 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 196 insertions(+), 76 deletions(-)
Blue Swirl - May 11, 2010, 7:28 p.m.
On 5/11/10, Richard Henderson <rth@twiddle.net> wrote:
> Computing carry is trivial for some inputs.  By avoiding an
>  external function call, we generate near-optimal code for
>  the common cases of add+addx (double-word arithmetic) and
>  cmp+addx (a setcc pattern).
>
>  Signed-off-by: Richard Henderson <rth@twiddle.net>
>  ---
>   target-sparc/helper.h    |    2 +-
>   target-sparc/op_helper.c |    2 +-
>   target-sparc/translate.c |  268 +++++++++++++++++++++++++++++++++-------------
>   3 files changed, 196 insertions(+), 76 deletions(-)
>
>  diff --git a/target-sparc/helper.h b/target-sparc/helper.h
>  index 04c1306..6f103e7 100644
>  --- a/target-sparc/helper.h
>  +++ b/target-sparc/helper.h
>  @@ -158,6 +158,6 @@ VIS_CMPHELPER(cmpne);
>   #undef VIS_HELPER
>   #undef VIS_CMPHELPER
>   DEF_HELPER_0(compute_psr, void);
>  -DEF_HELPER_0(compute_C_icc, tl);
>  +DEF_HELPER_0(compute_C_icc, i32);
>
>   #include "def-helper.h"
>  diff --git a/target-sparc/op_helper.c b/target-sparc/op_helper.c
>  index c36bc54..3d6177b 100644
>  --- a/target-sparc/op_helper.c
>  +++ b/target-sparc/op_helper.c
>  @@ -1314,7 +1314,7 @@ void helper_compute_psr(void)
>      CC_OP = CC_OP_FLAGS;
>   }
>
>  -target_ulong helper_compute_C_icc(void)
>  +uint32_t helper_compute_C_icc(void)
>   {
>      uint32_t ret;
>
>  diff --git a/target-sparc/translate.c b/target-sparc/translate.c
>  index ea7c71b..06f0f34 100644
>  --- a/target-sparc/translate.c
>  +++ b/target-sparc/translate.c
>  @@ -332,24 +332,130 @@ static inline void gen_op_add_cc(TCGv dst, TCGv src1, TCGv src2)
>      tcg_gen_mov_tl(dst, cpu_cc_dst);
>   }
>
>  -static inline void gen_op_addxi_cc(TCGv dst, TCGv src1, target_long src2)
>  +static TCGv_i32 gen_add32_carry32(void)
>   {
>  -    gen_helper_compute_C_icc(cpu_tmp0);
>  -    tcg_gen_mov_tl(cpu_cc_src, src1);
>  -    tcg_gen_movi_tl(cpu_cc_src2, src2);
>  -    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
>  -    tcg_gen_addi_tl(cpu_cc_dst, cpu_cc_dst, src2);
>  -    tcg_gen_mov_tl(dst, cpu_cc_dst);
>  +    TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
>  +
>  +    /* Carry is computed from a previous add: (dst < src)  */
>  +#if TARGET_LONG_BITS == 64
>  +    cc_src1_32 = tcg_temp_new_i32();
>  +    cc_src2_32 = tcg_temp_new_i32();
>  +    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst);
>  +    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src);
>  +#else
>  +    cc_src1_32 = cpu_cc_dst;
>  +    cc_src2_32 = cpu_cc_src;
>  +#endif
>  +
>  +    carry_32 = tcg_temp_new_i32();
>  +    tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
>  +
>  +#if TARGET_LONG_BITS == 64
>  +    tcg_temp_free_i32(cc_src1_32);
>  +    tcg_temp_free_i32(cc_src2_32);
>  +#endif
>  +
>  +    return carry_32;
>   }
>
>  -static inline void gen_op_addx_cc(TCGv dst, TCGv src1, TCGv src2)
>  +static TCGv_i32 gen_sub32_carry32(void)
>   {
>  -    gen_helper_compute_C_icc(cpu_tmp0);
>  -    tcg_gen_mov_tl(cpu_cc_src, src1);
>  -    tcg_gen_mov_tl(cpu_cc_src2, src2);
>  -    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
>  -    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
>  -    tcg_gen_mov_tl(dst, cpu_cc_dst);
>  +    TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
>  +
>  +    /* Carry is computed from a previous borrow: (src1 < src2)  */
>  +#if TARGET_LONG_BITS == 64
>  +    cc_src1_32 = tcg_temp_new_i32();
>  +    cc_src2_32 = tcg_temp_new_i32();
>  +    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src);
>  +    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2);
>  +#else
>  +    cc_src1_32 = cpu_cc_src;
>  +    cc_src2_32 = cpu_cc_src2;
>  +#endif
>  +
>  +    carry_32 = tcg_temp_new_i32();
>  +    tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
>  +
>  +#if TARGET_LONG_BITS == 64
>  +    tcg_temp_free_i32(cc_src1_32);
>  +    tcg_temp_free_i32(cc_src2_32);
>  +#endif
>  +
>  +    return carry_32;
>  +}
>  +
>  +static void gen_op_addx_int(DisasContext *dc, TCGv dst, TCGv src1,
>  +                            TCGv src2, int update_cc)
>  +{
>  +    TCGv_i32 carry_32;
>  +    TCGv carry;
>  +
>  +    switch (dc->cc_op) {
>  +    case CC_OP_DIV:
>  +    case CC_OP_LOGIC:
>  +        /* Carry is known to be zero.  Fall back to plain ADD.  */
>  +        if (update_cc) {
>  +            gen_op_add_cc(dst, src1, src2);
>  +        } else {
>  +            tcg_gen_add_tl(dst, src1, src2);
>  +        }
>  +        return;
>  +
>  +    case CC_OP_ADD:
>  +    case CC_OP_TADD:
>  +    case CC_OP_TADDTV:
>  +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
>  +        {
>  +            /* For 32-bit hosts, we can re-use the host's hardware carry
>  +               generation by using an ADD2 opcode.  We discard the low
>  +               part of the output.  Ideally we'd combine this operation
>  +               with the add that generated the carry in the first place.  */
>  +            TCGv dst_low = tcg_temp_new();
>  +            tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst,
>  +                            cpu_cc_src, src1, cpu_cc_src2, src2);

Awesome idea!

>  +            tcg_temp_free(dst_low);
>  +            goto add_done;
>  +        }
>  +#endif
>  +        carry_32 = gen_add32_carry32();
>  +        break;
>  +
>  +    case CC_OP_SUB:
>  +    case CC_OP_TSUB:
>  +    case CC_OP_TSUBTV:
>  +        carry_32 = gen_sub32_carry32();
>  +        break;
>  +
>  +    default:
>  +        /* We need external help to produce the carry.  */
>  +        carry_32 = tcg_temp_new_i32();
>  +        gen_helper_compute_C_icc(carry_32);
>  +        break;
>  +    }
>  +
>  +#if TARGET_LONG_BITS == 64
>  +    carry = tcg_temp_new();
>  +    tcg_gen_extu_i32_i64(carry, carry_32);
>  +#else
>  +    carry = carry_32;
>  +#endif
>  +
>  +    tcg_gen_add_tl(dst, src1, src2);
>  +    tcg_gen_add_tl(dst, dst, carry);
>  +
>  +    tcg_temp_free_i32(carry_32);
>  +#if TARGET_LONG_BITS == 64
>  +    tcg_temp_free(carry);
>  +#endif
>  +
>  +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
>  + add_done:
>  +#endif
>  +    if (update_cc) {
>  +        tcg_gen_mov_tl(cpu_cc_src, src1);
>  +        tcg_gen_mov_tl(cpu_cc_src2, src2);
>  +        tcg_gen_mov_tl(cpu_cc_dst, dst);
>  +    }
>   }
>
>   static inline void gen_op_tadd_cc(TCGv dst, TCGv src1, TCGv src2)
>  @@ -415,24 +521,78 @@ static inline void gen_op_sub_cc(TCGv dst, TCGv src1, TCGv src2)
>      tcg_gen_mov_tl(dst, cpu_cc_dst);
>   }
>
>  -static inline void gen_op_subxi_cc(TCGv dst, TCGv src1, target_long src2)
>  +static void gen_op_subx_int(DisasContext *dc, TCGv dst, TCGv src1,
>  +                            TCGv src2, int update_cc)
>   {
>  -    gen_helper_compute_C_icc(cpu_tmp0);
>  -    tcg_gen_mov_tl(cpu_cc_src, src1);
>  -    tcg_gen_movi_tl(cpu_cc_src2, src2);
>  -    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
>  -    tcg_gen_subi_tl(cpu_cc_dst, cpu_cc_dst, src2);
>  -    tcg_gen_mov_tl(dst, cpu_cc_dst);
>  -}
>  +    TCGv_i32 carry_32;
>  +    TCGv carry;
>
>  -static inline void gen_op_subx_cc(TCGv dst, TCGv src1, TCGv src2)
>  -{
>  -    gen_helper_compute_C_icc(cpu_tmp0);
>  -    tcg_gen_mov_tl(cpu_cc_src, src1);
>  -    tcg_gen_mov_tl(cpu_cc_src2, src2);
>  -    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
>  -    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
>  -    tcg_gen_mov_tl(dst, cpu_cc_dst);
>  +    switch (dc->cc_op) {
>  +    case CC_OP_DIV:
>  +    case CC_OP_LOGIC:
>  +        /* Carry is known to be zero.  Fall back to plain SUB.  */
>  +        if (update_cc) {
>  +            gen_op_sub_cc(dst, src1, src2);
>  +        } else {
>  +            tcg_gen_sub_tl(dst, src1, src2);
>  +        }
>  +        return;
>  +
>  +    case CC_OP_ADD:
>  +    case CC_OP_TADD:
>  +    case CC_OP_TADDTV:
>  +        carry_32 = gen_add32_carry32();
>  +        break;
>  +
>  +    case CC_OP_SUB:
>  +    case CC_OP_TSUB:
>  +    case CC_OP_TSUBTV:
>  +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
>  +        {
>  +            /* For 32-bit hosts, we can re-use the host's hardware carry
>  +               generation by using a SUB2 opcode.  We discard the low
>  +               part of the output.  Ideally we'd combine this operation
>  +               with the add that generated the carry in the first place.  */
>  +            TCGv dst_low = tcg_temp_new();
>  +            tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst,
>  +                            cpu_cc_src, src1, cpu_cc_src2, src2);
>  +            tcg_temp_free(dst_low);
>  +            goto sub_done;
>  +        }
>  +#endif
>  +        carry_32 = gen_sub32_carry32();
>  +        break;
>  +
>  +    default:
>  +        /* We need external help to produce the carry.  */
>  +        carry_32 = tcg_temp_new_i32();
>  +        gen_helper_compute_C_icc(carry_32);
>  +        break;
>  +    }
>  +
>  +#if TARGET_LONG_BITS == 64
>  +    carry = tcg_temp_new();
>  +    tcg_gen_extu_i32_i64(carry, carry_32);
>  +#else
>  +    carry = carry_32;
>  +#endif
>  +
>  +    tcg_gen_sub_tl(dst, src1, src2);
>  +    tcg_gen_sub_tl(dst, dst, carry);
>  +
>  +    tcg_temp_free_i32(carry_32);
>  +#if TARGET_LONG_BITS == 64
>  +    tcg_temp_free(carry);
>  +#endif
>  +
>  +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
>  + sub_done:
>  +#endif
>  +    if (update_cc) {
>  +        tcg_gen_mov_tl(cpu_cc_src, src1);
>  +        tcg_gen_mov_tl(cpu_cc_src2, src2);
>  +        tcg_gen_mov_tl(cpu_cc_dst, dst);
>  +    }
>   }
>
>   static inline void gen_op_tsub_cc(TCGv dst, TCGv src1, TCGv src2)
>  @@ -2950,28 +3110,8 @@ static void disas_sparc_insn(DisasContext * dc)
>                          }
>                          break;
>                      case 0x8: /* addx, V9 addc */
>  -                        if (IS_IMM) {
>  -                            simm = GET_FIELDs(insn, 19, 31);
>  -                            if (xop & 0x10) {
>  -                                gen_op_addxi_cc(cpu_dst, cpu_src1, simm);
>  -                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
>  -                                dc->cc_op = CC_OP_ADDX;

The new code doesn't update dc->cc_op, shouldn't that happen if the
condition codes are changed? For example 'addx' in the sequence
'addcc; addxcc; addx;' should need the C flag from the second addxcc,
not from first addcc.

If we don't need to update, this was the only use of CC_OP_ADDX, do we
need that anymore?

>  -                            } else {
>  -                                gen_helper_compute_C_icc(cpu_tmp0);
>  -                                tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
>  -                                tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
>  -                            }
>  -                        } else {
>  -                            if (xop & 0x10) {
>  -                                gen_op_addx_cc(cpu_dst, cpu_src1, cpu_src2);
>  -                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
>  -                                dc->cc_op = CC_OP_ADDX;
>  -                            } else {
>  -                                gen_helper_compute_C_icc(cpu_tmp0);
>  -                                tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
>  -                                tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
>  -                            }
>  -                        }
>  +                        gen_op_addx_int(dc, cpu_dst, cpu_src1, cpu_src2,
>  +                                        (xop & 0x10));
>                          break;
>   #ifdef TARGET_SPARC64
>                      case 0x9: /* V9 mulx */
>  @@ -3002,28 +3142,8 @@ static void disas_sparc_insn(DisasContext * dc)
>                          }
>                          break;
>                      case 0xc: /* subx, V9 subc */
>  -                        if (IS_IMM) {
>  -                            simm = GET_FIELDs(insn, 19, 31);
>  -                            if (xop & 0x10) {
>  -                                gen_op_subxi_cc(cpu_dst, cpu_src1, simm);
>  -                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
>  -                                dc->cc_op = CC_OP_SUBX;
>  -                            } else {
>  -                                gen_helper_compute_C_icc(cpu_tmp0);
>  -                                tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
>  -                                tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
>  -                            }
>  -                        } else {
>  -                            if (xop & 0x10) {
>  -                                gen_op_subx_cc(cpu_dst, cpu_src1, cpu_src2);
>  -                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
>  -                                dc->cc_op = CC_OP_SUBX;
>  -                            } else {
>  -                                gen_helper_compute_C_icc(cpu_tmp0);
>  -                                tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
>  -                                tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
>  -                            }
>  -                        }
>  +                        gen_op_subx_int(dc, cpu_dst, cpu_src1, cpu_src2,
>  +                                        (xop & 0x10));
>                          break;
>   #ifdef TARGET_SPARC64
>                      case 0xd: /* V9 udivx */
>
> --
>  1.7.0.1
>
>
Richard Henderson - May 12, 2010, 2:48 p.m.
> The new code doesn't update dc->cc_op, shouldn't that happen if the
> condition codes are changed? For example 'addx' in the sequence
> 'addcc; addxcc; addx;' should need the C flag from the second addxcc,
> not from first addcc.

Oops, yes, that needs updating too.  Will fix.


r~

Patch

diff --git a/target-sparc/helper.h b/target-sparc/helper.h
index 04c1306..6f103e7 100644
--- a/target-sparc/helper.h
+++ b/target-sparc/helper.h
@@ -158,6 +158,6 @@  VIS_CMPHELPER(cmpne);
 #undef VIS_HELPER
 #undef VIS_CMPHELPER
 DEF_HELPER_0(compute_psr, void);
-DEF_HELPER_0(compute_C_icc, tl);
+DEF_HELPER_0(compute_C_icc, i32);
 
 #include "def-helper.h"
diff --git a/target-sparc/op_helper.c b/target-sparc/op_helper.c
index c36bc54..3d6177b 100644
--- a/target-sparc/op_helper.c
+++ b/target-sparc/op_helper.c
@@ -1314,7 +1314,7 @@  void helper_compute_psr(void)
     CC_OP = CC_OP_FLAGS;
 }
 
-target_ulong helper_compute_C_icc(void)
+uint32_t helper_compute_C_icc(void)
 {
     uint32_t ret;
 
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index ea7c71b..06f0f34 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -332,24 +332,130 @@  static inline void gen_op_add_cc(TCGv dst, TCGv src1, TCGv src2)
     tcg_gen_mov_tl(dst, cpu_cc_dst);
 }
 
-static inline void gen_op_addxi_cc(TCGv dst, TCGv src1, target_long src2)
+static TCGv_i32 gen_add32_carry32(void)
 {
-    gen_helper_compute_C_icc(cpu_tmp0);
-    tcg_gen_mov_tl(cpu_cc_src, src1);
-    tcg_gen_movi_tl(cpu_cc_src2, src2);
-    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
-    tcg_gen_addi_tl(cpu_cc_dst, cpu_cc_dst, src2);
-    tcg_gen_mov_tl(dst, cpu_cc_dst);
+    TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
+
+    /* Carry is computed from a previous add: (dst < src)  */
+#if TARGET_LONG_BITS == 64
+    cc_src1_32 = tcg_temp_new_i32();
+    cc_src2_32 = tcg_temp_new_i32();
+    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst);
+    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src);
+#else
+    cc_src1_32 = cpu_cc_dst;
+    cc_src2_32 = cpu_cc_src;
+#endif
+
+    carry_32 = tcg_temp_new_i32();
+    tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
+
+#if TARGET_LONG_BITS == 64
+    tcg_temp_free_i32(cc_src1_32);
+    tcg_temp_free_i32(cc_src2_32);
+#endif
+
+    return carry_32;
 }
 
-static inline void gen_op_addx_cc(TCGv dst, TCGv src1, TCGv src2)
+static TCGv_i32 gen_sub32_carry32(void)
 {
-    gen_helper_compute_C_icc(cpu_tmp0);
-    tcg_gen_mov_tl(cpu_cc_src, src1);
-    tcg_gen_mov_tl(cpu_cc_src2, src2);
-    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
-    tcg_gen_add_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
-    tcg_gen_mov_tl(dst, cpu_cc_dst);
+    TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
+
+    /* Carry is computed from a previous borrow: (src1 < src2)  */
+#if TARGET_LONG_BITS == 64
+    cc_src1_32 = tcg_temp_new_i32();
+    cc_src2_32 = tcg_temp_new_i32();
+    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src);
+    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2);
+#else
+    cc_src1_32 = cpu_cc_src;
+    cc_src2_32 = cpu_cc_src2;
+#endif
+
+    carry_32 = tcg_temp_new_i32();
+    tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
+
+#if TARGET_LONG_BITS == 64
+    tcg_temp_free_i32(cc_src1_32);
+    tcg_temp_free_i32(cc_src2_32);
+#endif
+
+    return carry_32;
+}
+
+static void gen_op_addx_int(DisasContext *dc, TCGv dst, TCGv src1,
+                            TCGv src2, int update_cc)
+{
+    TCGv_i32 carry_32;
+    TCGv carry;
+
+    switch (dc->cc_op) {
+    case CC_OP_DIV:
+    case CC_OP_LOGIC:
+        /* Carry is known to be zero.  Fall back to plain ADD.  */
+        if (update_cc) {
+            gen_op_add_cc(dst, src1, src2);
+        } else {
+            tcg_gen_add_tl(dst, src1, src2);
+        }
+        return;
+
+    case CC_OP_ADD:
+    case CC_OP_TADD:
+    case CC_OP_TADDTV:
+#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
+        {
+            /* For 32-bit hosts, we can re-use the host's hardware carry
+               generation by using an ADD2 opcode.  We discard the low
+               part of the output.  Ideally we'd combine this operation
+               with the add that generated the carry in the first place.  */
+            TCGv dst_low = tcg_temp_new();
+            tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst, 
+                            cpu_cc_src, src1, cpu_cc_src2, src2);
+            tcg_temp_free(dst_low);
+            goto add_done;
+        }
+#endif
+        carry_32 = gen_add32_carry32();
+        break;
+
+    case CC_OP_SUB:
+    case CC_OP_TSUB:
+    case CC_OP_TSUBTV:
+        carry_32 = gen_sub32_carry32();
+        break;
+
+    default:
+        /* We need external help to produce the carry.  */
+        carry_32 = tcg_temp_new_i32();
+        gen_helper_compute_C_icc(carry_32);
+        break;
+    }
+
+#if TARGET_LONG_BITS == 64
+    carry = tcg_temp_new();
+    tcg_gen_extu_i32_i64(carry, carry_32);
+#else
+    carry = carry_32;
+#endif
+
+    tcg_gen_add_tl(dst, src1, src2);
+    tcg_gen_add_tl(dst, dst, carry);
+
+    tcg_temp_free_i32(carry_32);
+#if TARGET_LONG_BITS == 64
+    tcg_temp_free(carry);
+#endif
+
+#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
+ add_done:
+#endif
+    if (update_cc) {
+        tcg_gen_mov_tl(cpu_cc_src, src1);
+        tcg_gen_mov_tl(cpu_cc_src2, src2);
+        tcg_gen_mov_tl(cpu_cc_dst, dst);
+    }
 }
 
 static inline void gen_op_tadd_cc(TCGv dst, TCGv src1, TCGv src2)
@@ -415,24 +521,78 @@  static inline void gen_op_sub_cc(TCGv dst, TCGv src1, TCGv src2)
     tcg_gen_mov_tl(dst, cpu_cc_dst);
 }
 
-static inline void gen_op_subxi_cc(TCGv dst, TCGv src1, target_long src2)
+static void gen_op_subx_int(DisasContext *dc, TCGv dst, TCGv src1,
+                            TCGv src2, int update_cc)
 {
-    gen_helper_compute_C_icc(cpu_tmp0);
-    tcg_gen_mov_tl(cpu_cc_src, src1);
-    tcg_gen_movi_tl(cpu_cc_src2, src2);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
-    tcg_gen_subi_tl(cpu_cc_dst, cpu_cc_dst, src2);
-    tcg_gen_mov_tl(dst, cpu_cc_dst);
-}
+    TCGv_i32 carry_32;
+    TCGv carry;
 
-static inline void gen_op_subx_cc(TCGv dst, TCGv src1, TCGv src2)
-{
-    gen_helper_compute_C_icc(cpu_tmp0);
-    tcg_gen_mov_tl(cpu_cc_src, src1);
-    tcg_gen_mov_tl(cpu_cc_src2, src2);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
-    tcg_gen_mov_tl(dst, cpu_cc_dst);
+    switch (dc->cc_op) {
+    case CC_OP_DIV:
+    case CC_OP_LOGIC:
+        /* Carry is known to be zero.  Fall back to plain SUB.  */
+        if (update_cc) {
+            gen_op_sub_cc(dst, src1, src2);
+        } else {
+            tcg_gen_sub_tl(dst, src1, src2);
+        }
+        return;
+
+    case CC_OP_ADD:
+    case CC_OP_TADD:
+    case CC_OP_TADDTV:
+        carry_32 = gen_add32_carry32();
+        break;
+
+    case CC_OP_SUB:
+    case CC_OP_TSUB:
+    case CC_OP_TSUBTV:
+#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
+        {
+            /* For 32-bit hosts, we can re-use the host's hardware carry
+               generation by using a SUB2 opcode.  We discard the low
+               part of the output.  Ideally we'd combine this operation
+               with the add that generated the carry in the first place.  */
+            TCGv dst_low = tcg_temp_new();
+            tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst, 
+                            cpu_cc_src, src1, cpu_cc_src2, src2);
+            tcg_temp_free(dst_low);
+            goto sub_done;
+        }
+#endif
+        carry_32 = gen_sub32_carry32();
+        break;
+
+    default:
+        /* We need external help to produce the carry.  */
+        carry_32 = tcg_temp_new_i32();
+        gen_helper_compute_C_icc(carry_32);
+        break;
+    }
+
+#if TARGET_LONG_BITS == 64
+    carry = tcg_temp_new();
+    tcg_gen_extu_i32_i64(carry, carry_32);
+#else
+    carry = carry_32;
+#endif
+
+    tcg_gen_sub_tl(dst, src1, src2);
+    tcg_gen_sub_tl(dst, dst, carry);
+
+    tcg_temp_free_i32(carry_32);
+#if TARGET_LONG_BITS == 64
+    tcg_temp_free(carry);
+#endif
+
+#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
+ sub_done:
+#endif
+    if (update_cc) {
+        tcg_gen_mov_tl(cpu_cc_src, src1);
+        tcg_gen_mov_tl(cpu_cc_src2, src2);
+        tcg_gen_mov_tl(cpu_cc_dst, dst);
+    }
 }
 
 static inline void gen_op_tsub_cc(TCGv dst, TCGv src1, TCGv src2)
@@ -2950,28 +3110,8 @@  static void disas_sparc_insn(DisasContext * dc)
                         }
                         break;
                     case 0x8: /* addx, V9 addc */
-                        if (IS_IMM) {
-                            simm = GET_FIELDs(insn, 19, 31);
-                            if (xop & 0x10) {
-                                gen_op_addxi_cc(cpu_dst, cpu_src1, simm);
-                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
-                                dc->cc_op = CC_OP_ADDX;
-                            } else {
-                                gen_helper_compute_C_icc(cpu_tmp0);
-                                tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
-                                tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
-                            }
-                        } else {
-                            if (xop & 0x10) {
-                                gen_op_addx_cc(cpu_dst, cpu_src1, cpu_src2);
-                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
-                                dc->cc_op = CC_OP_ADDX;
-                            } else {
-                                gen_helper_compute_C_icc(cpu_tmp0);
-                                tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
-                                tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
-                            }
-                        }
+                        gen_op_addx_int(dc, cpu_dst, cpu_src1, cpu_src2,
+                                        (xop & 0x10));
                         break;
 #ifdef TARGET_SPARC64
                     case 0x9: /* V9 mulx */
@@ -3002,28 +3142,8 @@  static void disas_sparc_insn(DisasContext * dc)
                         }
                         break;
                     case 0xc: /* subx, V9 subc */
-                        if (IS_IMM) {
-                            simm = GET_FIELDs(insn, 19, 31);
-                            if (xop & 0x10) {
-                                gen_op_subxi_cc(cpu_dst, cpu_src1, simm);
-                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
-                                dc->cc_op = CC_OP_SUBX;
-                            } else {
-                                gen_helper_compute_C_icc(cpu_tmp0);
-                                tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
-                                tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
-                            }
-                        } else {
-                            if (xop & 0x10) {
-                                gen_op_subx_cc(cpu_dst, cpu_src1, cpu_src2);
-                                tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
-                                dc->cc_op = CC_OP_SUBX;
-                            } else {
-                                gen_helper_compute_C_icc(cpu_tmp0);
-                                tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
-                                tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
-                            }
-                        }
+                        gen_op_subx_int(dc, cpu_dst, cpu_src1, cpu_src2,
+                                        (xop & 0x10));
                         break;
 #ifdef TARGET_SPARC64
                     case 0xd: /* V9 udivx */