From patchwork Wed May 12 18:04:27 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 52399 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 53524B7DB3 for ; Thu, 13 May 2010 04:15:00 +1000 (EST) Received: from localhost ([127.0.0.1]:47200 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1OCGSN-0003BO-R2 for incoming@patchwork.ozlabs.org; Wed, 12 May 2010 14:14:55 -0400 Received: from [140.186.70.92] (port=44640 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1OCGLe-00019N-6f for qemu-devel@nongnu.org; Wed, 12 May 2010 14:07:59 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.69) (envelope-from ) id 1OCGII-0001fx-Nd for qemu-devel@nongnu.org; Wed, 12 May 2010 14:04:38 -0400 Received: from are.twiddle.net ([75.149.56.221]:53205) by eggs.gnu.org with esmtp (Exim 4.69) (envelope-from ) id 1OCGII-0001fU-8N for qemu-devel@nongnu.org; Wed, 12 May 2010 14:04:30 -0400 Received: from anchor.twiddle.home (anchor.twiddle.home [172.31.0.4]) by are.twiddle.net (Postfix) with ESMTPS id 972CEBD7; Wed, 12 May 2010 11:04:29 -0700 (PDT) Received: from anchor.twiddle.home (anchor.twiddle.home [127.0.0.1]) by anchor.twiddle.home (8.14.4/8.14.4) with ESMTP id o4CI4T43023401; Wed, 12 May 2010 11:04:29 -0700 Received: (from rth@localhost) by anchor.twiddle.home (8.14.4/8.14.4/Submit) id o4CI4T8G023400; Wed, 12 May 2010 11:04:29 -0700 From: Richard Henderson To: qemu-devel@nongnu.org Date: Wed, 12 May 2010 11:04:27 -0700 Message-Id: <62d85d2c3864d77023c9d4d0d9ad87866ff19142.1273687255.git.rth@twiddle.net> X-Mailer: git-send-email 1.7.0.1 In-Reply-To: References: In-Reply-To: References: X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6 (newer, 2) Cc: blauwirbel@gmail.com, atar4qemu@googlemail.com Subject: [Qemu-devel] [PATCH 3/3] target-sparc: Inline some generation of carry for ADDX/SUBX. X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Computing carry is trivial for some inputs. By avoiding an external function call, we generate near-optimal code for the common cases of add+addx (double-word arithmetic) and cmp+addx (a setcc pattern). Signed-off-by: Richard Henderson --- target-sparc/helper.h | 2 +- target-sparc/op_helper.c | 2 +- target-sparc/translate.c | 272 +++++++++++++++++++++++++++++++++------------- 3 files changed, 200 insertions(+), 76 deletions(-) diff --git a/target-sparc/helper.h b/target-sparc/helper.h index 04c1306..6f103e7 100644 --- a/target-sparc/helper.h +++ b/target-sparc/helper.h @@ -158,6 +158,6 @@ VIS_CMPHELPER(cmpne); #undef VIS_HELPER #undef VIS_CMPHELPER DEF_HELPER_0(compute_psr, void); -DEF_HELPER_0(compute_C_icc, tl); +DEF_HELPER_0(compute_C_icc, i32); #include "def-helper.h" diff --git a/target-sparc/op_helper.c b/target-sparc/op_helper.c index 3783b02..125cd67 100644 --- a/target-sparc/op_helper.c +++ b/target-sparc/op_helper.c @@ -1342,7 +1342,7 @@ void helper_compute_psr(void) CC_OP = CC_OP_FLAGS; } -target_ulong helper_compute_C_icc(void) +uint32_t helper_compute_C_icc(void) { uint32_t ret; diff --git a/target-sparc/translate.c b/target-sparc/translate.c index ea7c71b..713d3e1 100644 --- a/target-sparc/translate.c +++ b/target-sparc/translate.c @@ -332,24 +332,132 @@ static inline void gen_op_add_cc(TCGv dst, TCGv src1, TCGv src2) tcg_gen_mov_tl(dst, cpu_cc_dst); } -static inline void gen_op_addxi_cc(TCGv dst, TCGv src1, target_long src2) +static TCGv_i32 gen_add32_carry32(void) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_movi_tl(cpu_cc_src2, src2); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_addi_tl(cpu_cc_dst, cpu_cc_dst, src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + TCGv_i32 carry_32, cc_src1_32, cc_src2_32; + + /* Carry is computed from a previous add: (dst < src) */ +#if TARGET_LONG_BITS == 64 + cc_src1_32 = tcg_temp_new_i32(); + cc_src2_32 = tcg_temp_new_i32(); + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst); + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src); +#else + cc_src1_32 = cpu_cc_dst; + cc_src2_32 = cpu_cc_src; +#endif + + carry_32 = tcg_temp_new_i32(); + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32); + +#if TARGET_LONG_BITS == 64 + tcg_temp_free_i32(cc_src1_32); + tcg_temp_free_i32(cc_src2_32); +#endif + + return carry_32; } -static inline void gen_op_addx_cc(TCGv dst, TCGv src1, TCGv src2) +static TCGv_i32 gen_sub32_carry32(void) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_mov_tl(cpu_cc_src2, src2); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + TCGv_i32 carry_32, cc_src1_32, cc_src2_32; + + /* Carry is computed from a previous borrow: (src1 < src2) */ +#if TARGET_LONG_BITS == 64 + cc_src1_32 = tcg_temp_new_i32(); + cc_src2_32 = tcg_temp_new_i32(); + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src); + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2); +#else + cc_src1_32 = cpu_cc_src; + cc_src2_32 = cpu_cc_src2; +#endif + + carry_32 = tcg_temp_new_i32(); + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32); + +#if TARGET_LONG_BITS == 64 + tcg_temp_free_i32(cc_src1_32); + tcg_temp_free_i32(cc_src2_32); +#endif + + return carry_32; +} + +static void gen_op_addx_int(DisasContext *dc, TCGv dst, TCGv src1, + TCGv src2, int update_cc) +{ + TCGv_i32 carry_32; + TCGv carry; + + switch (dc->cc_op) { + case CC_OP_DIV: + case CC_OP_LOGIC: + /* Carry is known to be zero. Fall back to plain ADD. */ + if (update_cc) { + gen_op_add_cc(dst, src1, src2); + } else { + tcg_gen_add_tl(dst, src1, src2); + } + return; + + case CC_OP_ADD: + case CC_OP_TADD: + case CC_OP_TADDTV: +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + { + /* For 32-bit hosts, we can re-use the host's hardware carry + generation by using an ADD2 opcode. We discard the low + part of the output. Ideally we'd combine this operation + with the add that generated the carry in the first place. */ + TCGv dst_low = tcg_temp_new(); + tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst, + cpu_cc_src, src1, cpu_cc_src2, src2); + tcg_temp_free(dst_low); + goto add_done; + } +#endif + carry_32 = gen_add32_carry32(); + break; + + case CC_OP_SUB: + case CC_OP_TSUB: + case CC_OP_TSUBTV: + carry_32 = gen_sub32_carry32(); + break; + + default: + /* We need external help to produce the carry. */ + carry_32 = tcg_temp_new_i32(); + gen_helper_compute_C_icc(carry_32); + break; + } + +#if TARGET_LONG_BITS == 64 + carry = tcg_temp_new(); + tcg_gen_extu_i32_i64(carry, carry_32); +#else + carry = carry_32; +#endif + + tcg_gen_add_tl(dst, src1, src2); + tcg_gen_add_tl(dst, dst, carry); + + tcg_temp_free_i32(carry_32); +#if TARGET_LONG_BITS == 64 + tcg_temp_free(carry); +#endif + +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + add_done: +#endif + if (update_cc) { + tcg_gen_mov_tl(cpu_cc_src, src1); + tcg_gen_mov_tl(cpu_cc_src2, src2); + tcg_gen_mov_tl(cpu_cc_dst, dst); + tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); + dc->cc_op = CC_OP_ADDX; + } } static inline void gen_op_tadd_cc(TCGv dst, TCGv src1, TCGv src2) @@ -415,24 +523,80 @@ static inline void gen_op_sub_cc(TCGv dst, TCGv src1, TCGv src2) tcg_gen_mov_tl(dst, cpu_cc_dst); } -static inline void gen_op_subxi_cc(TCGv dst, TCGv src1, target_long src2) +static void gen_op_subx_int(DisasContext *dc, TCGv dst, TCGv src1, + TCGv src2, int update_cc) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_movi_tl(cpu_cc_src2, src2); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_subi_tl(cpu_cc_dst, cpu_cc_dst, src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); -} + TCGv_i32 carry_32; + TCGv carry; -static inline void gen_op_subx_cc(TCGv dst, TCGv src1, TCGv src2) -{ - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_mov_tl(cpu_cc_src2, src2); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + switch (dc->cc_op) { + case CC_OP_DIV: + case CC_OP_LOGIC: + /* Carry is known to be zero. Fall back to plain SUB. */ + if (update_cc) { + gen_op_sub_cc(dst, src1, src2); + } else { + tcg_gen_sub_tl(dst, src1, src2); + } + return; + + case CC_OP_ADD: + case CC_OP_TADD: + case CC_OP_TADDTV: + carry_32 = gen_add32_carry32(); + break; + + case CC_OP_SUB: + case CC_OP_TSUB: + case CC_OP_TSUBTV: +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + { + /* For 32-bit hosts, we can re-use the host's hardware carry + generation by using a SUB2 opcode. We discard the low + part of the output. Ideally we'd combine this operation + with the add that generated the carry in the first place. */ + TCGv dst_low = tcg_temp_new(); + tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst, + cpu_cc_src, src1, cpu_cc_src2, src2); + tcg_temp_free(dst_low); + goto sub_done; + } +#endif + carry_32 = gen_sub32_carry32(); + break; + + default: + /* We need external help to produce the carry. */ + carry_32 = tcg_temp_new_i32(); + gen_helper_compute_C_icc(carry_32); + break; + } + +#if TARGET_LONG_BITS == 64 + carry = tcg_temp_new(); + tcg_gen_extu_i32_i64(carry, carry_32); +#else + carry = carry_32; +#endif + + tcg_gen_sub_tl(dst, src1, src2); + tcg_gen_sub_tl(dst, dst, carry); + + tcg_temp_free_i32(carry_32); +#if TARGET_LONG_BITS == 64 + tcg_temp_free(carry); +#endif + +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + sub_done: +#endif + if (update_cc) { + tcg_gen_mov_tl(cpu_cc_src, src1); + tcg_gen_mov_tl(cpu_cc_src2, src2); + tcg_gen_mov_tl(cpu_cc_dst, dst); + tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); + dc->cc_op = CC_OP_SUBX; + } } static inline void gen_op_tsub_cc(TCGv dst, TCGv src1, TCGv src2) @@ -2950,28 +3114,8 @@ static void disas_sparc_insn(DisasContext * dc) } break; case 0x8: /* addx, V9 addc */ - if (IS_IMM) { - simm = GET_FIELDs(insn, 19, 31); - if (xop & 0x10) { - gen_op_addxi_cc(cpu_dst, cpu_src1, simm); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); - dc->cc_op = CC_OP_ADDX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm); - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } else { - if (xop & 0x10) { - gen_op_addx_cc(cpu_dst, cpu_src1, cpu_src2); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); - dc->cc_op = CC_OP_ADDX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0); - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } + gen_op_addx_int(dc, cpu_dst, cpu_src1, cpu_src2, + (xop & 0x10)); break; #ifdef TARGET_SPARC64 case 0x9: /* V9 mulx */ @@ -3002,28 +3146,8 @@ static void disas_sparc_insn(DisasContext * dc) } break; case 0xc: /* subx, V9 subc */ - if (IS_IMM) { - simm = GET_FIELDs(insn, 19, 31); - if (xop & 0x10) { - gen_op_subxi_cc(cpu_dst, cpu_src1, simm); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); - dc->cc_op = CC_OP_SUBX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm); - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } else { - if (xop & 0x10) { - gen_op_subx_cc(cpu_dst, cpu_src1, cpu_src2); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); - dc->cc_op = CC_OP_SUBX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0); - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } + gen_op_subx_int(dc, cpu_dst, cpu_src1, cpu_src2, + (xop & 0x10)); break; #ifdef TARGET_SPARC64 case 0xd: /* V9 udivx */