From patchwork Mon Nov  2 15:16:44 2009
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ulrich Hecht <uli@suse.de>
X-Patchwork-Id: 37426
Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.gnu.org (lists.gnu.org [199.232.76.165])
	(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))
	(Client did not present a certificate)
	by ozlabs.org (Postfix) with ESMTPS id 6A7B9B7BFE
	for <incoming@patchwork.ozlabs.org>;
	Tue,  3 Nov 2009 02:16:36 +1100 (EST)
Received: from localhost ([127.0.0.1]:34943 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1N4ye1-0004Mt-Gw
	for incoming@patchwork.ozlabs.org; Mon, 02 Nov 2009 10:16:33 -0500
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
	id 1N4ydT-0004Mg-Ll
	for qemu-devel@nongnu.org; Mon, 02 Nov 2009 10:15:59 -0500
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
	id 1N4ydP-0004KR-Qa
	for qemu-devel@nongnu.org; Mon, 02 Nov 2009 10:15:59 -0500
Received: from [199.232.76.173] (port=44043 helo=monty-python.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1N4ydP-0004KM-NQ
	for qemu-devel@nongnu.org; Mon, 02 Nov 2009 10:15:55 -0500
Received: from cantor2.suse.de ([195.135.220.15]:43281 helo=mx2.suse.de)
	by monty-python.gnu.org with esmtp (Exim 4.60)
	(envelope-from <uli@suse.de>) id 1N4ydP-0002qw-8V
	for qemu-devel@nongnu.org; Mon, 02 Nov 2009 10:15:55 -0500
Received: from relay2.suse.de (relay-ext.suse.de [195.135.221.8])
	by mx2.suse.de (Postfix) with ESMTP id 7DDD68726A;
	Mon,  2 Nov 2009 16:15:52 +0100 (CET)
From: Ulrich Hecht <uli@suse.de>
Organization: SuSE Linux Products GmbH
To: Aurelien Jarno <aurelien@aurel32.net>
Subject: Re: [Qemu-devel] [PATCH 2/9] S/390 CPU emulation
Date: Mon, 2 Nov 2009 17:16:44 +0200
User-Agent: KMail/1.9.10
References: <1255696735-21396-1-git-send-email-uli@suse.de>
	<200910191917.18972.uli@suse.de>
	<20091022212854.GW1883@hall.aurel32.net>
In-Reply-To: <20091022212854.GW1883@hall.aurel32.net>
MIME-Version: 1.0
Message-Id: <200911021616.45102.uli@suse.de>
X-detected-operating-system: by monty-python.gnu.org: GNU/Linux 2.4-2.6
Cc: qemu-devel@nongnu.org, agraf@suse.de
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

On Thursday 22 October 2009, Aurelien Jarno wrote:
> Probably the second. Changing the instruction pointer in the helper
> instead of using the proper goto_tb TCG op prevents TB chaining, and
> therefore as a huge impact on performance.
>
> It's something not difficult to implement, and that I would definitely
> want to see in the patch before getting it merged.

OK, I implemented it, and the surprising result is that performance  does 
not get any better; in fact it even suffers a little bit. (My standard 
quick test, the polarssl test suite, shows about a 2% performance impact 
when profiled with cachegrind).

Could there be anything I overlooked? I modelled my implementation after 
those in the existing targets. (See the attached patch that goes on top 
of my other S/390 patches.)

CU
Uli

diff --git a/target-s390/helpers.h b/target-s390/helpers.h
index 0d16760..6009312 100644
--- a/target-s390/helpers.h
+++ b/target-s390/helpers.h
@@ -15,7 +15,6 @@ DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
 DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32)
 DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64)
 DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
-DEF_HELPER_4(brc, void, i32, i32, i64, s32)
 DEF_HELPER_3(brctg, void, i64, i64, s32)
 DEF_HELPER_3(brct, void, i32, i64, s32)
 DEF_HELPER_4(brcl, void, i32, i32, i64, s64)
diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c
index 637d22f..f7f52ba 100644
--- a/target-s390/op_helper.c
+++ b/target-s390/op_helper.c
@@ -218,17 +218,6 @@ uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val)
     return cc;
 }
 
-/* relative conditional branch */
-void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset)
-{
-    if ( mask & ( 1 << (3 - cc) ) ) {
-        env->psw.addr = pc + offset;
-    }
-    else {
-        env->psw.addr = pc + 4;
-    }
-}
-
 /* branch relative on 64-bit count (condition is computed inline, this only
    does the branch */
 void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset)
diff --git a/target-s390/translate.c b/target-s390/translate.c
index 9ffa7bd..5a7cfe7 100644
--- a/target-s390/translate.c
+++ b/target-s390/translate.c
@@ -49,6 +49,7 @@ struct DisasContext {
     uint64_t pc;
     int is_jmp;
     CPUS390XState *env;
+    struct TranslationBlock *tb;
 };
 
 #define DISAS_EXCP 4
@@ -359,23 +360,55 @@ static void gen_bcr(uint32_t mask, int tr, uint64_t offset)
     tcg_temp_free(target);
 }
 
-static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset)
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc)
 {
-    TCGv p;
-    TCGv_i32 m, o;
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    /* NOTE: we handle the case where the TB spans two pages here */
+    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+        /* jump to same page: we can use a direct jump */
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_goto_tb(tb_num);
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_exit_tb((long)tb + tb_num);
+    } else {
+        /* jump to another page: currently not optimized */
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_exit_tb(0);
+    }
+}
+
+static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset)
+{
+    TCGv_i32 r;
+    TCGv_i32 tmp, tmp2;
+    int skip;
     
     if (mask == 0xf) {	/* unconditional */
-      tcg_gen_movi_i64(psw_addr, pc + offset);
+      //tcg_gen_movi_i64(psw_addr, s->pc + offset);
+      gen_goto_tb(s, 0, s->pc + offset);
     }
     else {
-      m = tcg_const_i32(mask);
-      p = tcg_const_i64(pc);
-      o = tcg_const_i32(offset);
-      gen_helper_brc(cc, m, p, o);
-      tcg_temp_free(m);
-      tcg_temp_free(p);
-      tcg_temp_free(o);
+      tmp = tcg_const_i32(3);
+      tcg_gen_sub_i32(tmp, tmp, cc);	/* 3 - cc */
+      tmp2 = tcg_const_i32(1);
+      tcg_gen_shl_i32(tmp2, tmp2, tmp);	/* 1 << (3 - cc) */
+      r = tcg_const_i32(mask);
+      tcg_gen_and_i32(r, r, tmp2);	/* mask & (1 << (3 - cc)) */
+      tcg_temp_free(tmp);
+      tcg_temp_free(tmp2);
+      skip = gen_new_label();
+      tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip);
+      gen_goto_tb(s, 0, s->pc + offset);
+      gen_set_label(skip);
+      gen_goto_tb(s, 1, s->pc + 4);
+      tcg_gen_mov_i32(global_cc, cc);
+      tcg_temp_free(r);
     }
+    s->is_jmp = DISAS_TB_JUMP;
 }
 
 static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr)
@@ -1143,9 +1176,7 @@ static void disas_a7(DisasContext *s, int op, int r1, int i2)
         tcg_temp_free(tmp2);
         break;
     case 0x4: /* brc m1, i2 */
-        /* FIXME: optimize m1 == 0xf (unconditional) case */
-        gen_brc(r1, s->pc, i2 * 2);
-        s->is_jmp = DISAS_JUMP;
+        gen_brc(r1, s, i2 * 2);
         return;
     case 0x5: /* BRAS     R1,I2     [RI] */
         tmp = tcg_const_i64(s->pc + 4);
@@ -2739,6 +2770,7 @@ static inline void gen_intermediate_code_internal (CPUState *env,
     dc.env = env;
     dc.pc = pc_start;
     dc.is_jmp = DISAS_NEXT;
+    dc.tb = tb;
     
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     
@@ -2778,8 +2810,11 @@ static inline void gen_intermediate_code_internal (CPUState *env,
         num_insns++;
     } while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < next_page_start
              && num_insns < max_insns && !env->singlestep_enabled);
-    tcg_gen_mov_i32(global_cc, cc);
-    tcg_temp_free(cc);
+
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_temp_free(cc);
+    }
     
     if (!dc.is_jmp) {
         tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, psw.addr));
@@ -2794,7 +2829,9 @@ static inline void gen_intermediate_code_internal (CPUState *env,
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
     /* Generate the return instruction */
-    tcg_gen_exit_tb(0);
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_exit_tb(0);
+    }
     gen_icount_end(tb, num_insns);
     *gen_opc_ptr = INDEX_op_end;
     if (search_pc) {