Patchwork [2/9] S/390 CPU emulation

login
register
mail settings
Submitter Ulrich Hecht
Date Nov. 2, 2009, 3:16 p.m.
Message ID <200911021616.45102.uli@suse.de>
Download mbox | patch
Permalink /patch/37426/
State New
Headers show

Comments

Ulrich Hecht - Nov. 2, 2009, 3:16 p.m.
On Thursday 22 October 2009, Aurelien Jarno wrote:
> Probably the second. Changing the instruction pointer in the helper
> instead of using the proper goto_tb TCG op prevents TB chaining, and
> therefore as a huge impact on performance.
>
> It's something not difficult to implement, and that I would definitely
> want to see in the patch before getting it merged.

OK, I implemented it, and the surprising result is that performance  does 
not get any better; in fact it even suffers a little bit. (My standard 
quick test, the polarssl test suite, shows about a 2% performance impact 
when profiled with cachegrind).

Could there be anything I overlooked? I modelled my implementation after 
those in the existing targets. (See the attached patch that goes on top 
of my other S/390 patches.)

CU
Uli
Aurelien Jarno - Nov. 2, 2009, 6:42 p.m.
On Mon, Nov 02, 2009 at 05:16:44PM +0200, Ulrich Hecht wrote:
> On Thursday 22 October 2009, Aurelien Jarno wrote:
> > Probably the second. Changing the instruction pointer in the helper
> > instead of using the proper goto_tb TCG op prevents TB chaining, and
> > therefore as a huge impact on performance.
> >
> > It's something not difficult to implement, and that I would definitely
> > want to see in the patch before getting it merged.
> 
> OK, I implemented it, and the surprising result is that performance  does 
> not get any better; in fact it even suffers a little bit. (My standard 
> quick test, the polarssl test suite, shows about a 2% performance impact 
> when profiled with cachegrind).

That looks really strange, as TB chaining clearly reduce the number of
instructions to execute, by not have to lookup for the TB after each
branch. Also using a brcond instead of a helper should change nothing as
it is located at the end of the TB, where all the globals must be saved
in anyway.

Also a recent bug found on ARM host with regard to TB chaining has shown
it can gives a noticeably speed gain.

On what host are you doing your benchmarks?

> Could there be anything I overlooked? I modelled my implementation after 
> those in the existing targets. (See the attached patch that goes on top 
> of my other S/390 patches.)
> 

Your patch looks good overall, I have minor comments (see below), 
but nothing that should improve the speed noticeably.

> -- 
> SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG Nürnberg)

> diff --git a/target-s390/helpers.h b/target-s390/helpers.h
> index 0d16760..6009312 100644
> --- a/target-s390/helpers.h
> +++ b/target-s390/helpers.h
> @@ -15,7 +15,6 @@ DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
>  DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32)
>  DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64)
>  DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
> -DEF_HELPER_4(brc, void, i32, i32, i64, s32)
>  DEF_HELPER_3(brctg, void, i64, i64, s32)
>  DEF_HELPER_3(brct, void, i32, i64, s32)
>  DEF_HELPER_4(brcl, void, i32, i32, i64, s64)
> diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c
> index 637d22f..f7f52ba 100644
> --- a/target-s390/op_helper.c
> +++ b/target-s390/op_helper.c
> @@ -218,17 +218,6 @@ uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val)
>      return cc;
>  }
>  
> -/* relative conditional branch */
> -void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset)
> -{
> -    if ( mask & ( 1 << (3 - cc) ) ) {
> -        env->psw.addr = pc + offset;
> -    }
> -    else {
> -        env->psw.addr = pc + 4;
> -    }
> -}
> -
>  /* branch relative on 64-bit count (condition is computed inline, this only
>     does the branch */
>  void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset)
> diff --git a/target-s390/translate.c b/target-s390/translate.c
> index 9ffa7bd..5a7cfe7 100644
> --- a/target-s390/translate.c
> +++ b/target-s390/translate.c
> @@ -49,6 +49,7 @@ struct DisasContext {
>      uint64_t pc;
>      int is_jmp;
>      CPUS390XState *env;
> +    struct TranslationBlock *tb;
>  };
>  
>  #define DISAS_EXCP 4
> @@ -359,23 +360,55 @@ static void gen_bcr(uint32_t mask, int tr, uint64_t offset)
>      tcg_temp_free(target);
>  }
>  
> -static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset)
> +static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc)
>  {
> -    TCGv p;
> -    TCGv_i32 m, o;
> +    TranslationBlock *tb;
> +
> +    tb = s->tb;
> +    /* NOTE: we handle the case where the TB spans two pages here */
> +    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
> +        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {

I have difficulties to figure out why the second comparison is needed. I
know it comes from target-i386, but on the other hand it is not present
in other targets.

> +        /* jump to same page: we can use a direct jump */
> +        tcg_gen_mov_i32(global_cc, cc);
> +        tcg_gen_goto_tb(tb_num);
> +        tcg_gen_movi_i64(psw_addr, pc);
> +        tcg_gen_exit_tb((long)tb + tb_num);
> +    } else {
> +        /* jump to another page: currently not optimized */
> +        tcg_gen_movi_i64(psw_addr, pc);
> +        tcg_gen_mov_i32(global_cc, cc);
> +        tcg_gen_exit_tb(0);
> +    }
> +}
> +
> +static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset)
> +{
> +    TCGv_i32 r;
> +    TCGv_i32 tmp, tmp2;
> +    int skip;
>      
>      if (mask == 0xf) {	/* unconditional */
> -      tcg_gen_movi_i64(psw_addr, pc + offset);
> +      //tcg_gen_movi_i64(psw_addr, s->pc + offset);
> +      gen_goto_tb(s, 0, s->pc + offset);
>      }
>      else {
> -      m = tcg_const_i32(mask);
> -      p = tcg_const_i64(pc);
> -      o = tcg_const_i32(offset);
> -      gen_helper_brc(cc, m, p, o);
> -      tcg_temp_free(m);
> -      tcg_temp_free(p);
> -      tcg_temp_free(o);
> +      tmp = tcg_const_i32(3);
> +      tcg_gen_sub_i32(tmp, tmp, cc);	/* 3 - cc */
> +      tmp2 = tcg_const_i32(1);
> +      tcg_gen_shl_i32(tmp2, tmp2, tmp);	/* 1 << (3 - cc) */
> +      r = tcg_const_i32(mask);
> +      tcg_gen_and_i32(r, r, tmp2);	/* mask & (1 << (3 - cc)) */
> +      tcg_temp_free(tmp);
> +      tcg_temp_free(tmp2);
> +      skip = gen_new_label();
> +      tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip);
> +      gen_goto_tb(s, 0, s->pc + offset);
> +      gen_set_label(skip);
> +      gen_goto_tb(s, 1, s->pc + 4);
> +      tcg_gen_mov_i32(global_cc, cc);

This is probably not needed as this is already done in gen_goto_tb().

> +      tcg_temp_free(r);
>      }
> +    s->is_jmp = DISAS_TB_JUMP;
>  }
>  
>  static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr)
> @@ -1143,9 +1176,7 @@ static void disas_a7(DisasContext *s, int op, int r1, int i2)
>          tcg_temp_free(tmp2);
>          break;
>      case 0x4: /* brc m1, i2 */
> -        /* FIXME: optimize m1 == 0xf (unconditional) case */
> -        gen_brc(r1, s->pc, i2 * 2);
> -        s->is_jmp = DISAS_JUMP;
> +        gen_brc(r1, s, i2 * 2);
>          return;
>      case 0x5: /* BRAS     R1,I2     [RI] */
>          tmp = tcg_const_i64(s->pc + 4);
> @@ -2739,6 +2770,7 @@ static inline void gen_intermediate_code_internal (CPUState *env,
>      dc.env = env;
>      dc.pc = pc_start;
>      dc.is_jmp = DISAS_NEXT;
> +    dc.tb = tb;
>      
>      gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
>      
> @@ -2778,8 +2810,11 @@ static inline void gen_intermediate_code_internal (CPUState *env,
>          num_insns++;
>      } while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < next_page_start
>               && num_insns < max_insns && !env->singlestep_enabled);
> -    tcg_gen_mov_i32(global_cc, cc);
> -    tcg_temp_free(cc);
> +
> +    if (dc.is_jmp != DISAS_TB_JUMP) {
> +        tcg_gen_mov_i32(global_cc, cc);
> +        tcg_temp_free(cc);
> +    }
>      
>      if (!dc.is_jmp) {
>          tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, psw.addr));
> @@ -2794,7 +2829,9 @@ static inline void gen_intermediate_code_internal (CPUState *env,
>      if (tb->cflags & CF_LAST_IO)
>          gen_io_end();
>      /* Generate the return instruction */
> -    tcg_gen_exit_tb(0);
> +    if (dc.is_jmp != DISAS_TB_JUMP) {
> +        tcg_gen_exit_tb(0);
> +    }
>      gen_icount_end(tb, num_insns);
>      *gen_opc_ptr = INDEX_op_end;
>      if (search_pc) {
Laurent Desnogues - Nov. 2, 2009, 7:03 p.m.
On Mon, Nov 2, 2009 at 7:42 PM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> On Mon, Nov 02, 2009 at 05:16:44PM +0200, Ulrich Hecht wrote:
>> On Thursday 22 October 2009, Aurelien Jarno wrote:
>> > Probably the second. Changing the instruction pointer in the helper
>> > instead of using the proper goto_tb TCG op prevents TB chaining, and
>> > therefore as a huge impact on performance.
>> >
>> > It's something not difficult to implement, and that I would definitely
>> > want to see in the patch before getting it merged.
>>
>> OK, I implemented it, and the surprising result is that performance  does
>> not get any better; in fact it even suffers a little bit. (My standard
>> quick test, the polarssl test suite, shows about a 2% performance impact
>> when profiled with cachegrind).
>
> That looks really strange, as TB chaining clearly reduce the number of
> instructions to execute, by not have to lookup for the TB after each
> branch. Also using a brcond instead of a helper should change nothing as
> it is located at the end of the TB, where all the globals must be saved
> in anyway.
>
> Also a recent bug found on ARM host with regard to TB chaining has shown
> it can gives a noticeably speed gain.

That indeed looks strange:  fixing the TB chaining on ARM
made nbench i386 three times faster.  Note the gain was
less for FP parts of the benchmark due to the use of
helpers.

Ulrich,

out of curiosity could you post your tb_set_jmp_target1
function?  The only thing I can think of at the moment that
could make the code slower is that the program you ran
was not reusing blocks and/or cache flushing in
tb_set_jmp_target1 is overkill.


Laurent
Ulrich Hecht - Nov. 9, 2009, 4:55 p.m.
On Monday 02 November 2009, Laurent Desnogues wrote:
> That indeed looks strange:  fixing the TB chaining on ARM
> made nbench i386 three times faster.  Note the gain was
> less for FP parts of the benchmark due to the use of
> helpers.
>
> out of curiosity could you post your tb_set_jmp_target1
> function?

I'm on an AMD64 host, so it's the same code as in mainline.

> The only thing I can think of at the moment that 
> could make the code slower is that the program you ran
> was not reusing blocks and/or cache flushing in
> tb_set_jmp_target1 is overkill.

There is no cache flushing in the AMD64 tb_set_jmp_target1() function, 
and the polarssl test suite is by nature rather repetitive.

I did some experiments, and it seems disabling the TB chaining (by 
emptying tb_set_jmp_target()) does not have any impact on performance at 
all on AMD64. I tested it with several CPU-intensive programs (md5sum 
and the like) with AMD64 on AMD64 userspace emulation (qemu-x86_64), and 
the difference in performance with TB chaining and without is hardly 
measurable. The chaining is performed as advertised if enabled, I 
checked that, but it does not seem to help performance.

How is this possible? Could this be related to cache size? I suspect the 
Phenom 9500 of mine is better equipped in that area than the average ARM 
controller.

And does the TB chaining actually work on AMD64 at all? I checked by 
adding some debug output, and it seems to patch the jumps correctly, but 
maybe somebody can verify that.

CU
Uli
Aurelien Jarno - Nov. 10, 2009, 4:02 p.m.
On Mon, Nov 09, 2009 at 06:55:23PM +0200, Ulrich Hecht wrote:
> On Monday 02 November 2009, Laurent Desnogues wrote:
> > That indeed looks strange:  fixing the TB chaining on ARM
> > made nbench i386 three times faster.  Note the gain was
> > less for FP parts of the benchmark due to the use of
> > helpers.
> >
> > out of curiosity could you post your tb_set_jmp_target1
> > function?
> 
> I'm on an AMD64 host, so it's the same code as in mainline.
> 
> > The only thing I can think of at the moment that 
> > could make the code slower is that the program you ran
> > was not reusing blocks and/or cache flushing in
> > tb_set_jmp_target1 is overkill.
> 
> There is no cache flushing in the AMD64 tb_set_jmp_target1() function, 
> and the polarssl test suite is by nature rather repetitive.
> 
> I did some experiments, and it seems disabling the TB chaining (by 
> emptying tb_set_jmp_target()) does not have any impact on performance at 
> all on AMD64. I tested it with several CPU-intensive programs (md5sum 
> and the like) with AMD64 on AMD64 userspace emulation (qemu-x86_64), and 
> the difference in performance with TB chaining and without is hardly 
> measurable. The chaining is performed as advertised if enabled, I 
> checked that, but it does not seem to help performance.

I have tested it by removing all the block around tb_add_jump in
cpu_exec.c. I have a speed loss of about 2.5x in the boot time of an
x86_64 image.

> How is this possible? Could this be related to cache size? I suspect the 
> Phenom 9500 of mine is better equipped in that area than the average ARM 
> controller.

For me it's on a Core 2 Duo T7200, so I doubt it is related to cache
size.

> And does the TB chaining actually work on AMD64 at all? I checked by 
> adding some debug output, and it seems to patch the jumps correctly, but 
> maybe somebody can verify that.
> 

Given the gain in speed I have, I guess it works.
Ulrich Hecht - Nov. 11, 2009, 10:46 a.m.
On Tuesday 10 November 2009, Aurelien Jarno wrote:
> I have tested it by removing all the block around tb_add_jump in
> cpu_exec.c. I have a speed loss of about 2.5x in the boot time of an
> x86_64 image.

I just tried it with qemu-system-x86_64, and with that I can observe a 
noticable performance gain using TB chaining as well. Maybe it's simply 
a lot more effective in system than in userspace emulation.

Anyway, having implemented it for BRC already, I might as well leave it 
in for future generations to enjoy.

CU
Uli

Patch

diff --git a/target-s390/helpers.h b/target-s390/helpers.h
index 0d16760..6009312 100644
--- a/target-s390/helpers.h
+++ b/target-s390/helpers.h
@@ -15,7 +15,6 @@  DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
 DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32)
 DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64)
 DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
-DEF_HELPER_4(brc, void, i32, i32, i64, s32)
 DEF_HELPER_3(brctg, void, i64, i64, s32)
 DEF_HELPER_3(brct, void, i32, i64, s32)
 DEF_HELPER_4(brcl, void, i32, i32, i64, s64)
diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c
index 637d22f..f7f52ba 100644
--- a/target-s390/op_helper.c
+++ b/target-s390/op_helper.c
@@ -218,17 +218,6 @@  uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val)
     return cc;
 }
 
-/* relative conditional branch */
-void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset)
-{
-    if ( mask & ( 1 << (3 - cc) ) ) {
-        env->psw.addr = pc + offset;
-    }
-    else {
-        env->psw.addr = pc + 4;
-    }
-}
-
 /* branch relative on 64-bit count (condition is computed inline, this only
    does the branch */
 void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset)
diff --git a/target-s390/translate.c b/target-s390/translate.c
index 9ffa7bd..5a7cfe7 100644
--- a/target-s390/translate.c
+++ b/target-s390/translate.c
@@ -49,6 +49,7 @@  struct DisasContext {
     uint64_t pc;
     int is_jmp;
     CPUS390XState *env;
+    struct TranslationBlock *tb;
 };
 
 #define DISAS_EXCP 4
@@ -359,23 +360,55 @@  static void gen_bcr(uint32_t mask, int tr, uint64_t offset)
     tcg_temp_free(target);
 }
 
-static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset)
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc)
 {
-    TCGv p;
-    TCGv_i32 m, o;
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    /* NOTE: we handle the case where the TB spans two pages here */
+    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+        /* jump to same page: we can use a direct jump */
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_goto_tb(tb_num);
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_exit_tb((long)tb + tb_num);
+    } else {
+        /* jump to another page: currently not optimized */
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_exit_tb(0);
+    }
+}
+
+static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset)
+{
+    TCGv_i32 r;
+    TCGv_i32 tmp, tmp2;
+    int skip;
     
     if (mask == 0xf) {	/* unconditional */
-      tcg_gen_movi_i64(psw_addr, pc + offset);
+      //tcg_gen_movi_i64(psw_addr, s->pc + offset);
+      gen_goto_tb(s, 0, s->pc + offset);
     }
     else {
-      m = tcg_const_i32(mask);
-      p = tcg_const_i64(pc);
-      o = tcg_const_i32(offset);
-      gen_helper_brc(cc, m, p, o);
-      tcg_temp_free(m);
-      tcg_temp_free(p);
-      tcg_temp_free(o);
+      tmp = tcg_const_i32(3);
+      tcg_gen_sub_i32(tmp, tmp, cc);	/* 3 - cc */
+      tmp2 = tcg_const_i32(1);
+      tcg_gen_shl_i32(tmp2, tmp2, tmp);	/* 1 << (3 - cc) */
+      r = tcg_const_i32(mask);
+      tcg_gen_and_i32(r, r, tmp2);	/* mask & (1 << (3 - cc)) */
+      tcg_temp_free(tmp);
+      tcg_temp_free(tmp2);
+      skip = gen_new_label();
+      tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip);
+      gen_goto_tb(s, 0, s->pc + offset);
+      gen_set_label(skip);
+      gen_goto_tb(s, 1, s->pc + 4);
+      tcg_gen_mov_i32(global_cc, cc);
+      tcg_temp_free(r);
     }
+    s->is_jmp = DISAS_TB_JUMP;
 }
 
 static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr)
@@ -1143,9 +1176,7 @@  static void disas_a7(DisasContext *s, int op, int r1, int i2)
         tcg_temp_free(tmp2);
         break;
     case 0x4: /* brc m1, i2 */
-        /* FIXME: optimize m1 == 0xf (unconditional) case */
-        gen_brc(r1, s->pc, i2 * 2);
-        s->is_jmp = DISAS_JUMP;
+        gen_brc(r1, s, i2 * 2);
         return;
     case 0x5: /* BRAS     R1,I2     [RI] */
         tmp = tcg_const_i64(s->pc + 4);
@@ -2739,6 +2770,7 @@  static inline void gen_intermediate_code_internal (CPUState *env,
     dc.env = env;
     dc.pc = pc_start;
     dc.is_jmp = DISAS_NEXT;
+    dc.tb = tb;
     
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     
@@ -2778,8 +2810,11 @@  static inline void gen_intermediate_code_internal (CPUState *env,
         num_insns++;
     } while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < next_page_start
              && num_insns < max_insns && !env->singlestep_enabled);
-    tcg_gen_mov_i32(global_cc, cc);
-    tcg_temp_free(cc);
+
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_temp_free(cc);
+    }
     
     if (!dc.is_jmp) {
         tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, psw.addr));
@@ -2794,7 +2829,9 @@  static inline void gen_intermediate_code_internal (CPUState *env,
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
     /* Generate the return instruction */
-    tcg_gen_exit_tb(0);
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_exit_tb(0);
+    }
     gen_icount_end(tb, num_insns);
     *gen_opc_ptr = INDEX_op_end;
     if (search_pc) {