Patchwork [v3,14/20] tcg-arm: Cleanup goto_tb handling

login
register
mail settings
Submitter Richard Henderson
Date March 28, 2013, 3:32 p.m.
Message ID <1364484781-15561-15-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/232075/
State New
Headers show

Comments

Richard Henderson - March 28, 2013, 3:32 p.m.
Eliminate 2 disabled code blocks.  Choose the load-to-pc method of
jumping so that we can eliminate the 16M code_gen_buffer limitation.
Remove a test in the indirect jump method that is always true.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h | 23 +++--------------------
 tcg/arm/tcg-target.c    | 26 ++++++--------------------
 translate-all.c         |  2 --
 3 files changed, 9 insertions(+), 42 deletions(-)
Aurelien Jarno - March 28, 2013, 8:09 p.m.
On Thu, Mar 28, 2013 at 08:32:55AM -0700, Richard Henderson wrote:
> Eliminate 2 disabled code blocks.  Choose the load-to-pc method of
> jumping so that we can eliminate the 16M code_gen_buffer limitation.
> Remove a test in the indirect jump method that is always true.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/exec/exec-all.h | 23 +++--------------------
>  tcg/arm/tcg-target.c    | 26 ++++++--------------------
>  translate-all.c         |  2 --
>  3 files changed, 9 insertions(+), 42 deletions(-)

I already proposed such a patch, but it seems to improve things only on
some specific cases (kernel boot), while increasing the I/D cache and
TLB pressure.

See https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg01684.html 

> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index e856191..190effa 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -233,26 +233,9 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
>  #elif defined(__arm__)
>  static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
>  {
> -#if !QEMU_GNUC_PREREQ(4, 1)
> -    register unsigned long _beg __asm ("a1");
> -    register unsigned long _end __asm ("a2");
> -    register unsigned long _flg __asm ("a3");
> -#endif
> -
> -    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
> -    *(uint32_t *)jmp_addr =
> -        (*(uint32_t *)jmp_addr & ~0xffffff)
> -        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
> -
> -#if QEMU_GNUC_PREREQ(4, 1)
> -    __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
> -#else
> -    /* flush icache */
> -    _beg = jmp_addr;
> -    _end = jmp_addr + 4;
> -    _flg = 0;
> -    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
> -#endif
> +    /* We're using "ldr pc, [pc,#-4]", so we can just store the raw
> +       address, without caring for flushing the icache.  */
> +    *(uint32_t *)jmp_addr = addr;
>  }
>  #elif defined(__sparc__)
>  void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index 7bcba19..6de5e90 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -1595,30 +1595,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>      case INDEX_op_goto_tb:
>          if (s->tb_jmp_offset) {
> -            /* Direct jump method */
> -#if defined(USE_DIRECT_JUMP)
> -            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -            tcg_out_b_noaddr(s, COND_AL);
> -#else
> +            /* "Direct" jump method.  Rather than limit the code gen buffer
> +               to 16M, load the destination from the next word.  */
>              tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
>              s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -            tcg_out32(s, 0);
> -#endif
> +            s->code_ptr += 4;
>          } else {
> -            /* Indirect jump method */
> -#if 1
> -            c = (int) (s->tb_next + args[0]) - ((int) s->code_ptr + 8);
> -            if (c > 0xfff || c < -0xfff) {
> -                tcg_out_movi32(s, COND_AL, TCG_REG_R0,
> -                                (tcg_target_long) (s->tb_next + args[0]));
> -                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
> -            } else
> -                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, c);
> -#else
> -            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_PC, 0);
> +            /* Indirect jump method.  */
> +            tcg_out_movi32(s, COND_AL, TCG_REG_R0,
> +                           (tcg_target_long) (s->tb_next + args[0]));
>              tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
> -            tcg_out32(s, (tcg_target_long) (s->tb_next + args[0]));
> -#endif
>          }
>          s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
>          break;
> diff --git a/translate-all.c b/translate-all.c
> index a98c646..3ca839f 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -460,8 +460,6 @@ static inline PageDesc *page_find(tb_page_addr_t index)
>  # define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
>  #elif defined(__sparc__)
>  # define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
> -#elif defined(__arm__)
> -# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
>  #elif defined(__s390x__)
>    /* We have a +- 4GB range on the branches; leave some slop.  */
>  # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
> -- 
> 1.8.1.4
> 
> 
>
Richard Henderson - March 28, 2013, 8:48 p.m.
On 2013-03-28 13:09, Aurelien Jarno wrote:
> I already proposed such a patch, but it seems to improve things only on
> some specific cases (kernel boot), while increasing the I/D cache and
> TLB pressure.
>
> See https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg01684.html

Hmm.  I see that your previous attempt didn't also go with a removal of
the 16MB limit on code_gen_buffer.  Given the default is 32MB, I wonder
if the decrease in translation makes enough difference.

I'll admit that this sort of change needs more benchmarking.  What I'd
like to do one way or the other is get rid of the myriad ifdefs.  That's
just too confusing.

For the record, I've also been experimenting with a real constant pool.
This starts to significantly reduce the insn size, even with movw.  This
primarily due to re-use of helper address constants.  With the pool, I
also place the goto_tb constants there as well.

I've not gotten that code completely stable yet, but once I do I can
also consider the number of Dcache lines that the pool will occupy.
(With v7, I only see 2-3 entries in the pool per TB, so in most cases
aligning to a cache boundary won't matter.)


r~
Aurelien Jarno - March 29, 2013, 6:50 a.m.
On Thu, Mar 28, 2013 at 01:48:42PM -0700, Richard Henderson wrote:
> On 2013-03-28 13:09, Aurelien Jarno wrote:
> >I already proposed such a patch, but it seems to improve things only on
> >some specific cases (kernel boot), while increasing the I/D cache and
> >TLB pressure.
> >
> >See https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg01684.html
> 
> Hmm.  I see that your previous attempt didn't also go with a removal of
> the 16MB limit on code_gen_buffer.  Given the default is 32MB, I wonder
> if the decrease in translation makes enough difference.
> 

My patch also removed the 16MB limit. Note also the default is 1/4th
of the emulated RAM, so it can be a lot bigger than 32MB.
Richard Henderson - March 29, 2013, 3:06 p.m.
On 2013-03-28 23:50, Aurelien Jarno wrote:
> On Thu, Mar 28, 2013 at 01:48:42PM -0700, Richard Henderson wrote:
>> On 2013-03-28 13:09, Aurelien Jarno wrote:
>>> I already proposed such a patch, but it seems to improve things only on
>>> some specific cases (kernel boot), while increasing the I/D cache and
>>> TLB pressure.
>>>
>>> See https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg01684.html
>>
>> Hmm.  I see that your previous attempt didn't also go with a removal of
>> the 16MB limit on code_gen_buffer.  Given the default is 32MB, I wonder
>> if the decrease in translation makes enough difference.
>>
>
> My patch also removed the 16MB limit. Note also the default is 1/4th
> of the emulated RAM, so it can be a lot bigger than 32MB.
>
Ok, I guess consider that patch dropped for now then.

I'll post v4 today with the changes suggested so far.


r~

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index e856191..190effa 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -233,26 +233,9 @@  static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 #elif defined(__arm__)
 static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 {
-#if !QEMU_GNUC_PREREQ(4, 1)
-    register unsigned long _beg __asm ("a1");
-    register unsigned long _end __asm ("a2");
-    register unsigned long _flg __asm ("a3");
-#endif
-
-    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
-    *(uint32_t *)jmp_addr =
-        (*(uint32_t *)jmp_addr & ~0xffffff)
-        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
-
-#if QEMU_GNUC_PREREQ(4, 1)
-    __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
-#else
-    /* flush icache */
-    _beg = jmp_addr;
-    _end = jmp_addr + 4;
-    _flg = 0;
-    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
-#endif
+    /* We're using "ldr pc, [pc,#-4]", so we can just store the raw
+       address, without caring for flushing the icache.  */
+    *(uint32_t *)jmp_addr = addr;
 }
 #elif defined(__sparc__)
 void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 7bcba19..6de5e90 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1595,30 +1595,16 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
-            /* Direct jump method */
-#if defined(USE_DIRECT_JUMP)
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            tcg_out_b_noaddr(s, COND_AL);
-#else
+            /* "Direct" jump method.  Rather than limit the code gen buffer
+               to 16M, load the destination from the next word.  */
             tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
             s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            tcg_out32(s, 0);
-#endif
+            s->code_ptr += 4;
         } else {
-            /* Indirect jump method */
-#if 1
-            c = (int) (s->tb_next + args[0]) - ((int) s->code_ptr + 8);
-            if (c > 0xfff || c < -0xfff) {
-                tcg_out_movi32(s, COND_AL, TCG_REG_R0,
-                                (tcg_target_long) (s->tb_next + args[0]));
-                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
-            } else
-                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, c);
-#else
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_PC, 0);
+            /* Indirect jump method.  */
+            tcg_out_movi32(s, COND_AL, TCG_REG_R0,
+                           (tcg_target_long) (s->tb_next + args[0]));
             tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
-            tcg_out32(s, (tcg_target_long) (s->tb_next + args[0]));
-#endif
         }
         s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
         break;
diff --git a/translate-all.c b/translate-all.c
index a98c646..3ca839f 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -460,8 +460,6 @@  static inline PageDesc *page_find(tb_page_addr_t index)
 # define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
 #elif defined(__sparc__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
-#elif defined(__arm__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)