diff mbox

[5/5] tcg/arm: improve direct jump

Message ID 1349814652-22325-6-git-send-email-aurelien@aurel32.net
State New
Headers show

Commit Message

Aurelien Jarno Oct. 9, 2012, 8:30 p.m. UTC
Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
need to flush the icache on TB linking, and allow to remove the limit
on the code generation buffer.

This improves the boot-up speed of a MIPS guest by 11%.

Cc: Andrzej Zaborowski <balrogg@gmail.com>
Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 exec-all.h           |   24 ++++--------------------
 exec.c               |    4 ----
 tcg/arm/tcg-target.c |    7 +------
 3 files changed, 5 insertions(+), 30 deletions(-)

Comments

Laurent Desnogues Oct. 10, 2012, 1:21 p.m. UTC | #1
On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
> need to flush the icache on TB linking, and allow to remove the limit
> on the code generation buffer.

I'm not sure I like it.  In general having data in the middle
of code will increase I/D cache and I/D TLB pressure.

> This improves the boot-up speed of a MIPS guest by 11%.

Boot speed is very specific.  Did you test some other code?
Also what was your host?

Testing on a quad core Cortex-A9, using all of your patches
(including TCG optimizations), I get this running nbench i386
in user mode:

TEST                : Iter/sec.  : Old Index  : New Index
                    :            : Pentium 90 : AMD K6/233
--------------------:------------:------------:-----------
NUMERIC SORT        :     119.48 :       3.06 :       1.01
STRING SORT         :     7.7907 :       3.48 :       0.54
BITFIELD            : 2.2049e+07 :       3.78 :       0.79
FP EMULATION        :      5.094 :       2.44 :       0.56
FOURIER             :     483.73 :       0.55 :       0.31
ASSIGNMENT          :      1.778 :       6.77 :       1.75
IDEA                :     341.43 :       5.22 :       1.55
HUFFMAN             :     45.942 :       1.27 :       0.41
NEURAL NET          :    0.16667 :       0.27 :       0.11
LU DECOMPOSITION    :      5.969 :       0.31 :       0.22
===================ORIGINAL BYTEMARK RESULTS==============
INTEGER INDEX       : 3.319
FLOATING-POINT INDEX: 0.357
=======================LINUX DATA BELOW===================
MEMORY INDEX        : 0.907
INTEGER INDEX       : 0.774
FLOATING-POINT INDEX: 0.198

Not using this patch, I get:

TEST                : Iter/sec.  : Old Index  : New Index
                    :            : Pentium 90 : AMD K6/233
--------------------:------------:------------:-----------
NUMERIC SORT        :     121.88 :       3.13 :       1.03
STRING SORT         :     7.8438 :       3.50 :       0.54
BITFIELD            : 2.2597e+07 :       3.88 :       0.81
FP EMULATION        :     5.1424 :       2.47 :       0.57
FOURIER             :     466.04 :       0.53 :       0.30
ASSIGNMENT          :      1.809 :       6.88 :       1.79
IDEA                :     359.28 :       5.50 :       1.63
HUFFMAN             :     46.225 :       1.28 :       0.41
NEURAL NET          :    0.16644 :       0.27 :       0.11
LU DECOMPOSITION    :       5.77 :       0.30 :       0.22
===================ORIGINAL BYTEMARK RESULTS==============
INTEGER INDEX       : 3.384
FLOATING-POINT INDEX: 0.349
=======================LINUX DATA BELOW===================
MEMORY INDEX        : 0.922
INTEGER INDEX       : 0.790
FLOATING-POINT INDEX: 0.193

This patch doesn't bring any speedup in that case.

I guess we need more testing as a synthetic benchmark is as
specific as kernel booting :-)


Laurent

> Cc: Andrzej Zaborowski <balrogg@gmail.com>
> Cc: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  exec-all.h           |   24 ++++--------------------
>  exec.c               |    4 ----
>  tcg/arm/tcg-target.c |    7 +------
>  3 files changed, 5 insertions(+), 30 deletions(-)
>
> diff --git a/exec-all.h b/exec-all.h
> index 6516da0..662b916 100644
> --- a/exec-all.h
> +++ b/exec-all.h
> @@ -224,26 +224,10 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
>  #elif defined(__arm__)
>  static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
>  {
> -#if !QEMU_GNUC_PREREQ(4, 1)
> -    register unsigned long _beg __asm ("a1");
> -    register unsigned long _end __asm ("a2");
> -    register unsigned long _flg __asm ("a3");
> -#endif
> -
> -    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
> -    *(uint32_t *)jmp_addr =
> -        (*(uint32_t *)jmp_addr & ~0xffffff)
> -        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
> -
> -#if QEMU_GNUC_PREREQ(4, 1)
> -    __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
> -#else
> -    /* flush icache */
> -    _beg = jmp_addr;
> -    _end = jmp_addr + 4;
> -    _flg = 0;
> -    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
> -#endif
> +    /* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind
> +       of branch so we write absolute address and we don't need to
> +       flush icache. */
> +    *(uint32_t *)jmp_addr = addr;
>  }
>  #elif defined(__sparc__)
>  void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
> diff --git a/exec.c b/exec.c
> index 7899042..8d115ac 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -546,10 +546,6 @@ static void code_gen_alloc(unsigned long tb_size)
>          start = (void *) 0x40000000UL;
>          if (code_gen_buffer_size > (512 * 1024 * 1024))
>              code_gen_buffer_size = (512 * 1024 * 1024);
> -#elif defined(__arm__)
> -        /* Keep the buffer no bigger than 16MB to branch between blocks */
> -        if (code_gen_buffer_size > 16 * 1024 * 1024)
> -            code_gen_buffer_size = 16 * 1024 * 1024;
>  #elif defined(__s390x__)
>          /* Map the buffer so that we can use direct calls and branches.  */
>          /* We have a +- 4GB range on the branches; leave some slop.  */
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index fafbd5d..e04cfa7 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -1501,14 +1501,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>      case INDEX_op_goto_tb:
>          if (s->tb_jmp_offset) {
>              /* Direct jump method */
> -#if defined(USE_DIRECT_JUMP)
> -            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -            tcg_out_b_noaddr(s, COND_AL);
> -#else
>              tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
>              s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -            tcg_out32(s, 0);
> -#endif
> +            tcg_out32(s, (int)s->code_ptr + 4);
>          } else {
>              /* Indirect jump method */
>  #if 1
> --
> 1.7.10.4
>
>
Aurelien Jarno Oct. 10, 2012, 2:28 p.m. UTC | #2
On Wed, Oct 10, 2012 at 03:21:48PM +0200, Laurent Desnogues wrote:
> On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> > Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
> > need to flush the icache on TB linking, and allow to remove the limit
> > on the code generation buffer.
> 
> I'm not sure I like it.  In general having data in the middle
> of code will increase I/D cache and I/D TLB pressure.

Agreed. On the other hand, this patch remove the synchronization of
the instruction cache for TB linking/unlinking.

> > This improves the boot-up speed of a MIPS guest by 11%.
> 
> Boot speed is very specific.  Did you test some other code?
> Also what was your host?

I tested it on a Cortex-A8 machine. I have only tested MIPS, but I can
do more tests, like running the openssl testsuite in the emulated guest.

> Testing on a quad core Cortex-A9, using all of your patches
> (including TCG optimizations), I get this running nbench i386
> in user mode:
> 
> TEST                : Iter/sec.  : Old Index  : New Index
>                     :            : Pentium 90 : AMD K6/233
> --------------------:------------:------------:-----------
> NUMERIC SORT        :     119.48 :       3.06 :       1.01
> STRING SORT         :     7.7907 :       3.48 :       0.54
> BITFIELD            : 2.2049e+07 :       3.78 :       0.79
> FP EMULATION        :      5.094 :       2.44 :       0.56
> FOURIER             :     483.73 :       0.55 :       0.31
> ASSIGNMENT          :      1.778 :       6.77 :       1.75
> IDEA                :     341.43 :       5.22 :       1.55
> HUFFMAN             :     45.942 :       1.27 :       0.41
> NEURAL NET          :    0.16667 :       0.27 :       0.11
> LU DECOMPOSITION    :      5.969 :       0.31 :       0.22
> ===================ORIGINAL BYTEMARK RESULTS==============
> INTEGER INDEX       : 3.319
> FLOATING-POINT INDEX: 0.357
> =======================LINUX DATA BELOW===================
> MEMORY INDEX        : 0.907
> INTEGER INDEX       : 0.774
> FLOATING-POINT INDEX: 0.198
> 
> Not using this patch, I get:
> 
> TEST                : Iter/sec.  : Old Index  : New Index
>                     :            : Pentium 90 : AMD K6/233
> --------------------:------------:------------:-----------
> NUMERIC SORT        :     121.88 :       3.13 :       1.03
> STRING SORT         :     7.8438 :       3.50 :       0.54
> BITFIELD            : 2.2597e+07 :       3.88 :       0.81
> FP EMULATION        :     5.1424 :       2.47 :       0.57
> FOURIER             :     466.04 :       0.53 :       0.30
> ASSIGNMENT          :      1.809 :       6.88 :       1.79
> IDEA                :     359.28 :       5.50 :       1.63
> HUFFMAN             :     46.225 :       1.28 :       0.41
> NEURAL NET          :    0.16644 :       0.27 :       0.11
> LU DECOMPOSITION    :       5.77 :       0.30 :       0.22
> ===================ORIGINAL BYTEMARK RESULTS==============
> INTEGER INDEX       : 3.384
> FLOATING-POINT INDEX: 0.349
> =======================LINUX DATA BELOW===================
> MEMORY INDEX        : 0.922
> INTEGER INDEX       : 0.790
> FLOATING-POINT INDEX: 0.193
> 
> This patch doesn't bring any speedup in that case.
> 
> I guess we need more testing as a synthetic benchmark is as
> specific as kernel booting :-)
> 

This doesn't really surprise me. The goal of the patch is to remove the
limit of 16MB for the generated code. I really doubt you reach such a
limit in user mode unless you use some complex code.

On the other hand in system mode, this can be already reached once the
whole guest kernel is translated, so cached code is dropped and has to
be re-translated regularly. Re-translating guest code is clearly more
expensive than the increase of I/D cache and I/D TLB pressure.

The other way to allow more than 16MB of generated code would be to
disable direct jump on ARM. It adds one 32-bit constant loading + one
memory load, but then you don't have the I/D cache and TLB issue.
Laurent Desnogues Oct. 10, 2012, 2:43 p.m. UTC | #3
On Wed, Oct 10, 2012 at 4:28 PM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> On Wed, Oct 10, 2012 at 03:21:48PM +0200, Laurent Desnogues wrote:
>> On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno <aurelien@aurel32.net> wrote:
>> > Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
>> > need to flush the icache on TB linking, and allow to remove the limit
>> > on the code generation buffer.
>>
>> I'm not sure I like it.  In general having data in the middle
>> of code will increase I/D cache and I/D TLB pressure.
>
> Agreed. On the other hand, this patch remove the synchronization of
> the instruction cache for TB linking/unlinking.

TB linking/unlinking should happen less often than code execution.

>> > This improves the boot-up speed of a MIPS guest by 11%.
>>
>> Boot speed is very specific.  Did you test some other code?
>> Also what was your host?
>
> I tested it on a Cortex-A8 machine. I have only tested MIPS, but I can
> do more tests, like running the openssl testsuite in the emulated guest.

Yes, please.

[...]
> This doesn't really surprise me. The goal of the patch is to remove the
> limit of 16MB for the generated code. I really doubt you reach such a
> limit in user mode unless you use some complex code.
>
> On the other hand in system mode, this can be already reached once the
> whole guest kernel is translated, so cached code is dropped and has to
> be re-translated regularly. Re-translating guest code is clearly more
> expensive than the increase of I/D cache and I/D TLB pressure.

Ha yes, that's a real problem.  What about having some define
and/or runtime flag to keep both caches sync and your ldr PC
change in QEMU?

> The other way to allow more than 16MB of generated code would be to
> disable direct jump on ARM. It adds one 32-bit constant loading + one
> memory load, but then you don't have the I/D cache and TLB issue.

The performance hit would be even worse :-)


Laurent
diff mbox

Patch

diff --git a/exec-all.h b/exec-all.h
index 6516da0..662b916 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -224,26 +224,10 @@  static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 #elif defined(__arm__)
 static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 {
-#if !QEMU_GNUC_PREREQ(4, 1)
-    register unsigned long _beg __asm ("a1");
-    register unsigned long _end __asm ("a2");
-    register unsigned long _flg __asm ("a3");
-#endif
-
-    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
-    *(uint32_t *)jmp_addr =
-        (*(uint32_t *)jmp_addr & ~0xffffff)
-        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
-
-#if QEMU_GNUC_PREREQ(4, 1)
-    __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
-#else
-    /* flush icache */
-    _beg = jmp_addr;
-    _end = jmp_addr + 4;
-    _flg = 0;
-    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
-#endif
+    /* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind
+       of branch so we write absolute address and we don't need to
+       flush icache. */
+    *(uint32_t *)jmp_addr = addr;
 }
 #elif defined(__sparc__)
 void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
diff --git a/exec.c b/exec.c
index 7899042..8d115ac 100644
--- a/exec.c
+++ b/exec.c
@@ -546,10 +546,6 @@  static void code_gen_alloc(unsigned long tb_size)
         start = (void *) 0x40000000UL;
         if (code_gen_buffer_size > (512 * 1024 * 1024))
             code_gen_buffer_size = (512 * 1024 * 1024);
-#elif defined(__arm__)
-        /* Keep the buffer no bigger than 16MB to branch between blocks */
-        if (code_gen_buffer_size > 16 * 1024 * 1024)
-            code_gen_buffer_size = 16 * 1024 * 1024;
 #elif defined(__s390x__)
         /* Map the buffer so that we can use direct calls and branches.  */
         /* We have a +- 4GB range on the branches; leave some slop.  */
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index fafbd5d..e04cfa7 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1501,14 +1501,9 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* Direct jump method */
-#if defined(USE_DIRECT_JUMP)
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            tcg_out_b_noaddr(s, COND_AL);
-#else
             tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
             s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            tcg_out32(s, 0);
-#endif
+            tcg_out32(s, (int)s->code_ptr + 4);
         } else {
             /* Indirect jump method */
 #if 1