diff mbox

[v2,2/3] tcg/aarch64: Use ADRP+ADD to compute target address

Message ID 20170629075243.26984-3-bobby.prani@gmail.com
State New
Headers show

Commit Message

Pranith Kumar June 29, 2017, 7:52 a.m. UTC
We use ADRP+ADD to compute the target address for goto_tb. This patch
introduces the NOP instruction which is used to align the above
instruction pair so that we can use one atomic instruction to patch
the destination offsets.

CC: Richard Henderson <rth@twiddle.net>
CC: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 accel/tcg/translate-all.c    |  2 +-
 tcg/aarch64/tcg-target.inc.c | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

Comments

Richard Henderson June 29, 2017, 4:36 p.m. UTC | #1
On 06/29/2017 12:52 AM, Pranith Kumar wrote:
> We use ADRP+ADD to compute the target address for goto_tb. This patch
> introduces the NOP instruction which is used to align the above
> instruction pair so that we can use one atomic instruction to patch
> the destination offsets.
> 
> CC: Richard Henderson <rth@twiddle.net>
> CC: Alex Bennée <alex.bennee@linaro.org>
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> ---
>   accel/tcg/translate-all.c    |  2 +-
>   tcg/aarch64/tcg-target.inc.c | 26 +++++++++++++++++++++-----
>   2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index f6ad46b613..b6d122e087 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -522,7 +522,7 @@ static inline PageDesc *page_find(tb_page_addr_t index)
>   #elif defined(__powerpc__)
>   # define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
>   #elif defined(__aarch64__)
> -# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
> +# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)

The max is 2GB, because the 4GB range of ADRP is signed.
The end of the buffer must be able to address the beginning of the buffer.

>   void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
>   {
>       tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
> -    tcg_insn_unit *target = (tcg_insn_unit *)addr;
> +    tcg_insn_unit adrp_insn = *code_ptr++;
> +    tcg_insn_unit addi_insn = *code_ptr;
>   
> -    reloc_pc26_atomic(code_ptr, target);
> -    flush_icache_range(jmp_addr, jmp_addr + 4);
> +    ptrdiff_t offset = (addr >> 12) - (jmp_addr >> 12);
> +
> +    /* patch ADRP */
> +    adrp_insn = deposit32(adrp_insn, 29, 2, offset & 0x3);
> +    adrp_insn = deposit32(adrp_insn, 5, 19, offset >> 2);
> +    /* patch ADDI */
> +    addi_insn = deposit32(addi_insn, 10, 12, addr & 0xfff);
> +    atomic_set((uint64_t *)jmp_addr, adrp_insn | ((uint64_t)addi_insn << 32));
> +    flush_icache_range(jmp_addr, jmp_addr + 8);

(1) You don't need to load the ADRP and ADDI insns, because you know exactly 
what they are.  (2) You should check to see if the branch is within 128MB and 
use a direct branch in that case; it will happen quite often.  See the ppc64 
ppc_tb_set_jmp_target for an example.

>           /* actual branch destination will be patched by
> +           aarch64_tb_set_jmp_target later, beware of retranslation */

We can actually drop the retranslation comment now; that's a cleanup that 
should be applied to all of the backends...

> +        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
> +        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
> +        tcg_out_callr(s, TCG_REG_TMP);

Don't use callr, use BR like you did for goto_long in the previous patch.


r~
diff mbox

Patch

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f6ad46b613..b6d122e087 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -522,7 +522,7 @@  static inline PageDesc *page_find(tb_page_addr_t index)
 #elif defined(__powerpc__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
 #elif defined(__aarch64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 8fce11ace7..b7670ecc90 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -372,6 +372,7 @@  typedef enum {
     I3510_EON       = 0x4a200000,
     I3510_ANDS      = 0x6a000000,
 
+    NOP             = 0xd503201f,
     /* System instructions.  */
     DMB_ISH         = 0xd50338bf,
     DMB_LD          = 0x00000100,
@@ -866,10 +867,18 @@  static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
 {
     tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
-    tcg_insn_unit *target = (tcg_insn_unit *)addr;
+    tcg_insn_unit adrp_insn = *code_ptr++;
+    tcg_insn_unit addi_insn = *code_ptr;
 
-    reloc_pc26_atomic(code_ptr, target);
-    flush_icache_range(jmp_addr, jmp_addr + 4);
+    ptrdiff_t offset = (addr >> 12) - (jmp_addr >> 12);
+
+    /* patch ADRP */
+    adrp_insn = deposit32(adrp_insn, 29, 2, offset & 0x3);
+    adrp_insn = deposit32(adrp_insn, 5, 19, offset >> 2);
+    /* patch ADDI */
+    addi_insn = deposit32(addi_insn, 10, 12, addr & 0xfff);
+    atomic_set((uint64_t *)jmp_addr, adrp_insn | ((uint64_t)addi_insn << 32));
+    flush_icache_range(jmp_addr, jmp_addr + 8);
 }
 
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
@@ -1388,10 +1397,17 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #endif
         /* consistency for USE_DIRECT_JUMP */
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        /* Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+           write can be used to patch the target address. */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out32(s, NOP);
+        }
         s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
         /* actual branch destination will be patched by
-           aarch64_tb_set_jmp_target later, beware retranslation. */
-        tcg_out_goto_noaddr(s);
+           aarch64_tb_set_jmp_target later, beware of retranslation */
+        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+        tcg_out_callr(s, TCG_REG_TMP);
         s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
         break;