Patchwork [4/4] tcg/aarch64: implement tlb lookup fast path

login
register
mail settings
Submitter Jani Kokkonen
Date May 31, 2013, 6:07 p.m.
Message ID <51A8E6E2.1010704@huawei.com>
Download mbox | patch
Permalink /patch/248013/
State New
Headers show

Comments

Jani Kokkonen - May 31, 2013, 6:07 p.m.
From: Jani Kokkonen <jani.kokkonen@huawei.com>

implement the fast path for tcg_out_qemu_ld/st.

Signed-off-by: Jani Kokkonen <jani.kokkonen@huawei.com>
---
 tcg/aarch64/tcg-target.c | 161 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 157 insertions(+), 4 deletions(-)
Jani Kokkonen - June 3, 2013, 11:21 a.m.
On 5/31/2013 10:25 PM, Richard Henderson wrote:
> On 05/31/2013 11:07 AM, Jani Kokkonen wrote:
>> +/* Load and compare a TLB entry, leaving the flags set.  Leaves X2 pointing
>> +   to the tlb entry.  Clobbers X0,X1,X2,X3 and TMP.  */
>> +
>> +static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
>> +                             int s_bits, uint8_t **label_ptr, int tlb_offset)
>> +{
> 
> You copied the comment from ARM, and it isn't correct.  You generate branches.

I will fix the comment.
> 
>> +    TCGReg base = TCG_AREG0;
>> +
>> +    tcg_out_shr(s, 1, TCG_REG_TMP, addr_reg, TARGET_PAGE_BITS);
>> +    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X1, tlb_offset);
>> +    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, base, TCG_REG_X1, 0);
>> +    tcg_out_andi(s, 1, TCG_REG_X0, TCG_REG_TMP, CPU_TLB_BITS, 0);
>> +    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
>> +                             TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
>> +#if TARGET_LONG_BITS == 64
>> +    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X3, TCG_REG_X2, 0);
>> +#else
>> +    tcg_out_ldst(s, LDST_32, LDST_LD, TCG_REG_X3, TCG_REG_X2, 0);
>> +#endif
>> +    /* check alignment */
>> +    if (s_bits) {
>> +        tcg_out_tst(s, 1, addr_reg, s_bits, 0);
>> +        label_ptr[0] = s->code_ptr;
>> +        tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
>> +    }
>> +    tcg_out_cmp(s, 1, TCG_REG_X3, TCG_REG_TMP, -TARGET_PAGE_BITS);
>> +    label_ptr[1] = s->code_ptr;
>> +    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
> 
> I'm positive that the branch predictor would be happier with a single branch
> rather than the two you generate here.  It ought to be possible to use a
> different set of insns to do this in one go.
> 
> How about something like
> 
> 	@ extract the tlb index from the address
> 	ubfm	w0, addr_reg, TARGET_PAGE_BITS, CPU_TLB_BITS
> 
> 	@ add any "high bits" from the tlb offset
> 	@ noting that env will be much smaller than 24 bits.
> 	add	x1, env, tlb_offset & 0xfff000
> 
> 	@ zap the tlb index from the address for compare
> 	@ this is all high bits plus 0-3 low bits set, so this
> 	@ should match a logical immediate.
> 	and	w/x2, addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1)
> 
> 	@ merge the tlb index into the env+tlb_offset
> 	add	x1, x1, x0, lsl #3
> 
> 	@ load the tlb comparator.  the 12-bit scaled offset
> 	@ form will fit the bits remaining from above, given that
> 	@ we're loading an aligned object, and so the low 2/3 bits
> 	@ will be clear.
> 	ldr	w/x0, [x1, tlb_offset & 0xfff]
> 
> 	@ load the tlb addend.  do this early to avoid stalling.
> 	@ the addend_offset differs from tlb_offset by 1-3 words.
> 	@ given that we've got overlap between the scaled 12-bit
> 	@ value and the 12-bit shifted value above, this also ought
> 	@ to always be representable.
> 	ldr	x3, [x1, (tlb_offset & 0xfff) + (addend_offset - tlb_offset)]
> 
> 	@ perform the comparison
> 	cmp	w/x0, w/x2
> 
> 	@ generate the complete host address in parallel with the cmp.
> 	add	x3, x3, addr_reg		@ 64-bit guest
> 	add	x3, x3, addr_reg, uxtw		@ 32-bit guest
> 
> 	bne	miss_label
> 
> Note that the w/x above indicates the ext setting that ought to be used,
> depending on the address size of the guest.
> 
> This is at least 2 insns shorter than your sequence.

Ok, thanks. ubfm instruction will be added and I will modify implementation based on your comments.
> 
> Have you looked at doing the out-of-line tlb miss sequence right from the
> very beginning?  It's not that much more difficult to accomplish than the
> inline tlb miss.

I have to look into this one.
> 
> See CONFIG_QEMU_LDST_OPTIMIZATION, and the implementation in tcg/arm.
> You won't need two nops after the call; aarch64 can do all the required
> extensions and data movement operations in a single insn.
> 
> 

I will take this also into account. 

> r~
> 

-Jani
Richard Henderson - June 3, 2013, 3:52 p.m.
On 06/03/2013 04:21 AM, Jani Kokkonen wrote:
>> 	@ merge the tlb index into the env+tlb_offset
>> 	add	x1, x1, x0, lsl #3

For the record, oops.  3 should be CPU_TLB_ENTRY_BITS.


r~

Patch

diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 24b2862..47ec4a7 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -700,6 +700,36 @@  static inline void tcg_out_uxt(TCGContext *s, int s_bits,
 #ifdef CONFIG_SOFTMMU
 #include "exec/softmmu_defs.h"
 
+/* Load and compare a TLB entry, leaving the flags set.  Leaves X2 pointing
+   to the tlb entry.  Clobbers X0,X1,X2,X3 and TMP.  */
+
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
+                             int s_bits, uint8_t **label_ptr, int tlb_offset)
+{
+    TCGReg base = TCG_AREG0;
+
+    tcg_out_shr(s, 1, TCG_REG_TMP, addr_reg, TARGET_PAGE_BITS);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X1, tlb_offset);
+    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, base, TCG_REG_X1, 0);
+    tcg_out_andi(s, 1, TCG_REG_X0, TCG_REG_TMP, CPU_TLB_BITS, 0);
+    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
+                             TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
+#if TARGET_LONG_BITS == 64
+    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X3, TCG_REG_X2, 0);
+#else
+    tcg_out_ldst(s, LDST_32, LDST_LD, TCG_REG_X3, TCG_REG_X2, 0);
+#endif
+    /* check alignment */
+    if (s_bits) {
+        tcg_out_tst(s, 1, addr_reg, s_bits, 0);
+        label_ptr[0] = s->code_ptr;
+        tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
+    }
+    tcg_out_cmp(s, 1, TCG_REG_X3, TCG_REG_TMP, -TARGET_PAGE_BITS);
+    label_ptr[1] = s->code_ptr;
+    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
+}
+
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
    int mmu_idx) */
 static const void * const qemu_ld_helpers[4] = {
@@ -723,18 +753,85 @@  static const void * const qemu_st_helpers[4] = {
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 {
     TCGReg addr_reg, data_reg;
+    bool bswap;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    int i;
+    uint8_t *label_ptr[2] = { NULL };
+    uint8_t *label_ptr2;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
+#ifdef TARGET_WORDS_BIGENDIAN
+    bswap = 1;
+#else
+    bswap = 0;
+#endif
 
 #ifdef CONFIG_SOFTMMU
     mem_index = args[2];
     s_bits = opc & 3;
 
-    /* TODO: insert TLB lookup here */
+    tcg_out_tlb_read(s, addr_reg, s_bits, label_ptr,
+                     offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
 
+    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
+             offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_read));
+    switch (opc) {
+    case 0:
+        tcg_out_ldst_r(s, LDST_8, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+        break;
+    case 0 | 4:
+        tcg_out_ldst_r(s, LDST_8, LDST_LD_S_X, data_reg, addr_reg, TCG_REG_X1);
+        break;
+    case 1:
+        tcg_out_ldst_r(s, LDST_16, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+        if (bswap) {
+            tcg_out_rev16(s, 1, data_reg, data_reg);
+        }
+        break;
+    case 1 | 4:
+        if (bswap) {
+            tcg_out_ldst_r(s, LDST_16, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+            tcg_out_rev16(s, 1, data_reg, data_reg);
+            tcg_out_sxt(s, 1, s_bits, data_reg, data_reg);
+        } else {
+            tcg_out_ldst_r(s, LDST_16, LDST_LD_S_X,
+                           data_reg, addr_reg, TCG_REG_X1);
+        }
+        break;
+    case 2:
+        tcg_out_ldst_r(s, LDST_32, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+        if (bswap) {
+            tcg_out_rev32(s, data_reg, data_reg);
+        }
+        break;
+    case 2 | 4:
+        if (bswap) {
+            tcg_out_ldst_r(s, LDST_32, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+            tcg_out_rev32(s, data_reg, data_reg);
+            tcg_out_sxt(s, 1, s_bits, data_reg, data_reg);
+        } else {
+            tcg_out_ldst_r(s, LDST_32, LDST_LD_S_X,
+                           data_reg, addr_reg, TCG_REG_X1);
+        }
+        break;
+    case 3:
+        tcg_out_ldst_r(s, LDST_64, LDST_LD, data_reg, addr_reg, TCG_REG_X1);
+         if (bswap) {
+            tcg_out_rev(s, 1, data_reg, data_reg);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+    label_ptr2 = s->code_ptr;
+    tcg_out_goto_noaddr(s);
+    for (i = 0; i < 2; i++) {
+        if (label_ptr[i]) {
+            reloc_pc19(label_ptr[i], (tcg_target_long)s->code_ptr);
+        }
+    }
     /* all arguments passed via registers */
     tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
     tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
@@ -748,7 +845,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     } else {
         tcg_out_movr(s, 1, data_reg, TCG_REG_X0);
     }
-
+    reloc_pc26(label_ptr2, (tcg_target_long)s->code_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_abort(); /* TODO */
 #endif
@@ -757,8 +854,17 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
 {
     TCGReg addr_reg, data_reg;
+    bool bswap;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    int i;
+    uint8_t *label_ptr[2] = { NULL };
+    uint8_t *label_ptr2;
+#endif
+#ifdef TARGET_WORDS_BIGENDIAN
+    bswap = 1;
+#else
+    bswap = 0;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
@@ -767,8 +873,55 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     mem_index = args[2];
     s_bits = opc & 3;
 
-    /* TODO: insert TLB lookup here */
+    tcg_out_tlb_read(s, addr_reg, s_bits, label_ptr,
+                  offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
 
+    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
+           offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_write));
+    switch (opc) {
+    case 0:
+        tcg_out_ldst_r(s, LDST_8, LDST_ST, data_reg, addr_reg, TCG_REG_X1);
+        break;
+    case 1:
+        if (bswap) {
+            tcg_out_rev16(s, 1, TCG_REG_X0, data_reg);
+            tcg_out_ldst_r(s, LDST_16, LDST_ST, TCG_REG_X0,
+                      addr_reg, TCG_REG_X1);
+        } else {
+            tcg_out_ldst_r(s, LDST_16, LDST_ST, data_reg,
+                      addr_reg, TCG_REG_X1);
+        }
+        break;
+    case 2:
+        if (bswap) {
+            tcg_out_rev32(s, TCG_REG_X0, data_reg);
+            tcg_out_ldst_r(s, LDST_32, LDST_ST, TCG_REG_X0,
+                       addr_reg, TCG_REG_X1);
+        } else {
+            tcg_out_ldst_r(s, LDST_32, LDST_ST, data_reg,
+                       addr_reg, TCG_REG_X1);
+        }
+        break;
+    case 3:
+        if (bswap) {
+            tcg_out_rev(s, 1, TCG_REG_X0, data_reg);
+            tcg_out_ldst_r(s, LDST_64, LDST_ST, TCG_REG_X0,
+                       addr_reg, TCG_REG_X1);
+        } else {
+            tcg_out_ldst_r(s, LDST_64, LDST_ST, data_reg,
+                       addr_reg, TCG_REG_X1);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+    label_ptr2 = s->code_ptr;
+    tcg_out_goto_noaddr(s);
+    for (i = 0; i < 2; i++) {
+        if (label_ptr[i]) {
+            reloc_pc19(label_ptr[i], (tcg_target_long)s->code_ptr);
+        }
+    }
     /* all arguments passed via registers */
     tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
     tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
@@ -777,7 +930,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
                  (tcg_target_long)qemu_st_helpers[s_bits]);
     tcg_out_callr(s, TCG_REG_TMP);
-
+    reloc_pc26(label_ptr2, (tcg_target_long)s->code_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_abort(); /* TODO */
 #endif