Patchwork [RFC,3/4] tcg: add optimized TCG qemu_ld/st generation

login
register
mail settings
Submitter YeongKyoon Lee
Date July 4, 2012, 6:01 a.m.
Message ID <25206019.11171341381666158.JavaMail.weblogic@epml28>
Download mbox | patch
Permalink /patch/168904/
State New
Headers show

Comments

YeongKyoon Lee - July 4, 2012, 6:01 a.m.
Add optimized TCG qemu_ld/st generation which generates the code for TLB miss case handling at the end of TB after generating other IRs.

Signed-off-by: Yeongkyoon Lee 

---
tcg/i386/tcg-target.c |  328 +++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.c             |   12 ++
tcg/tcg.h             |   35 +++++
3 files changed, 375 insertions(+), 0 deletions(-)

__________________________________
Principal Engineer 
VM Team 
Yeongkyoon Lee 

S-Core Co., Ltd.
D.L.: +82-31-696-7249
M.P.: +82-10-9965-1265

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c

index da17bba..3f2f640 100644

--- a/tcg/i386/tcg-target.c

+++ b/tcg/i386/tcg-target.c

@@ -984,6 +984,8 @@  static const void *qemu_st_helpers[4] = {

     helper_stq_mmu,
};
#else
+

+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION

/* legacy helper signature: __ld_mmu(target_ulong addr, int
    mmu_idx) */
static void *qemu_ld_helpers[4] = {
@@ -1001,6 +1003,35 @@  static void *qemu_st_helpers[4] = {

     __stl_mmu,
     __stq_mmu,
};
+#else

+/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int

+   mmu_idx, uintptr raddr) */

+static void *qemu_ld_helpers[4] = {

+    __ext_ldb_mmu,

+    __ext_ldw_mmu,

+    __ext_ldl_mmu,

+    __ext_ldq_mmu,

+};

+

+/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,

+   int mmu_idx) */

+static void *qemu_st_helpers[4] = {

+    __ext_stb_mmu,

+    __ext_stw_mmu,

+    __ext_stl_mmu,

+    __ext_stq_mmu,

+};

+

+static void add_qemu_ldst_label(TCGContext *s,

+                                int opc_ext,

+                                int data_reg,

+                                int data_reg2,

+                                int addrlo_reg,

+                                int addrhi_reg,

+                                int mem_index,

+                                uint8_t *raddr,

+                                uint8_t **label_ptr);

+#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */

#endif

/* Perform the TLB load and compare.
@@ -1061,19 +1092,36 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,


     tcg_out_mov(s, type, r0, addrlo);

+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION

+    /* jne slow_path */

+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);

+    if (!label_ptr) {

+        tcg_abort();

+    }

+    label_ptr[0] = s->code_ptr;

+    s->code_ptr += 4;

+#else

     /* jne label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JNE);
     label_ptr[0] = s->code_ptr;
     s->code_ptr++;
+#endif


     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r1), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);

+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION

+        /* jne slow_path */

+        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);

+        label_ptr[1] = s->code_ptr;

+        s->code_ptr += 4;

+#else

         /* jne label1 */
         tcg_out8(s, OPC_JCC_short + JCC_JNE);
         label_ptr[1] = s->code_ptr;
         s->code_ptr++;
+#endif

     }

     /* TLB Hit.  */
@@ -1171,11 +1219,13 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,

     int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)

#if TCG_TARGET_REG_BITS == 64
     int arg_idx;
#else
     int stack_adjust;
#endif
+#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */

     uint8_t *label_ptr[3];
#endif

@@ -1197,6 +1247,18 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,

     tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+    /* helper stub will be jumped back here */

+    add_qemu_ldst_label(s,

+                        opc,

+                        data_reg,

+                        data_reg2,

+                        args[addrlo_idx],

+                        args[addrlo_idx + 1],

+                        mem_index,

+                        s->code_ptr,

+                        label_ptr);

+#else

     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
     label_ptr[2] = s->code_ptr;
@@ -1292,6 +1354,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,


     /* label2: */
     *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */

#else
     {
         int32_t offset = GUEST_BASE;
@@ -1385,7 +1448,9 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,

     int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)

     int stack_adjust;
+#endif

     uint8_t *label_ptr[3];
#endif

@@ -1407,6 +1472,18 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,

     tcg_out_qemu_st_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+    /* helper stub will be jumped back here */

+    add_qemu_ldst_label(s,

+                        opc | HL_ST_MASK,

+                        data_reg,

+                        data_reg2,

+                        args[addrlo_idx],

+                        args[addrlo_idx + 1],

+                        mem_index,

+                        s->code_ptr,

+                        label_ptr);

+#else

     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
     label_ptr[2] = s->code_ptr;
@@ -1469,6 +1546,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,


     /* label2: */
     *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */

#else
     {
         int32_t offset = GUEST_BASE;
@@ -1496,6 +1574,256 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,

#endif
}

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+/* optimization to reduce jump overheads for qemu_ld/st IRs */

+

+/*

+ * qemu_ld/st code generator call add_qemu_ldst_label,

+ * so that slow case(TLB miss or I/O rw) is handled at the end of TB

+ */

+static void add_qemu_ldst_label(TCGContext *s,

+                                int opc_ext,

+                                int data_reg,

+                                int data_reg2,

+                                int addrlo_reg,

+                                int addrhi_reg,

+                                int mem_index,

+                                uint8_t *raddr,

+                                uint8_t **label_ptr)

+{

+    int idx;

+    TCGLabelQemuLdst *label;

+

+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)

+        tcg_abort();

+

+    idx = s->nb_qemu_ldst_labels++;

+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];

+    label->opc_ext = opc_ext;

+    label->datalo_reg = data_reg;

+    label->datahi_reg = data_reg2;

+    label->addrlo_reg = addrlo_reg;

+    label->addrhi_reg = addrhi_reg;

+    label->mem_index = mem_index;

+    label->raddr = raddr;

+    if (!label_ptr) {

+        tcg_abort();

+    }

+    label->label_ptr[0] = label_ptr[0];

+    label->label_ptr[1] = label_ptr[1];

+}

+

+/* generates slow case of qemu_ld at the end of TB */

+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)

+{

+    int s_bits;

+    int opc = label->opc_ext & HL_OPC_MASK;

+    int mem_index = label->mem_index;

+#if TCG_TARGET_REG_BITS == 64

+    int arg_idx;

+#else

+    int stack_adjust;

+    int addrlo_reg = label->addrlo_reg;

+    int addrhi_reg = label->addrhi_reg;

+#endif

+    int data_reg = label->datalo_reg;

+    int data_reg2 = label->datahi_reg;

+    uint8_t *raddr = label->raddr;

+    uint8_t **label_ptr = &label->label_ptr[0];

+

+    s_bits = opc & 3;

+

+    /* resolve label address */

+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);

+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);

+    }

+

+    /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx,

+       uintptr_t raddr) */

+#if TCG_TARGET_REG_BITS == 32

+    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */

+    stack_adjust = 4;

+    tcg_out_pushi(s, mem_index);        /* mmu index */

+    stack_adjust += 4;

+    if (TARGET_LONG_BITS == 64) {

+        tcg_out_push(s, addrhi_reg);

+        stack_adjust += 4;

+    }

+    tcg_out_push(s, addrlo_reg); /* guest addr */

+    stack_adjust += 4;

+#ifdef CONFIG_TCG_PASS_AREG0

+    tcg_out_push(s, TCG_AREG0);

+    stack_adjust += 4;

+#endif

+#else

+    /* The first argument is already loaded with addrlo.  */

+    arg_idx = 1;

+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],

+                 mem_index);

+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],

+                 (uintptr_t)(raddr - 1));

+#ifdef CONFIG_TCG_PASS_AREG0

+    /* XXX/FIXME: suboptimal */

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],

+                tcg_target_call_iarg_regs[2]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],

+                tcg_target_call_iarg_regs[1]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],

+                tcg_target_call_iarg_regs[0]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],

+                TCG_AREG0);

+#endif

+#endif

+

+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);

+

+#if TCG_TARGET_REG_BITS == 32

+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {

+        /* Pop and discard.  This is 2 bytes smaller than the add.  */

+        tcg_out_pop(s, TCG_REG_ECX);

+    } else if (stack_adjust != 0) {

+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);

+    }

+#endif

+

+    switch(opc) {

+    case 0 | 4:

+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);

+        break;

+    case 1 | 4:

+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);

+        break;

+    case 0:

+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);

+        break;

+    case 1:

+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);

+        break;

+    case 2:

+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);

+        break;

+#if TCG_TARGET_REG_BITS == 64

+    case 2 | 4:

+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);

+        break;

+#endif

+    case 3:

+        if (TCG_TARGET_REG_BITS == 64) {

+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);

+        } else if (data_reg == TCG_REG_EDX) {

+            /* xchg %edx, %eax */

+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);

+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);

+        } else {

+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);

+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);

+        }

+        break;

+    default:

+        tcg_abort();

+    }

+

+    /* jump back to original code */

+    tcg_out_jmp(s, (tcg_target_long) raddr);

+}

+

+/* generates slow case of qemu_st at the end of TB */

+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)

+{

+    int s_bits;

+    int stack_adjust;

+    int opc = label->opc_ext & HL_OPC_MASK;

+    int mem_index = label->mem_index;

+    int data_reg = label->datalo_reg;

+#if TCG_TARGET_REG_BITS == 32

+    int data_reg2 = label->datahi_reg;

+    int addrlo_reg = label->addrlo_reg;

+    int addrhi_reg = label->addrhi_reg;

+#endif

+    uint8_t *raddr = label->raddr;

+    uint8_t **label_ptr = &label->label_ptr[0];

+

+    s_bits = opc & 3;

+

+    /* resolve label address */

+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);

+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);

+    }

+

+    /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,

+       int mmu_idx, uintptr_t raddr) */

+#if TCG_TARGET_REG_BITS == 32

+    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */

+    stack_adjust = 4;

+    tcg_out_pushi(s, mem_index); /* mmu index */

+    stack_adjust += 4;

+    if (opc == 3) {

+        tcg_out_push(s, data_reg2);

+        stack_adjust += 4;

+    }

+    tcg_out_push(s, data_reg);   /* guest data */

+    stack_adjust += 4;

+    if (TARGET_LONG_BITS == 64) {

+        tcg_out_push(s, addrhi_reg);

+        stack_adjust += 4;

+    }

+    tcg_out_push(s, addrlo_reg); /* guest addr */

+    stack_adjust += 4;

+#ifdef CONFIG_TCG_PASS_AREG0

+    tcg_out_push(s, TCG_AREG0);

+    stack_adjust += 4;

+#endif

+#else

+    tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),

+                tcg_target_call_iarg_regs[1], data_reg);

+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);

+    tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1));

+    stack_adjust = 0;

+#ifdef CONFIG_TCG_PASS_AREG0

+    /* XXX/FIXME: suboptimal */

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],

+                tcg_target_call_iarg_regs[2]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],

+                tcg_target_call_iarg_regs[1]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],

+                tcg_target_call_iarg_regs[0]);

+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],

+                TCG_AREG0);

+#endif

+#endif

+

+    tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);

+

+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {

+        /* Pop and discard.  This is 2 bytes smaller than the add.  */

+        tcg_out_pop(s, TCG_REG_ECX);

+    } else if (stack_adjust != 0) {

+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);

+    }

+

+    /* jump back to original code */

+    tcg_out_jmp(s, (tcg_target_long) raddr);

+}

+

+/* generates all of the slow cases of qemu_ld/st at the end of TB */

+void tcg_out_qemu_ldst_slow_path(TCGContext *s)

+{

+    int i;

+    TCGLabelQemuLdst *label;

+

+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {

+        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];

+        if (IS_QEMU_LD_LABEL(label)) {

+            tcg_out_qemu_ld_slow_path(s, label);

+        } else {

+            tcg_out_qemu_st_slow_path(s, label);

+        }

+    }

+}

+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */

+

static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
{
diff --git a/tcg/tcg.c b/tcg/tcg.c

index 8386b70..c33cb96 100644

--- a/tcg/tcg.c

+++ b/tcg/tcg.c

@@ -301,6 +301,14 @@  void tcg_func_start(TCGContext *s)


     gen_opc_ptr = gen_opc_buf;
     gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+    /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */

+    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST);

+    if (!s->qemu_ldst_labels) {

+        tcg_abort();

+    }

+    s->nb_qemu_ldst_labels = 0;

+#endif

}

static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2177,10 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,

#endif
     }
  the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+    /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */

+    tcg_out_qemu_ldst_slow_path(s);

+#endif

     return -1;
}

diff --git a/tcg/tcg.h b/tcg/tcg.h

index d710694..e52d3a4 100644

--- a/tcg/tcg.h

+++ b/tcg/tcg.h

@@ -187,6 +187,29 @@  typedef tcg_target_ulong TCGArg;

    are aliases for target_ulong and host pointer sized values respectively.
  */

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+/* Macros and structures for qemu_ld/st IR code optimization:

+   It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */

+#define TCG_MAX_QEMU_LDST       320

+#define HL_LDST_SHIFT           4

+#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)

+#define HL_ST_MASK              HL_LDST_MASK

+#define HL_OPC_MASK             (HL_LDST_MASK - 1)

+#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))

+#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)

+

+typedef struct TCGLabelQemuLdst {

+    int opc_ext;                /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */

+    int addrlo_reg;             /* reg index for the low word of guest virtual address */

+    int addrhi_reg;             /* reg index for the high word of guest virtual address */

+    int datalo_reg;             /* reg index for the low word to be loaded or to be stored */

+    int datahi_reg;             /* reg index for the high word to be loaded or to be stored */

+    int mem_index;              /* soft MMU memory index */

+    uint8_t *raddr;             /* return address (located end of TB) */

+    uint8_t *label_ptr[2];      /* label pointers to be updated */

+} TCGLabelQemuLdst;

+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */

+

#ifdef CONFIG_DEBUG_TCG
#define DEBUG_TCGV 1
#endif
@@ -389,6 +412,13 @@  struct TCGContext {

#ifdef CONFIG_DEBUG_TCG
     int temps_in_use;
#endif
+

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+    /* labels info for qemu_ld/st IRs

+       The labels help to generate TLB miss case codes at the end of TB */

+    TCGLabelQemuLdst *qemu_ldst_labels;

+    int nb_qemu_ldst_labels;

+#endif

};

extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@  extern uint8_t code_gen_prologue[];

#endif

void tcg_register_jit(void *buf, size_t buf_size);
+

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)

+/* qemu_ld/st generation at the end of TB */

+void tcg_out_qemu_ldst_slow_path(TCGContext *s);

+#endif