diff mbox

[v3,20/25] tcg: Save insn data and use it in cpu_restore_state_from_tb

Message ID 1442953507-4074-21-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Sept. 22, 2015, 8:25 p.m. UTC
We can now restore state without retranslation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h |   1 +
 tcg/tcg.c               |  40 ++++++++-----
 tcg/tcg.h               |   4 +-
 translate-all.c         | 149 +++++++++++++++++++++++++++++++++++-------------
 4 files changed, 139 insertions(+), 55 deletions(-)

Comments

Peter Maydell Sept. 23, 2015, 7:20 p.m. UTC | #1
On 22 September 2015 at 13:25, Richard Henderson <rth@twiddle.net> wrote:
> We can now restore state without retranslation.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/exec/exec-all.h |   1 +
>  tcg/tcg.c               |  40 ++++++++-----
>  tcg/tcg.h               |   4 +-
>  translate-all.c         | 149 +++++++++++++++++++++++++++++++++++-------------
>  4 files changed, 139 insertions(+), 55 deletions(-)

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

thanks
-- PMM
Aurelien Jarno Sept. 25, 2015, 9:10 p.m. UTC | #2
On 2015-09-22 13:25, Richard Henderson wrote:
> We can now restore state without retranslation.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/exec/exec-all.h |   1 +
>  tcg/tcg.c               |  40 ++++++++-----
>  tcg/tcg.h               |   4 +-
>  translate-all.c         | 149 +++++++++++++++++++++++++++++++++++-------------
>  4 files changed, 139 insertions(+), 55 deletions(-)
> 
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 6a69802..402dd87 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -199,6 +199,7 @@ struct TranslationBlock {
>  #define CF_USE_ICOUNT  0x20000
>  
>      void *tc_ptr;    /* pointer to the translated code */
> +    uint8_t *tc_search;  /* pointer to search data */
>      /* next matching tb for physical address. */
>      struct TranslationBlock *phys_hash_next;
>      /* original tb when cflags has CF_NOCACHE */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index bdb83d9..a0fce5b 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2294,7 +2294,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>                                        tcg_insn_unit *gen_code_buf,
>                                        long search_pc)
>  {
> -    int i, oi, oi_next;
> +    int i, oi, oi_next, num_insns;
>  
>  #ifdef DEBUG_DISAS
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
> @@ -2338,6 +2338,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  
>      tcg_out_tb_init(s);
>  
> +    num_insns = -1;
>      for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
>          TCGOp * const op = &s->gen_op_buf[oi];
>          TCGArg * const args = &s->gen_opparam_buf[op->args];
> @@ -2361,6 +2362,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
>              tcg_reg_alloc_movi(s, args, dead_args, sync_args);
>              break;
>          case INDEX_op_insn_start:
> +            if (num_insns >= 0) {
> +                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
> +            }
> +            num_insns++;
>              for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
>                  target_ulong a;
>  #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> @@ -2368,7 +2373,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  #else
>                  a = args[i];
>  #endif
> -                s->gen_opc_data[i] = a;
> +                s->gen_insn_data[num_insns][i] = a;
>              }
>              break;
>          case INDEX_op_discard:
> @@ -2400,6 +2405,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>          check_regs(s);
>  #endif
>      }
> +    tcg_debug_assert(num_insns >= 0);
> +    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>  
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> @@ -2448,24 +2455,26 @@ int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
>  void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
>  {
>      TCGContext *s = &tcg_ctx;
> -    int64_t tot;
> +    int64_t tb_count = s->tb_count;
> +    int64_t tb_div_count = tb_count ? tb_count : 1;
> +    int64_t tot = s->interm_time + s->code_time;
>  
> -    tot = s->interm_time + s->code_time;
>      cpu_fprintf(f, "JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
>                  tot, tot / 2.4e9);
>      cpu_fprintf(f, "translated TBs      %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n", 
> -                s->tb_count, 
> -                s->tb_count1 - s->tb_count,
> -                s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / s->tb_count1 * 100.0 : 0);
> +                tb_count, s->tb_count1 - tb_count,
> +                (double)(s->tb_count1 - s->tb_count)
> +                / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
>      cpu_fprintf(f, "avg ops/TB          %0.1f max=%d\n", 
> -                s->tb_count ? (double)s->op_count / s->tb_count : 0, s->op_count_max);
> +                (double)s->op_count / tb_div_count, s->op_count_max);
>      cpu_fprintf(f, "deleted ops/TB      %0.2f\n",
> -                s->tb_count ? 
> -                (double)s->del_op_count / s->tb_count : 0);
> +                (double)s->del_op_count / tb_div_count);
>      cpu_fprintf(f, "avg temps/TB        %0.2f max=%d\n",
> -                s->tb_count ? 
> -                (double)s->temp_count / s->tb_count : 0,
> -                s->temp_count_max);
> +                (double)s->temp_count / tb_div_count, s->temp_count_max);
> +    cpu_fprintf(f, "avg host code/TB    %0.1f\n",
> +                (double)s->code_out_len / tb_div_count);
> +    cpu_fprintf(f, "avg search data/TB  %0.1f\n",
> +                (double)s->search_out_len / tb_div_count);
>      
>      cpu_fprintf(f, "cycles/op           %0.1f\n", 
>                  s->op_count ? (double)tot / s->op_count : 0);
> @@ -2473,8 +2482,11 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
>                  s->code_in_len ? (double)tot / s->code_in_len : 0);
>      cpu_fprintf(f, "cycles/out byte     %0.1f\n", 
>                  s->code_out_len ? (double)tot / s->code_out_len : 0);
> -    if (tot == 0)
> +    cpu_fprintf(f, "cycles/search byte     %0.1f\n", 
> +                s->search_out_len ? (double)tot / s->search_out_len : 0);
> +    if (tot == 0) {
>          tot = 1;
> +    }
>      cpu_fprintf(f, "  gen_interm time   %0.1f%%\n", 
>                  (double)s->interm_time / tot * 100.0);
>      cpu_fprintf(f, "  gen_code time     %0.1f%%\n", 
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 8fd1252..df499c6 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -532,6 +532,7 @@ struct TCGContext {
>      int64_t del_op_count;
>      int64_t code_in_len;
>      int64_t code_out_len;
> +    int64_t search_out_len;
>      int64_t interm_time;
>      int64_t code_time;
>      int64_t la_time;
> @@ -581,7 +582,8 @@ struct TCGContext {
>      uint16_t gen_opc_icount[OPC_BUF_SIZE];
>      uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
>  
> -    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
> +    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> +    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
>  };
>  
>  extern TCGContext tcg_ctx;
> diff --git a/translate-all.c b/translate-all.c
> index 9f801ae..f6b8148 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -168,61 +168,127 @@ void cpu_gen_init(void)
>      tcg_context_init(&tcg_ctx); 
>  }
>  
> +/* Encode VAL as a signed leb128 sequence at P.
> +   Return P incremented past the encoded value.  */
> +static uint8_t *encode_sleb128(uint8_t *p, target_long val)
> +{
> +    int more, byte;
> +
> +    do {
> +        byte = val & 0x7f;
> +        val >>= 7;
> +        more = !((val == 0 && (byte & 0x40) == 0)
> +                 || (val == -1 && (byte & 0x40) != 0));
> +        if (more)
> +          byte |= 0x80;

You are missing braces here.

> +        *p++ = byte;
> +    } while (more);
> +
> +    return p;
> +}
> +
> +/* Decode a signed leb128 sequence at *PP; increment *PP past the
> +   decoded value.  Return the decoded value.  */
> +static target_long decode_sleb128(uint8_t **pp)
> +{
> +    uint8_t *p = *pp;
> +    target_long val = 0;
> +    int byte, shift = 0;
> +
> +    do {
> +        byte = *p++;
> +        val |= (target_ulong)(byte & 0x7f) << shift;
> +        shift += 7;
> +    } while (byte & 0x80);
> +    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
> +        val |= -(target_ulong)1 << shift;
> +    }
> +
> +    *pp = p;
> +    return val;
> +}
> +
> +/* Encode the data collected about the instructions while compiling TB.
> +   Place the data at BLOCK, and return the number of bytes consumed.
> +
> +   The logical table consisits of TARGET_INSN_START_WORDS target_ulong's,
> +   which come from the target's insn_start data, followed by a uintptr_t
> +   which comes from the host pc of the end of the code implementing the insn.
> +
> +   Each line of the table is encoded as sleb128 deltas from the previous
> +   line.  The seed for the first line is { tb->pc, 0..., tb->tc_ptr }.
> +   That is, the first column is seeded with the guest pc, the last column
> +   with the host pc, and the middle columns with zeros.  */
> +
> +static int encode_search(TranslationBlock *tb, uint8_t *block)
> +{
> +    uint8_t *p = block;
> +    int i, j, n;
> +
> +    tb->tc_search = block;
> +
> +    for (i = 0, n = tb->icount; i < n; ++i) {
> +        target_ulong prev;
> +
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            if (i == 0) {
> +                prev = (j == 0 ? tb->pc : 0);
> +            } else {
> +                prev = tcg_ctx.gen_insn_data[i - 1][j];
> +            }
> +            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
> +        }
> +        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
> +        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
> +    }
> +
> +    return p - block;
> +}
> +

Given we save both the host and the guest PC in this structure, one
obvious optimization would be to skip saving data for host instructions
which can not generate exception. It means that all the TCG ops in this
instruction do not generate exceptions either. We can easily test that
for all TCG instructions except all by looking at the
TCG_OPF_SIDE_EFFECTS flag. For the call op, we have to look at the
TCG_CALL_NO_SIDE_EFFECTS flag, even if it doesn't necessary means the
helper might generate exception.

That should significantly save space on load/store architectures. That
said we can probably do that in a latter time.

>  /* The cpu state corresponding to 'searched_pc' is restored.  */
>  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>                                       uintptr_t searched_pc)
>  {
> +    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
> +    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
>      CPUArchState *env = cpu->env_ptr;
> -    TCGContext *s = &tcg_ctx;
> -    int j;
> -    uintptr_t tc_ptr;
> +    uint8_t *p = tb->tc_search;
> +    int i, j, num_insns = tb->icount;
>  #ifdef CONFIG_PROFILER
> -    int64_t ti;
> +    int64_t ti = profile_getclock();
>  #endif
>  
> -#ifdef CONFIG_PROFILER
> -    ti = profile_getclock();
> -#endif
> -    tcg_func_start(s);
> +    if (searched_pc < host_pc) {
> +        return -1;
> +    }
>  
> -    gen_intermediate_code_pc(env, tb);
> +    /* Reconstruct the stored insn data while looking for the point at
> +       which the end of the insn exceeds the searched_pc.  */
> +    for (i = 0; i < num_insns; ++i) {
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            data[j] += decode_sleb128(&p);
> +        }
> +        host_pc += decode_sleb128(&p);
> +        if (host_pc > searched_pc) {
> +            goto found;
> +        }
> +    }
> +    return -1;
>  
> + found:
>      if (tb->cflags & CF_USE_ICOUNT) {
>          assert(use_icount);
>          /* Reset the cycle counter to the start of the block.  */
> -        cpu->icount_decr.u16.low += tb->icount;
> +        cpu->icount_decr.u16.low += num_insns;
>          /* Clear the IO flag.  */
>          cpu->can_do_io = 0;
>      }
> -
> -    /* find opc index corresponding to search_pc */
> -    tc_ptr = (uintptr_t)tb->tc_ptr;
> -    if (searched_pc < tc_ptr)
> -        return -1;
> -
> -    s->tb_next_offset = tb->tb_next_offset;
> -#ifdef USE_DIRECT_JUMP
> -    s->tb_jmp_offset = tb->tb_jmp_offset;
> -    s->tb_next = NULL;
> -#else
> -    s->tb_jmp_offset = NULL;
> -    s->tb_next = tb->tb_next;
> -#endif
> -    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
> -                               searched_pc - tc_ptr);
> -    if (j < 0)
> -        return -1;
> -    /* now find start of instruction before */
> -    while (s->gen_opc_instr_start[j] == 0) {
> -        j--;
> -    }
> -    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
> -
> -    restore_state_to_opc(env, tb, s->gen_opc_data);
> +    cpu->icount_decr.u16.low -= i;
> +    restore_state_to_opc(env, tb, data);
>  
>  #ifdef CONFIG_PROFILER
> -    s->restore_time += profile_getclock() - ti;
> -    s->restore_count++;
> +    tcg_ctx.restore_time += profile_getclock() - ti;
> +    tcg_ctx.restore_count++;
>  #endif
>      return 0;
>  }
> @@ -969,7 +1035,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      tb_page_addr_t phys_pc, phys_page2;
>      target_ulong virt_page2;
>      tcg_insn_unit *gen_code_buf;
> -    int gen_code_size;
> +    int gen_code_size, search_size;
>  #ifdef CONFIG_PROFILER
>      int64_t ti;
>  #endif
> @@ -1025,11 +1091,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>  
>      gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
> +    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
>  
>  #ifdef CONFIG_PROFILER
>      tcg_ctx.code_time += profile_getclock();
>      tcg_ctx.code_in_len += tb->size;
>      tcg_ctx.code_out_len += gen_code_size;
> +    tcg_ctx.search_out_len += search_size;
>  #endif
>  
>  #ifdef DEBUG_DISAS
> @@ -1041,8 +1109,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      }
>  #endif
>  
> -    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
> -            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
> +    tcg_ctx.code_gen_ptr = (void *)
> +        ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
> +                 CODE_GEN_ALIGN);
>  
>      /* check next page if needed */
>      virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;

If you fix the coding style issue I mentioned above, you get:

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
Richard Henderson Sept. 25, 2015, 11:05 p.m. UTC | #3
On 09/25/2015 02:10 PM, Aurelien Jarno wrote:
>> +        if (more)
>> +          byte |= 0x80;
> 
> You are missing braces here.

Gah.  I thought I fixed that...

> Given we save both the host and the guest PC in this structure, one
> obvious optimization would be to skip saving data for host instructions
> which can not generate exception. It means that all the TCG ops in this
> instruction do not generate exceptions either. We can easily test that
> for all TCG instructions except all by looking at the
> TCG_OPF_SIDE_EFFECTS flag. For the call op, we have to look at the
> TCG_CALL_NO_SIDE_EFFECTS flag, even if it doesn't necessary means the
> helper might generate exception.
> 
> That should significantly save space on load/store architectures. That
> said we can probably do that in a latter time.

Yes, Alex Bennee mentioned this during round 1.  I decided to not try to do
that all at once.

When we do get there, we also have to add an additional column for icount.
It's currently inferred that each entry is 1 insn.  This will expand the size
of the table in any case that every insn might raise an exception, but I expect
the normal case to be a slight decrease.



r~
diff mbox

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 6a69802..402dd87 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -199,6 +199,7 @@  struct TranslationBlock {
 #define CF_USE_ICOUNT  0x20000
 
     void *tc_ptr;    /* pointer to the translated code */
+    uint8_t *tc_search;  /* pointer to search data */
     /* next matching tb for physical address. */
     struct TranslationBlock *phys_hash_next;
     /* original tb when cflags has CF_NOCACHE */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index bdb83d9..a0fce5b 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2294,7 +2294,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
                                       tcg_insn_unit *gen_code_buf,
                                       long search_pc)
 {
-    int i, oi, oi_next;
+    int i, oi, oi_next, num_insns;
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
@@ -2338,6 +2338,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
 
     tcg_out_tb_init(s);
 
+    num_insns = -1;
     for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
         TCGOp * const op = &s->gen_op_buf[oi];
         TCGArg * const args = &s->gen_opparam_buf[op->args];
@@ -2361,6 +2362,10 @@  static inline int tcg_gen_code_common(TCGContext *s,
             tcg_reg_alloc_movi(s, args, dead_args, sync_args);
             break;
         case INDEX_op_insn_start:
+            if (num_insns >= 0) {
+                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
+            }
+            num_insns++;
             for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
                 target_ulong a;
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
@@ -2368,7 +2373,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
 #else
                 a = args[i];
 #endif
-                s->gen_opc_data[i] = a;
+                s->gen_insn_data[num_insns][i] = a;
             }
             break;
         case INDEX_op_discard:
@@ -2400,6 +2405,8 @@  static inline int tcg_gen_code_common(TCGContext *s,
         check_regs(s);
 #endif
     }
+    tcg_debug_assert(num_insns >= 0);
+    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
 
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
@@ -2448,24 +2455,26 @@  int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
 void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
 {
     TCGContext *s = &tcg_ctx;
-    int64_t tot;
+    int64_t tb_count = s->tb_count;
+    int64_t tb_div_count = tb_count ? tb_count : 1;
+    int64_t tot = s->interm_time + s->code_time;
 
-    tot = s->interm_time + s->code_time;
     cpu_fprintf(f, "JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
                 tot, tot / 2.4e9);
     cpu_fprintf(f, "translated TBs      %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n", 
-                s->tb_count, 
-                s->tb_count1 - s->tb_count,
-                s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / s->tb_count1 * 100.0 : 0);
+                tb_count, s->tb_count1 - tb_count,
+                (double)(s->tb_count1 - s->tb_count)
+                / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
     cpu_fprintf(f, "avg ops/TB          %0.1f max=%d\n", 
-                s->tb_count ? (double)s->op_count / s->tb_count : 0, s->op_count_max);
+                (double)s->op_count / tb_div_count, s->op_count_max);
     cpu_fprintf(f, "deleted ops/TB      %0.2f\n",
-                s->tb_count ? 
-                (double)s->del_op_count / s->tb_count : 0);
+                (double)s->del_op_count / tb_div_count);
     cpu_fprintf(f, "avg temps/TB        %0.2f max=%d\n",
-                s->tb_count ? 
-                (double)s->temp_count / s->tb_count : 0,
-                s->temp_count_max);
+                (double)s->temp_count / tb_div_count, s->temp_count_max);
+    cpu_fprintf(f, "avg host code/TB    %0.1f\n",
+                (double)s->code_out_len / tb_div_count);
+    cpu_fprintf(f, "avg search data/TB  %0.1f\n",
+                (double)s->search_out_len / tb_div_count);
     
     cpu_fprintf(f, "cycles/op           %0.1f\n", 
                 s->op_count ? (double)tot / s->op_count : 0);
@@ -2473,8 +2482,11 @@  void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
                 s->code_in_len ? (double)tot / s->code_in_len : 0);
     cpu_fprintf(f, "cycles/out byte     %0.1f\n", 
                 s->code_out_len ? (double)tot / s->code_out_len : 0);
-    if (tot == 0)
+    cpu_fprintf(f, "cycles/search byte     %0.1f\n", 
+                s->search_out_len ? (double)tot / s->search_out_len : 0);
+    if (tot == 0) {
         tot = 1;
+    }
     cpu_fprintf(f, "  gen_interm time   %0.1f%%\n", 
                 (double)s->interm_time / tot * 100.0);
     cpu_fprintf(f, "  gen_code time     %0.1f%%\n", 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 8fd1252..df499c6 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -532,6 +532,7 @@  struct TCGContext {
     int64_t del_op_count;
     int64_t code_in_len;
     int64_t code_out_len;
+    int64_t search_out_len;
     int64_t interm_time;
     int64_t code_time;
     int64_t la_time;
@@ -581,7 +582,8 @@  struct TCGContext {
     uint16_t gen_opc_icount[OPC_BUF_SIZE];
     uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
 
-    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
+    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
+    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 };
 
 extern TCGContext tcg_ctx;
diff --git a/translate-all.c b/translate-all.c
index 9f801ae..f6b8148 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -168,61 +168,127 @@  void cpu_gen_init(void)
     tcg_context_init(&tcg_ctx); 
 }
 
+/* Encode VAL as a signed leb128 sequence at P.
+   Return P incremented past the encoded value.  */
+static uint8_t *encode_sleb128(uint8_t *p, target_long val)
+{
+    int more, byte;
+
+    do {
+        byte = val & 0x7f;
+        val >>= 7;
+        more = !((val == 0 && (byte & 0x40) == 0)
+                 || (val == -1 && (byte & 0x40) != 0));
+        if (more)
+          byte |= 0x80;
+        *p++ = byte;
+    } while (more);
+
+    return p;
+}
+
+/* Decode a signed leb128 sequence at *PP; increment *PP past the
+   decoded value.  Return the decoded value.  */
+static target_long decode_sleb128(uint8_t **pp)
+{
+    uint8_t *p = *pp;
+    target_long val = 0;
+    int byte, shift = 0;
+
+    do {
+        byte = *p++;
+        val |= (target_ulong)(byte & 0x7f) << shift;
+        shift += 7;
+    } while (byte & 0x80);
+    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
+        val |= -(target_ulong)1 << shift;
+    }
+
+    *pp = p;
+    return val;
+}
+
+/* Encode the data collected about the instructions while compiling TB.
+   Place the data at BLOCK, and return the number of bytes consumed.
+
+   The logical table consisits of TARGET_INSN_START_WORDS target_ulong's,
+   which come from the target's insn_start data, followed by a uintptr_t
+   which comes from the host pc of the end of the code implementing the insn.
+
+   Each line of the table is encoded as sleb128 deltas from the previous
+   line.  The seed for the first line is { tb->pc, 0..., tb->tc_ptr }.
+   That is, the first column is seeded with the guest pc, the last column
+   with the host pc, and the middle columns with zeros.  */
+
+static int encode_search(TranslationBlock *tb, uint8_t *block)
+{
+    uint8_t *p = block;
+    int i, j, n;
+
+    tb->tc_search = block;
+
+    for (i = 0, n = tb->icount; i < n; ++i) {
+        target_ulong prev;
+
+        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
+            if (i == 0) {
+                prev = (j == 0 ? tb->pc : 0);
+            } else {
+                prev = tcg_ctx.gen_insn_data[i - 1][j];
+            }
+            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
+        }
+        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
+        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
+    }
+
+    return p - block;
+}
+
 /* The cpu state corresponding to 'searched_pc' is restored.  */
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                      uintptr_t searched_pc)
 {
+    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
+    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
     CPUArchState *env = cpu->env_ptr;
-    TCGContext *s = &tcg_ctx;
-    int j;
-    uintptr_t tc_ptr;
+    uint8_t *p = tb->tc_search;
+    int i, j, num_insns = tb->icount;
 #ifdef CONFIG_PROFILER
-    int64_t ti;
+    int64_t ti = profile_getclock();
 #endif
 
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    tcg_func_start(s);
+    if (searched_pc < host_pc) {
+        return -1;
+    }
 
-    gen_intermediate_code_pc(env, tb);
+    /* Reconstruct the stored insn data while looking for the point at
+       which the end of the insn exceeds the searched_pc.  */
+    for (i = 0; i < num_insns; ++i) {
+        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
+            data[j] += decode_sleb128(&p);
+        }
+        host_pc += decode_sleb128(&p);
+        if (host_pc > searched_pc) {
+            goto found;
+        }
+    }
+    return -1;
 
+ found:
     if (tb->cflags & CF_USE_ICOUNT) {
         assert(use_icount);
         /* Reset the cycle counter to the start of the block.  */
-        cpu->icount_decr.u16.low += tb->icount;
+        cpu->icount_decr.u16.low += num_insns;
         /* Clear the IO flag.  */
         cpu->can_do_io = 0;
     }
-
-    /* find opc index corresponding to search_pc */
-    tc_ptr = (uintptr_t)tb->tc_ptr;
-    if (searched_pc < tc_ptr)
-        return -1;
-
-    s->tb_next_offset = tb->tb_next_offset;
-#ifdef USE_DIRECT_JUMP
-    s->tb_jmp_offset = tb->tb_jmp_offset;
-    s->tb_next = NULL;
-#else
-    s->tb_jmp_offset = NULL;
-    s->tb_next = tb->tb_next;
-#endif
-    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
-                               searched_pc - tc_ptr);
-    if (j < 0)
-        return -1;
-    /* now find start of instruction before */
-    while (s->gen_opc_instr_start[j] == 0) {
-        j--;
-    }
-    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
-
-    restore_state_to_opc(env, tb, s->gen_opc_data);
+    cpu->icount_decr.u16.low -= i;
+    restore_state_to_opc(env, tb, data);
 
 #ifdef CONFIG_PROFILER
-    s->restore_time += profile_getclock() - ti;
-    s->restore_count++;
+    tcg_ctx.restore_time += profile_getclock() - ti;
+    tcg_ctx.restore_count++;
 #endif
     return 0;
 }
@@ -969,7 +1035,7 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tb_page_addr_t phys_pc, phys_page2;
     target_ulong virt_page2;
     tcg_insn_unit *gen_code_buf;
-    int gen_code_size;
+    int gen_code_size, search_size;
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
@@ -1025,11 +1091,13 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
 #endif
 
     gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
+    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.code_time += profile_getclock();
     tcg_ctx.code_in_len += tb->size;
     tcg_ctx.code_out_len += gen_code_size;
+    tcg_ctx.search_out_len += search_size;
 #endif
 
 #ifdef DEBUG_DISAS
@@ -1041,8 +1109,9 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     }
 #endif
 
-    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
-            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
+    tcg_ctx.code_gen_ptr = (void *)
+        ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
+                 CODE_GEN_ALIGN);
 
     /* check next page if needed */
     virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;