diff mbox

[18/20] tcg: Save insn data and use it in cpu_restore_state_from_tb

Message ID 1441173123-25540-19-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Sept. 2, 2015, 5:52 a.m. UTC
We can now restore state without retranslation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h |   1 +
 tcg/tcg.c               |  11 ++++-
 tcg/tcg.h               |   3 +-
 translate-all.c         | 129 +++++++++++++++++++++++++++++++++---------------
 4 files changed, 100 insertions(+), 44 deletions(-)

Comments

Peter Maydell Sept. 10, 2015, 1:49 p.m. UTC | #1
On 2 September 2015 at 06:52, Richard Henderson <rth@twiddle.net> wrote:
> We can now restore state without retranslation.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/exec/exec-all.h |   1 +
>  tcg/tcg.c               |  11 ++++-
>  tcg/tcg.h               |   3 +-
>  translate-all.c         | 129 +++++++++++++++++++++++++++++++++---------------
>  4 files changed, 100 insertions(+), 44 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 8c347ba..315f20a 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -198,6 +198,7 @@ struct TranslationBlock {
>  #define CF_USE_ICOUNT  0x20000
>
>      void *tc_ptr;    /* pointer to the translated code */
> +    uint8_t *tc_search;  /* pointer to search data */
>      /* next matching tb for physical address. */
>      struct TranslationBlock *phys_hash_next;
>      /* original tb when cflags has CF_NOCACHE */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index d956f0b..3541d4c 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2300,7 +2300,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>                                        tcg_insn_unit *gen_code_buf,
>                                        long search_pc)
>  {
> -    int i, oi, oi_next;
> +    int i, oi, oi_next, num_insns;
>
>  #ifdef DEBUG_DISAS
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
> @@ -2344,6 +2344,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>
>      tcg_out_tb_init(s);
>
> +    num_insns = -1;
>      for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
>          TCGOp * const op = &s->gen_op_buf[oi];
>          TCGArg * const args = &s->gen_opparam_buf[op->args];
> @@ -2367,6 +2368,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
>              tcg_reg_alloc_movi(s, args, dead_args, sync_args);
>              break;
>          case INDEX_op_insn_start:
> +            if (num_insns >= 0) {
> +                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
> +            }
> +            num_insns++;
>              for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
>                  target_ulong a;
>  #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> @@ -2374,7 +2379,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  #else
>                  a = args[i];
>  #endif
> -                s->gen_opc_data[i] = a;
> +                s->gen_insn_data[num_insns][i] = a;
>              }
>              break;
>          case INDEX_op_discard:
> @@ -2406,6 +2411,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>          check_regs(s);
>  #endif
>      }
> +    tcg_debug_assert(num_insns >= 0);

This is claiming that every TB will have at least one insn_start,
right? I think that most targets will violate that in the breakpoint
case, because the "if we have a bp for this insn then generate a
debug insn and break out of the loop" code is before the call
to tcg_gen_insn_start().

We should probably assert that num_insns < TCG_MAX_INSNS while
we're here.

> +    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 794b757..11cc107 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -581,7 +581,8 @@ struct TCGContext {
>      uint16_t gen_opc_icount[OPC_BUF_SIZE];
>      uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
>
> -    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
> +    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> +    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
>  };
>
>  extern TCGContext tcg_ctx;
> diff --git a/translate-all.c b/translate-all.c
> index 74be98a..a31f839 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -138,58 +138,65 @@ void cpu_gen_init(void)
>      tcg_context_init(&tcg_ctx);
>  }
>
> -/* The cpu state corresponding to 'searched_pc' is restored.
> - */
> +static target_long decode_sleb128(uint8_t **pp)
> +{
> +    uint8_t *p = *pp;
> +    target_long val = 0;
> +    int byte, shift = 0;
> +
> +    do {
> +        byte = *p++;
> +        val |= (target_ulong)(byte & 0x7f) << shift;
> +        shift += 7;
> +    } while (byte & 0x80);
> +    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
> +        val |= -(target_ulong)1 << shift;
> +    }
> +
> +    *pp = p;
> +    return val;
> +}

Are the encode/decode sleb128 functions known-good ones
borrowed from somewhere else?

(PS: checkpatch complains about missing braces.)

> +
> +/* The cpu state corresponding to 'searched_pc' is restored.  */
>  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>                                       uintptr_t searched_pc)
>  {
> +    target_ulong data[TARGET_INSN_START_WORDS] = { };
> +    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
>      CPUArchState *env = cpu->env_ptr;
> -    TCGContext *s = &tcg_ctx;
> -    int j;
> -    uintptr_t tc_ptr;
> +    uint8_t *p = tb->tc_search;
> +    int i, j, num_insns = tb->icount;
>  #ifdef CONFIG_PROFILER
> -    int64_t ti;
> +    int64_t ti = profile_getclock();
>  #endif
>
> -#ifdef CONFIG_PROFILER
> -    ti = profile_getclock();
> -#endif
> -    tcg_func_start(s);
> +    if (searched_pc < host_pc) {
> +        return -1;
> +    }
>
> -    gen_intermediate_code_pc(env, tb);
> +    /* Reconstruct the stored insn data while looking for the point at
> +       which the end of the insn exceeds the searched_pc.  */
> +    for (i = 0; i < num_insns; ++i) {
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            data[j] += decode_sleb128(&p);
> +        }
> +        host_pc += decode_sleb128(&p);
> +        if (host_pc > searched_pc) {
> +            goto found;
> +        }
> +    }
> +    return -1;
>
> + found:
>      if (tb->cflags & CF_USE_ICOUNT) {
>          assert(use_icount);
>          /* Reset the cycle counter to the start of the block.  */
> -        cpu->icount_decr.u16.low += tb->icount;
> +        cpu->icount_decr.u16.low += num_insns;
>          /* Clear the IO flag.  */
>          cpu->can_do_io = 0;
>      }
> -
> -    /* find opc index corresponding to search_pc */
> -    tc_ptr = (uintptr_t)tb->tc_ptr;
> -    if (searched_pc < tc_ptr)
> -        return -1;
> -
> -    s->tb_next_offset = tb->tb_next_offset;
> -#ifdef USE_DIRECT_JUMP
> -    s->tb_jmp_offset = tb->tb_jmp_offset;
> -    s->tb_next = NULL;
> -#else
> -    s->tb_jmp_offset = NULL;
> -    s->tb_next = tb->tb_next;
> -#endif
> -    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
> -                               searched_pc - tc_ptr);
> -    if (j < 0)
> -        return -1;
> -    /* now find start of instruction before */
> -    while (s->gen_opc_instr_start[j] == 0) {
> -        j--;
> -    }
> -    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
> -
> -    restore_state_to_opc(env, tb, s->gen_opc_data);
> +    cpu->icount_decr.u16.low -= i;
> +    restore_state_to_opc(env, tb, data);
>
>  #ifdef CONFIG_PROFILER
>      s->restore_time += profile_getclock() - ti;
> @@ -933,6 +940,44 @@ static void build_page_bitmap(PageDesc *p)
>      }
>  }
>
> +static uint8_t *encode_sleb128(uint8_t *p, target_long val)
> +{
> +    int more, byte;
> +
> +    do {
> +        byte = val & 0x7f;
> +        val >>= 7;
> +        more = !((val == 0 && (byte & 0x40) == 0)
> +                 || (val == -1 && (byte & 0x40) != 0));
> +        if (more)
> +          byte |= 0x80;
> +        *p++ = byte;
> +    } while (more);
> +
> +    return p;
> +}
> +
> +static int encode_search(TranslationBlock *tb, uint8_t *block)
> +{

I think this function would benefit from a brief comment
describing the compressed format we're creating here.

> +    uint8_t *p = block;
> +    int i, j, n;
> +
> +    tb->tc_search = block;
> +
> +    for (i = 0, n = tb->icount; i < n; ++i) {
> +        target_ulong prev;
> +
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            prev = (i == 0 ? 0 : tcg_ctx.gen_insn_data[i - 1][j]);
> +            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
> +        }
> +        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
> +        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
> +    }
> +
> +    return p - block;
> +}
> +
>  TranslationBlock *tb_gen_code(CPUState *cpu,
>                                target_ulong pc, target_ulong cs_base,
>                                int flags, int cflags)
> @@ -942,7 +987,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      tb_page_addr_t phys_pc, phys_page2;
>      target_ulong virt_page2;
>      tcg_insn_unit *gen_code_buf;
> -    int gen_code_size;
> +    int gen_code_size, search_size;
>  #ifdef CONFIG_PROFILER
>      int64_t ti;
>  #endif
> @@ -998,11 +1043,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>
>      gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
> +    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);

Now we're putting the encoded search info in the codegen buffer,
don't we need to adjust the calculation of code_gen_buffer_max_size
to avoid falling off the end if the last TB in the buffer has a very
large set of generated TCG code and also a big encoded search buffer?

It would also be nice to assert if we do fall off the end of the
buffer somehow.

>
>  #ifdef CONFIG_PROFILER
>      tcg_ctx.code_time += profile_getclock();
>      tcg_ctx.code_in_len += tb->size;
> -    tcg_ctx.code_out_len += gen_code_size;
> +    tcg_ctx.code_out_len += gen_code_size + search_size;
>  #endif

How much extra space does the encoded search typically take (as a
% of the gen_code_size, say)?

>
>  #ifdef DEBUG_DISAS
> @@ -1014,8 +1060,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      }
>  #endif
>
> -    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
> -            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
> +    tcg_ctx.code_gen_ptr = (void *)
> +        (((uintptr_t)gen_code_buf + gen_code_size + search_size
> +          + CODE_GEN_ALIGN - 1) & -CODE_GEN_ALIGN);

If we're messing with this line anyway we might as well use ROUND_UP:

    tcg_ctx.code_gen_ptr = (void *)
         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
                  CODE_GEN_ALIGN);

>
>      /* check next page if needed */
>      virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
> --
> 2.4.3

thanks
-- PMM
Sergey Fedorov Sept. 11, 2015, 10:29 a.m. UTC | #2
On 10.09.2015 16:49, Peter Maydell wrote:
>> @@ -2406,6 +2411,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>> >          check_regs(s);
>> >  #endif
>> >      }
>> > +    tcg_debug_assert(num_insns >= 0);
> This is claiming that every TB will have at least one insn_start,
> right? I think that most targets will violate that in the breakpoint
> case, because the "if we have a bp for this insn then generate a
> debug insn and break out of the loop" code is before the call
> to tcg_gen_insn_start().
>
> We should probably assert that num_insns < TCG_MAX_INSNS while
> we're here.
>

BTW, such skipping of instruction generation seems to be the cause of
getting a confusing "Disassembler disagrees with translator over
instruction" message in qemu log.
Peter Maydell Sept. 11, 2015, 10:32 a.m. UTC | #3
On 11 September 2015 at 11:29, Sergey Fedorov <serge.fdrv@gmail.com> wrote:
> On 10.09.2015 16:49, Peter Maydell wrote:
>>> @@ -2406,6 +2411,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>>> >          check_regs(s);
>>> >  #endif
>>> >      }
>>> > +    tcg_debug_assert(num_insns >= 0);
>> This is claiming that every TB will have at least one insn_start,
>> right? I think that most targets will violate that in the breakpoint
>> case, because the "if we have a bp for this insn then generate a
>> debug insn and break out of the loop" code is before the call
>> to tcg_gen_insn_start().
>>
>> We should probably assert that num_insns < TCG_MAX_INSNS while
>> we're here.
>>
>
> BTW, such skipping of instruction generation seems to be the cause of
> getting a confusing "Disassembler disagrees with translator over
> instruction" message in qemu log.

...I'd been meaning to try to track down what was provoking that :-)

thanks
-- PMM
Sergey Fedorov Sept. 11, 2015, 10:46 a.m. UTC | #4
On 11.09.2015 13:32, Peter Maydell wrote:
> On 11 September 2015 at 11:29, Sergey Fedorov <serge.fdrv@gmail.com> wrote:
>> On 10.09.2015 16:49, Peter Maydell wrote:
>>>> @@ -2406,6 +2411,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>>>>>          check_regs(s);
>>>>>  #endif
>>>>>      }
>>>>> +    tcg_debug_assert(num_insns >= 0);
>>> This is claiming that every TB will have at least one insn_start,
>>> right? I think that most targets will violate that in the breakpoint
>>> case, because the "if we have a bp for this insn then generate a
>>> debug insn and break out of the loop" code is before the call
>>> to tcg_gen_insn_start().
>>>
>>> We should probably assert that num_insns < TCG_MAX_INSNS while
>>> we're here.
>>>
>> BTW, such skipping of instruction generation seems to be the cause of
>> getting a confusing "Disassembler disagrees with translator over
>> instruction" message in qemu log.
> ...I'd been meaning to try to track down what was provoking that :-)

Seems it was wrong pc increment when handling a fired breakpoint.

Best,
Sergey
Richard Henderson Sept. 15, 2015, 8:08 p.m. UTC | #5
On 09/10/2015 06:49 AM, Peter Maydell wrote:
>> +    tcg_debug_assert(num_insns >= 0);
> 
> This is claiming that every TB will have at least one insn_start,
> right? I think that most targets will violate that in the breakpoint
> case, because the "if we have a bp for this insn then generate a
> debug insn and break out of the loop" code is before the call
> to tcg_gen_insn_start().
> 
> We should probably assert that num_insns < TCG_MAX_INSNS while
> we're here.

True.  I wonder if we shouldn't fix bp placement while I'm at it.  And the
assertion should really be num_insns == tb->icount.

>> +static target_long decode_sleb128(uint8_t **pp)
>> +{
>> +    uint8_t *p = *pp;
>> +    target_long val = 0;
>> +    int byte, shift = 0;
>> +
>> +    do {
>> +        byte = *p++;
>> +        val |= (target_ulong)(byte & 0x7f) << shift;
>> +        shift += 7;
>> +    } while (byte & 0x80);
>> +    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
>> +        val |= -(target_ulong)1 << shift;
>> +    }
>> +
>> +    *pp = p;
>> +    return val;
>> +}
> 
> Are the encode/decode sleb128 functions known-good ones
> borrowed from somewhere else?

Yes, from libgcc.

> (PS: checkpatch complains about missing braces.)

Ho hum...

>> +static int encode_search(TranslationBlock *tb, uint8_t *block)
>> +{
> 
> I think this function would benefit from a brief comment
> describing the compressed format we're creating here.

Yes.

>>      gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
>> +    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
> 
> Now we're putting the encoded search info in the codegen buffer,
> don't we need to adjust the calculation of code_gen_buffer_max_size
> to avoid falling off the end if the last TB in the buffer has a very
> large set of generated TCG code and also a big encoded search buffer?

Dunno.  It's not that we've ever checked for this before; I'm not sure what
factor I would actually apply.

> It would also be nice to assert if we do fall off the end of the
> buffer somehow.

Given that we generally use a very large mmap to allocate it, perhaps simply
adding a guard page would be best.

> How much extra space does the encoded search typically take (as a
> % of the gen_code_size, say)?

Dunno; I'll have to have a look at that.  Probably easiest to just enhance info
jit...


r~
diff mbox

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 8c347ba..315f20a 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -198,6 +198,7 @@  struct TranslationBlock {
 #define CF_USE_ICOUNT  0x20000
 
     void *tc_ptr;    /* pointer to the translated code */
+    uint8_t *tc_search;  /* pointer to search data */
     /* next matching tb for physical address. */
     struct TranslationBlock *phys_hash_next;
     /* original tb when cflags has CF_NOCACHE */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index d956f0b..3541d4c 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2300,7 +2300,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
                                       tcg_insn_unit *gen_code_buf,
                                       long search_pc)
 {
-    int i, oi, oi_next;
+    int i, oi, oi_next, num_insns;
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
@@ -2344,6 +2344,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
 
     tcg_out_tb_init(s);
 
+    num_insns = -1;
     for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
         TCGOp * const op = &s->gen_op_buf[oi];
         TCGArg * const args = &s->gen_opparam_buf[op->args];
@@ -2367,6 +2368,10 @@  static inline int tcg_gen_code_common(TCGContext *s,
             tcg_reg_alloc_movi(s, args, dead_args, sync_args);
             break;
         case INDEX_op_insn_start:
+            if (num_insns >= 0) {
+                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
+            }
+            num_insns++;
             for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
                 target_ulong a;
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
@@ -2374,7 +2379,7 @@  static inline int tcg_gen_code_common(TCGContext *s,
 #else
                 a = args[i];
 #endif
-                s->gen_opc_data[i] = a;
+                s->gen_insn_data[num_insns][i] = a;
             }
             break;
         case INDEX_op_discard:
@@ -2406,6 +2411,8 @@  static inline int tcg_gen_code_common(TCGContext *s,
         check_regs(s);
 #endif
     }
+    tcg_debug_assert(num_insns >= 0);
+    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
 
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 794b757..11cc107 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -581,7 +581,8 @@  struct TCGContext {
     uint16_t gen_opc_icount[OPC_BUF_SIZE];
     uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
 
-    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
+    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
+    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 };
 
 extern TCGContext tcg_ctx;
diff --git a/translate-all.c b/translate-all.c
index 74be98a..a31f839 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -138,58 +138,65 @@  void cpu_gen_init(void)
     tcg_context_init(&tcg_ctx); 
 }
 
-/* The cpu state corresponding to 'searched_pc' is restored.
- */
+static target_long decode_sleb128(uint8_t **pp)
+{
+    uint8_t *p = *pp;
+    target_long val = 0;
+    int byte, shift = 0;
+
+    do {
+        byte = *p++;
+        val |= (target_ulong)(byte & 0x7f) << shift;
+        shift += 7;
+    } while (byte & 0x80);
+    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
+        val |= -(target_ulong)1 << shift;
+    }
+
+    *pp = p;
+    return val;
+}
+
+/* The cpu state corresponding to 'searched_pc' is restored.  */
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                      uintptr_t searched_pc)
 {
+    target_ulong data[TARGET_INSN_START_WORDS] = { };
+    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
     CPUArchState *env = cpu->env_ptr;
-    TCGContext *s = &tcg_ctx;
-    int j;
-    uintptr_t tc_ptr;
+    uint8_t *p = tb->tc_search;
+    int i, j, num_insns = tb->icount;
 #ifdef CONFIG_PROFILER
-    int64_t ti;
+    int64_t ti = profile_getclock();
 #endif
 
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    tcg_func_start(s);
+    if (searched_pc < host_pc) {
+        return -1;
+    }
 
-    gen_intermediate_code_pc(env, tb);
+    /* Reconstruct the stored insn data while looking for the point at
+       which the end of the insn exceeds the searched_pc.  */
+    for (i = 0; i < num_insns; ++i) {
+        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
+            data[j] += decode_sleb128(&p);
+        }
+        host_pc += decode_sleb128(&p);
+        if (host_pc > searched_pc) {
+            goto found;
+        }
+    }
+    return -1;
 
+ found:
     if (tb->cflags & CF_USE_ICOUNT) {
         assert(use_icount);
         /* Reset the cycle counter to the start of the block.  */
-        cpu->icount_decr.u16.low += tb->icount;
+        cpu->icount_decr.u16.low += num_insns;
         /* Clear the IO flag.  */
         cpu->can_do_io = 0;
     }
-
-    /* find opc index corresponding to search_pc */
-    tc_ptr = (uintptr_t)tb->tc_ptr;
-    if (searched_pc < tc_ptr)
-        return -1;
-
-    s->tb_next_offset = tb->tb_next_offset;
-#ifdef USE_DIRECT_JUMP
-    s->tb_jmp_offset = tb->tb_jmp_offset;
-    s->tb_next = NULL;
-#else
-    s->tb_jmp_offset = NULL;
-    s->tb_next = tb->tb_next;
-#endif
-    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
-                               searched_pc - tc_ptr);
-    if (j < 0)
-        return -1;
-    /* now find start of instruction before */
-    while (s->gen_opc_instr_start[j] == 0) {
-        j--;
-    }
-    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
-
-    restore_state_to_opc(env, tb, s->gen_opc_data);
+    cpu->icount_decr.u16.low -= i;
+    restore_state_to_opc(env, tb, data);
 
 #ifdef CONFIG_PROFILER
     s->restore_time += profile_getclock() - ti;
@@ -933,6 +940,44 @@  static void build_page_bitmap(PageDesc *p)
     }
 }
 
+static uint8_t *encode_sleb128(uint8_t *p, target_long val)
+{
+    int more, byte;
+
+    do {
+        byte = val & 0x7f;
+        val >>= 7;
+        more = !((val == 0 && (byte & 0x40) == 0)
+                 || (val == -1 && (byte & 0x40) != 0));
+        if (more)
+          byte |= 0x80;
+        *p++ = byte;
+    } while (more);
+
+    return p;
+}
+
+static int encode_search(TranslationBlock *tb, uint8_t *block)
+{
+    uint8_t *p = block;
+    int i, j, n;
+
+    tb->tc_search = block;
+
+    for (i = 0, n = tb->icount; i < n; ++i) {
+        target_ulong prev;
+
+        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
+            prev = (i == 0 ? 0 : tcg_ctx.gen_insn_data[i - 1][j]);
+            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
+        }
+        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
+        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
+    }
+
+    return p - block;
+}
+
 TranslationBlock *tb_gen_code(CPUState *cpu,
                               target_ulong pc, target_ulong cs_base,
                               int flags, int cflags)
@@ -942,7 +987,7 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tb_page_addr_t phys_pc, phys_page2;
     target_ulong virt_page2;
     tcg_insn_unit *gen_code_buf;
-    int gen_code_size;
+    int gen_code_size, search_size;
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
@@ -998,11 +1043,12 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
 #endif
 
     gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
+    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.code_time += profile_getclock();
     tcg_ctx.code_in_len += tb->size;
-    tcg_ctx.code_out_len += gen_code_size;
+    tcg_ctx.code_out_len += gen_code_size + search_size;
 #endif
 
 #ifdef DEBUG_DISAS
@@ -1014,8 +1060,9 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     }
 #endif
 
-    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
-            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
+    tcg_ctx.code_gen_ptr = (void *)
+        (((uintptr_t)gen_code_buf + gen_code_size + search_size
+          + CODE_GEN_ALIGN - 1) & -CODE_GEN_ALIGN);
 
     /* check next page if needed */
     virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;