diff mbox

[RFC,v3] translate-all: protect code_gen_buffer with RCU

Message ID 1461627983-32563-1-git-send-email-cota@braap.org
State New
Headers show

Commit Message

Emilio Cota April 25, 2016, 11:46 p.m. UTC
Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

This seems to half-work[*] although I'm uneasy about the while idea.
I see two major hurdles:

* If the TB size is too small, this breaks badly, because we're not
  out of the RCU read critical section when another call to tb_flush
  is made. For instance, when booting ARM with this patch applied,
  I need to at least pass -tb-size 10 for it to fully boot debian
  jessie.
* We have different tb_flush callers that should be audited:
 $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
 exec.c:            tb_flush(cpu);
 gdbstub.c:        tb_flush(cpu);
 gdbstub.c:    tb_flush(cpu);
 hw/ppc/spapr_hcall.c:            tb_flush(CPU(cpu));
 target-alpha/sys_helper.c:    tb_flush(CPU(alpha_env_get_cpu(env)));
 target-i386/translate.c:        tb_flush(CPU(x86_env_get_cpu(env)));
 translate-all.c:        tb_flush(cpu);

With two code_gen "halves", if two tb_flush calls are done in the same
RCU read critical section, we're screwed. I added a cpu_exit at the end
of tb_flush to try to mitigate this, but I haven't audited all the callers
(for instance, what does the gdbstub do?).

If we end up having a mechanism to "stop all  CPUs to do something", as
I think we'll end up needing for correct LL/SC emulation, we'll probably
be better off using that mechanism for tb_flush as well -- plus, we'll avoid
wasting memory.

Other issues:
- This could be split into at least 2 patches, one that touches
  tcg/ and another to deal with translate-all.
  Note that in translate-all, the initial allocation of code_gen doesn't
  allocate extra space for the guard page; reserving guard page space is
  done instead by the added split_code_gen_buffer function.
- Windows: not even compile-tested.

[*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
    Alex' tests, however, sometimes fail with:

Unhandled exception 3 (pabt)
Exception frame registers:
pc : [<fffffb44>]    lr : [<00000001>]    psr: 20000173
sp : 4004f528  ip : 40012048  fp : 40032ca8
r10: 40032ca8  r9 : 00000000  r8 : 00000000
r7 : 0000000e  r6 : 40030000  r5 : 40032ca8  r4 : 00001ac6
r3 : 40012030  r2 : 40012030  r1 : d5ffffe7  r0 : 00000028
Flags: nzCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000
IFAR: fffffb44    IFSR: 00000205

or with:

CPU0: 16986 irqs (0 races, 11 slow,  1322 ticks avg latency)
FAIL: smc: irq: 17295 IRQs sent, 16986 received

Unhandled exception 6 (irq)
Exception frame registers:
pc : [<00000020>]    lr : [<40010800>]    psr: 60000153
sp : 400b45c0  ip : 400b34e8  fp : 40032ca8
r10: 00000000  r9 : 00000000  r8 : 00000000
r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
r3 : 00000000  r2 : 00000000  r1 : 000000ff  r0 : 00000000
Flags: nZCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000

I built with --enable-tcg-debug.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 tcg/tcg.c       |  22 +++++---
 tcg/tcg.h       |   1 +
 translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 144 insertions(+), 34 deletions(-)

Comments

Richard Henderson April 26, 2016, 4:48 a.m. UTC | #1
On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
> +    /*
> +     * write the prologue into buf2. This is safe because we'll later call
> +     * tcg_prologue_init on buf1, from which we'll start execution.
> +     */
> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
> +    tcg_prologue_init(&tcg_ctx);
> +

Ah, no.  Write only one prologue, not one per buffer.

If they're sufficiently close (i.e. one allocation under the max size),
then the same one can be used for both halves.

The global variables that you didn't see in this revision are:

aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];


r~
Alex Bennée April 26, 2016, 6:32 a.m. UTC | #2
Emilio G. Cota <cota@braap.org> writes:

> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> This seems to half-work[*] although I'm uneasy about the while idea.
> I see two major hurdles:
>
> * If the TB size is too small, this breaks badly, because we're not
>   out of the RCU read critical section when another call to tb_flush
>   is made. For instance, when booting ARM with this patch applied,
>   I need to at least pass -tb-size 10 for it to fully boot debian
>   jessie.
> * We have different tb_flush callers that should be audited:
>  $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
>  exec.c:            tb_flush(cpu);
>  gdbstub.c:        tb_flush(cpu);
>  gdbstub.c:    tb_flush(cpu);
>  hw/ppc/spapr_hcall.c:            tb_flush(CPU(cpu));
>  target-alpha/sys_helper.c:    tb_flush(CPU(alpha_env_get_cpu(env)));
>  target-i386/translate.c:        tb_flush(CPU(x86_env_get_cpu(env)));
>  translate-all.c:        tb_flush(cpu);
>
> With two code_gen "halves", if two tb_flush calls are done in the same
> RCU read critical section, we're screwed. I added a cpu_exit at the end
> of tb_flush to try to mitigate this, but I haven't audited all the callers
> (for instance, what does the gdbstub do?).

I'm not sure we are going to get much from this approach. The tb_flush
is a fairly rare occurrence its not like its on the critical performance
path (although of course pathological cases are possible).

I still think there are possibilities with a smaller TranslationRegion
approach but this is more aimed at solving problems like bulk
invalidations of a page of translations at a time and safer inter-block
patching. It doesn't do much to make the tb_flush easier though.

>
> If we end up having a mechanism to "stop all  CPUs to do something", as
> I think we'll end up needing for correct LL/SC emulation, we'll probably
> be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> wasting memory.

I'm fairly certain there will need to be a "stop everything" mode for
some things - I'm less certain of the best way of doing it. Did you get
a chance to look at my version of the async_safe_work mechanism?

>
> Other issues:
> - This could be split into at least 2 patches, one that touches
>   tcg/ and another to deal with translate-all.
>   Note that in translate-all, the initial allocation of code_gen doesn't
>   allocate extra space for the guard page; reserving guard page space is
>   done instead by the added split_code_gen_buffer function.
> - Windows: not even compile-tested.
>
> [*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
>     Alex' tests, however, sometimes fail with:
>
> Unhandled exception 3 (pabt)
> Exception frame registers:
> pc : [<fffffb44>]    lr : [<00000001>]    psr: 20000173
> sp : 4004f528  ip : 40012048  fp : 40032ca8
> r10: 40032ca8  r9 : 00000000  r8 : 00000000
> r7 : 0000000e  r6 : 40030000  r5 : 40032ca8  r4 : 00001ac6
> r3 : 40012030  r2 : 40012030  r1 : d5ffffe7  r0 : 00000028
> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32
> Control: 00c5107d  Table: 40060000  DAC: 00000000
> IFAR: fffffb44    IFSR: 00000205
>
> or with:
>
> CPU0: 16986 irqs (0 races, 11 slow,  1322 ticks avg latency)
> FAIL: smc: irq: 17295 IRQs sent, 16986 received
>
> Unhandled exception 6 (irq)
> Exception frame registers:
> pc : [<00000020>]    lr : [<40010800>]    psr: 60000153
> sp : 400b45c0  ip : 400b34e8  fp : 40032ca8
> r10: 00000000  r9 : 00000000  r8 : 00000000
> r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
> r3 : 00000000  r2 : 00000000  r1 : 000000ff  r0 : 00000000
> Flags: nZCv  IRQs on  FIQs off  Mode SVC_32
> Control: 00c5107d  Table: 40060000  DAC: 00000000
>
> I built with --enable-tcg-debug.

I'll have another go at reproducing. I could only get the asserts to
fire which was odd because AFAICT the prologue generation which
triggered it was serialised with tb_lock.

>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  tcg/tcg.c       |  22 +++++---
>  tcg/tcg.h       |   1 +
>  translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
>  3 files changed, 144 insertions(+), 34 deletions(-)
>
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index b46bf1a..7db8ce9 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
>      }
>  }
>
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
> +{
> +    size_t size = s->code_gen_buffer_size - prologue_size;
> +
> +    s->code_gen_ptr = buf;
> +    s->code_gen_buffer = buf;
> +    s->code_buf = buf;
> +
> +    /* Compute a high-water mark, at which we voluntarily flush the buffer
> +       and start over.  The size here is arbitrary, significantly larger
> +       than we expect the code generation for any one opcode to require.  */
> +    s->code_gen_highwater = s->code_buf + (size - 1024);
> +}
> +
>  void tcg_prologue_init(TCGContext *s)
>  {
>      size_t prologue_size, total_size;
> @@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
>
>      /* Deduct the prologue from the buffer.  */
>      prologue_size = tcg_current_code_size(s);
> -    s->code_gen_ptr = buf1;
> -    s->code_gen_buffer = buf1;
> -    s->code_buf = buf1;
>      total_size = s->code_gen_buffer_size - prologue_size;
>      s->code_gen_buffer_size = total_size;
>
> -    /* Compute a high-water mark, at which we voluntarily flush the buffer
> -       and start over.  The size here is arbitrary, significantly larger
> -       than we expect the code generation for any one opcode to require.  */
> -    s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
> +    tcg_code_gen_init(s, buf1, prologue_size);
>
>      tcg_register_jit(s->code_gen_buffer, total_size);
>
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 40c8fbe..e849d3e 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
>
>  void tcg_context_init(TCGContext *s);
>  void tcg_prologue_init(TCGContext *s);
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
>  void tcg_func_start(TCGContext *s);
>
>  int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..7a83176 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
>    (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
>     ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
>
> +static int code_gen_buf_mask;
> +static void *code_gen_buf1;
> +static void *code_gen_buf2;
> +static size_t code_gen_prologue_size;
> +
>  static inline size_t size_code_gen_buffer(size_t tb_size)
>  {
>      /* Size the buffer.  */
> @@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
>      return tb_size;
>  }
>
> +static void *split_code_gen_buffer(void *buf, size_t size)
> +{
> +    /* enforce alignment of the buffer to a page boundary */
> +    if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
> +        uintptr_t b;
> +
> +        b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
> +        size -= b - (uintptr_t)buf;
> +        buf = (void *)b;
> +    }
> +    /*
> +     * Make sure the size is an even number of pages so that both halves will
> +     * end on a page boundary.
> +     */
> +    size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
> +
> +    /* split in half, making room for 2 guard pages */
> +    size -= 2 * qemu_real_host_page_size;
> +    size /= 2;
> +    code_gen_buf1 = buf;
> +    code_gen_buf2 = buf + size + qemu_real_host_page_size;
> +
> +    /*
> +     * write the prologue into buf2. This is safe because we'll later call
> +     * tcg_prologue_init on buf1, from which we'll start execution.
> +     */
> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
> +    tcg_prologue_init(&tcg_ctx);
> +    code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
> +
> +    tcg_ctx.code_gen_buffer_size = size;
> +
> +    /* start execution from buf1 */
> +    code_gen_buf_mask = 1;
> +    return code_gen_buf1;
> +}
> +
>  #ifdef __mips__
>  /* In order to use J and JAL within the code_gen_buffer, we require
>     that the buffer not cross a 256MB boundary.  */
> @@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
>  static inline void *alloc_code_gen_buffer(void)
>  {
>      void *buf = static_code_gen_buffer;
> -    size_t full_size, size;
> +    size_t size;
>
>      /* The size of the buffer, rounded down to end on a page boundary.  */
> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> -                 & qemu_real_host_page_mask) - (uintptr_t)buf;
> -
> -    /* Reserve a guard page.  */
> -    size = full_size - qemu_real_host_page_size;
> +    size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> +            & qemu_real_host_page_mask) - (uintptr_t)buf;
>
>      /* Honor a command-line option limiting the size of the buffer.  */
>      if (size > tcg_ctx.code_gen_buffer_size) {
>          size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
>                  & qemu_real_host_page_mask) - (uintptr_t)buf;
>      }
> -    tcg_ctx.code_gen_buffer_size = size;
>
>  #ifdef __mips__
>      if (cross_256mb(buf, size)) {
> @@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
>  #endif
>
>      map_exec(buf, size);
> -    map_none(buf + size, qemu_real_host_page_size);
>      qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> +    buf = split_code_gen_buffer(buf, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    /* page guards */
> +    map_none(code_gen_buf1 + size, qemu_real_host_page_size);
> +    map_none(code_gen_buf2 + size, qemu_real_host_page_size);
> +
>      return buf;
>  }
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
>  {
>      size_t size = tcg_ctx.code_gen_buffer_size;
> -    void *buf1, *buf2;
> +    void *buf;
> +    void *ret;
>
> -    /* Perform the allocation in two steps, so that the guard page
> -       is reserved but uncommitted.  */
> -    buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
> -                        MEM_RESERVE, PAGE_NOACCESS);
> -    if (buf1 != NULL) {
> -        buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> -        assert(buf1 == buf2);
> +    /* Perform the allocation in two steps, so that the guard pages
> +       are reserved but uncommitted.  */
> +    ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
> +    if (ret == NULL) {
> +        return NULL;
>      }
>
> -    return buf1;
> +    ret = split_code_gen_buffer(ret, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> +    assert(buf);
> +    buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> +    assert(buf);
> +
> +    return ret;
>  }
>  #else
>  static inline void *alloc_code_gen_buffer(void)
> @@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
>  #  endif
>  # endif
>
> -    buf = mmap((void *)start, size + qemu_real_host_page_size,
> -               PROT_NONE, flags, -1, 0);
> +    buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
>      }
> @@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
>          /* Try again, with the original still mapped, to avoid re-acquiring
>             that 256mb crossing.  This time don't specify an address.  */
>          size_t size2;
> -        void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
> -                          PROT_NONE, flags, -1, 0);
> +        void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
> +
>          switch (buf2 != MAP_FAILED) {
>          case 1:
>              if (!cross_256mb(buf2, size)) {
>                  /* Success!  Use the new buffer.  */
> -                munmap(buf, size + qemu_real_host_page_size);
> +                munmap(buf, size);
>                  break;
>              }
>              /* Failure.  Work with what we had.  */
> -            munmap(buf2, size + qemu_real_host_page_size);
> +            munmap(buf2, size);
>              /* fallthru */
>          default:
>              /* Split the original buffer.  Free the smaller half.  */
>              buf2 = split_cross_256mb(buf, size);
>              size2 = tcg_ctx.code_gen_buffer_size;
>              if (buf == buf2) {
> -                munmap(buf + size2 + qemu_real_host_page_size, size - size2);
> +                munmap(buf + size2, size - size2);
>              } else {
>                  munmap(buf, size - size2);
>              }
> @@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
>      }
>  #endif
>
> -    /* Make the final buffer accessible.  The guard page at the end
> -       will remain inaccessible with PROT_NONE.  */
> +    /* Make the final buffer accessible. */
>      mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
>
>      /* Request large pages for the buffer.  */
>      qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> +    buf = split_code_gen_buffer(buf + 1, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    /* page guards */
> +    mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
> +    mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
> +
>      return buf;
>  }
>  #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
> @@ -829,10 +885,48 @@ static void page_flush_tb(void)
>      }
>  }
>
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void clear_code_gen_buffer(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    tb_lock();
> +    code_gen_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *replace_code_gen_buffer(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
> +
> +    desc->clear_bit = code_gen_buf_mask;
> +    call_rcu1(&desc->rcu, clear_code_gen_buffer);
> +
> +    if (code_gen_buf_mask == 1) {
> +        code_gen_buf_mask |= 2;
> +        return code_gen_buf2 + code_gen_prologue_size;
> +    }
> +    code_gen_buf_mask |= 1;
> +    return code_gen_buf1 + code_gen_prologue_size;
> +}
> +
>  /* flush all the translation blocks */
>  /* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUState *cpu)
>  {
> +    void *buf;
> +
>  #if defined(DEBUG_FLUSH)
>      printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
>             (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
> @@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>      page_flush_tb();
>
> -    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    buf = replace_code_gen_buffer();
> +    tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
> +
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée
Alex Bennée April 26, 2016, 6:35 a.m. UTC | #3
Richard Henderson <rth@twiddle.net> writes:

> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>> +    /*
>> +     * write the prologue into buf2. This is safe because we'll later call
>> +     * tcg_prologue_init on buf1, from which we'll start execution.
>> +     */
>> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
>> +    tcg_prologue_init(&tcg_ctx);
>> +
>
> Ah, no.  Write only one prologue, not one per buffer.
>
> If they're sufficiently close (i.e. one allocation under the max size),
> then the same one can be used for both halves.
>
> The global variables that you didn't see in this revision are:
>
> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];

Aside from the existing code structure is there any reason to have only
one prologue? It doesn't seem to be a large amount of code and in the
case of having smaller translation regions I would posit having a
"local" prologue/epilogue would make the jumps cheaper.

>
>
> r~


--
Alex Bennée
Richard Henderson April 26, 2016, 3:42 p.m. UTC | #4
On 04/25/2016 11:35 PM, Alex Bennée wrote:
> 
> Richard Henderson <rth@twiddle.net> writes:
> 
>> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>>> +    /*
>>> +     * write the prologue into buf2. This is safe because we'll later call
>>> +     * tcg_prologue_init on buf1, from which we'll start execution.
>>> +     */
>>> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
>>> +    tcg_prologue_init(&tcg_ctx);
>>> +
>>
>> Ah, no.  Write only one prologue, not one per buffer.
>>
>> If they're sufficiently close (i.e. one allocation under the max size),
>> then the same one can be used for both halves.
>>
>> The global variables that you didn't see in this revision are:
>>
>> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
>> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];
> 
> Aside from the existing code structure is there any reason to have only
> one prologue? 

Well, there's also the gdb jit unwind info.  But aside from those, no.

> It doesn't seem to be a large amount of code and in the
> case of having smaller translation regions I would posit having a
> "local" prologue/epilogue would make the jumps cheaper.

Not really.  The jumps are generally in range already, based on the restriction
in max buffer size.

Really only arm32 (and ppc32, post direct jump atomicity patchset) are the only
ones that require a tiny (less than 64MB) buffer.  Anything bigger than 64MB, I
don't see any reason to create two independent buffers.

The other consideration not yet mentioned is that you'd like to put on the
entire buffer, in the case of x86_64 and some others, within 2GB of the main
executable, so that helper calls can use a direct call insn.



r~
Emilio Cota April 30, 2016, 3:40 a.m. UTC | #5
On Tue, Apr 26, 2016 at 07:32:39 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
> > With two code_gen "halves", if two tb_flush calls are done in the same
> > RCU read critical section, we're screwed. I added a cpu_exit at the end
> > of tb_flush to try to mitigate this, but I haven't audited all the callers
> > (for instance, what does the gdbstub do?).
> 
> I'm not sure we are going to get much from this approach. The tb_flush
> is a fairly rare occurrence its not like its on the critical performance
> path (although of course pathological cases are possible).

This is what I thought from the beginning, but wanted to give this
alternative a go anyway to see if it was feasible.

On my end I won't do any more work on this approach. Will go back
to locks, despite Paolo's (justified) dislike for them =)

> > If we end up having a mechanism to "stop all  CPUs to do something", as
> > I think we'll end up needing for correct LL/SC emulation, we'll probably
> > be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> > wasting memory.
> 
> I'm fairly certain there will need to be a "stop everything" mode for
> some things - I'm less certain of the best way of doing it. Did you get
> a chance to look at my version of the async_safe_work mechanism?

Not yet, but will get to it very soon.

Cheers,

		Emilio
Paolo Bonzini May 9, 2016, 11:21 a.m. UTC | #6
On 30/04/2016 05:40, Emilio G. Cota wrote:
>> The tb_flush
>> > is a fairly rare occurrence its not like its on the critical performance
>> > path (although of course pathological cases are possible).
> This is what I thought from the beginning, but wanted to give this
> alternative a go anyway to see if it was feasible.
> 
> On my end I won't do any more work on this approach. Will go back
> to locks, despite Paolo's (justified) dislike for them =)

Which locks?  tb_lock during tb_find_fast?  The problem with that was
that it slowed down everything a lot, wasn't it?

To me, the RCU idea is not really about making tb_flush (the rare case)
faster; it was more about keeping the rest simple and fast.

Paolo
Alex Bennée May 9, 2016, 11:50 a.m. UTC | #7
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 30/04/2016 05:40, Emilio G. Cota wrote:
>>> The tb_flush
>>> > is a fairly rare occurrence its not like its on the critical performance
>>> > path (although of course pathological cases are possible).
>> This is what I thought from the beginning, but wanted to give this
>> alternative a go anyway to see if it was feasible.
>>
>> On my end I won't do any more work on this approach. Will go back
>> to locks, despite Paolo's (justified) dislike for them =)
>
> Which locks?  tb_lock during tb_find_fast?  The problem with that was
> that it slowed down everything a lot, wasn't it?

Very much so, in the new tree (coming soon) with QHT I was able to
remove the locks from the whole hot-path which means they where only
needed for code generation.

> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.

I'm not sure it achieved that as there is added complexity from having
the split buffer and then ensuring you don't double-flush.

>
> Paolo


--
Alex Bennée
Paolo Bonzini May 9, 2016, 1:55 p.m. UTC | #8
On 09/05/2016 13:50, Alex Bennée wrote:
> > Which locks?  tb_lock during tb_find_fast?  The problem with that was
> > that it slowed down everything a lot, wasn't it?
> 
> Very much so, in the new tree (coming soon) with QHT I was able to
> remove the locks from the whole hot-path which means they where only
> needed for code generation.

Okay, I'm curious now. :)

> > To me, the RCU idea is not really about making tb_flush (the rare case)
> > faster; it was more about keeping the rest simple and fast.
> 
> I'm not sure it achieved that as there is added complexity from having
> the split buffer and then ensuring you don't double-flush.

Agreed.

Paolo
Alex Bennée May 9, 2016, 3:05 p.m. UTC | #9
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 09/05/2016 13:50, Alex Bennée wrote:
>> > Which locks?  tb_lock during tb_find_fast?  The problem with that was
>> > that it slowed down everything a lot, wasn't it?
>>
>> Very much so, in the new tree (coming soon) with QHT I was able to
>> remove the locks from the whole hot-path which means they where only
>> needed for code generation.
>
> Okay, I'm curious now. :)

https://github.com/stsquad/qemu/commits/mttcg/base-patches-v3 is the
current WIP, with:

https://github.com/stsquad/qemu/commit/0823f1c77f12ed5958f77484d6477ea205aee220

being the commit that clears the hot-path to run without locks.

The tree is based on tcg-next which has made things a lot cleaner now a
bunch of Sergey's stuff has been grabbed by rth. Obviously being WIP
subject to change. Once I'm done with my current out-of-tree diversions
I'll be back onto cleaning the tree up for the next review round.

Review comments on the posted tree's always welcome of course ;-)

>
>> > To me, the RCU idea is not really about making tb_flush (the rare case)
>> > faster; it was more about keeping the rest simple and fast.
>>
>> I'm not sure it achieved that as there is added complexity from having
>> the split buffer and then ensuring you don't double-flush.
>
> Agreed.
>
> Paolo


--
Alex Bennée
Emilio Cota May 9, 2016, 5:07 p.m. UTC | #10
On Mon, May 09, 2016 at 13:21:50 +0200, Paolo Bonzini wrote:
> On 30/04/2016 05:40, Emilio G. Cota wrote:
> >> The tb_flush
> >> > is a fairly rare occurrence its not like its on the critical performance
> >> > path (although of course pathological cases are possible).
> > This is what I thought from the beginning, but wanted to give this
> > alternative a go anyway to see if it was feasible.
> > 
> > On my end I won't do any more work on this approach. Will go back
> > to locks, despite Paolo's (justified) dislike for them =)
> 
> Which locks?  tb_lock during tb_find_fast?  The problem with that was
> that it slowed down everything a lot, wasn't it?

By "locks" I meant somehow forcing other threads/vcpus to stop, to then
perform tb_flush. This can be achieved with a bunch of primitives
such as condition variables (*gaaah*) -- these of course involve "locks" :P

The removal of tb_lock() when looking up tb's is orthogonal to this.

> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.

Well I agree with this idea; I wanted to see how far we could take it.

		E.
diff mbox

Patch

diff --git a/tcg/tcg.c b/tcg/tcg.c
index b46bf1a..7db8ce9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -380,6 +380,20 @@  void tcg_context_init(TCGContext *s)
     }
 }
 
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
+{
+    size_t size = s->code_gen_buffer_size - prologue_size;
+
+    s->code_gen_ptr = buf;
+    s->code_gen_buffer = buf;
+    s->code_buf = buf;
+
+    /* Compute a high-water mark, at which we voluntarily flush the buffer
+       and start over.  The size here is arbitrary, significantly larger
+       than we expect the code generation for any one opcode to require.  */
+    s->code_gen_highwater = s->code_buf + (size - 1024);
+}
+
 void tcg_prologue_init(TCGContext *s)
 {
     size_t prologue_size, total_size;
@@ -398,16 +412,10 @@  void tcg_prologue_init(TCGContext *s)
 
     /* Deduct the prologue from the buffer.  */
     prologue_size = tcg_current_code_size(s);
-    s->code_gen_ptr = buf1;
-    s->code_gen_buffer = buf1;
-    s->code_buf = buf1;
     total_size = s->code_gen_buffer_size - prologue_size;
     s->code_gen_buffer_size = total_size;
 
-    /* Compute a high-water mark, at which we voluntarily flush the buffer
-       and start over.  The size here is arbitrary, significantly larger
-       than we expect the code generation for any one opcode to require.  */
-    s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
+    tcg_code_gen_init(s, buf1, prologue_size);
 
     tcg_register_jit(s->code_gen_buffer, total_size);
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 40c8fbe..e849d3e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -634,6 +634,7 @@  static inline void *tcg_malloc(int size)
 
 void tcg_context_init(TCGContext *s);
 void tcg_prologue_init(TCGContext *s);
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
 void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
diff --git a/translate-all.c b/translate-all.c
index bba9b62..7a83176 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -485,6 +485,11 @@  static inline PageDesc *page_find(tb_page_addr_t index)
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
    ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
 
+static int code_gen_buf_mask;
+static void *code_gen_buf1;
+static void *code_gen_buf2;
+static size_t code_gen_prologue_size;
+
 static inline size_t size_code_gen_buffer(size_t tb_size)
 {
     /* Size the buffer.  */
@@ -508,6 +513,43 @@  static inline size_t size_code_gen_buffer(size_t tb_size)
     return tb_size;
 }
 
+static void *split_code_gen_buffer(void *buf, size_t size)
+{
+    /* enforce alignment of the buffer to a page boundary */
+    if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
+        uintptr_t b;
+
+        b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
+        size -= b - (uintptr_t)buf;
+        buf = (void *)b;
+    }
+    /*
+     * Make sure the size is an even number of pages so that both halves will
+     * end on a page boundary.
+     */
+    size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
+
+    /* split in half, making room for 2 guard pages */
+    size -= 2 * qemu_real_host_page_size;
+    size /= 2;
+    code_gen_buf1 = buf;
+    code_gen_buf2 = buf + size + qemu_real_host_page_size;
+
+    /*
+     * write the prologue into buf2. This is safe because we'll later call
+     * tcg_prologue_init on buf1, from which we'll start execution.
+     */
+    tcg_ctx.code_gen_buffer = code_gen_buf2;
+    tcg_prologue_init(&tcg_ctx);
+    code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
+
+    tcg_ctx.code_gen_buffer_size = size;
+
+    /* start execution from buf1 */
+    code_gen_buf_mask = 1;
+    return code_gen_buf1;
+}
+
 #ifdef __mips__
 /* In order to use J and JAL within the code_gen_buffer, we require
    that the buffer not cross a 256MB boundary.  */
@@ -583,21 +625,17 @@  static inline void map_none(void *addr, long size)
 static inline void *alloc_code_gen_buffer(void)
 {
     void *buf = static_code_gen_buffer;
-    size_t full_size, size;
+    size_t size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
-                 & qemu_real_host_page_mask) - (uintptr_t)buf;
-
-    /* Reserve a guard page.  */
-    size = full_size - qemu_real_host_page_size;
+    size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+            & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Honor a command-line option limiting the size of the buffer.  */
     if (size > tcg_ctx.code_gen_buffer_size) {
         size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
                 & qemu_real_host_page_mask) - (uintptr_t)buf;
     }
-    tcg_ctx.code_gen_buffer_size = size;
 
 #ifdef __mips__
     if (cross_256mb(buf, size)) {
@@ -607,27 +645,40 @@  static inline void *alloc_code_gen_buffer(void)
 #endif
 
     map_exec(buf, size);
-    map_none(buf + size, qemu_real_host_page_size);
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    map_none(code_gen_buf1 + size, qemu_real_host_page_size);
+    map_none(code_gen_buf2 + size, qemu_real_host_page_size);
+
     return buf;
 }
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
     size_t size = tcg_ctx.code_gen_buffer_size;
-    void *buf1, *buf2;
+    void *buf;
+    void *ret;
 
-    /* Perform the allocation in two steps, so that the guard page
-       is reserved but uncommitted.  */
-    buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
-                        MEM_RESERVE, PAGE_NOACCESS);
-    if (buf1 != NULL) {
-        buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
-        assert(buf1 == buf2);
+    /* Perform the allocation in two steps, so that the guard pages
+       are reserved but uncommitted.  */
+    ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+    if (ret == NULL) {
+        return NULL;
     }
 
-    return buf1;
+    ret = split_code_gen_buffer(ret, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+    assert(buf);
+    buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+    assert(buf);
+
+    return ret;
 }
 #else
 static inline void *alloc_code_gen_buffer(void)
@@ -665,8 +716,7 @@  static inline void *alloc_code_gen_buffer(void)
 #  endif
 # endif
 
-    buf = mmap((void *)start, size + qemu_real_host_page_size,
-               PROT_NONE, flags, -1, 0);
+    buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
     }
@@ -676,24 +726,24 @@  static inline void *alloc_code_gen_buffer(void)
         /* Try again, with the original still mapped, to avoid re-acquiring
            that 256mb crossing.  This time don't specify an address.  */
         size_t size2;
-        void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
-                          PROT_NONE, flags, -1, 0);
+        void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+
         switch (buf2 != MAP_FAILED) {
         case 1:
             if (!cross_256mb(buf2, size)) {
                 /* Success!  Use the new buffer.  */
-                munmap(buf, size + qemu_real_host_page_size);
+                munmap(buf, size);
                 break;
             }
             /* Failure.  Work with what we had.  */
-            munmap(buf2, size + qemu_real_host_page_size);
+            munmap(buf2, size);
             /* fallthru */
         default:
             /* Split the original buffer.  Free the smaller half.  */
             buf2 = split_cross_256mb(buf, size);
             size2 = tcg_ctx.code_gen_buffer_size;
             if (buf == buf2) {
-                munmap(buf + size2 + qemu_real_host_page_size, size - size2);
+                munmap(buf + size2, size - size2);
             } else {
                 munmap(buf, size - size2);
             }
@@ -704,13 +754,19 @@  static inline void *alloc_code_gen_buffer(void)
     }
 #endif
 
-    /* Make the final buffer accessible.  The guard page at the end
-       will remain inaccessible with PROT_NONE.  */
+    /* Make the final buffer accessible. */
     mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
 
     /* Request large pages for the buffer.  */
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf + 1, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
+    mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
+
     return buf;
 }
 #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
@@ -829,10 +885,48 @@  static void page_flush_tb(void)
     }
 }
 
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void clear_code_gen_buffer(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    code_gen_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *replace_code_gen_buffer(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
+
+    desc->clear_bit = code_gen_buf_mask;
+    call_rcu1(&desc->rcu, clear_code_gen_buffer);
+
+    if (code_gen_buf_mask == 1) {
+        code_gen_buf_mask |= 2;
+        return code_gen_buf2 + code_gen_prologue_size;
+    }
+    code_gen_buf_mask |= 1;
+    return code_gen_buf1 + code_gen_prologue_size;
+}
+
 /* flush all the translation blocks */
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
+    void *buf;
+
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -853,10 +947,17 @@  void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
-    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    buf = replace_code_gen_buffer();
+    tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
+
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK