diff mbox

[6/8] tcg: Add bytecode generator for tcg interpreter

Message ID 1316289634-18786-6-git-send-email-weil@mail.berlios.de
State Superseded
Headers show

Commit Message

Stefan Weil Sept. 17, 2011, 8 p.m. UTC
Unlike other tcg target code generators, this one does not generate
machine code for some cpu. It generates machine independent bytecode
which is interpreted later.

This allows running QEMU on any host.

Interpreted bytecode is slower than direct execution of generated
machine code.

Signed-off-by: Stefan Weil <weil@mail.berlios.de>
---
 dis-asm.h                 |    1 +
 disas.c                   |    4 +-
 dyngen-exec.h             |   13 +-
 exec-all.h                |   13 +-
 tcg/bytecode/README       |  129 ++++++
 tcg/bytecode/tcg-target.c |  955 +++++++++++++++++++++++++++++++++++++++++++++
 tcg/bytecode/tcg-target.h |  152 +++++++
 7 files changed, 1263 insertions(+), 4 deletions(-)
 create mode 100644 tcg/bytecode/README
 create mode 100644 tcg/bytecode/tcg-target.c
 create mode 100644 tcg/bytecode/tcg-target.h

Comments

Blue Swirl Sept. 18, 2011, 10:03 a.m. UTC | #1
On Sat, Sep 17, 2011 at 8:00 PM, Stefan Weil <weil@mail.berlios.de> wrote:
> Unlike other tcg target code generators, this one does not generate
> machine code for some cpu. It generates machine independent bytecode
> which is interpreted later.
>
> This allows running QEMU on any host.
>
> Interpreted bytecode is slower than direct execution of generated
> machine code.
>
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>
> ---
>  dis-asm.h                 |    1 +
>  disas.c                   |    4 +-
>  dyngen-exec.h             |   13 +-
>  exec-all.h                |   13 +-
>  tcg/bytecode/README       |  129 ++++++
>  tcg/bytecode/tcg-target.c |  955 +++++++++++++++++++++++++++++++++++++++++++++
>  tcg/bytecode/tcg-target.h |  152 +++++++
>  7 files changed, 1263 insertions(+), 4 deletions(-)
>  create mode 100644 tcg/bytecode/README
>  create mode 100644 tcg/bytecode/tcg-target.c
>  create mode 100644 tcg/bytecode/tcg-target.h

It would be nice to use either 'bytecode' or TCI (or TCG interpreter)
consistently, also for the file names.

>
> diff --git a/dis-asm.h b/dis-asm.h
> index 5b07d7f..876975f 100644
> --- a/dis-asm.h
> +++ b/dis-asm.h
> @@ -365,6 +365,7 @@ typedef struct disassemble_info {
>    target address.  Return number of bytes processed.  */
>  typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
>
> +int print_insn_bytecode(bfd_vma, disassemble_info*);
>  int print_insn_big_mips         (bfd_vma, disassemble_info*);
>  int print_insn_little_mips      (bfd_vma, disassemble_info*);
>  int print_insn_i386             (bfd_vma, disassemble_info*);
> diff --git a/disas.c b/disas.c
> index 611b30b..e2061d8 100644
> --- a/disas.c
> +++ b/disas.c
> @@ -273,7 +273,9 @@ void disas(FILE *out, void *code, unsigned long size)
>  #else
>     disasm_info.endian = BFD_ENDIAN_LITTLE;
>  #endif
> -#if defined(__i386__)
> +#if defined(CONFIG_TCG_INTERPRETER)
> +    print_insn = print_insn_bytecode;
> +#elif defined(__i386__)
>     disasm_info.mach = bfd_mach_i386_i386;
>     print_insn = print_insn_i386;
>  #elif defined(__x86_64__)
> diff --git a/dyngen-exec.h b/dyngen-exec.h
> index 8beb7f3..64f76c4 100644
> --- a/dyngen-exec.h
> +++ b/dyngen-exec.h
> @@ -19,7 +19,9 @@
>  #if !defined(__DYNGEN_EXEC_H__)
>  #define __DYNGEN_EXEC_H__
>
> -#if defined(__i386__)
> +#if defined(CONFIG_TCG_INTERPRETER)
> +/* The TCG interpreter does not use special registers. */
> +#elif defined(__i386__)
>  #define AREG0 "ebp"
>  #elif defined(__x86_64__)
>  #define AREG0 "r14"
> @@ -55,11 +57,18 @@
>  #error unsupported CPU
>  #endif
>
> +#if defined(AREG0)
>  register CPUState *env asm(AREG0);
> +#else
> +extern CPUState *env;

Maybe cpu_single_env could be used instead.

> +#endif
>
>  /* The return address may point to the start of the next instruction.
>    Subtracting one gets us the call instruction itself.  */
> -#if defined(__s390__) && !defined(__s390x__)
> +#if defined(CONFIG_TCG_INTERPRETER)
> +extern uint8_t *tci_tb_ptr;

Why is this here, could it be somewhere in tcg/*.h?

> +# define GETPC() ((void *)tci_tb_ptr)
> +#elif defined(__s390__) && !defined(__s390x__)
>  # define GETPC() ((void*)(((unsigned long)__builtin_return_address(0) & 0x7fffffffUL) - 1))
>  #elif defined(__arm__)
>  /* Thumb return addresses have the low bit set, so we need to subtract two.
> diff --git a/exec-all.h b/exec-all.h
> index 9b8d62c..0116acd 100644
> --- a/exec-all.h
> +++ b/exec-all.h
> @@ -122,6 +122,8 @@ void tlb_set_page(CPUState *env, target_ulong vaddr,
>
>  #if defined(_ARCH_PPC) || defined(__x86_64__) || defined(__arm__) || defined(__i386__)
>  #define USE_DIRECT_JUMP
> +#elif defined(CONFIG_TCG_INTERPRETER)
> +#define USE_DIRECT_JUMP
>  #endif
>
>  struct TranslationBlock {
> @@ -189,7 +191,14 @@ extern TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
>
>  #if defined(USE_DIRECT_JUMP)
>
> -#if defined(_ARCH_PPC)
> +#if defined(CONFIG_TCG_INTERPRETER)
> +static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
> +{
> +    /* patch the branch destination */
> +    *(uint32_t *)jmp_addr = addr - (jmp_addr + 4);
> +    /* no need to flush icache explicitly */
> +}
> +#elif defined(_ARCH_PPC)
>  void ppc_tb_set_jmp_target(unsigned long jmp_addr, unsigned long addr);
>  #define tb_set_jmp_target1 ppc_tb_set_jmp_target
>  #elif defined(__i386__) || defined(__x86_64__)
> @@ -223,6 +232,8 @@ static inline void tb_set_jmp_target1(unsigned long jmp_addr, unsigned long addr
>     __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
>  #endif
>  }
> +#else
> +#error tb_set_jmp_target1 is missing
>  #endif
>
>  static inline void tb_set_jmp_target(TranslationBlock *tb,
> diff --git a/tcg/bytecode/README b/tcg/bytecode/README
> new file mode 100644
> index 0000000..6fe9755
> --- /dev/null
> +++ b/tcg/bytecode/README
> @@ -0,0 +1,129 @@
> +TCG Interpreter (TCI) - Copyright (c) 2011 Stefan Weil.
> +
> +This file is released under GPL 2 or later.
> +
> +1) Introduction
> +
> +TCG (Tiny Code Generator) is a code generator which translates
> +code fragments ("basic blocks") from target code (any of the
> +targets supported by QEMU) to a code representation which
> +can be run on a host.
> +
> +QEMU can create native code for some hosts (arm, hppa, i386, ia64, ppc, ppc64,
> +s390, sparc, x86_64). For others, unofficial host support was written.
> +
> +By adding a code generator for a virtual machine and using an
> +interpreter for the generated bytecode, it is possible to
> +support (almost) any host.

This sounds like there are some limitations. I'm curious, what they are?

> +
> +This is what TCI (Tiny Code Interpreter) does.
> +
> +2) Implementation
> +
> +Like each TCG host frontend, TCI implements the code generator in
> +tcg-target.c, tcg-target.h. Both files are in directory tcg/bytecode.
> +
> +The additional file tcg/tci.c adds the interpreter.
> +
> +The bytecode consists of opcodes (same numeric values as those used by
> +TCG), command length and arguments of variable size and number.
> +
> +3) Usage
> +
> +For hosts without native TCG, the interpreter TCI must be enabled by
> +
> +        configure --enable-tcg-interpreter
> +
> +If configure is called without --enable-tcg-interpreter, it will
> +suggest using this option. Setting it automatically would need
> +additional code in configure which must be fixed when new native TCG
> +implementations are added.
> +
> +System emulation should work on any 32 or 64 bit host.
> +User mode emulation might work. Maybe a new loader (*.ld)
> +is needed. Byte order might be wrong (on big endian hosts)
> +and need fixes in configure.
> +
> +For hosts with native TCG, the interpreter TCI can be enabled by
> +
> +        configure --enable-tcg-interpreter
> +
> +The only difference from running qemu with TCI to running without TCI

QEMU

> +should be speed. Especially during development of TCI, it was very
> +useful to compare runs with and without TCI. Create /tmp/qemu.log by
> +
> +        qemu -d in_asm,op_opt,cpu -singlestep
> +
> +once with interpreter and once without interpreter and compare the resulting
> +qemu.log files. This is also useful to see the effects of additional
> +registers or additional opcodes (it is easy to modify the virtual machine).
> +It can also be used to verify native TCGs.
> +
> +Hosts with native TCG can also enable TCI by claiming to be unsupported:
> +
> +        configure --cpu=unknown --enable-tcg-interpreter
> +
> +configure then no longer uses the native loader (*.ld) for user mode emulation.

s/loader/linker script/

> +
> +
> +4) Status
> +
> +TCI needs special implementation for 32 and 64 bit host, 32 and 64 bit target,
> +host and target with same or different endianness.
> +
> +            | host (le)                     host (be)
> +            | 32             64             32             64
> +------------+------------------------------------------------------------
> +target (le) | s0, u0         s1, u1         s?, u?         s?, u?
> +32 bit      |
> +            |
> +target (le) | sc, uc         s1, u1         s?, u?         s?, u?
> +64 bit      |
> +            |
> +target (be) | sc, u0         sc, uc         s?, u?         s?, u?
> +32 bit      |
> +            |
> +target (be) | sc, uc         sc, uc         s?, u?         s?, u?
> +64 bit      |
> +            |
> +
> +System emulation
> +s? = untested
> +sc = compiles
> +s0 = bios works
> +s1 = grub works
> +s2 = linux boots

Linux

> +
> +Linux user mode emulation
> +u? = untested
> +uc = compiles
> +u0 = static hello works
> +u1 = linux-user-test works
> +
> +5) Todo list
> +
> +* TCI is not widely tested. It was written and tested on a x86_64 host
> +  running i386 and x86_64 system emulation and linux user mode.
> +  A cross compiled qemu for i386 host also works with the same basic tests.
> +  A cross compiled qemu for mipsel host works, too. It is terribly slow
> +  because I run it in a mips malta emulation, so it is an interpreted
> +  emulation in an emulation.
> +  A cross compiled qemu for arm host works (tested with pc bios).
> +  A cross compiled qemu for ppc host works at least partially:
> +  i386-linux-user/qemu-i386 can run a simple hello-world program
> +  (tested in a ppc emulation).
> +
> +* Some TCG opcodes are either missing in the code generator and/or
> +  in the interpreter. These opcodes raise a runtime exception, so it is
> +  possible to see where code must be added.
> +
> +* The pseudo code is not optimized and still ugly. For hosts with special
> +  alignment requirements, it needs some fixes (maybe aligned bytecode
> +  would also improve speed for hosts which support byte alignment).
> +
> +* A better disassembler for the pseudo code would be nice (a very primitive
> +  disassembler is included in tcg-target.c).
> +
> +* It might be useful to have a runtime option which selects the native TCG
> +  or TCI, so qemu would have to include two TCGs. Today, selecting TCI
> +  is a configure option, so you need two compilations of qemu.
> diff --git a/tcg/bytecode/tcg-target.c b/tcg/bytecode/tcg-target.c
> new file mode 100644
> index 0000000..f505ff0
> --- /dev/null
> +++ b/tcg/bytecode/tcg-target.c
> @@ -0,0 +1,955 @@
> +/*
> + * Tiny Code Generator for QEMU
> + *
> + * Copyright (c) 2009, 2011 Stefan Weil
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/* TODO list:
> + * - See TODO comments in code.
> + */
> +
> +/* Marker for missing code. */
> +#define TODO() \
> +    do { \
> +        fprintf(stderr, "TODO %s:%u: %s()\n", \
> +                __FILE__, __LINE__, __func__); \
> +        tcg_abort(); \
> +    } while (0)
> +
> +/* Trace message to see program flow. */
> +#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
> +#define TRACE() \
> +    loglevel \
> +    ? fprintf(stderr, "TCG %s:%u: %s()\n", __FILE__, __LINE__, __func__) \
> +    : (void)0
> +#else
> +#define TRACE() ((void)0)
> +#endif

Perhaps tracepoints could be used instead.

> +
> +/* Single bit n. */
> +#define BIT(n) (1 << (n))
> +
> +/* Bitfield n...m (in 32 bit value). */
> +#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
> +
> +/* Used for function call generation. */
> +#define TCG_REG_CALL_STACK              TCG_REG_R4
> +#define TCG_TARGET_STACK_ALIGN          16
> +#define TCG_TARGET_CALL_STACK_OFFSET    0
> +
> +/* TODO: documentation. */
> +static uint8_t *tb_ret_addr;
> +
> +/* Macros used in tcg_target_op_defs. */
> +#define R       "r"
> +#define RI      "ri"
> +#if TCG_TARGET_REG_BITS == 32
> +# define R64    "r", "r"
> +#else
> +# define R64    "r"
> +#endif
> +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> +# define L      "L", "L"
> +# define S      "S", "S"
> +#else
> +# define L      "L"
> +# define S      "S"
> +#endif
> +
> +/* TODO: documentation. */
> +static const TCGTargetOpDef tcg_target_op_defs[] = {
> +    { INDEX_op_exit_tb, { } },
> +    { INDEX_op_goto_tb, { } },
> +    { INDEX_op_call, { RI } },
> +    { INDEX_op_jmp, { RI } },
> +    { INDEX_op_br, { } },
> +
> +    { INDEX_op_mov_i32, { R, R } },
> +    { INDEX_op_movi_i32, { R } },
> +
> +    { INDEX_op_ld8u_i32, { R, R } },
> +    { INDEX_op_ld8s_i32, { R, R } },
> +    { INDEX_op_ld16u_i32, { R, R } },
> +    { INDEX_op_ld16s_i32, { R, R } },
> +    { INDEX_op_ld_i32, { R, R } },
> +    { INDEX_op_st8_i32, { R, R } },
> +    { INDEX_op_st16_i32, { R, R } },
> +    { INDEX_op_st_i32, { R, R } },
> +
> +    { INDEX_op_add_i32, { R, RI, RI } },
> +    { INDEX_op_sub_i32, { R, RI, RI } },
> +    { INDEX_op_mul_i32, { R, RI, RI } },
> +#if TCG_TARGET_HAS_div_i32

I was wondering if this #ifdeffery is needed since TCI would probably
give more performance compared to the alternative, TCG generated
emulation sequences. But it could be useful for testing those. Maybe
there should be two options to enable and disable all non-mandatoryTCI
versions.

> +    { INDEX_op_div_i32, { R, R, R } },
> +    { INDEX_op_divu_i32, { R, R, R } },
> +    { INDEX_op_rem_i32, { R, R, R } },
> +    { INDEX_op_remu_i32, { R, R, R } },
> +#elif TCG_TARGET_HAS_div2_i32
> +    { INDEX_op_div2_i32, { R, R, "0", "1", R } },
> +    { INDEX_op_divu2_i32, { R, R, "0", "1", R } },
> +#endif
> +    /* TODO: Does R, RI, RI result in faster code than R, R, RI?
> +       If both operands are constants, we can optimize. */
> +    { INDEX_op_and_i32, { R, RI, RI } },
> +#if TCG_TARGET_HAS_andc_i32
> +    { INDEX_op_andc_i32, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_eqv_i32
> +    { INDEX_op_eqv_i32, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_nand_i32
> +    { INDEX_op_nand_i32, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_nor_i32
> +    { INDEX_op_nor_i32, { R, RI, RI } },
> +#endif
> +    { INDEX_op_or_i32, { R, RI, RI } },
> +#if TCG_TARGET_HAS_orc_i32
> +    { INDEX_op_orc_i32, { R, RI, RI } },
> +#endif
> +    { INDEX_op_xor_i32, { R, RI, RI } },
> +    { INDEX_op_shl_i32, { R, RI, RI } },
> +    { INDEX_op_shr_i32, { R, RI, RI } },
> +    { INDEX_op_sar_i32, { R, RI, RI } },
> +#if TCG_TARGET_HAS_rot_i32
> +    { INDEX_op_rotl_i32, { R, RI, RI } },
> +    { INDEX_op_rotr_i32, { R, RI, RI } },
> +#endif
> +
> +    { INDEX_op_brcond_i32, { R, RI } },
> +
> +    { INDEX_op_setcond_i32, { R, R, RI } },
> +#if TCG_TARGET_REG_BITS == 64
> +    { INDEX_op_setcond_i64, { R, R, RI } },
> +#endif /* TCG_TARGET_REG_BITS == 64 */
> +
> +#if TCG_TARGET_REG_BITS == 32
> +    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
> +    { INDEX_op_add2_i32, { R, R, R, R, R, R } },
> +    { INDEX_op_sub2_i32, { R, R, R, R, R, R } },
> +    { INDEX_op_brcond2_i32, { R, R, RI, RI } },
> +    { INDEX_op_mulu2_i32, { R, R, R, R } },
> +    { INDEX_op_setcond2_i32, { R, R, R, RI, RI } },
> +#endif
> +
> +#if TCG_TARGET_HAS_not_i32
> +    { INDEX_op_not_i32, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_neg_i32
> +    { INDEX_op_neg_i32, { R, R } },
> +#endif
> +
> +#if TCG_TARGET_REG_BITS == 64
> +    { INDEX_op_mov_i64, { R, R } },
> +    { INDEX_op_movi_i64, { R } },
> +
> +    { INDEX_op_ld8u_i64, { R, R } },
> +    { INDEX_op_ld8s_i64, { R, R } },
> +    { INDEX_op_ld16u_i64, { R, R } },
> +    { INDEX_op_ld16s_i64, { R, R } },
> +    { INDEX_op_ld32u_i64, { R, R } },
> +    { INDEX_op_ld32s_i64, { R, R } },
> +    { INDEX_op_ld_i64, { R, R } },
> +
> +    { INDEX_op_st8_i64, { R, R } },
> +    { INDEX_op_st16_i64, { R, R } },
> +    { INDEX_op_st32_i64, { R, R } },
> +    { INDEX_op_st_i64, { R, R } },
> +
> +    { INDEX_op_add_i64, { R, RI, RI } },
> +    { INDEX_op_sub_i64, { R, RI, RI } },
> +    { INDEX_op_mul_i64, { R, RI, RI } },
> +#if TCG_TARGET_HAS_div_i64
> +    { INDEX_op_div_i64, { R, R, R } },
> +    { INDEX_op_divu_i64, { R, R, R } },
> +    { INDEX_op_rem_i64, { R, R, R } },
> +    { INDEX_op_remu_i64, { R, R, R } },
> +#elif defined(TCG_TARGET_HAS_div2_i64)
> +    { INDEX_op_div2_i64, { R, R, "0", "1", R } },
> +    { INDEX_op_divu2_i64, { R, R, "0", "1", R } },
> +#endif
> +    { INDEX_op_and_i64, { R, RI, RI } },
> +#if TCG_TARGET_HAS_andc_i64
> +    { INDEX_op_andc_i64, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_eqv_i64
> +    { INDEX_op_eqv_i64, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_nand_i64
> +    { INDEX_op_nand_i64, { R, RI, RI } },
> +#endif
> +#if TCG_TARGET_HAS_nor_i64
> +    { INDEX_op_nor_i64, { R, RI, RI } },
> +#endif
> +    { INDEX_op_or_i64, { R, RI, RI } },
> +#if TCG_TARGET_HAS_orc_i64
> +    { INDEX_op_orc_i64, { R, RI, RI } },
> +#endif
> +    { INDEX_op_xor_i64, { R, RI, RI } },
> +    { INDEX_op_shl_i64, { R, RI, RI } },
> +    { INDEX_op_shr_i64, { R, RI, RI } },
> +    { INDEX_op_sar_i64, { R, RI, RI } },
> +#if TCG_TARGET_HAS_rot_i64
> +    { INDEX_op_rotl_i64, { R, RI, RI } },
> +    { INDEX_op_rotr_i64, { R, RI, RI } },
> +#endif
> +    { INDEX_op_brcond_i64, { R, RI } },
> +
> +#if TCG_TARGET_HAS_ext8s_i64
> +    { INDEX_op_ext8s_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext16s_i64
> +    { INDEX_op_ext16s_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext32s_i64
> +    { INDEX_op_ext32s_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext8u_i64
> +    { INDEX_op_ext8u_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext16u_i64
> +    { INDEX_op_ext16u_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext32u_i64
> +    { INDEX_op_ext32u_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_bswap16_i64
> +    { INDEX_op_bswap16_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_bswap32_i64
> +    { INDEX_op_bswap32_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_bswap64_i64
> +    { INDEX_op_bswap64_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_not_i64
> +    { INDEX_op_not_i64, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_neg_i64
> +    { INDEX_op_neg_i64, { R, R } },
> +#endif
> +#endif /* TCG_TARGET_REG_BITS == 64 */
> +
> +    { INDEX_op_qemu_ld8u, { R, L } },
> +    { INDEX_op_qemu_ld8s, { R, L } },
> +    { INDEX_op_qemu_ld16u, { R, L } },
> +    { INDEX_op_qemu_ld16s, { R, L } },
> +    { INDEX_op_qemu_ld32, { R, L } },
> +#if TCG_TARGET_REG_BITS == 64
> +    { INDEX_op_qemu_ld32u, { R, L } },
> +    { INDEX_op_qemu_ld32s, { R, L } },
> +#endif
> +    { INDEX_op_qemu_ld64, { R64, L } },
> +
> +    { INDEX_op_qemu_st8, { R, S } },
> +    { INDEX_op_qemu_st16, { R, S } },
> +    { INDEX_op_qemu_st32, { R, S } },
> +    { INDEX_op_qemu_st64, { R64, S } },
> +
> +#if TCG_TARGET_HAS_ext8s_i32
> +    { INDEX_op_ext8s_i32, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext16s_i32
> +    { INDEX_op_ext16s_i32, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext8u_i32
> +    { INDEX_op_ext8u_i32, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_ext16u_i32
> +    { INDEX_op_ext16u_i32, { R, R } },
> +#endif
> +
> +#if TCG_TARGET_HAS_bswap16_i32
> +    { INDEX_op_bswap16_i32, { R, R } },
> +#endif
> +#if TCG_TARGET_HAS_bswap32_i32
> +    { INDEX_op_bswap32_i32, { R, R } },
> +#endif
> +
> +    { -1 },
> +};
> +
> +static const int tcg_target_reg_alloc_order[] = {
> +    TCG_REG_R0,
> +    TCG_REG_R1,
> +    TCG_REG_R2,
> +    TCG_REG_R3,
> +#if 0 /* used for TCG_REG_CALL_STACK */
> +    TCG_REG_R4,
> +#endif
> +    TCG_REG_R5,
> +    TCG_REG_R6,
> +    TCG_REG_R7,
> +#if TCG_TARGET_NB_REGS >= 16
> +    TCG_REG_R8,
> +    TCG_REG_R9,
> +    TCG_REG_R10,
> +    TCG_REG_R11,
> +    TCG_REG_R12,
> +    TCG_REG_R13,
> +    TCG_REG_R14,
> +    TCG_REG_R15,
> +#endif
> +};
> +
> +#if MAX_OPC_PARAM_IARGS != 4
> +# error Fix needed, number of supported input arguments changed!
> +#endif
> +
> +static const int tcg_target_call_iarg_regs[] = {
> +    TCG_REG_R0,
> +    TCG_REG_R1,
> +    TCG_REG_R2,
> +    TCG_REG_R3,
> +#if TCG_TARGET_REG_BITS == 32
> +    /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
> +#if 0 /* used for TCG_REG_CALL_STACK */
> +    TCG_REG_R4,
> +#endif
> +    TCG_REG_R5,
> +    TCG_REG_R6,
> +    TCG_REG_R7,
> +#if TCG_TARGET_NB_REGS >= 16
> +    TCG_REG_R8,
> +#else
> +# error Too few input registers available
> +#endif
> +#endif
> +};
> +
> +static const int tcg_target_call_oarg_regs[] = {
> +    TCG_REG_R0,
> +#if TCG_TARGET_REG_BITS == 32
> +    TCG_REG_R1
> +#endif
> +};
> +
> +#ifndef NDEBUG
> +static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
> +    "r00",
> +    "r01",
> +    "r02",
> +    "r03",
> +    "r04",
> +    "r05",
> +    "r06",
> +    "r07",
> +#if TCG_TARGET_NB_REGS >= 16
> +    "r08",
> +    "r09",
> +    "r10",
> +    "r11",
> +    "r12",
> +    "r13",
> +    "r14",
> +    "r15",
> +#if TCG_TARGET_NB_REGS >= 32
> +    "r16",
> +    "r17",
> +    "r18",
> +    "r19",
> +    "r20",
> +    "r21",
> +    "r22",
> +    "r23",
> +    "r24",
> +    "r25",
> +    "r26",
> +    "r27",
> +    "r28",
> +    "r29",
> +    "r30",
> +    "r31"
> +#endif
> +#endif
> +};
> +#endif
> +
> +static void flush_icache_range(unsigned long start, unsigned long stop)
> +{
> +    TRACE();
> +}
> +
> +static void patch_reloc(uint8_t *code_ptr, int type,
> +                        tcg_target_long value, tcg_target_long addend)
> +{
> +    /* tcg_out_reloc always uses the same type, addend. */
> +    assert(type == sizeof(tcg_target_long));
> +    assert(addend == 0);
> +    assert(value != 0);
> +    *(tcg_target_long *)code_ptr = value;
> +}
> +
> +/* Parse target specific constraints. */
> +static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
> +{
> +    const char *ct_str = *pct_str;
> +    switch (ct_str[0]) {
> +    case 'r':
> +    case 'L':                   /* qemu_ld constraint */
> +    case 'S':                   /* qemu_st constraint */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_set32(ct->u.regs, 0, BIT(TCG_TARGET_NB_REGS) - 1);
> +        break;
> +    default:
> +        return -1;
> +    }
> +    ct_str++;
> +    *pct_str = ct_str;
> +    return 0;
> +}
> +
> +#include "dis-asm.h"
> +
> +/* Disassemble bytecode. */
> +int print_insn_bytecode(bfd_vma addr, disassemble_info *info)
> +{
> +    int length;
> +    uint8_t byte;
> +    int status;
> +    TCGOpcode op;
> +
> +    status = info->read_memory_func(addr, &byte, 1, info);
> +    if (status != 0) {
> +        info->memory_error_func(status, addr, info);
> +        return -1;
> +    }
> +    op = byte;
> +
> +    addr++;
> +    status = info->read_memory_func(addr, &byte, 1, info);
> +    if (status != 0) {
> +        info->memory_error_func(status, addr, info);
> +        return -1;
> +    }
> +    length = byte;
> +
> +    if (op >= ARRAY_SIZE(tcg_op_defs)) {
> +        return length;
> +    }
> +
> +    const TCGOpDef *def = &tcg_op_defs[op];
> +    int nb_oargs = def->nb_oargs;
> +    int nb_iargs = def->nb_iargs;
> +    int nb_cargs = def->nb_cargs;
> +    FILE *f = info->stream;
> +    /* TODO: Improve disassembler output. */
> +    info->fprintf_func(f, "%s\to=%d i=%d c=%d",
> +                       def->name, nb_oargs, nb_iargs, nb_cargs);
> +
> +    return length;
> +}
> +
> +#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
> +/* Show current bytecode. Used by tcg interpreter. */
> +void tci_disas(uint8_t opc)
> +{
> +    const TCGOpDef *def = &tcg_op_defs[opc];
> +    fprintf(stderr, "TCG %s %u, %u, %u\n",
> +            def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs);
> +}
> +#endif
> +
> +/* Write value (native size). */
> +static void tcg_out_i(TCGContext *s, tcg_target_ulong v)
> +{
> +    *(tcg_target_ulong *)s->code_ptr = v;
> +    s->code_ptr += sizeof(tcg_target_ulong);
> +}
> +
> +/* Write 64 bit value. */
> +static void tcg_out64(TCGContext *s, uint64_t v)
> +{
> +    *(uint64_t *)s->code_ptr = v;
> +    s->code_ptr += sizeof(v);
> +}
> +
> +/* Write opcode. */
> +static void tcg_out_op_t(TCGContext *s, TCGOpcode op)
> +{
> +    tcg_out8(s, op);
> +    tcg_out8(s, 0);
> +}
> +
> +/* Write register. */
> +static void tcg_out_r(TCGContext *s, TCGArg t0)
> +{
> +    assert(t0 < TCG_TARGET_NB_REGS);
> +    tcg_out8(s, t0);
> +}
> +
> +/* Write register or constant (native size). */
> +static void tcg_out_ri(TCGContext *s, int const_arg, TCGArg arg)
> +{
> +    if (const_arg) {
> +        assert(const_arg == 1);
> +        tcg_out8(s, TCG_CONST);
> +        tcg_out_i(s, arg);
> +    } else {
> +        tcg_out_r(s, arg);
> +    }
> +}
> +
> +/* Write register or constant (32 bit). */
> +static void tcg_out_ri32(TCGContext *s, int const_arg, TCGArg arg)
> +{
> +    if (const_arg) {
> +        assert(const_arg == 1);
> +        tcg_out8(s, TCG_CONST);
> +        tcg_out32(s, arg);
> +    } else {
> +        tcg_out_r(s, arg);
> +    }
> +}
> +
> +#if TCG_TARGET_REG_BITS == 64
> +/* Write register or constant (64 bit). */
> +static void tcg_out_ri64(TCGContext *s, int const_arg, TCGArg arg)
> +{
> +    if (const_arg) {
> +        assert(const_arg == 1);
> +        tcg_out8(s, TCG_CONST);
> +        tcg_out64(s, arg);
> +    } else {
> +        tcg_out_r(s, arg);
> +    }
> +}
> +#endif
> +
> +/* Write label. */
> +static void tci_out_label(TCGContext *s, TCGArg arg)
> +{
> +    TCGLabel *label = &s->labels[arg];
> +    if (label->has_value) {
> +        tcg_out_i(s, label->u.value);
> +        assert(label->u.value);
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), arg, 0);
> +        tcg_out_i(s, 0);
> +    }
> +}
> +
> +static void tcg_out_ld(TCGContext *s, TCGType type, int ret, int arg1,
> +                       tcg_target_long arg2)
> +{
> +    uint8_t *old_code_ptr = s->code_ptr;
> +    if (type == TCG_TYPE_I32) {
> +        tcg_out_op_t(s, INDEX_op_ld_i32);
> +        tcg_out_r(s, ret);
> +        tcg_out_r(s, arg1);
> +        tcg_out32(s, arg2);
> +    } else {
> +        assert(type == TCG_TYPE_I64);
> +#if TCG_TARGET_REG_BITS == 64
> +        tcg_out_op_t(s, INDEX_op_ld_i64);
> +        tcg_out_r(s, ret);
> +        tcg_out_r(s, arg1);
> +        assert(arg2 == (uint32_t)arg2);
> +        tcg_out32(s, arg2);
> +#else
> +        TODO();
> +#endif
> +    }
> +    old_code_ptr[1] = s->code_ptr - old_code_ptr;
> +}
> +
> +static void tcg_out_mov(TCGContext *s, TCGType type, int ret, int arg)
> +{
> +    uint8_t *old_code_ptr = s->code_ptr;
> +    assert(ret != arg);
> +#if TCG_TARGET_REG_BITS == 32
> +    tcg_out_op_t(s, INDEX_op_mov_i32);
> +#else
> +    tcg_out_op_t(s, INDEX_op_mov_i64);
> +#endif
> +    tcg_out_r(s, ret);
> +    tcg_out_r(s, arg);
> +    old_code_ptr[1] = s->code_ptr - old_code_ptr;
> +}
> +
> +static void tcg_out_movi(TCGContext *s, TCGType type,
> +                         int t0, tcg_target_long arg)
> +{
> +    uint8_t *old_code_ptr = s->code_ptr;
> +    uint32_t arg32 = arg;
> +    if (type == TCG_TYPE_I32 || arg == arg32) {
> +        tcg_out_op_t(s, INDEX_op_movi_i32);
> +        tcg_out_r(s, t0);
> +        tcg_out32(s, arg32);
> +    } else {
> +        assert(type == TCG_TYPE_I64);
> +#if TCG_TARGET_REG_BITS == 64
> +        tcg_out_op_t(s, INDEX_op_movi_i64);
> +        tcg_out_r(s, t0);
> +        tcg_out64(s, arg);
> +#else
> +        TODO();
> +#endif
> +    }
> +    old_code_ptr[1] = s->code_ptr - old_code_ptr;
> +}
> +
> +static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
> +                       const int *const_args)
> +{
> +    uint8_t *old_code_ptr = s->code_ptr;
> +
> +    tcg_out_op_t(s, opc);
> +
> +    switch (opc) {
> +    case INDEX_op_exit_tb:
> +        tcg_out64(s, args[0]);
> +        break;
> +    case INDEX_op_goto_tb:
> +        if (s->tb_jmp_offset) {
> +            /* Direct jump method. */
> +            assert(args[0] < ARRAY_SIZE(s->tb_jmp_offset));
> +            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> +            tcg_out32(s, 0);
> +        } else {
> +            /* Indirect jump method. */
> +            TODO();
> +        }
> +        assert(args[0] < ARRAY_SIZE(s->tb_next_offset));
> +        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
> +        break;
> +    case INDEX_op_br:
> +        tci_out_label(s, args[0]);
> +        break;
> +    case INDEX_op_call:
> +        tcg_out_ri(s, const_args[0], args[0]);
> +        break;
> +    case INDEX_op_jmp:
> +        TODO();
> +        break;
> +    case INDEX_op_setcond_i32:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_ri32(s, const_args[2], args[2]);
> +        tcg_out8(s, args[3]);   /* condition */
> +        break;
> +#if TCG_TARGET_REG_BITS == 32
> +    case INDEX_op_setcond2_i32:
> +        /* setcond2_i32 cond, t0, t1_low, t1_high, t2_low, t2_high */
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_r(s, args[2]);
> +        tcg_out_ri32(s, const_args[3], args[3]);
> +        tcg_out_ri32(s, const_args[4], args[4]);
> +        tcg_out8(s, args[5]);   /* condition */
> +        break;
> +#elif TCG_TARGET_REG_BITS == 64
> +    case INDEX_op_setcond_i64:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_ri64(s, const_args[2], args[2]);
> +        tcg_out8(s, args[3]);   /* condition */
> +        break;
> +#endif
> +    case INDEX_op_movi_i32:
> +        TODO(); /* Handled by tcg_out_movi? */
> +        break;
> +    case INDEX_op_ld8u_i32:
> +    case INDEX_op_ld8s_i32:
> +    case INDEX_op_ld16u_i32:
> +    case INDEX_op_ld16s_i32:
> +    case INDEX_op_ld_i32:
> +    case INDEX_op_st8_i32:
> +    case INDEX_op_st16_i32:
> +    case INDEX_op_st_i32:
> +    case INDEX_op_ld8u_i64:
> +    case INDEX_op_ld8s_i64:
> +    case INDEX_op_ld16u_i64:
> +    case INDEX_op_ld16s_i64:
> +    case INDEX_op_ld32u_i64:
> +    case INDEX_op_ld32s_i64:
> +    case INDEX_op_ld_i64:
> +    case INDEX_op_st8_i64:
> +    case INDEX_op_st16_i64:
> +    case INDEX_op_st32_i64:
> +    case INDEX_op_st_i64:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        assert(args[2] == (uint32_t)args[2]);
> +        tcg_out32(s, args[2]);
> +        break;
> +    case INDEX_op_add_i32:
> +    case INDEX_op_sub_i32:
> +    case INDEX_op_mul_i32:
> +    case INDEX_op_and_i32:
> +    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
> +    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_eqv_i32). */
> +    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
> +    case INDEX_op_nor_i32:      /* Optional (TCG_TARGET_HAS_nor_i32). */
> +    case INDEX_op_or_i32:
> +    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i32). */
> +    case INDEX_op_xor_i32:
> +    case INDEX_op_shl_i32:
> +    case INDEX_op_shr_i32:
> +    case INDEX_op_sar_i32:
> +    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
> +    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
> +        tcg_out_r(s, args[0]);
> +        tcg_out_ri32(s, const_args[1], args[1]);
> +        tcg_out_ri32(s, const_args[2], args[2]);
> +        break;
> +
> +#if TCG_TARGET_REG_BITS == 64
> +    case INDEX_op_mov_i64:
> +    case INDEX_op_movi_i64:
> +        TODO();
> +        break;
> +    case INDEX_op_add_i64:
> +    case INDEX_op_sub_i64:
> +    case INDEX_op_mul_i64:
> +    case INDEX_op_and_i64:
> +    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
> +    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
> +    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
> +    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
> +    case INDEX_op_or_i64:
> +    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
> +    case INDEX_op_xor_i64:
> +    case INDEX_op_shl_i64:
> +    case INDEX_op_shr_i64:
> +    case INDEX_op_sar_i64:
> +    /* TODO: Implementation of rotl_i64, rotr_i64 missing in tci.c. */
> +    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
> +    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
> +        tcg_out_r(s, args[0]);
> +        tcg_out_ri64(s, const_args[1], args[1]);
> +        tcg_out_ri64(s, const_args[2], args[2]);
> +        break;
> +    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
> +    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
> +    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
> +    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
> +        TODO();
> +        break;
> +    case INDEX_op_div2_i64:     /* Optional (TCG_TARGET_HAS_div2_i64). */
> +    case INDEX_op_divu2_i64:    /* Optional (TCG_TARGET_HAS_div2_i64). */
> +        TODO();
> +        break;
> +    case INDEX_op_brcond_i64:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_ri64(s, const_args[1], args[1]);
> +        tcg_out8(s, args[2]);           /* condition */
> +        tci_out_label(s, args[3]);
> +        break;
> +    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
> +    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
> +    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
> +    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
> +    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
> +    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
> +    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
> +    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
> +    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
> +    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
> +    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
> +#endif /* TCG_TARGET_REG_BITS == 64 */
> +    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
> +    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
> +    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
> +    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
> +    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
> +    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
> +    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
> +    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        break;
> +    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
> +    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
> +    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
> +    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
> +        tcg_out_r(s, args[0]);
> +        tcg_out_ri32(s, const_args[1], args[1]);
> +        tcg_out_ri32(s, const_args[2], args[2]);
> +        break;
> +    case INDEX_op_div2_i32:     /* Optional (TCG_TARGET_HAS_div2_i32). */
> +    case INDEX_op_divu2_i32:    /* Optional (TCG_TARGET_HAS_div2_i32). */
> +        TODO();
> +        break;
> +#if TCG_TARGET_REG_BITS == 32
> +    case INDEX_op_add2_i32:
> +    case INDEX_op_sub2_i32:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_r(s, args[2]);
> +        tcg_out_r(s, args[3]);
> +        tcg_out_r(s, args[4]);
> +        tcg_out_r(s, args[5]);
> +        break;
> +    case INDEX_op_brcond2_i32:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_ri32(s, const_args[2], args[2]);
> +        tcg_out_ri32(s, const_args[3], args[3]);
> +        tcg_out8(s, args[4]);           /* condition */
> +        tci_out_label(s, args[5]);
> +        break;
> +    case INDEX_op_mulu2_i32:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_r(s, args[1]);
> +        tcg_out_r(s, args[2]);
> +        tcg_out_r(s, args[3]);
> +        break;
> +#endif
> +    case INDEX_op_brcond_i32:
> +        tcg_out_r(s, args[0]);
> +        tcg_out_ri32(s, const_args[1], args[1]);
> +        tcg_out8(s, args[2]);           /* condition */
> +        tci_out_label(s, args[3]);
> +        break;
> +    case INDEX_op_qemu_ld8u:
> +    case INDEX_op_qemu_ld8s:
> +    case INDEX_op_qemu_ld16u:
> +    case INDEX_op_qemu_ld16s:
> +    case INDEX_op_qemu_ld32:
> +#if TCG_TARGET_REG_BITS == 64
> +    case INDEX_op_qemu_ld32s:
> +    case INDEX_op_qemu_ld32u:
> +#endif
> +        tcg_out_r(s, *args++);
> +        tcg_out_r(s, *args++);
> +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> +        tcg_out_r(s, *args++);
> +#endif
> +#ifdef CONFIG_SOFTMMU
> +        tcg_out_i(s, *args);
> +#endif
> +        break;
> +    case INDEX_op_qemu_ld64:
> +        tcg_out_r(s, *args++);
> +#if TCG_TARGET_REG_BITS == 32
> +        tcg_out_r(s, *args++);
> +#endif
> +        tcg_out_r(s, *args++);
> +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> +        tcg_out_r(s, *args++);
> +#endif
> +#ifdef CONFIG_SOFTMMU
> +        tcg_out_i(s, *args);
> +#endif
> +        break;
> +    case INDEX_op_qemu_st8:
> +    case INDEX_op_qemu_st16:
> +    case INDEX_op_qemu_st32:
> +        tcg_out_r(s, *args++);
> +        tcg_out_r(s, *args++);
> +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> +        tcg_out_r(s, *args++);
> +#endif
> +#ifdef CONFIG_SOFTMMU
> +        tcg_out_i(s, *args);
> +#endif
> +        break;
> +    case INDEX_op_qemu_st64:
> +        tcg_out_r(s, *args++);
> +#if TCG_TARGET_REG_BITS == 32
> +        tcg_out_r(s, *args++);
> +#endif
> +        tcg_out_r(s, *args++);
> +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> +        tcg_out_r(s, *args++);
> +#endif
> +#ifdef CONFIG_SOFTMMU
> +        tcg_out_i(s, *args);
> +#endif
> +        break;
> +    case INDEX_op_end:
> +        TODO();
> +        break;
> +    default:
> +        fprintf(stderr, "Missing: %s\n", tcg_op_defs[opc].name);
> +        tcg_abort();
> +    }
> +    old_code_ptr[1] = s->code_ptr - old_code_ptr;
> +}
> +
> +static void tcg_out_st(TCGContext *s, TCGType type, int arg, int arg1,
> +                       tcg_target_long arg2)
> +{
> +    uint8_t *old_code_ptr = s->code_ptr;
> +    if (type == TCG_TYPE_I32) {
> +        tcg_out_op_t(s, INDEX_op_st_i32);
> +        tcg_out_r(s, arg);
> +        tcg_out_r(s, arg1);
> +        tcg_out32(s, arg2);
> +    } else {
> +        assert(type == TCG_TYPE_I64);
> +#if TCG_TARGET_REG_BITS == 64
> +        tcg_out_op_t(s, INDEX_op_st_i64);
> +        tcg_out_r(s, arg);
> +        tcg_out_r(s, arg1);
> +        tcg_out32(s, arg2);
> +#else
> +        TODO();
> +#endif
> +    }
> +    old_code_ptr[1] = s->code_ptr - old_code_ptr;
> +}
> +
> +/* Test if a constant matches the constraint. */
> +static int tcg_target_const_match(tcg_target_long val,
> +                                  const TCGArgConstraint *arg_ct)
> +{
> +    /* No need to return 0 or 1, 0 or != 0 is good enough. */
> +    return arg_ct->ct & TCG_CT_CONST;
> +}
> +
> +/* Maximum number of register used for input function arguments. */
> +static int tcg_target_get_call_iarg_regs_count(int flags)
> +{
> +    return ARRAY_SIZE(tcg_target_call_iarg_regs);
> +}
> +
> +static void tcg_target_init(TCGContext *s)
> +{
> +#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
> +    const char *envval = getenv("DEBUG_TCG");
> +    if (envval) {
> +        loglevel = strtol(envval, NULL, 0);
> +    }
> +#endif
> +    TRACE();
> +
> +    /* The current code uses uint8_t for tcg operations. */
> +    assert(ARRAY_SIZE(tcg_op_defs) <= UINT8_MAX);
> +
> +    /* Registers available for 32 bit operations. */
> +    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0,
> +                     BIT(TCG_TARGET_NB_REGS) - 1);
> +    /* Registers available for 64 bit operations. */
> +    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0,
> +                     BIT(TCG_TARGET_NB_REGS) - 1);
> +    /* TODO: Which registers should be set here? */
> +    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
> +                     BIT(TCG_TARGET_NB_REGS) - 1);
> +    tcg_regset_clear(s->reserved_regs);
> +    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
> +    tcg_add_target_add_op_defs(tcg_target_op_defs);
> +    tcg_set_frame(s, TCG_AREG0, offsetof(CPUState, temp_buf),
> +                  CPU_TEMP_BUF_NLONGS * sizeof(long));
> +}
> +
> +/* Generate global QEMU prologue and epilogue code. */
> +static void tcg_target_qemu_prologue(TCGContext *s)
> +{
> +    TRACE();
> +    tb_ret_addr = s->code_ptr;
> +}
> diff --git a/tcg/bytecode/tcg-target.h b/tcg/bytecode/tcg-target.h
> new file mode 100644
> index 0000000..05aaaf2
> --- /dev/null
> +++ b/tcg/bytecode/tcg-target.h
> @@ -0,0 +1,152 @@
> +/*
> + * Tiny Code Generator for QEMU
> + *
> + * Copyright (c) 2009, 2011 Stefan Weil
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/*
> + * This code implements a TCG which does not generate machine code for some
> + * real target machine but which generates virtual machine code for an
> + * interpreter. Interpreted pseudo code is slow, but it works on any host.
> + *
> + * Some remarks might help in understanding the code:
> + *
> + * "target" or "TCG target" is the machine which runs the generated code.
> + * This is different to the usual meaning in QEMU where "target" is the
> + * emulated machine. So normally QEMU host is identical to TCG target.
> + * Here the TCG target is a virtual machine, but this virtual machine must
> + * use the same word size like the real machine.

Why, for performance? Allowing that could be useful for testing TCG,
perhaps we could even use non-native endianness?

> + * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
> + */
> +
> +#if !defined(TCG_TARGET_H)
> +#define TCG_TARGET_H
> +
> +#include "config-host.h"
> +
> +#define TCG_TARGET_INTERPRETER 1
> +
> +#ifdef CONFIG_DEBUG_TCG
> +/* Enable debug output. */
> +#define CONFIG_DEBUG_TCG_INTERPRETER
> +#endif
> +
> +#if 0 /* TCI tries to emulate a little endian host. */
> +#if defined(HOST_WORDS_BIGENDIAN)
> +# define TCG_TARGET_WORDS_BIGENDIAN
> +#endif
> +#endif
> +
> +/* Optional instructions. */
> +
> +#define TCG_TARGET_HAS_bswap16_i32      1
> +#define TCG_TARGET_HAS_bswap32_i32      1
> +/* Not more than one of the next two defines must be 1. */
> +#define TCG_TARGET_HAS_div_i32          1
> +#define TCG_TARGET_HAS_div2_i32         0
> +#define TCG_TARGET_HAS_ext8s_i32        1
> +#define TCG_TARGET_HAS_ext16s_i32       1
> +#define TCG_TARGET_HAS_ext8u_i32        1
> +#define TCG_TARGET_HAS_ext16u_i32       1
> +#define TCG_TARGET_HAS_andc_i32         0
> +#define TCG_TARGET_HAS_deposit_i32      0
> +#define TCG_TARGET_HAS_eqv_i32          0
> +#define TCG_TARGET_HAS_nand_i32         0
> +#define TCG_TARGET_HAS_nor_i32          0
> +#define TCG_TARGET_HAS_neg_i32          1
> +#define TCG_TARGET_HAS_not_i32          1
> +#define TCG_TARGET_HAS_orc_i32          0
> +#define TCG_TARGET_HAS_rot_i32          1
> +
> +#if TCG_TARGET_REG_BITS == 64
> +#define TCG_TARGET_HAS_bswap16_i64      1
> +#define TCG_TARGET_HAS_bswap32_i64      1
> +#define TCG_TARGET_HAS_bswap64_i64      1
> +#define TCG_TARGET_HAS_deposit_i64      0
> +/* Not more than one of the next two defines must be 1. */
> +#define TCG_TARGET_HAS_div_i64          0
> +#define TCG_TARGET_HAS_div2_i64         0
> +#define TCG_TARGET_HAS_ext8s_i64        1
> +#define TCG_TARGET_HAS_ext16s_i64       1
> +#define TCG_TARGET_HAS_ext32s_i64       1
> +#define TCG_TARGET_HAS_ext8u_i64        1
> +#define TCG_TARGET_HAS_ext16u_i64       1
> +#define TCG_TARGET_HAS_ext32u_i64       1
> +#define TCG_TARGET_HAS_andc_i64         0
> +#define TCG_TARGET_HAS_eqv_i64          0
> +#define TCG_TARGET_HAS_nand_i64         0
> +#define TCG_TARGET_HAS_nor_i64          0
> +#define TCG_TARGET_HAS_neg_i64          1
> +#define TCG_TARGET_HAS_not_i64          1
> +#define TCG_TARGET_HAS_orc_i64          0
> +#define TCG_TARGET_HAS_rot_i64          1
> +#endif /* TCG_TARGET_REG_BITS == 64 */
> +
> +/* Offset to user memory in user mode. */
> +#define TCG_TARGET_HAS_GUEST_BASE
> +
> +/* Number of registers available.
> +   For 32 bit hosts, we need more than 8 registers (call arguments). */

On i386 there certainly aren't 8 registers, where does 8 come from?

> +/* #define TCG_TARGET_NB_REGS 8 */
> +#define TCG_TARGET_NB_REGS 16

Again, one way to test TCG would be to minimize and maximize the
number of registers.

> +/* #define TCG_TARGET_NB_REGS 32 */
> +
> +/* List of registers which are used by TCG. */
> +typedef enum {
> +    TCG_REG_R0 = 0,
> +    TCG_REG_R1,
> +    TCG_REG_R2,
> +    TCG_REG_R3,
> +    TCG_REG_R4,
> +    TCG_REG_R5,
> +    TCG_REG_R6,
> +    TCG_REG_R7,
> +    TCG_AREG0 = TCG_REG_R7,
> +#if TCG_TARGET_NB_REGS >= 16
> +    TCG_REG_R8,
> +    TCG_REG_R9,
> +    TCG_REG_R10,
> +    TCG_REG_R11,
> +    TCG_REG_R12,
> +    TCG_REG_R13,
> +    TCG_REG_R14,
> +    TCG_REG_R15,
> +#if TCG_TARGET_NB_REGS >= 32
> +    TCG_REG_R16,
> +    TCG_REG_R17,
> +    TCG_REG_R18,
> +    TCG_REG_R19,
> +    TCG_REG_R20,
> +    TCG_REG_R21,
> +    TCG_REG_R22,
> +    TCG_REG_R23,
> +    TCG_REG_R24,
> +    TCG_REG_R25,
> +    TCG_REG_R26,
> +    TCG_REG_R27,
> +    TCG_REG_R28,
> +    TCG_REG_R29,
> +    TCG_REG_R30,
> +    TCG_REG_R31,
> +#endif
> +#endif
> +    /* Special value UINT8_MAX is used by TCI to encode constant values. */
> +    TCG_CONST = UINT8_MAX
> +} TCGRegister;
> +
> +void tci_disas(uint8_t opc);
> +
> +#endif /* TCG_TARGET_H */
> --
> 1.7.2.5
>
>
>
Stuart Brady Sept. 19, 2011, 10:28 p.m. UTC | #2
On Sun, Sep 18, 2011 at 10:03:07AM +0000, Blue Swirl wrote:

> I was wondering if this #ifdeffery is needed since TCI would probably
> give more performance compared to the alternative, TCG generated
> emulation sequences. But it could be useful for testing those. Maybe
> there should be two options to enable and disable all non-mandatory TCI
> versions.

We could perhaps even allow enabling/disabling of optional ops from the
command line, although I this would complicate tcg-op.h pretty badly.

> > +/*
> > + * This code implements a TCG which does not generate machine code for some
> > + * real target machine but which generates virtual machine code for an
> > + * interpreter. Interpreted pseudo code is slow, but it works on any host.
> > + *
> > + * Some remarks might help in understanding the code:
> > + *
> > + * "target" or "TCG target" is the machine which runs the generated code.
> > + * This is different to the usual meaning in QEMU where "target" is the
> > + * emulated machine. So normally QEMU host is identical to TCG target.
> > + * Here the TCG target is a virtual machine, but this virtual machine must
> > + * use the same word size like the real machine.
> 
> Why, for performance? Allowing that could be useful for testing TCG,
> perhaps we could even use non-native endianness?

I suppose any mismatch between TCGv_ptr and the host pointer size must
be avoided.  Perhaps it would be worth adding a TCG_TARGET_PTR_BITS and
converting users of TCG_TARGET_REG_BITS appropriately.  I'm surprised
at just how few places I've found that test TCG_TARGET_REG_BITS to
determine the width of a TCGv_ptr.

> > + * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
> > + */
> > +
> > +#if !defined(TCG_TARGET_H)
> > +#define TCG_TARGET_H
> > +
> > +#include "config-host.h"
> > +
> > +#define TCG_TARGET_INTERPRETER 1
> > +
> > +#ifdef CONFIG_DEBUG_TCG
> > +/* Enable debug output. */
> > +#define CONFIG_DEBUG_TCG_INTERPRETER
> > +#endif
> > +
> > +#if 0 /* TCI tries to emulate a little endian host. */
> > +#if defined(HOST_WORDS_BIGENDIAN)
> > +# define TCG_TARGET_WORDS_BIGENDIAN
> > +#endif
> > +#endif
> > +
> > +/* Optional instructions. */
> > +
> > +#define TCG_TARGET_HAS_bswap16_i32      1
> > +#define TCG_TARGET_HAS_bswap32_i32      1
> > +/* Not more than one of the next two defines must be 1. */
> > +#define TCG_TARGET_HAS_div_i32          1
> > +#define TCG_TARGET_HAS_div2_i32         0
> > +#define TCG_TARGET_HAS_ext8s_i32        1
> > +#define TCG_TARGET_HAS_ext16s_i32       1
> > +#define TCG_TARGET_HAS_ext8u_i32        1
> > +#define TCG_TARGET_HAS_ext16u_i32       1
> > +#define TCG_TARGET_HAS_andc_i32         0
> > +#define TCG_TARGET_HAS_deposit_i32      0
> > +#define TCG_TARGET_HAS_eqv_i32          0
> > +#define TCG_TARGET_HAS_nand_i32         0
> > +#define TCG_TARGET_HAS_nor_i32          0
> > +#define TCG_TARGET_HAS_neg_i32          1
> > +#define TCG_TARGET_HAS_not_i32          1
> > +#define TCG_TARGET_HAS_orc_i32          0
> > +#define TCG_TARGET_HAS_rot_i32          1
> > +
> > +#if TCG_TARGET_REG_BITS == 64
> > +#define TCG_TARGET_HAS_bswap16_i64      1
> > +#define TCG_TARGET_HAS_bswap32_i64      1
> > +#define TCG_TARGET_HAS_bswap64_i64      1
> > +#define TCG_TARGET_HAS_deposit_i64      0
> > +/* Not more than one of the next two defines must be 1. */
> > +#define TCG_TARGET_HAS_div_i64          0
> > +#define TCG_TARGET_HAS_div2_i64         0
> > +#define TCG_TARGET_HAS_ext8s_i64        1
> > +#define TCG_TARGET_HAS_ext16s_i64       1
> > +#define TCG_TARGET_HAS_ext32s_i64       1
> > +#define TCG_TARGET_HAS_ext8u_i64        1
> > +#define TCG_TARGET_HAS_ext16u_i64       1
> > +#define TCG_TARGET_HAS_ext32u_i64       1
> > +#define TCG_TARGET_HAS_andc_i64         0
> > +#define TCG_TARGET_HAS_eqv_i64          0
> > +#define TCG_TARGET_HAS_nand_i64         0
> > +#define TCG_TARGET_HAS_nor_i64          0
> > +#define TCG_TARGET_HAS_neg_i64          1
> > +#define TCG_TARGET_HAS_not_i64          1
> > +#define TCG_TARGET_HAS_orc_i64          0
> > +#define TCG_TARGET_HAS_rot_i64          1
> > +#endif /* TCG_TARGET_REG_BITS == 64 */
> > +
> > +/* Offset to user memory in user mode. */
> > +#define TCG_TARGET_HAS_GUEST_BASE
> > +
> > +/* Number of registers available.
> > +   For 32 bit hosts, we need more than 8 registers (call arguments). */
> 
> On i386 there certainly aren't 8 registers, where does 8 come from?

We need eight registers to allow passing of four 32-bit arguments using
the registers.

Alternatively, we could use a stack to pass arguments.  For this, we'd
need to point our stack register (tci_reg[TCG_REG_CALL_STACK]) at some
memory that we use as a stack.  It wouldn't need to be much, just
enough to accomodate the arguments, AFAICT.

Unless we use the stack pointer register, we should not define
TCG_REG_CALL_STACK at all, and we should #ifdef out the parts of
tcg_reg_alloc_call() that use it.  If there's no stack available, the
code should abort in the case where there aren't enough TCI registers
for all of the parameters being passed, although in this case, there
should be a compile time check to ensure that:

    ARRAY_SIZE(tcg_target_call_iarg_regs) ==
        (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_IARGS)

We might want to relax that check to allow tcg_target_call_iarg_regs to
have list more TCI registers than required, but those registers would
just be wasted.

BTW, note that TCG limits us to 64 registers (due to TCGRegSet).
I expect this could be changed (for TCI only) if needed, though, but if
we allow a user-supplied TCG_TARGET_NB_REGS, then we should check that
it is no more than 64, at least for the time being.

> > +/* #define TCG_TARGET_NB_REGS 32 */
> > +
> > +/* List of registers which are used by TCG. */
> > +typedef enum {
> > +    TCG_REG_R0 = 0,
> > +    TCG_REG_R1,
> > +    TCG_REG_R2,
> > +    TCG_REG_R3,
> > +    TCG_REG_R4,
> > +    TCG_REG_R5,
> > +    TCG_REG_R6,
> > +    TCG_REG_R7,
> > +    TCG_AREG0 = TCG_REG_R7,
> > +#if TCG_TARGET_NB_REGS >= 16
> > +    TCG_REG_R8,
> > +    TCG_REG_R9,
> > +    TCG_REG_R10,
> > +    TCG_REG_R11,
> > +    TCG_REG_R12,
> > +    TCG_REG_R13,
> > +    TCG_REG_R14,
> > +    TCG_REG_R15,
> > +#if TCG_TARGET_NB_REGS >= 32
> > +    TCG_REG_R16,
> > +    TCG_REG_R17,
> > +    TCG_REG_R18,
> > +    TCG_REG_R19,
> > +    TCG_REG_R20,
> > +    TCG_REG_R21,
> > +    TCG_REG_R22,
> > +    TCG_REG_R23,
> > +    TCG_REG_R24,
> > +    TCG_REG_R25,
> > +    TCG_REG_R26,
> > +    TCG_REG_R27,
> > +    TCG_REG_R28,
> > +    TCG_REG_R29,
> > +    TCG_REG_R30,
> > +    TCG_REG_R31,
> > +#endif

This seems unfortunate to me...

I wonder whether some sort of chain of defines would be better:

   /* already defined in osdep.h */

   #define xglue(x, y) x ## y
   #define glue(x, y) xglue(x, y)
   #define stringify(s)    tostring(s)
   #define tostring(s)     #s

   /* common definitions */

   #define NUM_DEF_1(n)                n(0)
   #define NUM_DEF_2(n)  NUM_DEF_1(n)  n(1)
   #define NUM_DEF_3(n)  NUM_DEF_2(n)  n(2)
   #define NUM_DEF_4(n)  NUM_DEF_3(n)  n(3)
   #define NUM_DEF_5(n)  NUM_DEF_4(n)  n(4)
   #define NUM_DEF_6(n)  NUM_DEF_5(n)  n(5)
   #define NUM_DEF_7(n)  NUM_DEF_6(n)  n(6)
   #define NUM_DEF_8(n)  NUM_DEF_7(n)  n(7)
   #define NUM_DEF_9(n)  NUM_DEF_8(n)  n(8)
   #define NUM_DEF_10(n) NUM_DEF_9(n)  n(9)
   #define NUM_DEF_11(n) NUM_DEF_10(n) n(10)
   #define NUM_DEF_12(n) NUM_DEF_11(n) n(11)
   #define NUM_DEF_13(n) NUM_DEF_12(n) n(12)
   #define NUM_DEF_14(n) NUM_DEF_13(n) n(13)
   #define NUM_DEF_15(n) NUM_DEF_14(n) n(14)
   #define NUM_DEF_16(n) NUM_DEF_15(n) n(15)

   #define DEF_TCG_REGS glue(NUM_DEF_,TCG_TARGET_NB_REGS)

   /* tcg-target.h */

   #define DEF_TCG_REG_NUM(x) TCG_REG_R##x,

   typedef enum {
      DEF_TCG_REGS(DEF_TCG_REG_NUM)
   };

   /* tcg-target.c */

   #define DEF_TCG_REG_NAME(x) tostring(r##x),

   static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
      DEF_TCG_REGS(DEF_TCG_REG_NAME)
   };

Okay, so I accept that this is rather horrible, but it does allow us
to define the right number of entries based on TCG_TARGET_NB_REGS
without masses of #ifdefs or relying on the compiler for the host.

It might be better to allow the number of registers to be defined at
run-time -- although TCG would have to be modified not to rely upon
TCG_TARGET_NB_REGS when compiled for TCI.

Cheers,
Andreas Färber Oct. 1, 2011, 4:54 p.m. UTC | #3
Am 17.09.2011 22:00, schrieb Stefan Weil:
> Unlike other tcg target code generators, this one does not generate
> machine code for some cpu. It generates machine independent bytecode
> which is interpreted later.
>
> This allows running QEMU on any host.
>
> Interpreted bytecode is slower than direct execution of generated
> machine code.
>
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>
[...]
> diff --git a/tcg/bytecode/README b/tcg/bytecode/README
> new file mode 100644
> index 0000000..6fe9755
> --- /dev/null
> +++ b/tcg/bytecode/README
> @@ -0,0 +1,129 @@
> +TCG Interpreter (TCI) - Copyright (c) 2011 Stefan Weil.
> +
> +This file is released under GPL 2 or later.
> +
> +1) Introduction
> +
> +TCG (Tiny Code Generator) is a code generator which translates
> +code fragments ("basic blocks") from target code (any of the
> +targets supported by QEMU) to a code representation which
> +can be run on a host.
> +
> +QEMU can create native code for some hosts (arm, hppa, i386, ia64, ppc, ppc64,
> +s390, sparc, x86_64). For others, unofficial host support was written.
> +
> +By adding a code generator for a virtual machine and using an
> +interpreter for the generated bytecode, it is possible to
> +support (almost) any host.
> +
> +This is what TCI (Tiny Code Interpreter) does.
> +
> +2) Implementation
> +
> +Like each TCG host frontend, TCI implements the code generator in
> +tcg-target.c, tcg-target.h. Both files are in directory tcg/bytecode.
> +
> +The additional file tcg/tci.c adds the interpreter.
> +
> +The bytecode consists of opcodes (same numeric values as those used by
> +TCG), command length and arguments of variable size and number.

While reusing TCG opcode values certainly makes things easy to
implement, have you evaluated using LLVM bitcode as alternative to a
fully custom intermediate code format?

Andreas
Stefan Weil Oct. 1, 2011, 9:25 p.m. UTC | #4
Am 01.10.2011 18:54, schrieb Andreas Färber:
> Am 17.09.2011 22:00, schrieb Stefan Weil:
>> Unlike other tcg target code generators, this one does not generate
>> machine code for some cpu. It generates machine independent bytecode
>> which is interpreted later.
>>
>> This allows running QEMU on any host.
>>
>> Interpreted bytecode is slower than direct execution of generated
>> machine code.
>>
>> Signed-off-by: Stefan Weil <weil@mail.berlios.de>
> [...]
>> diff --git a/tcg/bytecode/README b/tcg/bytecode/README
>> new file mode 100644
>> index 0000000..6fe9755
>> --- /dev/null
>> +++ b/tcg/bytecode/README
>> @@ -0,0 +1,129 @@
>> +TCG Interpreter (TCI) - Copyright (c) 2011 Stefan Weil.
>> +
>> +This file is released under GPL 2 or later.
>> +
>> +1) Introduction
>> +
>> +TCG (Tiny Code Generator) is a code generator which translates
>> +code fragments ("basic blocks") from target code (any of the
>> +targets supported by QEMU) to a code representation which
>> +can be run on a host.
>> +
>> +QEMU can create native code for some hosts (arm, hppa, i386, ia64, 
>> ppc, ppc64,
>> +s390, sparc, x86_64). For others, unofficial host support was written.
>> +
>> +By adding a code generator for a virtual machine and using an
>> +interpreter for the generated bytecode, it is possible to
>> +support (almost) any host.
>> +
>> +This is what TCI (Tiny Code Interpreter) does.
>> +
>> +2) Implementation
>> +
>> +Like each TCG host frontend, TCI implements the code generator in
>> +tcg-target.c, tcg-target.h. Both files are in directory tcg/bytecode.
>> +
>> +The additional file tcg/tci.c adds the interpreter.
>> +
>> +The bytecode consists of opcodes (same numeric values as those used by
>> +TCG), command length and arguments of variable size and number.
>
> While reusing TCG opcode values certainly makes things easy to
> implement, have you evaluated using LLVM bitcode as alternative to a
> fully custom intermediate code format?
>
> Andreas

I had a look on several bytecode representations - initially I thought
of using Java. LLVM was on my list, too, but I cannot say that I really
evaluated any of these alternatives. My primary goal was to learn more
about TCG and to get something working, and as you said, reusing the
TCG opcodes made things easier.

LLVM might also be used as a replacement for TCG.
It would be really interesting to see how both compare.

Stefan
Andreas Färber Oct. 9, 2011, 4:19 p.m. UTC | #5
Am 01.10.2011 23:25, schrieb Stefan Weil:
> Am 01.10.2011 18:54, schrieb Andreas Färber:
>> Am 17.09.2011 22:00, schrieb Stefan Weil:
>>> +The bytecode consists of opcodes (same numeric values as those used by
>>> +TCG), command length and arguments of variable size and number.
>>
>> While reusing TCG opcode values certainly makes things easy to
>> implement, have you evaluated using LLVM bitcode as alternative to a
>> fully custom intermediate code format?
> 
> I had a look on several bytecode representations - initially I thought
> of using Java. LLVM was on my list, too, but I cannot say that I really
> evaluated any of these alternatives. My primary goal was to learn more
> about TCG and to get something working, and as you said, reusing the
> TCG opcodes made things easier.

Okay, just thought I'd ask the blunt question. :)

We should be careful not to expose it to outside processes as discussed
elsewhere in the thread or we will have to start caring about ABI
versioning.

> LLVM might also be used as a replacement for TCG.
> It would be really interesting to see how both compare.

Maybe suited for a GSoC project?

Andreas
diff mbox

Patch

diff --git a/dis-asm.h b/dis-asm.h
index 5b07d7f..876975f 100644
--- a/dis-asm.h
+++ b/dis-asm.h
@@ -365,6 +365,7 @@  typedef struct disassemble_info {
    target address.  Return number of bytes processed.  */
 typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
 
+int print_insn_bytecode(bfd_vma, disassemble_info*);
 int print_insn_big_mips         (bfd_vma, disassemble_info*);
 int print_insn_little_mips      (bfd_vma, disassemble_info*);
 int print_insn_i386             (bfd_vma, disassemble_info*);
diff --git a/disas.c b/disas.c
index 611b30b..e2061d8 100644
--- a/disas.c
+++ b/disas.c
@@ -273,7 +273,9 @@  void disas(FILE *out, void *code, unsigned long size)
 #else
     disasm_info.endian = BFD_ENDIAN_LITTLE;
 #endif
-#if defined(__i386__)
+#if defined(CONFIG_TCG_INTERPRETER)
+    print_insn = print_insn_bytecode;
+#elif defined(__i386__)
     disasm_info.mach = bfd_mach_i386_i386;
     print_insn = print_insn_i386;
 #elif defined(__x86_64__)
diff --git a/dyngen-exec.h b/dyngen-exec.h
index 8beb7f3..64f76c4 100644
--- a/dyngen-exec.h
+++ b/dyngen-exec.h
@@ -19,7 +19,9 @@ 
 #if !defined(__DYNGEN_EXEC_H__)
 #define __DYNGEN_EXEC_H__
 
-#if defined(__i386__)
+#if defined(CONFIG_TCG_INTERPRETER)
+/* The TCG interpreter does not use special registers. */
+#elif defined(__i386__)
 #define AREG0 "ebp"
 #elif defined(__x86_64__)
 #define AREG0 "r14"
@@ -55,11 +57,18 @@ 
 #error unsupported CPU
 #endif
 
+#if defined(AREG0)
 register CPUState *env asm(AREG0);
+#else
+extern CPUState *env;
+#endif
 
 /* The return address may point to the start of the next instruction.
    Subtracting one gets us the call instruction itself.  */
-#if defined(__s390__) && !defined(__s390x__)
+#if defined(CONFIG_TCG_INTERPRETER)
+extern uint8_t *tci_tb_ptr;
+# define GETPC() ((void *)tci_tb_ptr)
+#elif defined(__s390__) && !defined(__s390x__)
 # define GETPC() ((void*)(((unsigned long)__builtin_return_address(0) & 0x7fffffffUL) - 1))
 #elif defined(__arm__)
 /* Thumb return addresses have the low bit set, so we need to subtract two.
diff --git a/exec-all.h b/exec-all.h
index 9b8d62c..0116acd 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -122,6 +122,8 @@  void tlb_set_page(CPUState *env, target_ulong vaddr,
 
 #if defined(_ARCH_PPC) || defined(__x86_64__) || defined(__arm__) || defined(__i386__)
 #define USE_DIRECT_JUMP
+#elif defined(CONFIG_TCG_INTERPRETER)
+#define USE_DIRECT_JUMP
 #endif
 
 struct TranslationBlock {
@@ -189,7 +191,14 @@  extern TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
 
 #if defined(USE_DIRECT_JUMP)
 
-#if defined(_ARCH_PPC)
+#if defined(CONFIG_TCG_INTERPRETER)
+static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
+{
+    /* patch the branch destination */
+    *(uint32_t *)jmp_addr = addr - (jmp_addr + 4);
+    /* no need to flush icache explicitly */
+}
+#elif defined(_ARCH_PPC)
 void ppc_tb_set_jmp_target(unsigned long jmp_addr, unsigned long addr);
 #define tb_set_jmp_target1 ppc_tb_set_jmp_target
 #elif defined(__i386__) || defined(__x86_64__)
@@ -223,6 +232,8 @@  static inline void tb_set_jmp_target1(unsigned long jmp_addr, unsigned long addr
     __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
 #endif
 }
+#else
+#error tb_set_jmp_target1 is missing
 #endif
 
 static inline void tb_set_jmp_target(TranslationBlock *tb,
diff --git a/tcg/bytecode/README b/tcg/bytecode/README
new file mode 100644
index 0000000..6fe9755
--- /dev/null
+++ b/tcg/bytecode/README
@@ -0,0 +1,129 @@ 
+TCG Interpreter (TCI) - Copyright (c) 2011 Stefan Weil.
+
+This file is released under GPL 2 or later.
+
+1) Introduction
+
+TCG (Tiny Code Generator) is a code generator which translates
+code fragments ("basic blocks") from target code (any of the
+targets supported by QEMU) to a code representation which
+can be run on a host.
+
+QEMU can create native code for some hosts (arm, hppa, i386, ia64, ppc, ppc64,
+s390, sparc, x86_64). For others, unofficial host support was written.
+
+By adding a code generator for a virtual machine and using an
+interpreter for the generated bytecode, it is possible to
+support (almost) any host.
+
+This is what TCI (Tiny Code Interpreter) does.
+
+2) Implementation
+
+Like each TCG host frontend, TCI implements the code generator in
+tcg-target.c, tcg-target.h. Both files are in directory tcg/bytecode.
+
+The additional file tcg/tci.c adds the interpreter.
+
+The bytecode consists of opcodes (same numeric values as those used by
+TCG), command length and arguments of variable size and number.
+
+3) Usage
+
+For hosts without native TCG, the interpreter TCI must be enabled by
+
+        configure --enable-tcg-interpreter
+
+If configure is called without --enable-tcg-interpreter, it will
+suggest using this option. Setting it automatically would need
+additional code in configure which must be fixed when new native TCG
+implementations are added.
+
+System emulation should work on any 32 or 64 bit host.
+User mode emulation might work. Maybe a new loader (*.ld)
+is needed. Byte order might be wrong (on big endian hosts)
+and need fixes in configure.
+
+For hosts with native TCG, the interpreter TCI can be enabled by
+
+        configure --enable-tcg-interpreter
+
+The only difference from running qemu with TCI to running without TCI
+should be speed. Especially during development of TCI, it was very
+useful to compare runs with and without TCI. Create /tmp/qemu.log by
+
+        qemu -d in_asm,op_opt,cpu -singlestep
+
+once with interpreter and once without interpreter and compare the resulting
+qemu.log files. This is also useful to see the effects of additional
+registers or additional opcodes (it is easy to modify the virtual machine).
+It can also be used to verify native TCGs.
+
+Hosts with native TCG can also enable TCI by claiming to be unsupported:
+
+        configure --cpu=unknown --enable-tcg-interpreter
+
+configure then no longer uses the native loader (*.ld) for user mode emulation.
+
+
+4) Status
+
+TCI needs special implementation for 32 and 64 bit host, 32 and 64 bit target,
+host and target with same or different endianness.
+
+            | host (le)                     host (be)
+            | 32             64             32             64
+------------+------------------------------------------------------------
+target (le) | s0, u0         s1, u1         s?, u?         s?, u?
+32 bit      |
+            |
+target (le) | sc, uc         s1, u1         s?, u?         s?, u?
+64 bit      |
+            |
+target (be) | sc, u0         sc, uc         s?, u?         s?, u?
+32 bit      |
+            |
+target (be) | sc, uc         sc, uc         s?, u?         s?, u?
+64 bit      |
+            |
+
+System emulation
+s? = untested
+sc = compiles
+s0 = bios works
+s1 = grub works
+s2 = linux boots
+
+Linux user mode emulation
+u? = untested
+uc = compiles
+u0 = static hello works
+u1 = linux-user-test works
+
+5) Todo list
+
+* TCI is not widely tested. It was written and tested on a x86_64 host
+  running i386 and x86_64 system emulation and linux user mode.
+  A cross compiled qemu for i386 host also works with the same basic tests.
+  A cross compiled qemu for mipsel host works, too. It is terribly slow
+  because I run it in a mips malta emulation, so it is an interpreted
+  emulation in an emulation.
+  A cross compiled qemu for arm host works (tested with pc bios).
+  A cross compiled qemu for ppc host works at least partially:
+  i386-linux-user/qemu-i386 can run a simple hello-world program
+  (tested in a ppc emulation).
+
+* Some TCG opcodes are either missing in the code generator and/or
+  in the interpreter. These opcodes raise a runtime exception, so it is
+  possible to see where code must be added.
+
+* The pseudo code is not optimized and still ugly. For hosts with special
+  alignment requirements, it needs some fixes (maybe aligned bytecode
+  would also improve speed for hosts which support byte alignment).
+
+* A better disassembler for the pseudo code would be nice (a very primitive
+  disassembler is included in tcg-target.c).
+
+* It might be useful to have a runtime option which selects the native TCG
+  or TCI, so qemu would have to include two TCGs. Today, selecting TCI
+  is a configure option, so you need two compilations of qemu.
diff --git a/tcg/bytecode/tcg-target.c b/tcg/bytecode/tcg-target.c
new file mode 100644
index 0000000..f505ff0
--- /dev/null
+++ b/tcg/bytecode/tcg-target.c
@@ -0,0 +1,955 @@ 
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2009, 2011 Stefan Weil
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* TODO list:
+ * - See TODO comments in code.
+ */
+
+/* Marker for missing code. */
+#define TODO() \
+    do { \
+        fprintf(stderr, "TODO %s:%u: %s()\n", \
+                __FILE__, __LINE__, __func__); \
+        tcg_abort(); \
+    } while (0)
+
+/* Trace message to see program flow. */
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+#define TRACE() \
+    loglevel \
+    ? fprintf(stderr, "TCG %s:%u: %s()\n", __FILE__, __LINE__, __func__) \
+    : (void)0
+#else
+#define TRACE() ((void)0)
+#endif
+
+/* Single bit n. */
+#define BIT(n) (1 << (n))
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/* Used for function call generation. */
+#define TCG_REG_CALL_STACK              TCG_REG_R4
+#define TCG_TARGET_STACK_ALIGN          16
+#define TCG_TARGET_CALL_STACK_OFFSET    0
+
+/* TODO: documentation. */
+static uint8_t *tb_ret_addr;
+
+/* Macros used in tcg_target_op_defs. */
+#define R       "r"
+#define RI      "ri"
+#if TCG_TARGET_REG_BITS == 32
+# define R64    "r", "r"
+#else
+# define R64    "r"
+#endif
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+# define L      "L", "L"
+# define S      "S", "S"
+#else
+# define L      "L"
+# define S      "S"
+#endif
+
+/* TODO: documentation. */
+static const TCGTargetOpDef tcg_target_op_defs[] = {
+    { INDEX_op_exit_tb, { } },
+    { INDEX_op_goto_tb, { } },
+    { INDEX_op_call, { RI } },
+    { INDEX_op_jmp, { RI } },
+    { INDEX_op_br, { } },
+
+    { INDEX_op_mov_i32, { R, R } },
+    { INDEX_op_movi_i32, { R } },
+
+    { INDEX_op_ld8u_i32, { R, R } },
+    { INDEX_op_ld8s_i32, { R, R } },
+    { INDEX_op_ld16u_i32, { R, R } },
+    { INDEX_op_ld16s_i32, { R, R } },
+    { INDEX_op_ld_i32, { R, R } },
+    { INDEX_op_st8_i32, { R, R } },
+    { INDEX_op_st16_i32, { R, R } },
+    { INDEX_op_st_i32, { R, R } },
+
+    { INDEX_op_add_i32, { R, RI, RI } },
+    { INDEX_op_sub_i32, { R, RI, RI } },
+    { INDEX_op_mul_i32, { R, RI, RI } },
+#if TCG_TARGET_HAS_div_i32
+    { INDEX_op_div_i32, { R, R, R } },
+    { INDEX_op_divu_i32, { R, R, R } },
+    { INDEX_op_rem_i32, { R, R, R } },
+    { INDEX_op_remu_i32, { R, R, R } },
+#elif TCG_TARGET_HAS_div2_i32
+    { INDEX_op_div2_i32, { R, R, "0", "1", R } },
+    { INDEX_op_divu2_i32, { R, R, "0", "1", R } },
+#endif
+    /* TODO: Does R, RI, RI result in faster code than R, R, RI?
+       If both operands are constants, we can optimize. */
+    { INDEX_op_and_i32, { R, RI, RI } },
+#if TCG_TARGET_HAS_andc_i32
+    { INDEX_op_andc_i32, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_eqv_i32
+    { INDEX_op_eqv_i32, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_nand_i32
+    { INDEX_op_nand_i32, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_nor_i32
+    { INDEX_op_nor_i32, { R, RI, RI } },
+#endif
+    { INDEX_op_or_i32, { R, RI, RI } },
+#if TCG_TARGET_HAS_orc_i32
+    { INDEX_op_orc_i32, { R, RI, RI } },
+#endif
+    { INDEX_op_xor_i32, { R, RI, RI } },
+    { INDEX_op_shl_i32, { R, RI, RI } },
+    { INDEX_op_shr_i32, { R, RI, RI } },
+    { INDEX_op_sar_i32, { R, RI, RI } },
+#if TCG_TARGET_HAS_rot_i32
+    { INDEX_op_rotl_i32, { R, RI, RI } },
+    { INDEX_op_rotr_i32, { R, RI, RI } },
+#endif
+
+    { INDEX_op_brcond_i32, { R, RI } },
+
+    { INDEX_op_setcond_i32, { R, R, RI } },
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_setcond_i64, { R, R, RI } },
+#endif /* TCG_TARGET_REG_BITS == 64 */
+
+#if TCG_TARGET_REG_BITS == 32
+    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
+    { INDEX_op_add2_i32, { R, R, R, R, R, R } },
+    { INDEX_op_sub2_i32, { R, R, R, R, R, R } },
+    { INDEX_op_brcond2_i32, { R, R, RI, RI } },
+    { INDEX_op_mulu2_i32, { R, R, R, R } },
+    { INDEX_op_setcond2_i32, { R, R, R, RI, RI } },
+#endif
+
+#if TCG_TARGET_HAS_not_i32
+    { INDEX_op_not_i32, { R, R } },
+#endif
+#if TCG_TARGET_HAS_neg_i32
+    { INDEX_op_neg_i32, { R, R } },
+#endif
+
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_mov_i64, { R, R } },
+    { INDEX_op_movi_i64, { R } },
+
+    { INDEX_op_ld8u_i64, { R, R } },
+    { INDEX_op_ld8s_i64, { R, R } },
+    { INDEX_op_ld16u_i64, { R, R } },
+    { INDEX_op_ld16s_i64, { R, R } },
+    { INDEX_op_ld32u_i64, { R, R } },
+    { INDEX_op_ld32s_i64, { R, R } },
+    { INDEX_op_ld_i64, { R, R } },
+
+    { INDEX_op_st8_i64, { R, R } },
+    { INDEX_op_st16_i64, { R, R } },
+    { INDEX_op_st32_i64, { R, R } },
+    { INDEX_op_st_i64, { R, R } },
+
+    { INDEX_op_add_i64, { R, RI, RI } },
+    { INDEX_op_sub_i64, { R, RI, RI } },
+    { INDEX_op_mul_i64, { R, RI, RI } },
+#if TCG_TARGET_HAS_div_i64
+    { INDEX_op_div_i64, { R, R, R } },
+    { INDEX_op_divu_i64, { R, R, R } },
+    { INDEX_op_rem_i64, { R, R, R } },
+    { INDEX_op_remu_i64, { R, R, R } },
+#elif defined(TCG_TARGET_HAS_div2_i64)
+    { INDEX_op_div2_i64, { R, R, "0", "1", R } },
+    { INDEX_op_divu2_i64, { R, R, "0", "1", R } },
+#endif
+    { INDEX_op_and_i64, { R, RI, RI } },
+#if TCG_TARGET_HAS_andc_i64
+    { INDEX_op_andc_i64, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_eqv_i64
+    { INDEX_op_eqv_i64, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_nand_i64
+    { INDEX_op_nand_i64, { R, RI, RI } },
+#endif
+#if TCG_TARGET_HAS_nor_i64
+    { INDEX_op_nor_i64, { R, RI, RI } },
+#endif
+    { INDEX_op_or_i64, { R, RI, RI } },
+#if TCG_TARGET_HAS_orc_i64
+    { INDEX_op_orc_i64, { R, RI, RI } },
+#endif
+    { INDEX_op_xor_i64, { R, RI, RI } },
+    { INDEX_op_shl_i64, { R, RI, RI } },
+    { INDEX_op_shr_i64, { R, RI, RI } },
+    { INDEX_op_sar_i64, { R, RI, RI } },
+#if TCG_TARGET_HAS_rot_i64
+    { INDEX_op_rotl_i64, { R, RI, RI } },
+    { INDEX_op_rotr_i64, { R, RI, RI } },
+#endif
+    { INDEX_op_brcond_i64, { R, RI } },
+
+#if TCG_TARGET_HAS_ext8s_i64
+    { INDEX_op_ext8s_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext16s_i64
+    { INDEX_op_ext16s_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext32s_i64
+    { INDEX_op_ext32s_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext8u_i64
+    { INDEX_op_ext8u_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext16u_i64
+    { INDEX_op_ext16u_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext32u_i64
+    { INDEX_op_ext32u_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_bswap16_i64
+    { INDEX_op_bswap16_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_bswap32_i64
+    { INDEX_op_bswap32_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_bswap64_i64
+    { INDEX_op_bswap64_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_not_i64
+    { INDEX_op_not_i64, { R, R } },
+#endif
+#if TCG_TARGET_HAS_neg_i64
+    { INDEX_op_neg_i64, { R, R } },
+#endif
+#endif /* TCG_TARGET_REG_BITS == 64 */
+
+    { INDEX_op_qemu_ld8u, { R, L } },
+    { INDEX_op_qemu_ld8s, { R, L } },
+    { INDEX_op_qemu_ld16u, { R, L } },
+    { INDEX_op_qemu_ld16s, { R, L } },
+    { INDEX_op_qemu_ld32, { R, L } },
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_qemu_ld32u, { R, L } },
+    { INDEX_op_qemu_ld32s, { R, L } },
+#endif
+    { INDEX_op_qemu_ld64, { R64, L } },
+
+    { INDEX_op_qemu_st8, { R, S } },
+    { INDEX_op_qemu_st16, { R, S } },
+    { INDEX_op_qemu_st32, { R, S } },
+    { INDEX_op_qemu_st64, { R64, S } },
+
+#if TCG_TARGET_HAS_ext8s_i32
+    { INDEX_op_ext8s_i32, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext16s_i32
+    { INDEX_op_ext16s_i32, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext8u_i32
+    { INDEX_op_ext8u_i32, { R, R } },
+#endif
+#if TCG_TARGET_HAS_ext16u_i32
+    { INDEX_op_ext16u_i32, { R, R } },
+#endif
+
+#if TCG_TARGET_HAS_bswap16_i32
+    { INDEX_op_bswap16_i32, { R, R } },
+#endif
+#if TCG_TARGET_HAS_bswap32_i32
+    { INDEX_op_bswap32_i32, { R, R } },
+#endif
+
+    { -1 },
+};
+
+static const int tcg_target_reg_alloc_order[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+#if 0 /* used for TCG_REG_CALL_STACK */
+    TCG_REG_R4,
+#endif
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+#if TCG_TARGET_NB_REGS >= 16
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+#endif
+};
+
+#if MAX_OPC_PARAM_IARGS != 4
+# error Fix needed, number of supported input arguments changed!
+#endif
+
+static const int tcg_target_call_iarg_regs[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+#if TCG_TARGET_REG_BITS == 32
+    /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
+#if 0 /* used for TCG_REG_CALL_STACK */
+    TCG_REG_R4,
+#endif
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+#if TCG_TARGET_NB_REGS >= 16
+    TCG_REG_R8,
+#else
+# error Too few input registers available
+#endif
+#endif
+};
+
+static const int tcg_target_call_oarg_regs[] = {
+    TCG_REG_R0,
+#if TCG_TARGET_REG_BITS == 32
+    TCG_REG_R1
+#endif
+};
+
+#ifndef NDEBUG
+static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+    "r00",
+    "r01",
+    "r02",
+    "r03",
+    "r04",
+    "r05",
+    "r06",
+    "r07",
+#if TCG_TARGET_NB_REGS >= 16
+    "r08",
+    "r09",
+    "r10",
+    "r11",
+    "r12",
+    "r13",
+    "r14",
+    "r15",
+#if TCG_TARGET_NB_REGS >= 32
+    "r16",
+    "r17",
+    "r18",
+    "r19",
+    "r20",
+    "r21",
+    "r22",
+    "r23",
+    "r24",
+    "r25",
+    "r26",
+    "r27",
+    "r28",
+    "r29",
+    "r30",
+    "r31"
+#endif
+#endif
+};
+#endif
+
+static void flush_icache_range(unsigned long start, unsigned long stop)
+{
+    TRACE();
+}
+
+static void patch_reloc(uint8_t *code_ptr, int type,
+                        tcg_target_long value, tcg_target_long addend)
+{
+    /* tcg_out_reloc always uses the same type, addend. */
+    assert(type == sizeof(tcg_target_long));
+    assert(addend == 0);
+    assert(value != 0);
+    *(tcg_target_long *)code_ptr = value;
+}
+
+/* Parse target specific constraints. */
+static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
+{
+    const char *ct_str = *pct_str;
+    switch (ct_str[0]) {
+    case 'r':
+    case 'L':                   /* qemu_ld constraint */
+    case 'S':                   /* qemu_st constraint */
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set32(ct->u.regs, 0, BIT(TCG_TARGET_NB_REGS) - 1);
+        break;
+    default:
+        return -1;
+    }
+    ct_str++;
+    *pct_str = ct_str;
+    return 0;
+}
+
+#include "dis-asm.h"
+
+/* Disassemble bytecode. */
+int print_insn_bytecode(bfd_vma addr, disassemble_info *info)
+{
+    int length;
+    uint8_t byte;
+    int status;
+    TCGOpcode op;
+
+    status = info->read_memory_func(addr, &byte, 1, info);
+    if (status != 0) {
+        info->memory_error_func(status, addr, info);
+        return -1;
+    }
+    op = byte;
+
+    addr++;
+    status = info->read_memory_func(addr, &byte, 1, info);
+    if (status != 0) {
+        info->memory_error_func(status, addr, info);
+        return -1;
+    }
+    length = byte;
+
+    if (op >= ARRAY_SIZE(tcg_op_defs)) {
+        return length;
+    }
+
+    const TCGOpDef *def = &tcg_op_defs[op];
+    int nb_oargs = def->nb_oargs;
+    int nb_iargs = def->nb_iargs;
+    int nb_cargs = def->nb_cargs;
+    FILE *f = info->stream;
+    /* TODO: Improve disassembler output. */
+    info->fprintf_func(f, "%s\to=%d i=%d c=%d",
+                       def->name, nb_oargs, nb_iargs, nb_cargs);
+
+    return length;
+}
+
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+/* Show current bytecode. Used by tcg interpreter. */
+void tci_disas(uint8_t opc)
+{
+    const TCGOpDef *def = &tcg_op_defs[opc];
+    fprintf(stderr, "TCG %s %u, %u, %u\n",
+            def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs);
+}
+#endif
+
+/* Write value (native size). */
+static void tcg_out_i(TCGContext *s, tcg_target_ulong v)
+{
+    *(tcg_target_ulong *)s->code_ptr = v;
+    s->code_ptr += sizeof(tcg_target_ulong);
+}
+
+/* Write 64 bit value. */
+static void tcg_out64(TCGContext *s, uint64_t v)
+{
+    *(uint64_t *)s->code_ptr = v;
+    s->code_ptr += sizeof(v);
+}
+
+/* Write opcode. */
+static void tcg_out_op_t(TCGContext *s, TCGOpcode op)
+{
+    tcg_out8(s, op);
+    tcg_out8(s, 0);
+}
+
+/* Write register. */
+static void tcg_out_r(TCGContext *s, TCGArg t0)
+{
+    assert(t0 < TCG_TARGET_NB_REGS);
+    tcg_out8(s, t0);
+}
+
+/* Write register or constant (native size). */
+static void tcg_out_ri(TCGContext *s, int const_arg, TCGArg arg)
+{
+    if (const_arg) {
+        assert(const_arg == 1);
+        tcg_out8(s, TCG_CONST);
+        tcg_out_i(s, arg);
+    } else {
+        tcg_out_r(s, arg);
+    }
+}
+
+/* Write register or constant (32 bit). */
+static void tcg_out_ri32(TCGContext *s, int const_arg, TCGArg arg)
+{
+    if (const_arg) {
+        assert(const_arg == 1);
+        tcg_out8(s, TCG_CONST);
+        tcg_out32(s, arg);
+    } else {
+        tcg_out_r(s, arg);
+    }
+}
+
+#if TCG_TARGET_REG_BITS == 64
+/* Write register or constant (64 bit). */
+static void tcg_out_ri64(TCGContext *s, int const_arg, TCGArg arg)
+{
+    if (const_arg) {
+        assert(const_arg == 1);
+        tcg_out8(s, TCG_CONST);
+        tcg_out64(s, arg);
+    } else {
+        tcg_out_r(s, arg);
+    }
+}
+#endif
+
+/* Write label. */
+static void tci_out_label(TCGContext *s, TCGArg arg)
+{
+    TCGLabel *label = &s->labels[arg];
+    if (label->has_value) {
+        tcg_out_i(s, label->u.value);
+        assert(label->u.value);
+    } else {
+        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), arg, 0);
+        tcg_out_i(s, 0);
+    }
+}
+
+static void tcg_out_ld(TCGContext *s, TCGType type, int ret, int arg1,
+                       tcg_target_long arg2)
+{
+    uint8_t *old_code_ptr = s->code_ptr;
+    if (type == TCG_TYPE_I32) {
+        tcg_out_op_t(s, INDEX_op_ld_i32);
+        tcg_out_r(s, ret);
+        tcg_out_r(s, arg1);
+        tcg_out32(s, arg2);
+    } else {
+        assert(type == TCG_TYPE_I64);
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_op_t(s, INDEX_op_ld_i64);
+        tcg_out_r(s, ret);
+        tcg_out_r(s, arg1);
+        assert(arg2 == (uint32_t)arg2);
+        tcg_out32(s, arg2);
+#else
+        TODO();
+#endif
+    }
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+}
+
+static void tcg_out_mov(TCGContext *s, TCGType type, int ret, int arg)
+{
+    uint8_t *old_code_ptr = s->code_ptr;
+    assert(ret != arg);
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_op_t(s, INDEX_op_mov_i32);
+#else
+    tcg_out_op_t(s, INDEX_op_mov_i64);
+#endif
+    tcg_out_r(s, ret);
+    tcg_out_r(s, arg);
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+}
+
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         int t0, tcg_target_long arg)
+{
+    uint8_t *old_code_ptr = s->code_ptr;
+    uint32_t arg32 = arg;
+    if (type == TCG_TYPE_I32 || arg == arg32) {
+        tcg_out_op_t(s, INDEX_op_movi_i32);
+        tcg_out_r(s, t0);
+        tcg_out32(s, arg32);
+    } else {
+        assert(type == TCG_TYPE_I64);
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_op_t(s, INDEX_op_movi_i64);
+        tcg_out_r(s, t0);
+        tcg_out64(s, arg);
+#else
+        TODO();
+#endif
+    }
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+}
+
+static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+                       const int *const_args)
+{
+    uint8_t *old_code_ptr = s->code_ptr;
+
+    tcg_out_op_t(s, opc);
+
+    switch (opc) {
+    case INDEX_op_exit_tb:
+        tcg_out64(s, args[0]);
+        break;
+    case INDEX_op_goto_tb:
+        if (s->tb_jmp_offset) {
+            /* Direct jump method. */
+            assert(args[0] < ARRAY_SIZE(s->tb_jmp_offset));
+            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            tcg_out32(s, 0);
+        } else {
+            /* Indirect jump method. */
+            TODO();
+        }
+        assert(args[0] < ARRAY_SIZE(s->tb_next_offset));
+        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        break;
+    case INDEX_op_br:
+        tci_out_label(s, args[0]);
+        break;
+    case INDEX_op_call:
+        tcg_out_ri(s, const_args[0], args[0]);
+        break;
+    case INDEX_op_jmp:
+        TODO();
+        break;
+    case INDEX_op_setcond_i32:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_ri32(s, const_args[2], args[2]);
+        tcg_out8(s, args[3]);   /* condition */
+        break;
+#if TCG_TARGET_REG_BITS == 32
+    case INDEX_op_setcond2_i32:
+        /* setcond2_i32 cond, t0, t1_low, t1_high, t2_low, t2_high */
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
+        tcg_out_ri32(s, const_args[3], args[3]);
+        tcg_out_ri32(s, const_args[4], args[4]);
+        tcg_out8(s, args[5]);   /* condition */
+        break;
+#elif TCG_TARGET_REG_BITS == 64
+    case INDEX_op_setcond_i64:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_ri64(s, const_args[2], args[2]);
+        tcg_out8(s, args[3]);   /* condition */
+        break;
+#endif
+    case INDEX_op_movi_i32:
+        TODO(); /* Handled by tcg_out_movi? */
+        break;
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8s_i32:
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16s_i32:
+    case INDEX_op_ld_i32:
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_ld8u_i64:
+    case INDEX_op_ld8s_i64:
+    case INDEX_op_ld16u_i64:
+    case INDEX_op_ld16s_i64:
+    case INDEX_op_ld32u_i64:
+    case INDEX_op_ld32s_i64:
+    case INDEX_op_ld_i64:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        assert(args[2] == (uint32_t)args[2]);
+        tcg_out32(s, args[2]);
+        break;
+    case INDEX_op_add_i32:
+    case INDEX_op_sub_i32:
+    case INDEX_op_mul_i32:
+    case INDEX_op_and_i32:
+    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
+    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_eqv_i32). */
+    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
+    case INDEX_op_nor_i32:      /* Optional (TCG_TARGET_HAS_nor_i32). */
+    case INDEX_op_or_i32:
+    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i32). */
+    case INDEX_op_xor_i32:
+    case INDEX_op_shl_i32:
+    case INDEX_op_shr_i32:
+    case INDEX_op_sar_i32:
+    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+        tcg_out_r(s, args[0]);
+        tcg_out_ri32(s, const_args[1], args[1]);
+        tcg_out_ri32(s, const_args[2], args[2]);
+        break;
+
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i64:
+        TODO();
+        break;
+    case INDEX_op_add_i64:
+    case INDEX_op_sub_i64:
+    case INDEX_op_mul_i64:
+    case INDEX_op_and_i64:
+    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
+    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
+    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+    case INDEX_op_or_i64:
+    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+    case INDEX_op_xor_i64:
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i64:
+    /* TODO: Implementation of rotl_i64, rotr_i64 missing in tci.c. */
+    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+        tcg_out_r(s, args[0]);
+        tcg_out_ri64(s, const_args[1], args[1]);
+        tcg_out_ri64(s, const_args[2], args[2]);
+        break;
+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        TODO();
+        break;
+    case INDEX_op_div2_i64:     /* Optional (TCG_TARGET_HAS_div2_i64). */
+    case INDEX_op_divu2_i64:    /* Optional (TCG_TARGET_HAS_div2_i64). */
+        TODO();
+        break;
+    case INDEX_op_brcond_i64:
+        tcg_out_r(s, args[0]);
+        tcg_out_ri64(s, const_args[1], args[1]);
+        tcg_out8(s, args[2]);           /* condition */
+        tci_out_label(s, args[3]);
+        break;
+    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
+    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
+    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
+    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+#endif /* TCG_TARGET_REG_BITS == 64 */
+    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
+    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
+    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        break;
+    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_r(s, args[0]);
+        tcg_out_ri32(s, const_args[1], args[1]);
+        tcg_out_ri32(s, const_args[2], args[2]);
+        break;
+    case INDEX_op_div2_i32:     /* Optional (TCG_TARGET_HAS_div2_i32). */
+    case INDEX_op_divu2_i32:    /* Optional (TCG_TARGET_HAS_div2_i32). */
+        TODO();
+        break;
+#if TCG_TARGET_REG_BITS == 32
+    case INDEX_op_add2_i32:
+    case INDEX_op_sub2_i32:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
+        tcg_out_r(s, args[3]);
+        tcg_out_r(s, args[4]);
+        tcg_out_r(s, args[5]);
+        break;
+    case INDEX_op_brcond2_i32:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_ri32(s, const_args[2], args[2]);
+        tcg_out_ri32(s, const_args[3], args[3]);
+        tcg_out8(s, args[4]);           /* condition */
+        tci_out_label(s, args[5]);
+        break;
+    case INDEX_op_mulu2_i32:
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
+        tcg_out_r(s, args[3]);
+        break;
+#endif
+    case INDEX_op_brcond_i32:
+        tcg_out_r(s, args[0]);
+        tcg_out_ri32(s, const_args[1], args[1]);
+        tcg_out8(s, args[2]);           /* condition */
+        tci_out_label(s, args[3]);
+        break;
+    case INDEX_op_qemu_ld8u:
+    case INDEX_op_qemu_ld8s:
+    case INDEX_op_qemu_ld16u:
+    case INDEX_op_qemu_ld16s:
+    case INDEX_op_qemu_ld32:
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_qemu_ld32s:
+    case INDEX_op_qemu_ld32u:
+#endif
+        tcg_out_r(s, *args++);
+        tcg_out_r(s, *args++);
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+        tcg_out_r(s, *args++);
+#endif
+#ifdef CONFIG_SOFTMMU
+        tcg_out_i(s, *args);
+#endif
+        break;
+    case INDEX_op_qemu_ld64:
+        tcg_out_r(s, *args++);
+#if TCG_TARGET_REG_BITS == 32
+        tcg_out_r(s, *args++);
+#endif
+        tcg_out_r(s, *args++);
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+        tcg_out_r(s, *args++);
+#endif
+#ifdef CONFIG_SOFTMMU
+        tcg_out_i(s, *args);
+#endif
+        break;
+    case INDEX_op_qemu_st8:
+    case INDEX_op_qemu_st16:
+    case INDEX_op_qemu_st32:
+        tcg_out_r(s, *args++);
+        tcg_out_r(s, *args++);
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+        tcg_out_r(s, *args++);
+#endif
+#ifdef CONFIG_SOFTMMU
+        tcg_out_i(s, *args);
+#endif
+        break;
+    case INDEX_op_qemu_st64:
+        tcg_out_r(s, *args++);
+#if TCG_TARGET_REG_BITS == 32
+        tcg_out_r(s, *args++);
+#endif
+        tcg_out_r(s, *args++);
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+        tcg_out_r(s, *args++);
+#endif
+#ifdef CONFIG_SOFTMMU
+        tcg_out_i(s, *args);
+#endif
+        break;
+    case INDEX_op_end:
+        TODO();
+        break;
+    default:
+        fprintf(stderr, "Missing: %s\n", tcg_op_defs[opc].name);
+        tcg_abort();
+    }
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+}
+
+static void tcg_out_st(TCGContext *s, TCGType type, int arg, int arg1,
+                       tcg_target_long arg2)
+{
+    uint8_t *old_code_ptr = s->code_ptr;
+    if (type == TCG_TYPE_I32) {
+        tcg_out_op_t(s, INDEX_op_st_i32);
+        tcg_out_r(s, arg);
+        tcg_out_r(s, arg1);
+        tcg_out32(s, arg2);
+    } else {
+        assert(type == TCG_TYPE_I64);
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_op_t(s, INDEX_op_st_i64);
+        tcg_out_r(s, arg);
+        tcg_out_r(s, arg1);
+        tcg_out32(s, arg2);
+#else
+        TODO();
+#endif
+    }
+    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+}
+
+/* Test if a constant matches the constraint. */
+static int tcg_target_const_match(tcg_target_long val,
+                                  const TCGArgConstraint *arg_ct)
+{
+    /* No need to return 0 or 1, 0 or != 0 is good enough. */
+    return arg_ct->ct & TCG_CT_CONST;
+}
+
+/* Maximum number of register used for input function arguments. */
+static int tcg_target_get_call_iarg_regs_count(int flags)
+{
+    return ARRAY_SIZE(tcg_target_call_iarg_regs);
+}
+
+static void tcg_target_init(TCGContext *s)
+{
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+    const char *envval = getenv("DEBUG_TCG");
+    if (envval) {
+        loglevel = strtol(envval, NULL, 0);
+    }
+#endif
+    TRACE();
+
+    /* The current code uses uint8_t for tcg operations. */
+    assert(ARRAY_SIZE(tcg_op_defs) <= UINT8_MAX);
+
+    /* Registers available for 32 bit operations. */
+    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0,
+                     BIT(TCG_TARGET_NB_REGS) - 1);
+    /* Registers available for 64 bit operations. */
+    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0,
+                     BIT(TCG_TARGET_NB_REGS) - 1);
+    /* TODO: Which registers should be set here? */
+    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
+                     BIT(TCG_TARGET_NB_REGS) - 1);
+    tcg_regset_clear(s->reserved_regs);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+    tcg_add_target_add_op_defs(tcg_target_op_defs);
+    tcg_set_frame(s, TCG_AREG0, offsetof(CPUState, temp_buf),
+                  CPU_TEMP_BUF_NLONGS * sizeof(long));
+}
+
+/* Generate global QEMU prologue and epilogue code. */
+static void tcg_target_qemu_prologue(TCGContext *s)
+{
+    TRACE();
+    tb_ret_addr = s->code_ptr;
+}
diff --git a/tcg/bytecode/tcg-target.h b/tcg/bytecode/tcg-target.h
new file mode 100644
index 0000000..05aaaf2
--- /dev/null
+++ b/tcg/bytecode/tcg-target.h
@@ -0,0 +1,152 @@ 
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2009, 2011 Stefan Weil
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This code implements a TCG which does not generate machine code for some
+ * real target machine but which generates virtual machine code for an
+ * interpreter. Interpreted pseudo code is slow, but it works on any host.
+ *
+ * Some remarks might help in understanding the code:
+ *
+ * "target" or "TCG target" is the machine which runs the generated code.
+ * This is different to the usual meaning in QEMU where "target" is the
+ * emulated machine. So normally QEMU host is identical to TCG target.
+ * Here the TCG target is a virtual machine, but this virtual machine must
+ * use the same word size like the real machine.
+ * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
+ */
+
+#if !defined(TCG_TARGET_H)
+#define TCG_TARGET_H
+
+#include "config-host.h"
+
+#define TCG_TARGET_INTERPRETER 1
+
+#ifdef CONFIG_DEBUG_TCG
+/* Enable debug output. */
+#define CONFIG_DEBUG_TCG_INTERPRETER
+#endif
+
+#if 0 /* TCI tries to emulate a little endian host. */
+#if defined(HOST_WORDS_BIGENDIAN)
+# define TCG_TARGET_WORDS_BIGENDIAN
+#endif
+#endif
+
+/* Optional instructions. */
+
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+/* Not more than one of the next two defines must be 1. */
+#define TCG_TARGET_HAS_div_i32          1
+#define TCG_TARGET_HAS_div2_i32         0
+#define TCG_TARGET_HAS_ext8s_i32        1
+#define TCG_TARGET_HAS_ext16s_i32       1
+#define TCG_TARGET_HAS_ext8u_i32        1
+#define TCG_TARGET_HAS_ext16u_i32       1
+#define TCG_TARGET_HAS_andc_i32         0
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_eqv_i32          0
+#define TCG_TARGET_HAS_nand_i32         0
+#define TCG_TARGET_HAS_nor_i32          0
+#define TCG_TARGET_HAS_neg_i32          1
+#define TCG_TARGET_HAS_not_i32          1
+#define TCG_TARGET_HAS_orc_i32          0
+#define TCG_TARGET_HAS_rot_i32          1
+
+#if TCG_TARGET_REG_BITS == 64
+#define TCG_TARGET_HAS_bswap16_i64      1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
+#define TCG_TARGET_HAS_deposit_i64      0
+/* Not more than one of the next two defines must be 1. */
+#define TCG_TARGET_HAS_div_i64          0
+#define TCG_TARGET_HAS_div2_i64         0
+#define TCG_TARGET_HAS_ext8s_i64        1
+#define TCG_TARGET_HAS_ext16s_i64       1
+#define TCG_TARGET_HAS_ext32s_i64       1
+#define TCG_TARGET_HAS_ext8u_i64        1
+#define TCG_TARGET_HAS_ext16u_i64       1
+#define TCG_TARGET_HAS_ext32u_i64       1
+#define TCG_TARGET_HAS_andc_i64         0
+#define TCG_TARGET_HAS_eqv_i64          0
+#define TCG_TARGET_HAS_nand_i64         0
+#define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_neg_i64          1
+#define TCG_TARGET_HAS_not_i64          1
+#define TCG_TARGET_HAS_orc_i64          0
+#define TCG_TARGET_HAS_rot_i64          1
+#endif /* TCG_TARGET_REG_BITS == 64 */
+
+/* Offset to user memory in user mode. */
+#define TCG_TARGET_HAS_GUEST_BASE
+
+/* Number of registers available.
+   For 32 bit hosts, we need more than 8 registers (call arguments). */
+/* #define TCG_TARGET_NB_REGS 8 */
+#define TCG_TARGET_NB_REGS 16
+/* #define TCG_TARGET_NB_REGS 32 */
+
+/* List of registers which are used by TCG. */
+typedef enum {
+    TCG_REG_R0 = 0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_AREG0 = TCG_REG_R7,
+#if TCG_TARGET_NB_REGS >= 16
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+#if TCG_TARGET_NB_REGS >= 32
+    TCG_REG_R16,
+    TCG_REG_R17,
+    TCG_REG_R18,
+    TCG_REG_R19,
+    TCG_REG_R20,
+    TCG_REG_R21,
+    TCG_REG_R22,
+    TCG_REG_R23,
+    TCG_REG_R24,
+    TCG_REG_R25,
+    TCG_REG_R26,
+    TCG_REG_R27,
+    TCG_REG_R28,
+    TCG_REG_R29,
+    TCG_REG_R30,
+    TCG_REG_R31,
+#endif
+#endif
+    /* Special value UINT8_MAX is used by TCI to encode constant values. */
+    TCG_CONST = UINT8_MAX
+} TCGRegister;
+
+void tci_disas(uint8_t opc);
+
+#endif /* TCG_TARGET_H */