diff mbox

[2/4] tcg-hppa: Finish the port.

Message ID 3f11350da2fbc94c0095c66883bc27fa4858c883.1270682952.git.rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson April 7, 2010, 11:56 a.m. UTC
Delete inline functions from tcg-target.h that don't need to be there,
move the others to tcg-target.c.  Add 'Z', 'I', 'J' constraints for
0, signed 11-bit, and signed 5-bit respectively.  Add GUEST_BASE support
similar to ppc64, with the value stored in a register.  Add missing
registers to reg_alloc_order.  Add support for 12-bit branch relocations.
Add functions for synthetic operations: addi, mtctl, dep, shd, vshd, ori,
andi, shifts, rotates, multiply, branches, setcond.  Split out TLB reads
from qemu_ld and qemu_st; fix argument loading for tlb external calls.
Generate the prologue.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 configure             |    5 +-
 tcg/hppa/tcg-target.c | 1758 ++++++++++++++++++++++++++++++++++---------------
 tcg/hppa/tcg-target.h |  142 +----
 3 files changed, 1258 insertions(+), 647 deletions(-)

Comments

Aurelien Jarno April 8, 2010, 9:56 a.m. UTC | #1
On Wed, Apr 07, 2010 at 04:56:43AM -0700, Richard Henderson wrote:
> Delete inline functions from tcg-target.h that don't need to be there,
> move the others to tcg-target.c.  Add 'Z', 'I', 'J' constraints for
> 0, signed 11-bit, and signed 5-bit respectively.  Add GUEST_BASE support
> similar to ppc64, with the value stored in a register.  Add missing

Doing so actually don't work in a lot of cases. See below for more
explanations.

> registers to reg_alloc_order.  Add support for 12-bit branch relocations.
> Add functions for synthetic operations: addi, mtctl, dep, shd, vshd, ori,
> andi, shifts, rotates, multiply, branches, setcond.  Split out TLB reads
> from qemu_ld and qemu_st; fix argument loading for tlb external calls.
> Generate the prologue.

I have applied the patch. I have some comments though, it would be nice
if you can address them with additional patches.

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  configure             |    5 +-
>  tcg/hppa/tcg-target.c | 1758 ++++++++++++++++++++++++++++++++++---------------
>  tcg/hppa/tcg-target.h |  142 +----
>  3 files changed, 1258 insertions(+), 647 deletions(-)
> 
> diff --git a/configure b/configure
> index 1d5fb17..966cd7d 100755
> --- a/configure
> +++ b/configure
> @@ -722,6 +722,9 @@ case "$cpu" in
>      ia64*)
>             host_guest_base="yes"
>             ;;
> +    hppa*)
> +           host_guest_base="yes"
> +           ;;
>  esac
>  
>  [ -z "$guest_base" ] && guest_base="$host_guest_base"
> @@ -2744,7 +2747,7 @@ if test "$target_linux_user" = "yes" -o "$target_bsd_user" = "yes" ; then
>      # -static is used to avoid g1/g3 usage by the dynamic linker
>      ldflags="$linker_script -static $ldflags"
>      ;;
> -  i386|x86_64|ppc|ppc64|s390|sparc64|alpha|arm|m68k|mips|mips64|ia64)
> +  *)
>      ldflags="$linker_script $ldflags"
>      ;;
>    esac
> diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
> index f9ae898..4e15256 100644
> --- a/tcg/hppa/tcg-target.c
> +++ b/tcg/hppa/tcg-target.c
> @@ -24,41 +24,26 @@
>  
>  #ifndef NDEBUG
>  static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
> -    "%r0",
> -    "%r1",
> -    "%rp",
> -    "%r3",
> -    "%r4",
> -    "%r5",
> -    "%r6",
> -    "%r7",
> -    "%r8",
> -    "%r9",
> -    "%r10",
> -    "%r11",
> -    "%r12",
> -    "%r13",
> -    "%r14",
> -    "%r15",
> -    "%r16",
> -    "%r17",
> -    "%r18",
> -    "%r19",
> -    "%r20",
> -    "%r21",
> -    "%r22",
> -    "%r23",
> -    "%r24",
> -    "%r25",
> -    "%r26",
> -    "%dp",
> -    "%ret0",
> -    "%ret1",
> -    "%sp",
> -    "%r31",
> +    "%r0", "%r1", "%rp", "%r3", "%r4", "%r5", "%r6", "%r7",
> +    "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
> +    "%r16", "%r17", "%r18", "%r19", "%r20", "%r21", "%r22", "%r23",
> +    "%r24", "%r25", "%r26", "%dp", "%ret0", "%ret1", "%sp", "%r31",
>  };
>  #endif
>  
> +/* This is an 8 byte temp slot in the stack frame.  */
> +#define STACK_TEMP_OFS -16
> +
> +#ifndef GUEST_BASE
> +#define GUEST_BASE 0
> +#endif
> +
> +#ifdef CONFIG_USE_GUEST_BASE
> +#define TCG_GUEST_BASE_REG TCG_REG_R16
> +#else
> +#define TCG_GUEST_BASE_REG TCG_REG_R0
> +#endif
> +
>  static const int tcg_target_reg_alloc_order[] = {
>      TCG_REG_R4,
>      TCG_REG_R5,
> @@ -75,6 +60,14 @@ static const int tcg_target_reg_alloc_order[] = {
>      TCG_REG_R14,
>      TCG_REG_R15,
>      TCG_REG_R16,
> +
> +    TCG_REG_R26,
> +    TCG_REG_R25,
> +    TCG_REG_R24,
> +    TCG_REG_R23,
> +
> +    TCG_REG_RET0,
> +    TCG_REG_RET1,
>  };
>  
>  static const int tcg_target_call_iarg_regs[4] = {
> @@ -89,16 +82,98 @@ static const int tcg_target_call_oarg_regs[2] = {
>      TCG_REG_RET1,
>  };
>  
> +/* True iff val fits a signed field of width BITS.  */
> +static inline int check_fit_tl(tcg_target_long val, unsigned int bits)
> +{
> +    return (val << ((sizeof(tcg_target_long) * 8 - bits))
> +            >> (sizeof(tcg_target_long) * 8 - bits)) == val;
> +}
> +
> +/* True iff depi can be used to compute (reg | MASK).
> +   Accept a bit pattern like:
> +      0....01....1
> +      1....10....0
> +      0..01..10..0
> +   Copied from gcc sources.  */
> +static inline int or_mask_p(tcg_target_ulong mask)
> +{
> +    mask += mask & -mask;
> +    return (mask & (mask - 1)) == 0;
> +}
> +
> +/* True iff depi or extru can be used to compute (reg & mask).
> +   Accept a bit pattern like these:
> +      0....01....1
> +      1....10....0
> +      1..10..01..1 
> +   Copied from gcc sources.  */
> +static inline int and_mask_p(tcg_target_ulong mask)
> +{
> +    return or_mask_p(~mask);
> +}
> +
> +static int low_sign_ext(int val, int len)
> +{
> +    return (((val << 1) & ~(-1u << len)) | ((val >> (len - 1)) & 1));
> +}
> +
> +static int reassemble_12(int as12)
> +{
> +    return (((as12 & 0x800) >> 11) |
> +            ((as12 & 0x400) >> 8) |
> +            ((as12 & 0x3ff) << 3));
> +}
> +
> +static int reassemble_17(int as17)
> +{
> +    return (((as17 & 0x10000) >> 16) |
> +            ((as17 & 0x0f800) << 5) |
> +            ((as17 & 0x00400) >> 8) |
> +            ((as17 & 0x003ff) << 3));
> +}
> +
> +static int reassemble_21(int as21)
> +{
> +    return (((as21 & 0x100000) >> 20) |
> +            ((as21 & 0x0ffe00) >> 8) |
> +            ((as21 & 0x000180) << 7) |
> +            ((as21 & 0x00007c) << 14) |
> +            ((as21 & 0x000003) << 12));
> +}
> +
> +/* ??? Bizzarely, there is no PCREL12F relocation type.  I guess all
> +   such relocations are simply fully handled by the assembler.  */
> +#define R_PARISC_PCREL12F  R_PARISC_NONE
> +
>  static void patch_reloc(uint8_t *code_ptr, int type,
>                          tcg_target_long value, tcg_target_long addend)
>  {
> +    uint32_t *insn_ptr = (uint32_t *)code_ptr;
> +    uint32_t insn = *insn_ptr;
> +    tcg_target_long pcrel;
> +
> +    value += addend;
> +    pcrel = (value - ((tcg_target_long)code_ptr + 8)) >> 2;
> +
>      switch (type) {
> +    case R_PARISC_PCREL12F:
> +        assert(check_fit_tl(pcrel, 12));
> +        /* ??? We assume all patches are forward.  See tcg_out_brcond
> +           re setting the NUL bit on the branch and eliding the nop.  */
> +        assert(pcrel >= 0);
> +        insn &= ~0x1ffdu;
> +        insn |= reassemble_12(pcrel);
> +        break;
>      case R_PARISC_PCREL17F:
> -        hppa_patch17f((uint32_t *)code_ptr, value, addend);
> +        assert(check_fit_tl(pcrel, 17));
> +        insn &= ~0x1f1ffdu;
> +        insn |= reassemble_17(pcrel);
>          break;
>      default:
>          tcg_abort();
>      }
> +
> +    *insn_ptr = insn;
>  }
>  
>  /* maximum number of register used for input function arguments */
> @@ -126,6 +201,15 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R24);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R23);
>          break;
> +    case 'Z':
> +        ct->ct |= TCG_CT_CONST_0;
> +        break;
> +    case 'I':
> +        ct->ct |= TCG_CT_CONST_S11;
> +        break;
> +    case 'J':
> +        ct->ct |= TCG_CT_CONST_S5;
> +	break;
>      default:
>          return -1;
>      }
> @@ -135,15 +219,19 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>  }
>  
>  /* test if a constant matches the constraint */
> -static inline int tcg_target_const_match(tcg_target_long val,
> -                                         const TCGArgConstraint *arg_ct)
> +static int tcg_target_const_match(tcg_target_long val,
> +                                  const TCGArgConstraint *arg_ct)
>  {
> -    int ct;
> -
> -    ct = arg_ct->ct;
> -
> -    /* TODO */
> -
> +    int ct = arg_ct->ct;
> +    if (ct & TCG_CT_CONST) {
> +        return 1;
> +    } else if (ct & TCG_CT_CONST_0) {
> +        return val == 0;
> +    } else if (ct & TCG_CT_CONST_S5) {
> +        return check_fit_tl(val, 5);
> +    } else if (ct & TCG_CT_CONST_S11) {
> +        return check_fit_tl(val, 11);
> +    }
>      return 0;
>  }
>  
> @@ -163,191 +251,588 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define INSN_SHDEP_CP(x) ((31 - (x)) << 5)
>  #define INSN_SHDEP_P(x)  ((x) << 5)
>  #define INSN_COND(x)     ((x) << 13)
> +#define INSN_IM11(x)     low_sign_ext(x, 11)
> +#define INSN_IM14(x)     low_sign_ext(x, 14)
> +#define INSN_IM5(x)      (low_sign_ext(x, 5) << 16)
> +
> +#define COND_NEVER   0
> +#define COND_EQ      1
> +#define COND_LT      2
> +#define COND_LE      3
> +#define COND_LTU     4
> +#define COND_LEU     5
> +#define COND_SV      6
> +#define COND_OD      7
> +#define COND_FALSE   8
> +
> +#define INSN_ADD	(INSN_OP(0x02) | INSN_EXT6(0x18))
> +#define INSN_ADDC	(INSN_OP(0x02) | INSN_EXT6(0x1c))
> +#define INSN_ADDI	(INSN_OP(0x2d))
> +#define INSN_ADDIL	(INSN_OP(0x0a))
> +#define INSN_ADDL	(INSN_OP(0x02) | INSN_EXT6(0x28))
> +#define INSN_AND	(INSN_OP(0x02) | INSN_EXT6(0x08))
> +#define INSN_ANDCM	(INSN_OP(0x02) | INSN_EXT6(0x00))
> +#define INSN_COMCLR	(INSN_OP(0x02) | INSN_EXT6(0x22))
> +#define INSN_COMICLR	(INSN_OP(0x24))
> +#define INSN_DEP	(INSN_OP(0x35) | INSN_EXT3SH(3))
> +#define INSN_DEPI	(INSN_OP(0x35) | INSN_EXT3SH(7))
> +#define INSN_EXTRS	(INSN_OP(0x34) | INSN_EXT3SH(7))
> +#define INSN_EXTRU	(INSN_OP(0x34) | INSN_EXT3SH(6))
> +#define INSN_LDIL	(INSN_OP(0x08))
> +#define INSN_LDO	(INSN_OP(0x0d))
> +#define INSN_MTCTL	(INSN_OP(0x00) | INSN_EXT8B(0xc2))
> +#define INSN_OR		(INSN_OP(0x02) | INSN_EXT6(0x09))
> +#define INSN_SHD	(INSN_OP(0x34) | INSN_EXT3SH(2))
> +#define INSN_SUB	(INSN_OP(0x02) | INSN_EXT6(0x10))
> +#define INSN_SUBB	(INSN_OP(0x02) | INSN_EXT6(0x14))
> +#define INSN_SUBI	(INSN_OP(0x25))
> +#define INSN_VEXTRS	(INSN_OP(0x34) | INSN_EXT3SH(5))
> +#define INSN_VEXTRU	(INSN_OP(0x34) | INSN_EXT3SH(4))
> +#define INSN_VSHD	(INSN_OP(0x34) | INSN_EXT3SH(0))
> +#define INSN_XOR	(INSN_OP(0x02) | INSN_EXT6(0x0a))
> +#define INSN_ZDEP	(INSN_OP(0x35) | INSN_EXT3SH(2))
> +#define INSN_ZVDEP	(INSN_OP(0x35) | INSN_EXT3SH(0))
> +
> +#define INSN_BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
> +#define INSN_BL_N       (INSN_OP(0x3a) | INSN_EXT3BR(0) | 2)
> +#define INSN_BLR        (INSN_OP(0x3a) | INSN_EXT3BR(2))
> +#define INSN_BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
> +#define INSN_BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
> +#define INSN_BLE_SR4    (INSN_OP(0x39) | (1 << 13))
> +
> +#define INSN_LDB        (INSN_OP(0x10))
> +#define INSN_LDH        (INSN_OP(0x11))
> +#define INSN_LDW        (INSN_OP(0x12))
> +#define INSN_LDWM       (INSN_OP(0x13))
> +#define INSN_FLDDS      (INSN_OP(0x0b) | INSN_EXT4(0) | (1 << 12))
> +
> +#define INSN_LDBX	(INSN_OP(0x03) | INSN_EXT4(0))
> +#define INSN_LDHX	(INSN_OP(0x03) | INSN_EXT4(1))
> +#define INSN_LDWX       (INSN_OP(0x03) | INSN_EXT4(2))
> +
> +#define INSN_STB        (INSN_OP(0x18))
> +#define INSN_STH        (INSN_OP(0x19))
> +#define INSN_STW        (INSN_OP(0x1a))
> +#define INSN_STWM       (INSN_OP(0x1b))
> +#define INSN_FSTDS      (INSN_OP(0x0b) | INSN_EXT4(8) | (1 << 12))
> +
> +#define INSN_COMBT      (INSN_OP(0x20))
> +#define INSN_COMBF      (INSN_OP(0x22))
> +#define INSN_COMIBT     (INSN_OP(0x21))
> +#define INSN_COMIBF     (INSN_OP(0x23))
> +
> +/* supplied by libgcc */
> +extern void *__canonicalize_funcptr_for_compare(void *);
> +
> +static void tcg_out_mov(TCGContext *s, int ret, int arg)
> +{
> +    /* PA1.1 defines COPY as OR r,0,t; PA2.0 defines COPY as LDO 0(r),t
> +       but hppa-dis.c is unaware of this definition */
> +    if (ret != arg) {
> +        tcg_out32(s, INSN_OR | INSN_T(ret) | INSN_R1(arg)
> +                  | INSN_R2(TCG_REG_R0));
> +    }
> +}
>  
> -#define COND_NEVER 0
> -#define COND_EQUAL 1
> -#define COND_LT    2
> -#define COND_LTEQ  3
> -#define COND_LTU   4
> -#define COND_LTUEQ 5
> -#define COND_SV    6
> -#define COND_OD    7
> +static void tcg_out_movi(TCGContext *s, TCGType type,
> +                         int ret, tcg_target_long arg)
> +{
> +    if (check_fit_tl(arg, 14)) {
> +        tcg_out32(s, INSN_LDO | INSN_R1(ret)
> +                  | INSN_R2(TCG_REG_R0) | INSN_IM14(arg));
> +    } else {
> +        uint32_t hi, lo;
> +        hi = arg >> 11;
> +        lo = arg & 0x7ff;
> +
> +        tcg_out32(s, INSN_LDIL | INSN_R2(ret) | reassemble_21(hi));
> +        if (lo) {
> +            tcg_out32(s, INSN_LDO | INSN_R1(ret)
> +                      | INSN_R2(ret) | INSN_IM14(lo));
> +        }
> +    }
> +}
>  
> +static void tcg_out_ldst(TCGContext *s, int ret, int addr,
> +                         tcg_target_long offset, int op)
> +{
> +    if (!check_fit_tl(offset, 14)) {
> +        uint32_t hi, lo, op;
>  
> -/* Logical ADD */
> -#define ARITH_ADD  (INSN_OP(0x02) | INSN_EXT6(0x28))
> -#define ARITH_AND  (INSN_OP(0x02) | INSN_EXT6(0x08))
> -#define ARITH_OR   (INSN_OP(0x02) | INSN_EXT6(0x09))
> -#define ARITH_XOR  (INSN_OP(0x02) | INSN_EXT6(0x0a))
> -#define ARITH_SUB  (INSN_OP(0x02) | INSN_EXT6(0x10))
> +        hi = offset >> 11;
> +        lo = offset & 0x7ff;
>  
> -#define SHD        (INSN_OP(0x34) | INSN_EXT3SH(2))
> -#define VSHD       (INSN_OP(0x34) | INSN_EXT3SH(0))
> -#define DEP        (INSN_OP(0x35) | INSN_EXT3SH(3))
> -#define ZDEP       (INSN_OP(0x35) | INSN_EXT3SH(2))
> -#define ZVDEP      (INSN_OP(0x35) | INSN_EXT3SH(0))
> -#define EXTRU      (INSN_OP(0x34) | INSN_EXT3SH(6))
> -#define EXTRS      (INSN_OP(0x34) | INSN_EXT3SH(7))
> -#define VEXTRS     (INSN_OP(0x34) | INSN_EXT3SH(5))
> +        if (addr == TCG_REG_R0) {
> +            op = INSN_LDIL | INSN_R2(TCG_REG_R1);
> +        } else {
> +            op = INSN_ADDIL | INSN_R2(addr);
> +        }
> +        tcg_out32(s, op | reassemble_21(hi));
>  
> -#define SUBI       (INSN_OP(0x25))
> -#define MTCTL      (INSN_OP(0x00) | INSN_EXT8B(0xc2))
> +        addr = TCG_REG_R1;
> +	offset = lo;
> +    }
>  
> -#define BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
> -#define BLE_SR4    (INSN_OP(0x39) | (1 << 13))
> -#define BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
> -#define BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
> -#define LDIL       (INSN_OP(0x08))
> -#define LDO        (INSN_OP(0x0d))
> +    if (ret != addr || offset != 0 || op != INSN_LDO) {
> +        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) | INSN_IM14(offset));
> +    }
> +}
>  
> -#define LDB        (INSN_OP(0x10))
> -#define LDH        (INSN_OP(0x11))
> -#define LDW        (INSN_OP(0x12))
> -#define LDWM       (INSN_OP(0x13))
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
> +                              int arg1, tcg_target_long arg2)
> +{
> +    tcg_out_ldst(s, ret, arg1, arg2, INSN_LDW);
> +}
> +
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
> +                              int arg1, tcg_target_long arg2)
> +{
> +    tcg_out_ldst(s, ret, arg1, arg2, INSN_STW);
> +}
> +
> +static void tcg_out_ldst_index(TCGContext *s, int data,
> +                               int base, int index, int op)
> +{
> +    tcg_out32(s, op | INSN_T(data) | INSN_R1(index) | INSN_R2(base));
> +}
> +
> +static inline void tcg_out_addi2(TCGContext *s, int ret, int arg1,
> +                                 tcg_target_long val)
> +{
> +    tcg_out_ldst(s, ret, arg1, val, INSN_LDO);
> +}
>  
> -#define STB        (INSN_OP(0x18))
> -#define STH        (INSN_OP(0x19))
> -#define STW        (INSN_OP(0x1a))
> -#define STWM       (INSN_OP(0x1b))
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
> +{
> +    tcg_out_addi2(s, reg, reg, val);
> +}
>  
> -#define COMBT      (INSN_OP(0x20))
> -#define COMBF      (INSN_OP(0x22))
> +static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int op)
> +{
> +    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
> +}
>  
> -static int lowsignext(uint32_t val, int start, int length)
> +static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
> +                                  tcg_target_long val, int op)
>  {
> -    return (((val << 1) & ~(~0 << length)) |
> -            ((val >> (length - 1)) & 1)) << start;
> +    assert(check_fit_tl(val, 11));
> +    tcg_out32(s, op | INSN_R1(t) | INSN_R2(r1) | INSN_IM11(val));
>  }
>  
> -static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
> +static inline void tcg_out_nop(TCGContext *s)
>  {
> -    /* PA1.1 defines COPY as OR r,0,t */
> -    tcg_out32(s, ARITH_OR | INSN_T(ret) | INSN_R1(arg) | INSN_R2(TCG_REG_R0));
> +    tcg_out_arith(s, TCG_REG_R0, TCG_REG_R0, TCG_REG_R0, INSN_OR);
> +}
>  
> -    /* PA2.0 defines COPY as LDO 0(r),t
> -     * but hppa-dis.c is unaware of this definition */
> -    /* tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(arg) | reassemble_14(0)); */
> +static inline void tcg_out_mtctl_sar(TCGContext *s, int arg)
> +{
> +    tcg_out32(s, INSN_MTCTL | INSN_R2(11) | INSN_R1(arg));
> +}
> +
> +/* Extract LEN bits at position OFS from ARG and place in RET.
> +   Note that here the bit ordering is reversed from the PA-RISC
> +   standard, such that the right-most bit is 0.  */
> +static inline void tcg_out_extr(TCGContext *s, int ret, int arg,
> +                                unsigned ofs, unsigned len, int sign)
> +{
> +    assert(ofs < 32 && len <= 32 - ofs);
> +    tcg_out32(s, (sign ? INSN_EXTRS : INSN_EXTRU)
> +              | INSN_R1(ret) | INSN_R2(arg)
> +              | INSN_SHDEP_P(31 - ofs) | INSN_DEP_LEN(len));
>  }
>  
> -static inline void tcg_out_movi(TCGContext *s, TCGType type,
> -                                int ret, tcg_target_long arg)
> +/* Likewise with OFS interpreted little-endian.  */
> +static inline void tcg_out_dep(TCGContext *s, int ret, int arg,
> +                               unsigned ofs, unsigned len)
>  {
> -    if (arg == (arg & 0x1fff)) {
> -        tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(TCG_REG_R0) |
> -                     reassemble_14(arg));
> +    assert(ofs < 32 && len <= 32 - ofs);
> +    tcg_out32(s, INSN_DEP | INSN_R2(ret) | INSN_R1(arg)
> +              | INSN_SHDEP_CP(31 - ofs) | INSN_DEP_LEN(len));
> +}
> +
> +static inline void tcg_out_shd(TCGContext *s, int ret, int hi, int lo,
> +                               unsigned count)
> +{
> +    assert(count < 32);
> +    tcg_out32(s, INSN_SHD | INSN_R1(hi) | INSN_R2(lo) | INSN_T(ret)
> +              | INSN_SHDEP_CP(count));
> +}
> +
> +static void tcg_out_vshd(TCGContext *s, int ret, int hi, int lo, int creg)
> +{
> +    tcg_out_mtctl_sar(s, creg);
> +    tcg_out32(s, INSN_VSHD | INSN_T(ret) | INSN_R1(hi) | INSN_R2(lo));
> +}
> +
> +static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> +{
> +    if (m == 0) {
> +        tcg_out_mov(s, ret, arg);
> +    } else if (m == -1) {
> +        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);

Those cases are already eliminated in tcg/tcg-op.h. This code looks
redundant.

> +    } else if (or_mask_p(m)) {
> +        int bs0, bs1;
> +
> +        for (bs0 = 0; bs0 < 32; bs0++) {
> +            if ((m & (1u << bs0)) != 0) {
> +                break;
> +            }
> +        }
> +        for (bs1 = bs0; bs1 < 32; bs1++) {
> +            if ((m & (1u << bs1)) == 0) {
> +                break;
> +            }
> +        }
> +        assert(bs1 == 32 || (1ul << bs1) > m);
> +
> +        tcg_out_mov(s, ret, arg);
> +        tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(-1)
> +                  | INSN_SHDEP_CP(31 - bs0) | INSN_DEP_LEN(bs1 - bs0));
> +    } else {
> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_OR);

Do we really want a movi here? It would be better to leave the tcg code
load the constant itself, so that if the same constant is used twice, it
is only loaded once.

> +    }
> +}
> +
> +static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> +{
> +    if (m == 0) {
> +        tcg_out_mov(s, ret, TCG_REG_R0);
> +    } else if (m == -1) {
> +        tcg_out_mov(s, ret, arg);

Same.

> +    } else if (and_mask_p(m)) {
> +        int ls0, ls1, ms0;
> +
> +        for (ls0 = 0; ls0 < 32; ls0++) {
> +            if ((m & (1u << ls0)) == 0) {
> +                break;
> +            }
> +        }
> +        for (ls1 = ls0; ls1 < 32; ls1++) {
> +            if ((m & (1u << ls1)) != 0) {
> +                break;
> +            }
> +        }
> +        for (ms0 = ls1; ms0 < 32; ms0++) {
> +            if ((m & (1u << ms0)) == 0) {
> +                break;
> +            }
> +        }
> +        assert (ms0 == 32);
> +
> +        if (ls1 == 32) {
> +            tcg_out_extr(s, ret, arg, 0, ls0, 0);
> +        } else {
> +            tcg_out_mov(s, ret, arg);
> +            tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(0)
> +                      | INSN_SHDEP_CP(31 - ls0) | INSN_DEP_LEN(ls1 - ls0));
> +        }
>      } else {
> -        tcg_out32(s, LDIL | INSN_R2(ret) |
> -                     reassemble_21(lrsel((uint32_t)arg, 0)));
> -        if (arg & 0x7ff)
> -            tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(ret) |
> -                         reassemble_14(rrsel((uint32_t)arg, 0)));
> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_AND);

Same.

>      }
>  }
>  
> -static inline void tcg_out_ld_raw(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
>  {
> -    tcg_out32(s, LDIL | INSN_R2(ret) |
> -                 reassemble_21(lrsel((uint32_t)arg, 0)));
> -    tcg_out32(s, LDW | INSN_R1(ret) | INSN_R2(ret) |
> -                 reassemble_14(rrsel((uint32_t)arg, 0)));
> +    tcg_out_extr(s, ret, arg, 0, 8, 1);
>  }
>  
> -static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
>  {
> -    tcg_out_ld_raw(s, ret, arg);
> +    tcg_out_extr(s, ret, arg, 0, 16, 1);
>  }
>  
> -static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset,
> -                                int op)
> +static void tcg_out_shli(TCGContext *s, int ret, int arg, int count)
>  {
> -    if (offset == (offset & 0xfff))
> -        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) |
> -                 reassemble_14(offset));
> -    else {
> -        fprintf(stderr, "unimplemented %s with offset %d\n", __func__, offset);
> -        tcg_abort();
> -    }
> +    count &= 31;
> +    tcg_out32(s, INSN_ZDEP | INSN_R2(ret) | INSN_R1(arg)
> +              | INSN_SHDEP_CP(31 - count) | INSN_DEP_LEN(32 - count));
>  }
>  
> -static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
> -                              int arg1, tcg_target_long arg2)
> +static void tcg_out_shl(TCGContext *s, int ret, int arg, int creg)
>  {
> -    fprintf(stderr, "unimplemented %s\n", __func__);
> -    tcg_abort();
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
> +    tcg_out_mtctl_sar(s, TCG_REG_R20);
> +    tcg_out32(s, INSN_ZVDEP | INSN_R2(ret) | INSN_R1(arg) | INSN_DEP_LEN(32));
>  }
>  
> -static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
> -                              int arg1, tcg_target_long arg2)
> +static void tcg_out_shri(TCGContext *s, int ret, int arg, int count)
>  {
> -    fprintf(stderr, "unimplemented %s\n", __func__);
> -    tcg_abort();
> +    count &= 31;
> +    tcg_out_extr(s, ret, arg, count, 32 - count, 0);
>  }
>  
> -static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int op)
> +static void tcg_out_shr(TCGContext *s, int ret, int arg, int creg)
>  {
> -    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
> +    tcg_out_vshd(s, ret, TCG_REG_R0, arg, creg);
>  }
>  
> -static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
> -                                  tcg_target_long val, int op)
> +static void tcg_out_sari(TCGContext *s, int ret, int arg, int count)
>  {
> -    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, val);
> -    tcg_out_arith(s, t, r1, TCG_REG_R20, op);
> +    count &= 31;
> +    tcg_out_extr(s, ret, arg, count, 32 - count, 1);
>  }
>  
> -static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
> +static void tcg_out_sar(TCGContext *s, int ret, int arg, int creg)
>  {
> -    tcg_out_arithi(s, reg, reg, val, ARITH_ADD);
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
> +    tcg_out_mtctl_sar(s, TCG_REG_R20);
> +    tcg_out32(s, INSN_VEXTRS | INSN_R1(ret) | INSN_R2(arg) | INSN_DEP_LEN(32));
>  }
>  
> -static inline void tcg_out_nop(TCGContext *s)
> +static void tcg_out_rotli(TCGContext *s, int ret, int arg, int count)
>  {
> -    tcg_out32(s, ARITH_OR | INSN_T(TCG_REG_R0) | INSN_R1(TCG_REG_R0) |
> -                 INSN_R2(TCG_REG_R0));
> +    count &= 31;
> +    tcg_out_shd(s, ret, arg, arg, 32 - count);
>  }
>  
> -static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg) {
> -    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
> -                 INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> +static void tcg_out_rotl(TCGContext *s, int ret, int arg, int creg)
> +{
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 32, INSN_SUBI);
> +    tcg_out_vshd(s, ret, arg, arg, TCG_REG_R20);
>  }
>  
> -static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg) {
> -    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
> -                 INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> +static void tcg_out_rotri(TCGContext *s, int ret, int arg, int count)
> +{
> +    count &= 31;
> +    tcg_out_shd(s, ret, arg, arg, count);
>  }
>  
> -static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg) {
> -    if(ret != arg)
> -        tcg_out_mov(s, ret, arg);
> -    tcg_out32(s, DEP | INSN_R2(ret) | INSN_R1(ret) |
> -                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
> -    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(TCG_REG_R0) |
> -                 INSN_R2(ret) | INSN_SHDEP_CP(8));
> +static void tcg_out_rotr(TCGContext *s, int ret, int arg, int creg)
> +{
> +    tcg_out_vshd(s, ret, arg, arg, creg);
>  }
>  
> -static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp) {
> -    tcg_out32(s, SHD | INSN_T(temp) | INSN_R1(arg) |
> -                 INSN_R2(arg) | INSN_SHDEP_CP(16));
> -    tcg_out32(s, DEP | INSN_R2(temp) | INSN_R1(temp) |
> -                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
> -    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(arg) |
> -                 INSN_R2(temp) | INSN_SHDEP_CP(8));
> +static void tcg_out_bswap16(TCGContext *s, int ret, int arg, int sign)
> +{
> +    if (ret != arg) {
> +        tcg_out_mov(s, ret, arg);             /* arg =  xxAB */
> +    }
> +    tcg_out_dep(s, ret, ret, 16, 8);          /* ret =  xBAB */
> +    tcg_out_extr(s, ret, ret, 8, 16, sign);   /* ret =  ..BA */
>  }
>  
> -static inline void tcg_out_call(TCGContext *s, void *func)
> +static void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp)
>  {
> -    uint32_t val = (uint32_t)__canonicalize_funcptr_for_compare(func);
> -    tcg_out32(s, LDIL | INSN_R2(TCG_REG_R20) |
> -                 reassemble_21(lrsel(val, 0)));
> -    tcg_out32(s, BLE_SR4 | INSN_R2(TCG_REG_R20) |
> -                 reassemble_17(rrsel(val, 0) >> 2));
> -    tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +                                          /* arg =  ABCD */
> +    tcg_out_rotri(s, temp, arg, 16);      /* temp = CDAB */
> +    tcg_out_dep(s, temp, temp, 16, 8);    /* temp = CBAB */
> +    tcg_out_shd(s, ret, arg, temp, 8);    /* ret =  DCBA */
>  }
>  
> -#if defined(CONFIG_SOFTMMU)
> +static void tcg_out_call(TCGContext *s, void *func)
> +{
> +    tcg_target_long val, hi, lo, disp;
> +
> +    val = (uint32_t)__canonicalize_funcptr_for_compare(func);
> +    disp = (val - ((tcg_target_long)s->code_ptr + 8)) >> 2;
> +
> +    if (check_fit_tl(disp, 17)) {
> +        tcg_out32(s, INSN_BL_N | INSN_R2(TCG_REG_RP) | reassemble_17(disp));
> +    } else {
> +        hi = val >> 11;
> +        lo = val & 0x7ff;
> +
> +        tcg_out32(s, INSN_LDIL | INSN_R2(TCG_REG_R20) | reassemble_21(hi));
> +        tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R20)
> +                  | reassemble_17(lo >> 2));
> +        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +    }
> +}
>  
> +static void tcg_out_xmpyu(TCGContext *s, int retl, int reth,
> +                          int arg1, int arg2)
> +{
> +    /* Store both words into the stack for copy to the FPU.  */
> +    tcg_out_ldst(s, arg1, TCG_REG_SP, STACK_TEMP_OFS, INSN_STW);
> +    tcg_out_ldst(s, arg2, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_STW);
> +
> +    /* Load both words into the FPU at the same time.  We get away
> +       with this because we can address the left and right half of the
> +       FPU registers individually once loaded.  */
> +    /* fldds stack_temp(sp),fr22 */
> +    tcg_out32(s, INSN_FLDDS | INSN_R2(TCG_REG_SP)
> +              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
> +
> +    /* xmpyu fr22r,fr22,fr22 */
> +    tcg_out32(s, 0x3ad64796);
> +
> +    /* Store the 64-bit result back into the stack.  */
> +    /* fstds stack_temp(sp),fr22 */
> +    tcg_out32(s, INSN_FSTDS | INSN_R2(TCG_REG_SP)
> +              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
> +
> +    /* Load the pieces of the result that the caller requested.  */
> +    if (reth) {
> +        tcg_out_ldst(s, reth, TCG_REG_SP, STACK_TEMP_OFS, INSN_LDW);
> +    }
> +    if (retl) {
> +        tcg_out_ldst(s, retl, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_LDW);
> +    }
> +}
> +
> +static void tcg_out_branch(TCGContext *s, int label_index, int nul)
> +{
> +    TCGLabel *l = &s->labels[label_index];
> +    uint32_t op = nul ? INSN_BL_N : INSN_BL;
> +
> +    if (l->has_value) {
> +        tcg_target_long val = l->u.value;
> +
> +        val -= (tcg_target_long)s->code_ptr + 8;
> +        val >>= 2;
> +        assert(check_fit_tl(val, 17));
> +
> +        tcg_out32(s, op | reassemble_17(val));
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
> +        tcg_out32(s, op);

This breaks partial retranslation. The bits corresponding to the offset
should be preserved.

> +    }
> +}
> +
> +static const uint8_t tcg_cond_to_cmp_cond[10] =
> +{
> +    [TCG_COND_EQ] = COND_EQ,
> +    [TCG_COND_NE] = COND_EQ | COND_FALSE,
> +    [TCG_COND_LT] = COND_LT,
> +    [TCG_COND_GE] = COND_LT | COND_FALSE,
> +    [TCG_COND_LE] = COND_LE,
> +    [TCG_COND_GT] = COND_LE | COND_FALSE,
> +    [TCG_COND_LTU] = COND_LTU,
> +    [TCG_COND_GEU] = COND_LTU | COND_FALSE,
> +    [TCG_COND_LEU] = COND_LEU,
> +    [TCG_COND_GTU] = COND_LEU | COND_FALSE,
> +};
> +
> +static void tcg_out_brcond(TCGContext *s, int cond, TCGArg c1,
> +                           TCGArg c2, int c2const, int label_index)
> +{
> +    TCGLabel *l = &s->labels[label_index];
> +    int op, pacond;
> +
> +    /* Note that COMIB operates as if the immediate is the first
> +       operand.  We model brcond with the immediate in the second
> +       to better match what targets are likely to give us.  For
> +       consistency, model COMB with reversed operands as well.  */
> +    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
> +
> +    if (c2const) {
> +        op = (pacond & COND_FALSE ? INSN_COMIBF : INSN_COMIBT);
> +        op |= INSN_IM5(c2);
> +    } else {
> +        op = (pacond & COND_FALSE ? INSN_COMBF : INSN_COMBT);
> +        op |= INSN_R1(c2);
> +    }
> +    op |= INSN_R2(c1);
> +    op |= INSN_COND(pacond & 7);
> +
> +    if (l->has_value) {
> +        tcg_target_long val = l->u.value;
> +
> +        val -= (tcg_target_long)s->code_ptr + 8;
> +        val >>= 2;
> +        assert(check_fit_tl(val, 12));
> +
> +        /* ??? Assume that all branches to defined labels are backward.
> +           Which means that if the nul bit is set, the delay slot is
> +           executed if the branch is taken, and not executed in fallthru.  */
> +        tcg_out32(s, op | reassemble_12(val));
> +        tcg_out_nop(s);
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL12F, label_index, 0);
> +        /* ??? Assume that all branches to undefined labels are forward.
> +           Which means that if the nul bit is set, the delay slot is
> +           not executed if the branch is taken, which is what we want.  */
> +        tcg_out32(s, op | 2);

Same problem about partial retranslation here.

> +    }
> +}
> +
> +static void tcg_out_comclr(TCGContext *s, int cond, TCGArg ret,
> +                           TCGArg c1, TCGArg c2, int c2const)
> +{
> +    int op, pacond;
> +
> +    /* Note that COMICLR operates as if the immediate is the first
> +       operand.  We model setcond with the immediate in the second
> +       to better match what targets are likely to give us.  For
> +       consistency, model COMCLR with reversed operands as well.  */
> +    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
> +
> +    if (c2const) {
> +        op = INSN_COMICLR | INSN_R2(c1) | INSN_R1(ret) | INSN_IM11(c2);
> +    } else {
> +        op = INSN_COMCLR | INSN_R2(c1) | INSN_R1(c2) | INSN_T(ret);
> +    }
> +    op |= INSN_COND(pacond & 7);
> +    op |= pacond & COND_FALSE ? 1 << 12 : 0;
> +
> +    tcg_out32(s, op);
> +}
> +
> +static void tcg_out_brcond2(TCGContext *s, int cond, TCGArg al, TCGArg ah,
> +                            TCGArg bl, int blconst, TCGArg bh, int bhconst,
> +                            int label_index)
> +{
> +    switch (cond) {
> +    case TCG_COND_EQ:
> +    case TCG_COND_NE:
> +        tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, al, bl, blconst);
> +        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
> +        break;
> +
> +    default:
> +        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
> +        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_brcond(s, tcg_unsigned_cond(cond),
> +                       al, bl, blconst, label_index);
> +        break;
> +    }
> +}
> +
> +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg ret,
> +                            TCGArg c1, TCGArg c2, int c2const)
> +{
> +    tcg_out_comclr(s, tcg_invert_cond(cond), ret, c1, c2, c2const);
> +    tcg_out_movi(s, TCG_TYPE_I32, ret, 1);
> +}
> +
> +static void tcg_out_setcond2(TCGContext *s, int cond, TCGArg ret,
> +                             TCGArg al, TCGArg ah, TCGArg bl, int blconst,
> +                             TCGArg bh, int bhconst)
> +{
> +    int scratch = TCG_REG_R20;
> +
> +    if (ret != al && ret != ah
> +        && (blconst || ret != bl)
> +        && (bhconst || ret != bh)) {
> +        scratch = ret;
> +    }
> +
> +    switch (cond) {
> +    case TCG_COND_EQ:
> +    case TCG_COND_NE:
> +        tcg_out_setcond(s, cond, scratch, al, bl, blconst);
> +        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, cond == TCG_COND_NE);
> +        break;
> +
> +    default:
> +        tcg_out_setcond(s, tcg_unsigned_cond(cond), scratch, al, bl, blconst);
> +        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, 0);
> +        tcg_out_comclr(s, cond, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, 1);
> +        break;
> +    }
> +
> +    tcg_out_mov(s, ret, scratch);
> +}
> +
> +#if defined(CONFIG_SOFTMMU)
>  #include "../../softmmu_defs.h"
>  
>  static void *qemu_ld_helpers[4] = {
> @@ -363,30 +848,77 @@ static void *qemu_st_helpers[4] = {
>      __stl_mmu,
>      __stq_mmu,
>  };
> +
> +/* Load and compare a TLB entry, and branch if TLB miss.  OFFSET is set to
> +   the offset of the first ADDR_READ or ADDR_WRITE member of the appropriate
> +   TLB for the memory index.  The return value is the offset from ENV 
> +   contained in R1 afterward (to be used when loading ADDEND); if the
> +   return value is 0, R1 is not used.  */
> +
> +static int tcg_out_tlb_read(TCGContext *s, int r0, int r1, int addrlo,
> +                            int addrhi, int s_bits, int lab_miss, int offset)
> +{
> +    int ret;
> +
> +    /* Extracting the index into the TLB.  The "normal C operation" is
> +          r1 = addr_reg >> TARGET_PAGE_BITS;
> +          r1 &= CPU_TLB_SIZE - 1;
> +          r1 <<= CPU_TLB_ENTRY_BITS;
> +       What this does is extract CPU_TLB_BITS beginning at TARGET_PAGE_BITS
> +       and place them at CPU_TLB_ENTRY_BITS.  We can combine the first two
> +       operations with an EXTRU.  Unfortunately, the current value of
> +       CPU_TLB_ENTRY_BITS is > 3, so we can't merge that shift with the
> +       add that follows.  */
> +    tcg_out_extr(s, r1, addrlo, TARGET_PAGE_BITS, CPU_TLB_BITS, 0);
> +    tcg_out_andi(s, r0, addrlo, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
> +    tcg_out_shli(s, r1, r1, CPU_TLB_ENTRY_BITS);
> +    tcg_out_arith(s, r1, r1, TCG_AREG0, INSN_ADDL);
> +
> +    /* Make sure that both the addr_{read,write} and addend can be
> +       read with a 14-bit offset from the same base register.  */
> +    if (check_fit_tl(offset + CPU_TLB_SIZE, 14)) {
> +        ret = 0;
> +    } else {
> +        ret = (offset + 0x400) & ~0x7ff;
> +        offset = ret - offset;
> +        tcg_out_addi2(s, TCG_REG_R1, r1, ret);
> +        r1 = TCG_REG_R1;
> +    }
> +
> +    /* Load the entry from the computed slot.  */
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R23, r1, offset);
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset + 4);
> +    } else {
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset);
> +    }
> +
> +    /* If not equal, jump to lab_miss. */
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_brcond2(s, TCG_COND_NE, TCG_REG_R20, TCG_REG_R23,
> +                        r0, 0, addrhi, 0, lab_miss);
> +    } else {
> +        tcg_out_brcond(s, TCG_COND_NE, TCG_REG_R20, r0, 0, lab_miss);
> +    }
> +
> +    return ret;
> +}
>  #endif
>  
>  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>  {
> -    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
> +    int addr_reg, addr_reg2;
> +    int data_reg, data_reg2;
> +    int r0, r1, mem_index, s_bits, bswap;
> +    tcg_target_long offset;
>  #if defined(CONFIG_SOFTMMU)
> -    uint32_t *label1_ptr, *label2_ptr;
> -#endif
> -#if TARGET_LONG_BITS == 64
> -#if defined(CONFIG_SOFTMMU)
> -    uint32_t *label3_ptr;
> -#endif
> -    int addr_reg2;
> +    int lab1, lab2, argreg;
>  #endif
>  
>      data_reg = *args++;
> -    if (opc == 3)
> -        data_reg2 = *args++;
> -    else
> -        data_reg2 = 0; /* suppress warning */
> +    data_reg2 = (opc == 3 ? *args++ : TCG_REG_R0);

I am not sure TCG_REG_R0 is really correct here, and I find it confusing.
While it's value is zero, the assignment there is just to make GCC
happy, it won't be used after

>      addr_reg = *args++;
> -#if TARGET_LONG_BITS == 64
> -    addr_reg2 = *args++;
> -#endif
> +    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);

Same here.

>      mem_index = *args;
>      s_bits = opc & 3;
>  
> @@ -394,96 +926,22 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      r1 = TCG_REG_R25;
>  
>  #if defined(CONFIG_SOFTMMU)
> -    tcg_out_mov(s, r1, addr_reg);
> +    lab1 = gen_new_label();
> +    lab2 = gen_new_label();

Do you really want to use label here? load/store are the most common
instructions, I am not really sure of the resulting performance.

> -    tcg_out_mov(s, r0, addr_reg);
> +    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
> +                              offsetof(CPUState,
> +                                       tlb_table[mem_index][0].addr_read));
>  
> -    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
> -                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> +    /* TLB Hit.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
> +               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
>  
> -    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
> -                   ARITH_AND);
> -
> -    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
> -                   ARITH_AND);
> -
> -    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
> -    tcg_out_arithi(s, r1, r1,
> -                   offsetof(CPUState, tlb_table[mem_index][0].addr_read),
> -                   ARITH_ADD);
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
> -
> -#if TARGET_LONG_BITS == 32
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -#else
> -    /* if not equal, jump to label3 */
> -    label3_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
> -
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_nop(s); /* delay slot */
> -
> -    /* label3: */
> -    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
> -#endif
> -
> -#if TARGET_LONG_BITS == 32
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R25, mem_index);
> -#else
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
> -#endif
> -
> -    tcg_out_call(s, qemu_ld_helpers[s_bits]);
> -
> -    switch(opc) {
> -        case 0 | 4:
> -            tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 1 | 4:
> -            tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 0:
> -        case 1:
> -        case 2:
> -        default:
> -            tcg_out_mov(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 3:
> -            tcg_abort();
> -            tcg_out_mov(s, data_reg, TCG_REG_RET0);
> -            tcg_out_mov(s, data_reg2, TCG_REG_RET1);
> -            break;
> -    }
> -
> -    /* jump to label2 */
> -    label2_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
> -
> -    /* label1: */
> -    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
> -
> -    tcg_out_arithi(s, TCG_REG_R20, r1,
> -                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_read),
> -                   ARITH_ADD);
> -    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
> -    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
> +    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
> +    offset = TCG_REG_R0;
>  #else
>      r0 = addr_reg;
> +    offset = GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_R0;
>  #endif
>  
>  #ifdef TARGET_WORDS_BIGENDIAN
> @@ -492,190 +950,151 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      bswap = 1;
>  #endif
>      switch (opc) {
> -        case 0:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDB);
> -            break;
> -        case 0 | 4:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDB);
> -            tcg_out_ext8s(s, data_reg, data_reg);
> -            break;
> -        case 1:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDH);
> -            if (bswap)
> -                tcg_out_bswap16(s, data_reg, data_reg);
> -            break;
> -        case 1 | 4:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDH);
> -            if (bswap)
> -                tcg_out_bswap16(s, data_reg, data_reg);
> +    case 0:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
> +        break;
> +    case 0 | 4:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
> +        tcg_out_ext8s(s, data_reg, data_reg);
> +        break;
> +    case 1:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
> +        if (bswap) {
> +            tcg_out_bswap16(s, data_reg, data_reg, 0);
> +        }
> +        break;
> +    case 1 | 4:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
> +        if (bswap) {
> +            tcg_out_bswap16(s, data_reg, data_reg, 1);
> +        } else {
>              tcg_out_ext16s(s, data_reg, data_reg);
> -            break;
> -        case 2:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDW);
> -            if (bswap)
> -                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> -            break;
> -        case 3:
> -            tcg_abort();
> -            if (!bswap) {
> -                tcg_out_ldst(s, data_reg, r0, 0, LDW);
> -                tcg_out_ldst(s, data_reg2, r0, 4, LDW);
> +        }
> +        break;
> +    case 2:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDWX);
> +        if (bswap) {
> +            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> +        }
> +        break;
> +    case 3:
> +        if (bswap) {
> +            int t = data_reg2;
> +            data_reg2 = data_reg;
> +            data_reg = t;
> +        }
> +        if (offset == TCG_REG_R0) {
> +            /* Make sure not to clobber the base register.  */
> +            if (data_reg2 == r0) {
> +                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
> +                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
>              } else {
> -                tcg_out_ldst(s, data_reg, r0, 4, LDW);
> -                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> -                tcg_out_ldst(s, data_reg2, r0, 0, LDW);
> -                tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
> +                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
> +                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
>              }
> -            break;
> -        default:
> -            tcg_abort();
> +        } else {
> +            tcg_out_addi2(s, TCG_REG_R20, r0, 4);
> +            tcg_out_ldst_index(s, data_reg2, r0, offset, INSN_LDWX);
> +            tcg_out_ldst_index(s, data_reg, TCG_REG_R20, offset, INSN_LDWX);
> +        }
> +        if (bswap) {
> +            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> +            tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
> +        }
> +        break;
> +    default:
> +        tcg_abort();
>      }
>  
>  #if defined(CONFIG_SOFTMMU)
> +    tcg_out_branch(s, lab2, 1);
> +
> +    /* TLB Miss.  */
> +    /* label1: */
> +    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
> +
> +    argreg = TCG_REG_R26;
> +    tcg_out_mov(s, argreg--, addr_reg);
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_mov(s, argreg--, addr_reg2);
> +    }
> +    tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +
> +    tcg_out_call(s, qemu_ld_helpers[s_bits]);
> +
> +    switch (opc) {
> +    case 0:
> +        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xff);
> +        break;
> +    case 0 | 4:
> +        tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 1:
> +        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xffff);
> +        break;
> +    case 1 | 4:
> +        tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 2:
> +    case 2 | 4:
> +        tcg_out_mov(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 3:
> +        tcg_out_mov(s, data_reg, TCG_REG_RET0);
> +        tcg_out_mov(s, data_reg2, TCG_REG_RET1);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +
>      /* label2: */
> -    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
> +    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
>  #endif
>  }
>  
>  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>  {
> -    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
> -#if defined(CONFIG_SOFTMMU)
> -    uint32_t *label1_ptr, *label2_ptr;
> -#endif
> -#if TARGET_LONG_BITS == 64
> +    int addr_reg, addr_reg2;
> +    int data_reg, data_reg2;
> +    int r0, r1, mem_index, s_bits, bswap;
>  #if defined(CONFIG_SOFTMMU)
> -    uint32_t *label3_ptr;
> -#endif
> -    int addr_reg2;
> +    tcg_target_long offset;
> +    int lab1, lab2, argreg;
>  #endif
>  
>      data_reg = *args++;
> -    if (opc == 3)
> -        data_reg2 = *args++;
> -    else
> -        data_reg2 = 0; /* suppress warning */
> +    data_reg2 = (opc == 3 ? *args++ : 0);
>      addr_reg = *args++;
> -#if TARGET_LONG_BITS == 64
> -    addr_reg2 = *args++;
> -#endif
> +    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : 0);

Here it makes more sense ;-)

>      mem_index = *args;
> -
>      s_bits = opc;
>  
>      r0 = TCG_REG_R26;
>      r1 = TCG_REG_R25;
>  
>  #if defined(CONFIG_SOFTMMU)
> -    tcg_out_mov(s, r1, addr_reg);
> -
> -    tcg_out_mov(s, r0, addr_reg);
> -
> -    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
> -                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> -
> -    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
> -                   ARITH_AND);
> -
> -    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
> -                   ARITH_AND);
> +    lab1 = gen_new_label();
> +    lab2 = gen_new_label();

Same here.

> -    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
> -    tcg_out_arithi(s, r1, r1,
> -                   offsetof(CPUState, tlb_table[mem_index][0].addr_write),
> -                   ARITH_ADD);
> +    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
> +                              offsetof(CPUState,
> +                                       tlb_table[mem_index][0].addr_write));
>  
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
> +    /* TLB Hit.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
> +               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
>  
> -#if TARGET_LONG_BITS == 32
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> +    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
>  #else
> -    /* if not equal, jump to label3 */
> -    label3_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
> -
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_nop(s); /* delay slot */
> -
> -    /* label3: */
> -    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
> -#endif
> -
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -#if TARGET_LONG_BITS == 64
> -    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
> -    if (opc == 3) {
> -        tcg_abort();
> -        tcg_out_mov(s, TCG_REG_R24, data_reg);
> -        tcg_out_mov(s, TCG_REG_R23, data_reg2);
> -        /* TODO: push mem_index */
> -        tcg_abort();
> +    /* There are no indexed stores, so if GUEST_BASE is set
> +       we must do the add explicitly.  Careful to avoid R20,
> +       which is used for the bswaps to follow.  */
> +    if (GUEST_BASE == 0) {
> +        r0 = addr_reg;
>      } else {
> -        switch(opc) {
> -        case 0:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> -            break;
> -        case 1:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> -            break;
> -        case 2:
> -            tcg_out_mov(s, TCG_REG_R24, data_reg);
> -            break;
> -        }
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
> +        tcg_out_arith(s, TCG_REG_R31, addr_reg, TCG_GUEST_BASE_REG, INSN_ADDL);
> +        r0 = TCG_REG_R31;
>      }
> -#else
> -    if (opc == 3) {
> -        tcg_abort();
> -        tcg_out_mov(s, TCG_REG_R25, data_reg);
> -        tcg_out_mov(s, TCG_REG_R24, data_reg2);
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
> -    } else {
> -        switch(opc) {
> -        case 0:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> -            break;
> -        case 1:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> -            break;
> -        case 2:
> -            tcg_out_mov(s, TCG_REG_R25, data_reg);
> -            break;
> -        }
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
> -    }
> -#endif
> -    tcg_out_call(s, qemu_st_helpers[s_bits]);
> -
> -    /* jump to label2 */
> -    label2_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
> -
> -    /* label1: */
> -    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
> -
> -    tcg_out_arithi(s, TCG_REG_R20, r1,
> -                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_write),
> -                   ARITH_ADD);
> -    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
> -    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
> -#else
> -    r0 = addr_reg;
>  #endif
>  
>  #ifdef TARGET_WORDS_BIGENDIAN
> @@ -685,170 +1104,345 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>  #endif
>      switch (opc) {
>      case 0:
> -        tcg_out_ldst(s, data_reg, r0, 0, STB);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STB);
>          break;
>      case 1:
>          if (bswap) {
> -            tcg_out_bswap16(s, TCG_REG_R20, data_reg);
> +            tcg_out_bswap16(s, TCG_REG_R20, data_reg, 0);
>              data_reg = TCG_REG_R20;
>          }
> -        tcg_out_ldst(s, data_reg, r0, 0, STH);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STH);
>          break;
>      case 2:
>          if (bswap) {
>              tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
>              data_reg = TCG_REG_R20;
>          }
> -        tcg_out_ldst(s, data_reg, r0, 0, STW);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STW);
>          break;
>      case 3:
> -        tcg_abort();
> -        if (!bswap) {
> -            tcg_out_ldst(s, data_reg, r0, 0, STW);
> -            tcg_out_ldst(s, data_reg2, r0, 4, STW);
> -        } else {
> +        if (bswap) {
>              tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
> -            tcg_out_ldst(s, TCG_REG_R20, r0, 4, STW);
> -            tcg_out_bswap32(s, TCG_REG_R20, data_reg2, TCG_REG_R20);
> -            tcg_out_ldst(s, TCG_REG_R20, r0, 0, STW);
> +            tcg_out_bswap32(s, TCG_REG_R23, data_reg2, TCG_REG_R23);
> +            data_reg2 = TCG_REG_R20;
> +            data_reg = TCG_REG_R23;
>          }
> +        tcg_out_ldst(s, data_reg2, r0, 0, INSN_STW);
> +        tcg_out_ldst(s, data_reg, r0, 4, INSN_STW);
>          break;
>      default:
>          tcg_abort();
>      }
>  
>  #if defined(CONFIG_SOFTMMU)
> +    tcg_out_branch(s, lab2, 1);
> +
> +    /* TLB Miss.  */
> +    /* label1: */
> +    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
> +
> +    argreg = TCG_REG_R26;
> +    tcg_out_mov(s, argreg--, addr_reg);
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_mov(s, argreg--, addr_reg2);
> +    }
> +
> +    switch(opc) {
> +    case 0:
> +        tcg_out_andi(s, argreg--, data_reg, 0xff);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 1:
> +        tcg_out_andi(s, argreg--, data_reg, 0xffff);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 2:
> +        tcg_out_mov(s, argreg--, data_reg);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 3:
> +        /* Because of the alignment required by the 64-bit data argument,
> +           we will always use R23/R24.  Also, we will always run out of
> +           argument registers for storing mem_index, so that will have 
> +           to go on the stack.  */
> +        if (mem_index == 0) {
> +            argreg = TCG_REG_R0;
> +        } else {
> +            argreg = TCG_REG_R20;
> +            tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        }
> +        tcg_out_mov(s, TCG_REG_R23, data_reg2);
> +        tcg_out_mov(s, TCG_REG_R24, data_reg);
> +        tcg_out_st(s, TCG_TYPE_I32, argreg, TCG_REG_SP,
> +                   TCG_TARGET_CALL_STACK_OFFSET - 4);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +
> +    tcg_out_call(s, qemu_st_helpers[s_bits]);
> +
>      /* label2: */
> -    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
> +    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
>  #endif
>  }
>  
> +static void tcg_out_exit_tb(TCGContext *s, TCGArg arg)
> +{
> +    if (!check_fit_tl(arg, 14)) {
> +        uint32_t hi, lo;
> +        hi = arg & ~0x7ff;
> +        lo = arg & 0x7ff;
> +        if (lo) {
> +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, hi);
> +            tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
> +            tcg_out_addi(s, TCG_REG_RET0, lo);
> +            return;
> +        }
> +        arg = hi;
> +    }
> +    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
> +    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, arg);
> +}
> +
> +static void tcg_out_goto_tb(TCGContext *s, TCGArg arg)
> +{
> +    if (s->tb_jmp_offset) {
> +        /* direct jump method */
> +        fprintf(stderr, "goto_tb direct\n");
> +        tcg_abort();
> +    } else {
> +        /* indirect jump method */
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, TCG_REG_R0,
> +                   (tcg_target_long)(s->tb_next + arg));
> +        tcg_out32(s, INSN_BV_N | INSN_R2(TCG_REG_R20));
> +    }
> +    s->tb_next_offset[arg] = s->code_ptr - s->code_buf;
> +}
> +
>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
>                                const int *const_args)
>  {
> -    int c;
> -
>      switch (opc) {
>      case INDEX_op_exit_tb:
> -        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, args[0]);
> -        tcg_out32(s, BV_N | INSN_R2(TCG_REG_R18));
> +        tcg_out_exit_tb(s, args[0]);
>          break;
>      case INDEX_op_goto_tb:
> -        if (s->tb_jmp_offset) {
> -            /* direct jump method */
> -            fprintf(stderr, "goto_tb direct\n");
> -            tcg_abort();
> -            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, args[0]);
> -            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
> -            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -        } else {
> -            /* indirect jump method */
> -            tcg_out_ld_ptr(s, TCG_REG_R20,
> -                           (tcg_target_long)(s->tb_next + args[0]));
> -            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
> -        }
> -        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
> +        tcg_out_goto_tb(s, args[0]);
>          break;
> +
>      case INDEX_op_call:
> -        tcg_out32(s, BLE_SR4 | INSN_R2(args[0]));
> -        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +        if (const_args[0]) {
> +            tcg_out_call(s, (void *)args[0]);
> +        } else {
> +            tcg_out32(s, INSN_BLE_SR4 | INSN_R2(args[0]));
> +            tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +        }
>          break;
> +
>      case INDEX_op_jmp:
>          fprintf(stderr, "unimplemented jmp\n");
>          tcg_abort();
>          break;
> +
>      case INDEX_op_br:
> -        fprintf(stderr, "unimplemented br\n");
> -        tcg_abort();
> +        tcg_out_branch(s, args[0], 1);
>          break;
> +
>      case INDEX_op_movi_i32:
>          tcg_out_movi(s, TCG_TYPE_I32, args[0], (uint32_t)args[1]);
>          break;
>  
>      case INDEX_op_ld8u_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
>          break;
>      case INDEX_op_ld8s_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
>          tcg_out_ext8s(s, args[0], args[0]);
>          break;
>      case INDEX_op_ld16u_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
>          break;
>      case INDEX_op_ld16s_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
>          tcg_out_ext16s(s, args[0], args[0]);
>          break;
>      case INDEX_op_ld_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDW);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDW);
>          break;
>  
>      case INDEX_op_st8_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STB);
>          break;
>      case INDEX_op_st16_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STH);
>          break;
>      case INDEX_op_st_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STW);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STW);
> +        break;
> +
> +    case INDEX_op_add_i32:
> +        if (const_args[2]) {
> +            tcg_out_addi2(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_ADDL);
> +        }
>          break;
>  
>      case INDEX_op_sub_i32:
> -        c = ARITH_SUB;
> -        goto gen_arith;
> +        if (const_args[1]) {
> +            if (const_args[2]) {
> +                tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1] - args[2]);
> +            } else {
> +                /* Recall that SUBI is a reversed subtract.  */
> +                tcg_out_arithi(s, args[0], args[2], args[1], INSN_SUBI);
> +            }
> +        } else if (const_args[2]) {
> +            tcg_out_addi2(s, args[0], args[1], -args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_SUB);
> +        }
> +        break;
> +
>      case INDEX_op_and_i32:
> -        c = ARITH_AND;
> -        goto gen_arith;
> +        if (const_args[2]) {
> +            tcg_out_andi(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_AND);
> +        }
> +        break;
> +
>      case INDEX_op_or_i32:
> -        c = ARITH_OR;
> -        goto gen_arith;
> +        if (const_args[2]) {
> +            tcg_out_ori(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_OR);
> +        }
> +        break;
> +
>      case INDEX_op_xor_i32:
> -        c = ARITH_XOR;
> -        goto gen_arith;
> -    case INDEX_op_add_i32:
> -        c = ARITH_ADD;
> -        goto gen_arith;
> +        tcg_out_arith(s, args[0], args[1], args[2], INSN_XOR);
> +        break;
> +
> +    case INDEX_op_andc_i32:
> +        if (const_args[2]) {
> +            tcg_out_andi(s, args[0], args[1], ~args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_ANDCM);
> +        }
> +        break;
>  
>      case INDEX_op_shl_i32:
> -        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
> -                     lowsignext(0x1f, 0, 11));
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
> -        tcg_out32(s, ZVDEP | INSN_R2(args[0]) | INSN_R1(args[1]) |
> -                     INSN_DEP_LEN(32));
> +        if (const_args[2]) {
> +            tcg_out_shli(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_shl(s, args[0], args[1], args[2]);
> +        }
>          break;
> +
>      case INDEX_op_shr_i32:
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(args[2]));
> -        tcg_out32(s, VSHD | INSN_T(args[0]) | INSN_R1(TCG_REG_R0) |
> -                     INSN_R2(args[1]));
> +        if (const_args[2]) {
> +            tcg_out_shri(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_shr(s, args[0], args[1], args[2]);
> +        }
>          break;
> +
>      case INDEX_op_sar_i32:
> -        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
> -                     lowsignext(0x1f, 0, 11));
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
> -        tcg_out32(s, VEXTRS | INSN_R1(args[0]) | INSN_R2(args[1]) |
> -                     INSN_DEP_LEN(32));
> +        if (const_args[2]) {
> +            tcg_out_sari(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_sar(s, args[0], args[1], args[2]);
> +        }
> +        break;
> +
> +    case INDEX_op_rotl_i32:
> +        if (const_args[2]) {
> +            tcg_out_rotli(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_rotl(s, args[0], args[1], args[2]);
> +        }
> +        break;
> +
> +    case INDEX_op_rotr_i32:
> +        if (const_args[2]) {
> +            tcg_out_rotri(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_rotr(s, args[0], args[1], args[2]);
> +        }
>          break;
>  
>      case INDEX_op_mul_i32:
> -        fprintf(stderr, "unimplemented mul\n");
> -        tcg_abort();
> +        tcg_out_xmpyu(s, args[0], TCG_REG_R0, args[1], args[2]);
>          break;
>      case INDEX_op_mulu2_i32:
> -        fprintf(stderr, "unimplemented mulu2\n");
> -        tcg_abort();
> +        tcg_out_xmpyu(s, args[0], args[1], args[2], args[3]);
>          break;
> -    case INDEX_op_div2_i32:
> -        fprintf(stderr, "unimplemented div2\n");
> -        tcg_abort();
> +
> +    case INDEX_op_bswap16_i32:
> +        tcg_out_bswap16(s, args[0], args[1], 0);
>          break;
> -    case INDEX_op_divu2_i32:
> -        fprintf(stderr, "unimplemented divu2\n");
> -        tcg_abort();
> +    case INDEX_op_bswap32_i32:
> +        tcg_out_bswap32(s, args[0], args[1], TCG_REG_R20);
> +        break;
> +
> +    case INDEX_op_not_i32:
> +        tcg_out_arithi(s, args[0], args[1], -1, INSN_SUBI);
> +        break;
> +    case INDEX_op_ext8s_i32:
> +        tcg_out_ext8s(s, args[0], args[1]);
> +        break;
> +    case INDEX_op_ext16s_i32:
> +        tcg_out_ext16s(s, args[0], args[1]);
> +        break;
> +
> +    /* These three correspond exactly to the fallback implementation.
> +       But by including them we reduce the number of TCG ops that 
> +       need to be generated, and these opcodes are fairly common.  */

Are you sure it really makes a difference?

> +    case INDEX_op_neg_i32:
> +        tcg_out_arith(s, args[0], TCG_REG_R0, args[1], INSN_SUB);
> +        break;
> +    case INDEX_op_ext8u_i32:
> +        tcg_out_andi(s, args[0], args[1], 0xff);
> +        break;
> +    case INDEX_op_ext16u_i32:
> +        tcg_out_andi(s, args[0], args[1], 0xffff);
>          break;
>  
>      case INDEX_op_brcond_i32:
> -        fprintf(stderr, "unimplemented brcond\n");
> -        tcg_abort();
> +        tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], args[3]);
> +        break;
> +    case INDEX_op_brcond2_i32:
> +        tcg_out_brcond2(s, args[4], args[0], args[1],
> +                        args[2], const_args[2],
> +                        args[3], const_args[3], args[5]);
> +        break;
> +
> +    case INDEX_op_setcond_i32:
> +        tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]);
> +        break;
> +    case INDEX_op_setcond2_i32:
> +        tcg_out_setcond2(s, args[5], args[0], args[1], args[2],
> +                         args[3], const_args[3], args[4], const_args[4]);
> +        break;
> +
> +    case INDEX_op_add2_i32:
> +        if (const_args[4]) {
> +            tcg_out_arithi(s, args[0], args[2], args[4], INSN_ADDI);
> +        } else {
> +            tcg_out_arith(s, args[0], args[2], args[4], INSN_ADD);
> +        }
> +        tcg_out_arith(s, args[1], args[3], args[5], INSN_ADDC);
> +        break;
> +
> +    case INDEX_op_sub2_i32:
> +        if (const_args[2]) {
> +            /* Recall that SUBI is a reversed subtract.  */
> +            tcg_out_arithi(s, args[0], args[4], args[2], INSN_SUBI);
> +        } else {
> +            tcg_out_arith(s, args[0], args[2], args[4], INSN_SUB);
> +        }
> +        tcg_out_arith(s, args[1], args[3], args[5], INSN_SUBB);
>          break;
>  
>      case INDEX_op_qemu_ld8u:
> @@ -866,6 +1460,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
>      case INDEX_op_qemu_ld32:
>          tcg_out_qemu_ld(s, args, 2);
>          break;
> +    case INDEX_op_qemu_ld64:
> +        tcg_out_qemu_ld(s, args, 3);
> +        break;
>  
>      case INDEX_op_qemu_st8:
>          tcg_out_qemu_st(s, args, 0);
> @@ -876,47 +1473,70 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
>      case INDEX_op_qemu_st32:
>          tcg_out_qemu_st(s, args, 2);
>          break;
> +    case INDEX_op_qemu_st64:
> +        tcg_out_qemu_st(s, args, 3);
> +        break;
>  
>      default:
>          fprintf(stderr, "unknown opcode 0x%x\n", opc);
>          tcg_abort();
>      }
> -    return;
> -
> -gen_arith:
> -    tcg_out_arith(s, args[0], args[1], args[2], c);
>  }
>  
>  static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_exit_tb, { } },
>      { INDEX_op_goto_tb, { } },
>  
> -    { INDEX_op_call, { "r" } },
> +    { INDEX_op_call, { "ri" } },
>      { INDEX_op_jmp, { "r" } },
>      { INDEX_op_br, { } },
>  
>      { INDEX_op_mov_i32, { "r", "r" } },
>      { INDEX_op_movi_i32, { "r" } },
> +
>      { INDEX_op_ld8u_i32, { "r", "r" } },
>      { INDEX_op_ld8s_i32, { "r", "r" } },
>      { INDEX_op_ld16u_i32, { "r", "r" } },
>      { INDEX_op_ld16s_i32, { "r", "r" } },
>      { INDEX_op_ld_i32, { "r", "r" } },
> -    { INDEX_op_st8_i32, { "r", "r" } },
> -    { INDEX_op_st16_i32, { "r", "r" } },
> -    { INDEX_op_st_i32, { "r", "r" } },
> +    { INDEX_op_st8_i32, { "rZ", "r" } },
> +    { INDEX_op_st16_i32, { "rZ", "r" } },
> +    { INDEX_op_st_i32, { "rZ", "r" } },
> +
> +    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
> +    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
> +    { INDEX_op_and_i32, { "r", "rZ", "ri" } },
> +    { INDEX_op_or_i32, { "r", "rZ", "ri" } },

Already commented for "and" and "or", but the same apply for add and 
sub. Do we really need a "i" contraints here if the constant is going 
to be loaded with a movi.

> +    { INDEX_op_xor_i32, { "r", "rZ", "rZ" } },
> +    { INDEX_op_andc_i32, { "r", "rZ", "ri" } },

same here.

> +
> +    { INDEX_op_mul_i32, { "r", "r", "r" } },
> +    { INDEX_op_mulu2_i32, { "r", "r", "r", "r" } },
>  
> -    { INDEX_op_add_i32, { "r", "r", "r" } },
> -    { INDEX_op_sub_i32, { "r", "r", "r" } },
> -    { INDEX_op_and_i32, { "r", "r", "r" } },
> -    { INDEX_op_or_i32, { "r", "r", "r" } },
> -    { INDEX_op_xor_i32, { "r", "r", "r" } },
> +    { INDEX_op_shl_i32, { "r", "r", "ri" } },
> +    { INDEX_op_shr_i32, { "r", "r", "ri" } },
> +    { INDEX_op_sar_i32, { "r", "r", "ri" } },
> +    { INDEX_op_rotl_i32, { "r", "r", "ri" } },
> +    { INDEX_op_rotr_i32, { "r", "r", "ri" } },
>  
> -    { INDEX_op_shl_i32, { "r", "r", "r" } },
> -    { INDEX_op_shr_i32, { "r", "r", "r" } },
> -    { INDEX_op_sar_i32, { "r", "r", "r" } },
> +    { INDEX_op_bswap16_i32, { "r", "r" } },
> +    { INDEX_op_bswap32_i32, { "r", "r" } },
> +    { INDEX_op_neg_i32, { "r", "r" } },
> +    { INDEX_op_not_i32, { "r", "r" } },
>  
> -    { INDEX_op_brcond_i32, { "r", "r" } },
> +    { INDEX_op_ext8s_i32, { "r", "r" } },
> +    { INDEX_op_ext8u_i32, { "r", "r" } },
> +    { INDEX_op_ext16s_i32, { "r", "r" } },
> +    { INDEX_op_ext16u_i32, { "r", "r" } },
> +
> +    { INDEX_op_brcond_i32, { "rZ", "rJ" } },
> +    { INDEX_op_brcond2_i32,  { "rZ", "rZ", "rJ", "rJ" } },
> +
> +    { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
> +    { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
> +
> +    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
> +    { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rZ", "rZ" } },
>  
>  #if TARGET_LONG_BITS == 32
>      { INDEX_op_qemu_ld8u, { "r", "L" } },
> @@ -926,10 +1546,10 @@ static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_qemu_ld32, { "r", "L" } },
>      { INDEX_op_qemu_ld64, { "r", "r", "L" } },
>  
> -    { INDEX_op_qemu_st8, { "L", "L" } },
> -    { INDEX_op_qemu_st16, { "L", "L" } },
> -    { INDEX_op_qemu_st32, { "L", "L" } },
> -    { INDEX_op_qemu_st64, { "L", "L", "L" } },
> +    { INDEX_op_qemu_st8, { "LZ", "L" } },
> +    { INDEX_op_qemu_st16, { "LZ", "L" } },
> +    { INDEX_op_qemu_st32, { "LZ", "L" } },
> +    { INDEX_op_qemu_st64, { "LZ", "LZ", "L" } },
>  #else
>      { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
>      { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
> @@ -938,25 +1558,98 @@ static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_qemu_ld32, { "r", "L", "L" } },
>      { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
>  
> -    { INDEX_op_qemu_st8, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st16, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st32, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
> +    { INDEX_op_qemu_st8, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st16, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st32, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st64, { "LZ", "LZ", "L", "L" } },
>  #endif
>      { -1 },
>  };
>  
> +static int tcg_target_callee_save_regs[] = {
> +    /* R2, the return address register, is saved specially
> +       in the caller's frame.  */
> +    /* R3, the frame pointer, is not currently modified.  */
> +    TCG_REG_R4,
> +    TCG_REG_R5,
> +    TCG_REG_R6,
> +    TCG_REG_R7,
> +    TCG_REG_R8,
> +    TCG_REG_R9,
> +    TCG_REG_R10,
> +    TCG_REG_R11,
> +    TCG_REG_R12,
> +    TCG_REG_R13,
> +    TCG_REG_R14,
> +    TCG_REG_R15,
> +    TCG_REG_R16,
> +    /* R17 is the global env, so no need to save.  */
> +    TCG_REG_R18
> +};
> +
> +void tcg_target_qemu_prologue(TCGContext *s)
> +{
> +    int frame_size, i;
> +
> +    /* Allocate space for the fixed frame marker.  */
> +    frame_size = -TCG_TARGET_CALL_STACK_OFFSET;
> +    frame_size += TCG_TARGET_STATIC_CALL_ARGS_SIZE;
> +
> +    /* Allocate space for the saved registers.  */
> +    frame_size += ARRAY_SIZE(tcg_target_callee_save_regs) * 4;
> +
> +    /* Align the allocated space.  */
> +    frame_size = ((frame_size + TCG_TARGET_STACK_ALIGN - 1)
> +                  & -TCG_TARGET_STACK_ALIGN);
> +
> +    /* The return address is stored in the caller's frame.  */
> +    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -20);
> +
> +    /* Allocate stack frame, saving the first register at the same time.  */
> +    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
> +                 TCG_REG_SP, frame_size, INSN_STWM);
> +
> +    /* Save all callee saved registers.  */
> +    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
> +        tcg_out_st(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
> +                   TCG_REG_SP, -frame_size + i * 4);
> +    }
> +
> +    if (GUEST_BASE != 0) {
> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
> +    }

The final GUEST_BASE value is computed after the prologue has been
generated. The value is modified in two cases:
- The user specify a non-aligned base address.
- /proc/sys/vm/mmap_min_addr is different than 0, which is now the
  in default configuration for more than one year.

When it happens, the guest crashes almost immediately.

> +    /* Jump to TB, and adjust R18 to be the return address.  */
> +    tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R26));
> +    tcg_out_mov(s, TCG_REG_R18, TCG_REG_R31);
> +
> +    /* Restore callee saved registers.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -frame_size - 20);
> +    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
> +        tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
> +                   TCG_REG_SP, -frame_size + i * 4);
> +    }
> +
> +    /* Deallocate stack frame and return.  */
> +    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_RP));
> +    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
> +                 TCG_REG_SP, -frame_size, INSN_LDWM);
> +}
> +
>  void tcg_target_init(TCGContext *s)
>  {
>      tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
> -    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
> -                     (1 << TCG_REG_R20) |
> -                     (1 << TCG_REG_R21) |
> -                     (1 << TCG_REG_R22) |
> -                     (1 << TCG_REG_R23) |
> -                     (1 << TCG_REG_R24) |
> -                     (1 << TCG_REG_R25) |
> -                     (1 << TCG_REG_R26));
> +
> +    tcg_regset_clear(tcg_target_call_clobber_regs);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R20);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R21);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R22);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R23);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R24);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R25);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R26);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET0);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET1);
>  
>      tcg_regset_clear(s->reserved_regs);
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0);  /* hardwired to zero */
> @@ -969,6 +1662,9 @@ void tcg_target_init(TCGContext *s)
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_DP);  /* data pointer */
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);  /* stack pointer */
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_R31); /* ble link reg */
> +    if (GUEST_BASE != 0) {
> +        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
> +    }
>  
>      tcg_add_target_add_op_defs(hppa_op_defs);
>  }
> diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
> index e956e71..36b6949 100644
> --- a/tcg/hppa/tcg-target.h
> +++ b/tcg/hppa/tcg-target.h
> @@ -69,17 +69,33 @@ enum {
>      TCG_REG_R31,
>  };
>  
> +#define TCG_CT_CONST_0    0x0100
> +#define TCG_CT_CONST_S5   0x0200
> +#define TCG_CT_CONST_S11  0x0400
> +
>  /* used for function call generation */
>  #define TCG_REG_CALL_STACK TCG_REG_SP
> -#define TCG_TARGET_STACK_ALIGN 16
> +#define TCG_TARGET_STACK_ALIGN 64
> +#define TCG_TARGET_CALL_STACK_OFFSET -48
> +#define TCG_TARGET_STATIC_CALL_ARGS_SIZE 8*4
> +#define TCG_TARGET_CALL_ALIGN_ARGS 1
>  #define TCG_TARGET_STACK_GROWSUP
>  
>  /* optional instructions */
> -#define TCG_TARGET_HAS_div2_i32
> -//#define TCG_TARGET_HAS_ext8s_i32
> -//#define TCG_TARGET_HAS_ext16s_i32
> -//#define TCG_TARGET_HAS_bswap16_i32
> -//#define TCG_TARGET_HAS_bswap32_i32
> +// #define TCG_TARGET_HAS_div_i32
> +#define TCG_TARGET_HAS_rot_i32
> +#define TCG_TARGET_HAS_ext8s_i32
> +#define TCG_TARGET_HAS_ext16s_i32
> +#define TCG_TARGET_HAS_ext8u_i32
> +#define TCG_TARGET_HAS_ext16u_i32
> +#define TCG_TARGET_HAS_bswap16_i32
> +#define TCG_TARGET_HAS_bswap32_i32
> +#define TCG_TARGET_HAS_not_i32
> +#define TCG_TARGET_HAS_neg_i32
> +#define TCG_TARGET_HAS_andc_i32
> +// #define TCG_TARGET_HAS_orc_i32
> +
> +#define TCG_TARGET_HAS_GUEST_BASE
>  
>  /* Note: must be synced with dyngen-exec.h */
>  #define TCG_AREG0 TCG_REG_R17
> @@ -87,116 +103,12 @@ enum {
>  static inline void flush_icache_range(unsigned long start, unsigned long stop)
>  {
>      start &= ~31;
> -    while (start <= stop)
> -    {
> -        asm volatile ("fdc 0(%0)\n"
> -                      "sync\n"
> -                      "fic 0(%%sr4, %0)\n"
> -                      "sync\n"
> +    while (start <= stop) {
> +        asm volatile ("fdc 0(%0)\n\t"
> +                      "sync\n\t"
> +                      "fic 0(%%sr4, %0)\n\t"
> +                      "sync"
>                        : : "r"(start) : "memory");
>          start += 32;
>      }
>  }
> -
> -/* supplied by libgcc */
> -extern void *__canonicalize_funcptr_for_compare(void *);
> -
> -/* Field selection types defined by hppa */
> -#define rnd(x)                  (((x)+0x1000)&~0x1fff)
> -/* lsel: select left 21 bits */
> -#define lsel(v,a)               (((v)+(a))>>11)
> -/* rsel: select right 11 bits */
> -#define rsel(v,a)               (((v)+(a))&0x7ff)
> -/* lrsel with rounding of addend to nearest 8k */
> -#define lrsel(v,a)              (((v)+rnd(a))>>11)
> -/* rrsel with rounding of addend to nearest 8k */
> -#define rrsel(v,a)              ((((v)+rnd(a))&0x7ff)+((a)-rnd(a)))
> -
> -#define mask(x,sz)              ((x) & ~((1<<(sz))-1))
> -
> -static inline int reassemble_12(int as12)
> -{
> -    return (((as12 & 0x800) >> 11) |
> -            ((as12 & 0x400) >> 8) |
> -            ((as12 & 0x3ff) << 3));
> -}
> -
> -static inline int reassemble_14(int as14)
> -{
> -    return (((as14 & 0x1fff) << 1) |
> -            ((as14 & 0x2000) >> 13));
> -}
> -
> -static inline int reassemble_17(int as17)
> -{
> -    return (((as17 & 0x10000) >> 16) |
> -            ((as17 & 0x0f800) << 5) |
> -            ((as17 & 0x00400) >> 8) |
> -            ((as17 & 0x003ff) << 3));
> -}
> -
> -static inline int reassemble_21(int as21)
> -{
> -    return (((as21 & 0x100000) >> 20) |
> -            ((as21 & 0x0ffe00) >> 8) |
> -            ((as21 & 0x000180) << 7) |
> -            ((as21 & 0x00007c) << 14) |
> -            ((as21 & 0x000003) << 12));
> -}
> -
> -static inline void hppa_patch21l(uint32_t *insn, int val, int addend)
> -{
> -    val = lrsel(val, addend);
> -    *insn = mask(*insn, 21) | reassemble_21(val);
> -}
> -
> -static inline void hppa_patch14r(uint32_t *insn, int val, int addend)
> -{
> -    val = rrsel(val, addend);
> -    *insn = mask(*insn, 14) | reassemble_14(val);
> -}
> -
> -static inline void hppa_patch17r(uint32_t *insn, int val, int addend)
> -{
> -    val = rrsel(val, addend);
> -    *insn = (*insn & ~0x1f1ffd) | reassemble_17(val);
> -}
> -
> -
> -static inline void hppa_patch21l_dprel(uint32_t *insn, int val, int addend)
> -{
> -    register unsigned int dp asm("r27");
> -    hppa_patch21l(insn, val - dp, addend);
> -}
> -
> -static inline void hppa_patch14r_dprel(uint32_t *insn, int val, int addend)
> -{
> -    register unsigned int dp asm("r27");
> -    hppa_patch14r(insn, val - dp, addend);
> -}
> -
> -static inline void hppa_patch17f(uint32_t *insn, int val, int addend)
> -{
> -    int dot = (int)insn & ~0x3;
> -    int v = ((val + addend) - dot - 8) / 4;
> -    if (v > (1 << 16) || v < -(1 << 16)) {
> -        printf("cannot fit branch to offset %d [%08x->%08x]\n", v, dot, val);
> -        abort();
> -    }
> -    *insn = (*insn & ~0x1f1ffd) | reassemble_17(v);
> -}
> -
> -static inline void hppa_load_imm21l(uint32_t *insn, int val, int addend)
> -{
> -    /* Transform addil L'sym(%dp) to ldil L'val, %r1 */
> -    *insn = 0x20200000 | reassemble_21(lrsel(val, 0));
> -}
> -
> -static inline void hppa_load_imm14r(uint32_t *insn, int val, int addend)
> -{
> -    /* Transform ldw R'sym(%r1), %rN to ldo R'sym(%r1), %rN */
> -    hppa_patch14r(insn, val, addend);
> -    /* HACK */
> -    if (addend == 0)
> -        *insn = (*insn & ~0xfc000000) | (0x0d << 26);
> -}
> -- 
> 1.6.2.5
> 
> 
> 
>
Richard Henderson April 8, 2010, 4:32 p.m. UTC | #2
On 04/08/2010 02:56 AM, Aurelien Jarno wrote:
> I have applied the patch. I have some comments though, it would be nice
> if you can address them with additional patches.

Sure.

>> +static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
>> +{
>> +    if (m == 0) {
>> +        tcg_out_mov(s, ret, arg);
>> +    } else if (m == -1) {
>> +        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);
> 
> Those cases are already eliminated in tcg/tcg-op.h. This code looks
> redundant.

The cases eliminated in tcg-op.h are with immediate constants.
There is no generic code in tcg.c to eliminate these cases 
after constant propagation.  However, I can remove them with...

>> +    } else {
>> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
>> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_OR);
> 
> Do we really want a movi here? It would be better to leave the tcg code
> load the constant itself, so that if the same constant is used twice, it
> is only loaded once.

I've never caught TCG properly re-using constants, but I take your
point -- there's no reason why tcg.c can't be improved, and this 
port would miss out on that improvement.  I'll invent a constraint
that matches or_mask_p.

>> +static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
...
>> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
>> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_AND);
> 
> Same.

ANDI slightly different case.  This function is used by tcg_out_tlb_read
with constants that may or may not satisfy and_mask_p.  I think it's cleaner
to handle the arbitrary case here, rather than open code the same test in
the tlb read function.

I will of course add a constraint to match and_mask_p, for ANDs that 
originate within the opcode stream.

>> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
>> +        tcg_out32(s, op);
> 
> This breaks partial retranslation. The bits corresponding to the offset
> should be preserved.

I don't recall ever hearing about re-translation.  Can you point me
at the bits that do it, so I can figure out what's going on?  This
sounds like something that ought to be documented properly...

I rather assumed that the "addend" parameter to patch_reloc would
hold whatever is really needed to be preserved.  What else is that
field for, anyway? 

>> -    if (opc == 3)
>> -        data_reg2 = *args++;
>> -    else
>> -        data_reg2 = 0; /* suppress warning */
>> +    data_reg2 = (opc == 3 ? *args++ : TCG_REG_R0);
> 
> I am not sure TCG_REG_R0 is really correct here, and I find it confusing.
> While it's value is zero, the assignment there is just to make GCC
> happy, it won't be used after

Correct.  I don't see what else I can really do though.  I think it's
worse to mix types: integer-as-register-number (i.e. *args) and 
integer-as-filler (i.e. 0).  Better to at least have them be the same
type as it clarifies that *args must be a register number.

Perhaps just a comment here?

>>  #if defined(CONFIG_SOFTMMU)
>> -    tcg_out_mov(s, r1, addr_reg);
>> +    lab1 = gen_new_label();
>> +    lab2 = gen_new_label();
> 
> Do you really want to use label here? load/store are the most common
> instructions, I am not really sure of the resulting performance.

I think the code is *so* much more readable re-using the usual branch
and relocate code.  I'd almost rather spend the time speeding up the
use of temporary labels than uglifying the code here.

>> +    /* These three correspond exactly to the fallback implementation.
>> +       But by including them we reduce the number of TCG ops that 
>> +       need to be generated, and these opcodes are fairly common.  */
> 
> Are you sure it really makes a difference?

Not quantifiably, but the reasoning is sound.  I can remove them if you insist.

>> +    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
>> +    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
>> +    { INDEX_op_and_i32, { "r", "rZ", "ri" } },
>> +    { INDEX_op_or_i32, { "r", "rZ", "ri" } },
> 
> Already commented for "and" and "or", but the same apply for add and 
> sub. Do we really need a "i" contraints here if the constant is going 
> to be loaded with a movi.

ADD and SUB are not going to use movi.  They will use one or both of
ADDIL (21-bit constant << 11) and LDO (14-bit constant).  As a pair
these insns can perform a full 32-bit constant addition.

I suppose technically there's a subset of 32-bit constants that could
benefit from generic code loading constants into registers.  The only
valid output register for ADDIL is R1.  So at the moment for

	R3 = R4 + 0x10000;

we generate

	addil	0x10000, r4, r1
	copy	r1, r3

where we could equivalently generate

	ldil	0x10000, r5
	add	r4, r5, r3

However I don't think this is worth worrying about in the short term.

>> +    if (GUEST_BASE != 0) {
>> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
>> +    }
> 
> The final GUEST_BASE value is computed after the prologue has been
> generated. The value is modified in two cases:
> - The user specify a non-aligned base address.
> - /proc/sys/vm/mmap_min_addr is different than 0, which is now the
>   in default configuration for more than one year.
> 
> When it happens, the guest crashes almost immediately.

To be fair, mmap_min_addr only affects GUEST_BASE if the executable
image we've loaded overlaps.  Which is uncommon, but certainly possible.

Hmm.  I wonder which is better: one extra instruction needed per qemu_ld
vs having one more call-saved register available.  At the moment we don't
even come close to using all of the call-saved registers, and it would be
easy enough to have the prologue read the actual guest_base variable rather
than embed the constant.


r~
Richard Henderson April 8, 2010, 9:48 p.m. UTC | #3
On 04/08/2010 09:32 AM, Richard Henderson wrote:
>>> +static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
>>> +{
>>> +    if (m == 0) {
>>> +        tcg_out_mov(s, ret, arg);
>>> +    } else if (m == -1) {
>>> +        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);
>>
>> Those cases are already eliminated in tcg/tcg-op.h. This code looks
>> redundant.
> 
> The cases eliminated in tcg-op.h are with immediate constants.
> There is no generic code in tcg.c to eliminate these cases 
> after constant propagation.  However, I can remove them with...

For the record, a real case that appears in linux-test-0.3 sparc:

0x435e8ac4:  andcc  %g0, %g0, %o4

 ---- 0x435e8ac4
 movi_i32 tmp19,$0x0
 movi_i32 tmp20,$0x0
 and_i32 loc4,tmp19,tmp20
 mov_i32 cc_dst,loc4
 movi_i32 cc_op,$0xb
 st_i32 loc4,regwptr,$0x10

The and_i32 there is "loc4 = 0 & 0".

I've no idea why the original sparc code uses this instruction.


r~
Aurelien Jarno April 8, 2010, 11:01 p.m. UTC | #4
On Thu, Apr 08, 2010 at 09:32:41AM -0700, Richard Henderson wrote:
> On 04/08/2010 02:56 AM, Aurelien Jarno wrote:
> > I have applied the patch. I have some comments though, it would be nice
> > if you can address them with additional patches.
> 
> Sure.
> 
> >> +static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> >> +{
> >> +    if (m == 0) {
> >> +        tcg_out_mov(s, ret, arg);
> >> +    } else if (m == -1) {
> >> +        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);
> > 
> > Those cases are already eliminated in tcg/tcg-op.h. This code looks
> > redundant.
> 
> The cases eliminated in tcg-op.h are with immediate constants.
> There is no generic code in tcg.c to eliminate these cases 
> after constant propagation.  However, I can remove them with...

Ok, fine.

> >> +    } else {
> >> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> >> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_OR);
> > 
> > Do we really want a movi here? It would be better to leave the tcg code
> > load the constant itself, so that if the same constant is used twice, it
> > is only loaded once.
> 
> I've never caught TCG properly re-using constants, but I take your
> point -- there's no reason why tcg.c can't be improved, and this 
> port would miss out on that improvement.  I'll invent a constraint
> that matches or_mask_p.
> 
> >> +static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> ...
> >> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> >> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_AND);
> > 
> > Same.
> 
> ANDI slightly different case.  This function is used by tcg_out_tlb_read
> with constants that may or may not satisfy and_mask_p.  I think it's cleaner
> to handle the arbitrary case here, rather than open code the same test in
> the tlb read function.
> 
> I will of course add a constraint to match and_mask_p, for ANDs that 
> originate within the opcode stream.

Ok, fine.

> >> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
> >> +        tcg_out32(s, op);
> > 
> > This breaks partial retranslation. The bits corresponding to the offset
> > should be preserved.
> 
> I don't recall ever hearing about re-translation.  Can you point me
> at the bits that do it, so I can figure out what's going on?  This
> sounds like something that ought to be documented properly...
> 
> I rather assumed that the "addend" parameter to patch_reloc would
> hold whatever is really needed to be preserved.  What else is that
> field for, anyway? 

The problem is that in case of an exception, the code is retranslated to
get the address (on the guest side) of the exception. The retranslation
is done using the same buffer, and stops as soon as the address is
found.

It means that the branch instruction is rewritten with a new address,
why the relocation is not retranslated. In short the jump address is
then pointing to the wrong address, which causes either an endless loop
or a crash.

This is something visible in system mode, usually it starts to appear
when the guest switches to userland.

To prevent that, the code should change only the bits defining the
jump instruction, leaving the others defining the address unchanged.

> >> -    if (opc == 3)
> >> -        data_reg2 = *args++;
> >> -    else
> >> -        data_reg2 = 0; /* suppress warning */
> >> +    data_reg2 = (opc == 3 ? *args++ : TCG_REG_R0);
> > 
> > I am not sure TCG_REG_R0 is really correct here, and I find it confusing.
> > While it's value is zero, the assignment there is just to make GCC
> > happy, it won't be used after
> 
> Correct.  I don't see what else I can really do though.  I think it's
> worse to mix types: integer-as-register-number (i.e. *args) and 
> integer-as-filler (i.e. 0).  Better to at least have them be the same
> type as it clarifies that *args must be a register number.
> 
> Perhaps just a comment here?

I think the old code was actually pretty fine, that is the '0' value,
plus a comment. I don't really see why it was necessary to change this
code. 

> >>  #if defined(CONFIG_SOFTMMU)
> >> -    tcg_out_mov(s, r1, addr_reg);
> >> +    lab1 = gen_new_label();
> >> +    lab2 = gen_new_label();
> > 
> > Do you really want to use label here? load/store are the most common
> > instructions, I am not really sure of the resulting performance.
> 
> I think the code is *so* much more readable re-using the usual branch
> and relocate code.  I'd almost rather spend the time speeding up the
> use of temporary labels than uglifying the code here.
> 
> >> +    /* These three correspond exactly to the fallback implementation.
> >> +       But by including them we reduce the number of TCG ops that 
> >> +       need to be generated, and these opcodes are fairly common.  */
> > 
> > Are you sure it really makes a difference?
> 
> Not quantifiably, but the reasoning is sound.  I can remove them if you insist.

My point is on one side you seems to look for performance, while on the
others (labels just above), you don't really care about performance.

> >> +    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
> >> +    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
> >> +    { INDEX_op_and_i32, { "r", "rZ", "ri" } },
> >> +    { INDEX_op_or_i32, { "r", "rZ", "ri" } },
> > 
> > Already commented for "and" and "or", but the same apply for add and 
> > sub. Do we really need a "i" contraints here if the constant is going 
> > to be loaded with a movi.
> 
> ADD and SUB are not going to use movi.  They will use one or both of
> ADDIL (21-bit constant << 11) and LDO (14-bit constant).  As a pair
> these insns can perform a full 32-bit constant addition.
> 
> I suppose technically there's a subset of 32-bit constants that could
> benefit from generic code loading constants into registers.  The only
> valid output register for ADDIL is R1.  So at the moment for
> 
> 	R3 = R4 + 0x10000;
> 
> we generate
> 
> 	addil	0x10000, r4, r1
> 	copy	r1, r3
> 
> where we could equivalently generate
> 
> 	ldil	0x10000, r5
> 	add	r4, r5, r3
> 
> However I don't think this is worth worrying about in the short term.

Ok.

> >> +    if (GUEST_BASE != 0) {
> >> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
> >> +    }
> > 
> > The final GUEST_BASE value is computed after the prologue has been
> > generated. The value is modified in two cases:
> > - The user specify a non-aligned base address.
> > - /proc/sys/vm/mmap_min_addr is different than 0, which is now the
> >   in default configuration for more than one year.
> > 
> > When it happens, the guest crashes almost immediately.
> 
> To be fair, mmap_min_addr only affects GUEST_BASE if the executable
> image we've loaded overlaps.  Which is uncommon, but certainly possible.

When I found the problem on the tcg ia64, it was crashing for all
binaries I tried.

> Hmm.  I wonder which is better: one extra instruction needed per qemu_ld
> vs having one more call-saved register available.  At the moment we don't
> even come close to using all of the call-saved registers, and it would be
> easy enough to have the prologue read the actual guest_base variable rather
> than embed the constant.
> 

The other option is to reorganize the order in which the prologue is
generated and the guest base value computed. The work is probably more
important though.
diff mbox

Patch

diff --git a/configure b/configure
index 1d5fb17..966cd7d 100755
--- a/configure
+++ b/configure
@@ -722,6 +722,9 @@  case "$cpu" in
     ia64*)
            host_guest_base="yes"
            ;;
+    hppa*)
+           host_guest_base="yes"
+           ;;
 esac
 
 [ -z "$guest_base" ] && guest_base="$host_guest_base"
@@ -2744,7 +2747,7 @@  if test "$target_linux_user" = "yes" -o "$target_bsd_user" = "yes" ; then
     # -static is used to avoid g1/g3 usage by the dynamic linker
     ldflags="$linker_script -static $ldflags"
     ;;
-  i386|x86_64|ppc|ppc64|s390|sparc64|alpha|arm|m68k|mips|mips64|ia64)
+  *)
     ldflags="$linker_script $ldflags"
     ;;
   esac
diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
index f9ae898..4e15256 100644
--- a/tcg/hppa/tcg-target.c
+++ b/tcg/hppa/tcg-target.c
@@ -24,41 +24,26 @@ 
 
 #ifndef NDEBUG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "%r0",
-    "%r1",
-    "%rp",
-    "%r3",
-    "%r4",
-    "%r5",
-    "%r6",
-    "%r7",
-    "%r8",
-    "%r9",
-    "%r10",
-    "%r11",
-    "%r12",
-    "%r13",
-    "%r14",
-    "%r15",
-    "%r16",
-    "%r17",
-    "%r18",
-    "%r19",
-    "%r20",
-    "%r21",
-    "%r22",
-    "%r23",
-    "%r24",
-    "%r25",
-    "%r26",
-    "%dp",
-    "%ret0",
-    "%ret1",
-    "%sp",
-    "%r31",
+    "%r0", "%r1", "%rp", "%r3", "%r4", "%r5", "%r6", "%r7",
+    "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+    "%r16", "%r17", "%r18", "%r19", "%r20", "%r21", "%r22", "%r23",
+    "%r24", "%r25", "%r26", "%dp", "%ret0", "%ret1", "%sp", "%r31",
 };
 #endif
 
+/* This is an 8 byte temp slot in the stack frame.  */
+#define STACK_TEMP_OFS -16
+
+#ifndef GUEST_BASE
+#define GUEST_BASE 0
+#endif
+
+#ifdef CONFIG_USE_GUEST_BASE
+#define TCG_GUEST_BASE_REG TCG_REG_R16
+#else
+#define TCG_GUEST_BASE_REG TCG_REG_R0
+#endif
+
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R4,
     TCG_REG_R5,
@@ -75,6 +60,14 @@  static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R14,
     TCG_REG_R15,
     TCG_REG_R16,
+
+    TCG_REG_R26,
+    TCG_REG_R25,
+    TCG_REG_R24,
+    TCG_REG_R23,
+
+    TCG_REG_RET0,
+    TCG_REG_RET1,
 };
 
 static const int tcg_target_call_iarg_regs[4] = {
@@ -89,16 +82,98 @@  static const int tcg_target_call_oarg_regs[2] = {
     TCG_REG_RET1,
 };
 
+/* True iff val fits a signed field of width BITS.  */
+static inline int check_fit_tl(tcg_target_long val, unsigned int bits)
+{
+    return (val << ((sizeof(tcg_target_long) * 8 - bits))
+            >> (sizeof(tcg_target_long) * 8 - bits)) == val;
+}
+
+/* True iff depi can be used to compute (reg | MASK).
+   Accept a bit pattern like:
+      0....01....1
+      1....10....0
+      0..01..10..0
+   Copied from gcc sources.  */
+static inline int or_mask_p(tcg_target_ulong mask)
+{
+    mask += mask & -mask;
+    return (mask & (mask - 1)) == 0;
+}
+
+/* True iff depi or extru can be used to compute (reg & mask).
+   Accept a bit pattern like these:
+      0....01....1
+      1....10....0
+      1..10..01..1 
+   Copied from gcc sources.  */
+static inline int and_mask_p(tcg_target_ulong mask)
+{
+    return or_mask_p(~mask);
+}
+
+static int low_sign_ext(int val, int len)
+{
+    return (((val << 1) & ~(-1u << len)) | ((val >> (len - 1)) & 1));
+}
+
+static int reassemble_12(int as12)
+{
+    return (((as12 & 0x800) >> 11) |
+            ((as12 & 0x400) >> 8) |
+            ((as12 & 0x3ff) << 3));
+}
+
+static int reassemble_17(int as17)
+{
+    return (((as17 & 0x10000) >> 16) |
+            ((as17 & 0x0f800) << 5) |
+            ((as17 & 0x00400) >> 8) |
+            ((as17 & 0x003ff) << 3));
+}
+
+static int reassemble_21(int as21)
+{
+    return (((as21 & 0x100000) >> 20) |
+            ((as21 & 0x0ffe00) >> 8) |
+            ((as21 & 0x000180) << 7) |
+            ((as21 & 0x00007c) << 14) |
+            ((as21 & 0x000003) << 12));
+}
+
+/* ??? Bizzarely, there is no PCREL12F relocation type.  I guess all
+   such relocations are simply fully handled by the assembler.  */
+#define R_PARISC_PCREL12F  R_PARISC_NONE
+
 static void patch_reloc(uint8_t *code_ptr, int type,
                         tcg_target_long value, tcg_target_long addend)
 {
+    uint32_t *insn_ptr = (uint32_t *)code_ptr;
+    uint32_t insn = *insn_ptr;
+    tcg_target_long pcrel;
+
+    value += addend;
+    pcrel = (value - ((tcg_target_long)code_ptr + 8)) >> 2;
+
     switch (type) {
+    case R_PARISC_PCREL12F:
+        assert(check_fit_tl(pcrel, 12));
+        /* ??? We assume all patches are forward.  See tcg_out_brcond
+           re setting the NUL bit on the branch and eliding the nop.  */
+        assert(pcrel >= 0);
+        insn &= ~0x1ffdu;
+        insn |= reassemble_12(pcrel);
+        break;
     case R_PARISC_PCREL17F:
-        hppa_patch17f((uint32_t *)code_ptr, value, addend);
+        assert(check_fit_tl(pcrel, 17));
+        insn &= ~0x1f1ffdu;
+        insn |= reassemble_17(pcrel);
         break;
     default:
         tcg_abort();
     }
+
+    *insn_ptr = insn;
 }
 
 /* maximum number of register used for input function arguments */
@@ -126,6 +201,15 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R24);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R23);
         break;
+    case 'Z':
+        ct->ct |= TCG_CT_CONST_0;
+        break;
+    case 'I':
+        ct->ct |= TCG_CT_CONST_S11;
+        break;
+    case 'J':
+        ct->ct |= TCG_CT_CONST_S5;
+	break;
     default:
         return -1;
     }
@@ -135,15 +219,19 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val,
-                                         const TCGArgConstraint *arg_ct)
+static int tcg_target_const_match(tcg_target_long val,
+                                  const TCGArgConstraint *arg_ct)
 {
-    int ct;
-
-    ct = arg_ct->ct;
-
-    /* TODO */
-
+    int ct = arg_ct->ct;
+    if (ct & TCG_CT_CONST) {
+        return 1;
+    } else if (ct & TCG_CT_CONST_0) {
+        return val == 0;
+    } else if (ct & TCG_CT_CONST_S5) {
+        return check_fit_tl(val, 5);
+    } else if (ct & TCG_CT_CONST_S11) {
+        return check_fit_tl(val, 11);
+    }
     return 0;
 }
 
@@ -163,191 +251,588 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define INSN_SHDEP_CP(x) ((31 - (x)) << 5)
 #define INSN_SHDEP_P(x)  ((x) << 5)
 #define INSN_COND(x)     ((x) << 13)
+#define INSN_IM11(x)     low_sign_ext(x, 11)
+#define INSN_IM14(x)     low_sign_ext(x, 14)
+#define INSN_IM5(x)      (low_sign_ext(x, 5) << 16)
+
+#define COND_NEVER   0
+#define COND_EQ      1
+#define COND_LT      2
+#define COND_LE      3
+#define COND_LTU     4
+#define COND_LEU     5
+#define COND_SV      6
+#define COND_OD      7
+#define COND_FALSE   8
+
+#define INSN_ADD	(INSN_OP(0x02) | INSN_EXT6(0x18))
+#define INSN_ADDC	(INSN_OP(0x02) | INSN_EXT6(0x1c))
+#define INSN_ADDI	(INSN_OP(0x2d))
+#define INSN_ADDIL	(INSN_OP(0x0a))
+#define INSN_ADDL	(INSN_OP(0x02) | INSN_EXT6(0x28))
+#define INSN_AND	(INSN_OP(0x02) | INSN_EXT6(0x08))
+#define INSN_ANDCM	(INSN_OP(0x02) | INSN_EXT6(0x00))
+#define INSN_COMCLR	(INSN_OP(0x02) | INSN_EXT6(0x22))
+#define INSN_COMICLR	(INSN_OP(0x24))
+#define INSN_DEP	(INSN_OP(0x35) | INSN_EXT3SH(3))
+#define INSN_DEPI	(INSN_OP(0x35) | INSN_EXT3SH(7))
+#define INSN_EXTRS	(INSN_OP(0x34) | INSN_EXT3SH(7))
+#define INSN_EXTRU	(INSN_OP(0x34) | INSN_EXT3SH(6))
+#define INSN_LDIL	(INSN_OP(0x08))
+#define INSN_LDO	(INSN_OP(0x0d))
+#define INSN_MTCTL	(INSN_OP(0x00) | INSN_EXT8B(0xc2))
+#define INSN_OR		(INSN_OP(0x02) | INSN_EXT6(0x09))
+#define INSN_SHD	(INSN_OP(0x34) | INSN_EXT3SH(2))
+#define INSN_SUB	(INSN_OP(0x02) | INSN_EXT6(0x10))
+#define INSN_SUBB	(INSN_OP(0x02) | INSN_EXT6(0x14))
+#define INSN_SUBI	(INSN_OP(0x25))
+#define INSN_VEXTRS	(INSN_OP(0x34) | INSN_EXT3SH(5))
+#define INSN_VEXTRU	(INSN_OP(0x34) | INSN_EXT3SH(4))
+#define INSN_VSHD	(INSN_OP(0x34) | INSN_EXT3SH(0))
+#define INSN_XOR	(INSN_OP(0x02) | INSN_EXT6(0x0a))
+#define INSN_ZDEP	(INSN_OP(0x35) | INSN_EXT3SH(2))
+#define INSN_ZVDEP	(INSN_OP(0x35) | INSN_EXT3SH(0))
+
+#define INSN_BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
+#define INSN_BL_N       (INSN_OP(0x3a) | INSN_EXT3BR(0) | 2)
+#define INSN_BLR        (INSN_OP(0x3a) | INSN_EXT3BR(2))
+#define INSN_BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
+#define INSN_BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
+#define INSN_BLE_SR4    (INSN_OP(0x39) | (1 << 13))
+
+#define INSN_LDB        (INSN_OP(0x10))
+#define INSN_LDH        (INSN_OP(0x11))
+#define INSN_LDW        (INSN_OP(0x12))
+#define INSN_LDWM       (INSN_OP(0x13))
+#define INSN_FLDDS      (INSN_OP(0x0b) | INSN_EXT4(0) | (1 << 12))
+
+#define INSN_LDBX	(INSN_OP(0x03) | INSN_EXT4(0))
+#define INSN_LDHX	(INSN_OP(0x03) | INSN_EXT4(1))
+#define INSN_LDWX       (INSN_OP(0x03) | INSN_EXT4(2))
+
+#define INSN_STB        (INSN_OP(0x18))
+#define INSN_STH        (INSN_OP(0x19))
+#define INSN_STW        (INSN_OP(0x1a))
+#define INSN_STWM       (INSN_OP(0x1b))
+#define INSN_FSTDS      (INSN_OP(0x0b) | INSN_EXT4(8) | (1 << 12))
+
+#define INSN_COMBT      (INSN_OP(0x20))
+#define INSN_COMBF      (INSN_OP(0x22))
+#define INSN_COMIBT     (INSN_OP(0x21))
+#define INSN_COMIBF     (INSN_OP(0x23))
+
+/* supplied by libgcc */
+extern void *__canonicalize_funcptr_for_compare(void *);
+
+static void tcg_out_mov(TCGContext *s, int ret, int arg)
+{
+    /* PA1.1 defines COPY as OR r,0,t; PA2.0 defines COPY as LDO 0(r),t
+       but hppa-dis.c is unaware of this definition */
+    if (ret != arg) {
+        tcg_out32(s, INSN_OR | INSN_T(ret) | INSN_R1(arg)
+                  | INSN_R2(TCG_REG_R0));
+    }
+}
 
-#define COND_NEVER 0
-#define COND_EQUAL 1
-#define COND_LT    2
-#define COND_LTEQ  3
-#define COND_LTU   4
-#define COND_LTUEQ 5
-#define COND_SV    6
-#define COND_OD    7
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         int ret, tcg_target_long arg)
+{
+    if (check_fit_tl(arg, 14)) {
+        tcg_out32(s, INSN_LDO | INSN_R1(ret)
+                  | INSN_R2(TCG_REG_R0) | INSN_IM14(arg));
+    } else {
+        uint32_t hi, lo;
+        hi = arg >> 11;
+        lo = arg & 0x7ff;
+
+        tcg_out32(s, INSN_LDIL | INSN_R2(ret) | reassemble_21(hi));
+        if (lo) {
+            tcg_out32(s, INSN_LDO | INSN_R1(ret)
+                      | INSN_R2(ret) | INSN_IM14(lo));
+        }
+    }
+}
 
+static void tcg_out_ldst(TCGContext *s, int ret, int addr,
+                         tcg_target_long offset, int op)
+{
+    if (!check_fit_tl(offset, 14)) {
+        uint32_t hi, lo, op;
 
-/* Logical ADD */
-#define ARITH_ADD  (INSN_OP(0x02) | INSN_EXT6(0x28))
-#define ARITH_AND  (INSN_OP(0x02) | INSN_EXT6(0x08))
-#define ARITH_OR   (INSN_OP(0x02) | INSN_EXT6(0x09))
-#define ARITH_XOR  (INSN_OP(0x02) | INSN_EXT6(0x0a))
-#define ARITH_SUB  (INSN_OP(0x02) | INSN_EXT6(0x10))
+        hi = offset >> 11;
+        lo = offset & 0x7ff;
 
-#define SHD        (INSN_OP(0x34) | INSN_EXT3SH(2))
-#define VSHD       (INSN_OP(0x34) | INSN_EXT3SH(0))
-#define DEP        (INSN_OP(0x35) | INSN_EXT3SH(3))
-#define ZDEP       (INSN_OP(0x35) | INSN_EXT3SH(2))
-#define ZVDEP      (INSN_OP(0x35) | INSN_EXT3SH(0))
-#define EXTRU      (INSN_OP(0x34) | INSN_EXT3SH(6))
-#define EXTRS      (INSN_OP(0x34) | INSN_EXT3SH(7))
-#define VEXTRS     (INSN_OP(0x34) | INSN_EXT3SH(5))
+        if (addr == TCG_REG_R0) {
+            op = INSN_LDIL | INSN_R2(TCG_REG_R1);
+        } else {
+            op = INSN_ADDIL | INSN_R2(addr);
+        }
+        tcg_out32(s, op | reassemble_21(hi));
 
-#define SUBI       (INSN_OP(0x25))
-#define MTCTL      (INSN_OP(0x00) | INSN_EXT8B(0xc2))
+        addr = TCG_REG_R1;
+	offset = lo;
+    }
 
-#define BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
-#define BLE_SR4    (INSN_OP(0x39) | (1 << 13))
-#define BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
-#define BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
-#define LDIL       (INSN_OP(0x08))
-#define LDO        (INSN_OP(0x0d))
+    if (ret != addr || offset != 0 || op != INSN_LDO) {
+        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) | INSN_IM14(offset));
+    }
+}
 
-#define LDB        (INSN_OP(0x10))
-#define LDH        (INSN_OP(0x11))
-#define LDW        (INSN_OP(0x12))
-#define LDWM       (INSN_OP(0x13))
+/* This function is required by tcg.c.  */
+static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
+                              int arg1, tcg_target_long arg2)
+{
+    tcg_out_ldst(s, ret, arg1, arg2, INSN_LDW);
+}
+
+/* This function is required by tcg.c.  */
+static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
+                              int arg1, tcg_target_long arg2)
+{
+    tcg_out_ldst(s, ret, arg1, arg2, INSN_STW);
+}
+
+static void tcg_out_ldst_index(TCGContext *s, int data,
+                               int base, int index, int op)
+{
+    tcg_out32(s, op | INSN_T(data) | INSN_R1(index) | INSN_R2(base));
+}
+
+static inline void tcg_out_addi2(TCGContext *s, int ret, int arg1,
+                                 tcg_target_long val)
+{
+    tcg_out_ldst(s, ret, arg1, val, INSN_LDO);
+}
 
-#define STB        (INSN_OP(0x18))
-#define STH        (INSN_OP(0x19))
-#define STW        (INSN_OP(0x1a))
-#define STWM       (INSN_OP(0x1b))
+/* This function is required by tcg.c.  */
+static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+{
+    tcg_out_addi2(s, reg, reg, val);
+}
 
-#define COMBT      (INSN_OP(0x20))
-#define COMBF      (INSN_OP(0x22))
+static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int op)
+{
+    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
+}
 
-static int lowsignext(uint32_t val, int start, int length)
+static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
+                                  tcg_target_long val, int op)
 {
-    return (((val << 1) & ~(~0 << length)) |
-            ((val >> (length - 1)) & 1)) << start;
+    assert(check_fit_tl(val, 11));
+    tcg_out32(s, op | INSN_R1(t) | INSN_R2(r1) | INSN_IM11(val));
 }
 
-static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
+static inline void tcg_out_nop(TCGContext *s)
 {
-    /* PA1.1 defines COPY as OR r,0,t */
-    tcg_out32(s, ARITH_OR | INSN_T(ret) | INSN_R1(arg) | INSN_R2(TCG_REG_R0));
+    tcg_out_arith(s, TCG_REG_R0, TCG_REG_R0, TCG_REG_R0, INSN_OR);
+}
 
-    /* PA2.0 defines COPY as LDO 0(r),t
-     * but hppa-dis.c is unaware of this definition */
-    /* tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(arg) | reassemble_14(0)); */
+static inline void tcg_out_mtctl_sar(TCGContext *s, int arg)
+{
+    tcg_out32(s, INSN_MTCTL | INSN_R2(11) | INSN_R1(arg));
+}
+
+/* Extract LEN bits at position OFS from ARG and place in RET.
+   Note that here the bit ordering is reversed from the PA-RISC
+   standard, such that the right-most bit is 0.  */
+static inline void tcg_out_extr(TCGContext *s, int ret, int arg,
+                                unsigned ofs, unsigned len, int sign)
+{
+    assert(ofs < 32 && len <= 32 - ofs);
+    tcg_out32(s, (sign ? INSN_EXTRS : INSN_EXTRU)
+              | INSN_R1(ret) | INSN_R2(arg)
+              | INSN_SHDEP_P(31 - ofs) | INSN_DEP_LEN(len));
 }
 
-static inline void tcg_out_movi(TCGContext *s, TCGType type,
-                                int ret, tcg_target_long arg)
+/* Likewise with OFS interpreted little-endian.  */
+static inline void tcg_out_dep(TCGContext *s, int ret, int arg,
+                               unsigned ofs, unsigned len)
 {
-    if (arg == (arg & 0x1fff)) {
-        tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(TCG_REG_R0) |
-                     reassemble_14(arg));
+    assert(ofs < 32 && len <= 32 - ofs);
+    tcg_out32(s, INSN_DEP | INSN_R2(ret) | INSN_R1(arg)
+              | INSN_SHDEP_CP(31 - ofs) | INSN_DEP_LEN(len));
+}
+
+static inline void tcg_out_shd(TCGContext *s, int ret, int hi, int lo,
+                               unsigned count)
+{
+    assert(count < 32);
+    tcg_out32(s, INSN_SHD | INSN_R1(hi) | INSN_R2(lo) | INSN_T(ret)
+              | INSN_SHDEP_CP(count));
+}
+
+static void tcg_out_vshd(TCGContext *s, int ret, int hi, int lo, int creg)
+{
+    tcg_out_mtctl_sar(s, creg);
+    tcg_out32(s, INSN_VSHD | INSN_T(ret) | INSN_R1(hi) | INSN_R2(lo));
+}
+
+static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
+{
+    if (m == 0) {
+        tcg_out_mov(s, ret, arg);
+    } else if (m == -1) {
+        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);
+    } else if (or_mask_p(m)) {
+        int bs0, bs1;
+
+        for (bs0 = 0; bs0 < 32; bs0++) {
+            if ((m & (1u << bs0)) != 0) {
+                break;
+            }
+        }
+        for (bs1 = bs0; bs1 < 32; bs1++) {
+            if ((m & (1u << bs1)) == 0) {
+                break;
+            }
+        }
+        assert(bs1 == 32 || (1ul << bs1) > m);
+
+        tcg_out_mov(s, ret, arg);
+        tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(-1)
+                  | INSN_SHDEP_CP(31 - bs0) | INSN_DEP_LEN(bs1 - bs0));
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
+        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_OR);
+    }
+}
+
+static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
+{
+    if (m == 0) {
+        tcg_out_mov(s, ret, TCG_REG_R0);
+    } else if (m == -1) {
+        tcg_out_mov(s, ret, arg);
+    } else if (and_mask_p(m)) {
+        int ls0, ls1, ms0;
+
+        for (ls0 = 0; ls0 < 32; ls0++) {
+            if ((m & (1u << ls0)) == 0) {
+                break;
+            }
+        }
+        for (ls1 = ls0; ls1 < 32; ls1++) {
+            if ((m & (1u << ls1)) != 0) {
+                break;
+            }
+        }
+        for (ms0 = ls1; ms0 < 32; ms0++) {
+            if ((m & (1u << ms0)) == 0) {
+                break;
+            }
+        }
+        assert (ms0 == 32);
+
+        if (ls1 == 32) {
+            tcg_out_extr(s, ret, arg, 0, ls0, 0);
+        } else {
+            tcg_out_mov(s, ret, arg);
+            tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(0)
+                      | INSN_SHDEP_CP(31 - ls0) | INSN_DEP_LEN(ls1 - ls0));
+        }
     } else {
-        tcg_out32(s, LDIL | INSN_R2(ret) |
-                     reassemble_21(lrsel((uint32_t)arg, 0)));
-        if (arg & 0x7ff)
-            tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(ret) |
-                         reassemble_14(rrsel((uint32_t)arg, 0)));
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
+        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_AND);
     }
 }
 
-static inline void tcg_out_ld_raw(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
 {
-    tcg_out32(s, LDIL | INSN_R2(ret) |
-                 reassemble_21(lrsel((uint32_t)arg, 0)));
-    tcg_out32(s, LDW | INSN_R1(ret) | INSN_R2(ret) |
-                 reassemble_14(rrsel((uint32_t)arg, 0)));
+    tcg_out_extr(s, ret, arg, 0, 8, 1);
 }
 
-static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
 {
-    tcg_out_ld_raw(s, ret, arg);
+    tcg_out_extr(s, ret, arg, 0, 16, 1);
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset,
-                                int op)
+static void tcg_out_shli(TCGContext *s, int ret, int arg, int count)
 {
-    if (offset == (offset & 0xfff))
-        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) |
-                 reassemble_14(offset));
-    else {
-        fprintf(stderr, "unimplemented %s with offset %d\n", __func__, offset);
-        tcg_abort();
-    }
+    count &= 31;
+    tcg_out32(s, INSN_ZDEP | INSN_R2(ret) | INSN_R1(arg)
+              | INSN_SHDEP_CP(31 - count) | INSN_DEP_LEN(32 - count));
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
-                              int arg1, tcg_target_long arg2)
+static void tcg_out_shl(TCGContext *s, int ret, int arg, int creg)
 {
-    fprintf(stderr, "unimplemented %s\n", __func__);
-    tcg_abort();
+    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
+    tcg_out_mtctl_sar(s, TCG_REG_R20);
+    tcg_out32(s, INSN_ZVDEP | INSN_R2(ret) | INSN_R1(arg) | INSN_DEP_LEN(32));
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
-                              int arg1, tcg_target_long arg2)
+static void tcg_out_shri(TCGContext *s, int ret, int arg, int count)
 {
-    fprintf(stderr, "unimplemented %s\n", __func__);
-    tcg_abort();
+    count &= 31;
+    tcg_out_extr(s, ret, arg, count, 32 - count, 0);
 }
 
-static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int op)
+static void tcg_out_shr(TCGContext *s, int ret, int arg, int creg)
 {
-    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
+    tcg_out_vshd(s, ret, TCG_REG_R0, arg, creg);
 }
 
-static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
-                                  tcg_target_long val, int op)
+static void tcg_out_sari(TCGContext *s, int ret, int arg, int count)
 {
-    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, val);
-    tcg_out_arith(s, t, r1, TCG_REG_R20, op);
+    count &= 31;
+    tcg_out_extr(s, ret, arg, count, 32 - count, 1);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+static void tcg_out_sar(TCGContext *s, int ret, int arg, int creg)
 {
-    tcg_out_arithi(s, reg, reg, val, ARITH_ADD);
+    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
+    tcg_out_mtctl_sar(s, TCG_REG_R20);
+    tcg_out32(s, INSN_VEXTRS | INSN_R1(ret) | INSN_R2(arg) | INSN_DEP_LEN(32));
 }
 
-static inline void tcg_out_nop(TCGContext *s)
+static void tcg_out_rotli(TCGContext *s, int ret, int arg, int count)
 {
-    tcg_out32(s, ARITH_OR | INSN_T(TCG_REG_R0) | INSN_R1(TCG_REG_R0) |
-                 INSN_R2(TCG_REG_R0));
+    count &= 31;
+    tcg_out_shd(s, ret, arg, arg, 32 - count);
 }
 
-static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg) {
-    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
-                 INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
+static void tcg_out_rotl(TCGContext *s, int ret, int arg, int creg)
+{
+    tcg_out_arithi(s, TCG_REG_R20, creg, 32, INSN_SUBI);
+    tcg_out_vshd(s, ret, arg, arg, TCG_REG_R20);
 }
 
-static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg) {
-    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
-                 INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
+static void tcg_out_rotri(TCGContext *s, int ret, int arg, int count)
+{
+    count &= 31;
+    tcg_out_shd(s, ret, arg, arg, count);
 }
 
-static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg) {
-    if(ret != arg)
-        tcg_out_mov(s, ret, arg);
-    tcg_out32(s, DEP | INSN_R2(ret) | INSN_R1(ret) |
-                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
-    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(TCG_REG_R0) |
-                 INSN_R2(ret) | INSN_SHDEP_CP(8));
+static void tcg_out_rotr(TCGContext *s, int ret, int arg, int creg)
+{
+    tcg_out_vshd(s, ret, arg, arg, creg);
 }
 
-static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp) {
-    tcg_out32(s, SHD | INSN_T(temp) | INSN_R1(arg) |
-                 INSN_R2(arg) | INSN_SHDEP_CP(16));
-    tcg_out32(s, DEP | INSN_R2(temp) | INSN_R1(temp) |
-                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
-    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(arg) |
-                 INSN_R2(temp) | INSN_SHDEP_CP(8));
+static void tcg_out_bswap16(TCGContext *s, int ret, int arg, int sign)
+{
+    if (ret != arg) {
+        tcg_out_mov(s, ret, arg);             /* arg =  xxAB */
+    }
+    tcg_out_dep(s, ret, ret, 16, 8);          /* ret =  xBAB */
+    tcg_out_extr(s, ret, ret, 8, 16, sign);   /* ret =  ..BA */
 }
 
-static inline void tcg_out_call(TCGContext *s, void *func)
+static void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp)
 {
-    uint32_t val = (uint32_t)__canonicalize_funcptr_for_compare(func);
-    tcg_out32(s, LDIL | INSN_R2(TCG_REG_R20) |
-                 reassemble_21(lrsel(val, 0)));
-    tcg_out32(s, BLE_SR4 | INSN_R2(TCG_REG_R20) |
-                 reassemble_17(rrsel(val, 0) >> 2));
-    tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
+                                          /* arg =  ABCD */
+    tcg_out_rotri(s, temp, arg, 16);      /* temp = CDAB */
+    tcg_out_dep(s, temp, temp, 16, 8);    /* temp = CBAB */
+    tcg_out_shd(s, ret, arg, temp, 8);    /* ret =  DCBA */
 }
 
-#if defined(CONFIG_SOFTMMU)
+static void tcg_out_call(TCGContext *s, void *func)
+{
+    tcg_target_long val, hi, lo, disp;
+
+    val = (uint32_t)__canonicalize_funcptr_for_compare(func);
+    disp = (val - ((tcg_target_long)s->code_ptr + 8)) >> 2;
+
+    if (check_fit_tl(disp, 17)) {
+        tcg_out32(s, INSN_BL_N | INSN_R2(TCG_REG_RP) | reassemble_17(disp));
+    } else {
+        hi = val >> 11;
+        lo = val & 0x7ff;
+
+        tcg_out32(s, INSN_LDIL | INSN_R2(TCG_REG_R20) | reassemble_21(hi));
+        tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R20)
+                  | reassemble_17(lo >> 2));
+        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
+    }
+}
 
+static void tcg_out_xmpyu(TCGContext *s, int retl, int reth,
+                          int arg1, int arg2)
+{
+    /* Store both words into the stack for copy to the FPU.  */
+    tcg_out_ldst(s, arg1, TCG_REG_SP, STACK_TEMP_OFS, INSN_STW);
+    tcg_out_ldst(s, arg2, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_STW);
+
+    /* Load both words into the FPU at the same time.  We get away
+       with this because we can address the left and right half of the
+       FPU registers individually once loaded.  */
+    /* fldds stack_temp(sp),fr22 */
+    tcg_out32(s, INSN_FLDDS | INSN_R2(TCG_REG_SP)
+              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
+
+    /* xmpyu fr22r,fr22,fr22 */
+    tcg_out32(s, 0x3ad64796);
+
+    /* Store the 64-bit result back into the stack.  */
+    /* fstds stack_temp(sp),fr22 */
+    tcg_out32(s, INSN_FSTDS | INSN_R2(TCG_REG_SP)
+              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
+
+    /* Load the pieces of the result that the caller requested.  */
+    if (reth) {
+        tcg_out_ldst(s, reth, TCG_REG_SP, STACK_TEMP_OFS, INSN_LDW);
+    }
+    if (retl) {
+        tcg_out_ldst(s, retl, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_LDW);
+    }
+}
+
+static void tcg_out_branch(TCGContext *s, int label_index, int nul)
+{
+    TCGLabel *l = &s->labels[label_index];
+    uint32_t op = nul ? INSN_BL_N : INSN_BL;
+
+    if (l->has_value) {
+        tcg_target_long val = l->u.value;
+
+        val -= (tcg_target_long)s->code_ptr + 8;
+        val >>= 2;
+        assert(check_fit_tl(val, 17));
+
+        tcg_out32(s, op | reassemble_17(val));
+    } else {
+        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
+        tcg_out32(s, op);
+    }
+}
+
+static const uint8_t tcg_cond_to_cmp_cond[10] =
+{
+    [TCG_COND_EQ] = COND_EQ,
+    [TCG_COND_NE] = COND_EQ | COND_FALSE,
+    [TCG_COND_LT] = COND_LT,
+    [TCG_COND_GE] = COND_LT | COND_FALSE,
+    [TCG_COND_LE] = COND_LE,
+    [TCG_COND_GT] = COND_LE | COND_FALSE,
+    [TCG_COND_LTU] = COND_LTU,
+    [TCG_COND_GEU] = COND_LTU | COND_FALSE,
+    [TCG_COND_LEU] = COND_LEU,
+    [TCG_COND_GTU] = COND_LEU | COND_FALSE,
+};
+
+static void tcg_out_brcond(TCGContext *s, int cond, TCGArg c1,
+                           TCGArg c2, int c2const, int label_index)
+{
+    TCGLabel *l = &s->labels[label_index];
+    int op, pacond;
+
+    /* Note that COMIB operates as if the immediate is the first
+       operand.  We model brcond with the immediate in the second
+       to better match what targets are likely to give us.  For
+       consistency, model COMB with reversed operands as well.  */
+    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
+
+    if (c2const) {
+        op = (pacond & COND_FALSE ? INSN_COMIBF : INSN_COMIBT);
+        op |= INSN_IM5(c2);
+    } else {
+        op = (pacond & COND_FALSE ? INSN_COMBF : INSN_COMBT);
+        op |= INSN_R1(c2);
+    }
+    op |= INSN_R2(c1);
+    op |= INSN_COND(pacond & 7);
+
+    if (l->has_value) {
+        tcg_target_long val = l->u.value;
+
+        val -= (tcg_target_long)s->code_ptr + 8;
+        val >>= 2;
+        assert(check_fit_tl(val, 12));
+
+        /* ??? Assume that all branches to defined labels are backward.
+           Which means that if the nul bit is set, the delay slot is
+           executed if the branch is taken, and not executed in fallthru.  */
+        tcg_out32(s, op | reassemble_12(val));
+        tcg_out_nop(s);
+    } else {
+        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL12F, label_index, 0);
+        /* ??? Assume that all branches to undefined labels are forward.
+           Which means that if the nul bit is set, the delay slot is
+           not executed if the branch is taken, which is what we want.  */
+        tcg_out32(s, op | 2);
+    }
+}
+
+static void tcg_out_comclr(TCGContext *s, int cond, TCGArg ret,
+                           TCGArg c1, TCGArg c2, int c2const)
+{
+    int op, pacond;
+
+    /* Note that COMICLR operates as if the immediate is the first
+       operand.  We model setcond with the immediate in the second
+       to better match what targets are likely to give us.  For
+       consistency, model COMCLR with reversed operands as well.  */
+    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
+
+    if (c2const) {
+        op = INSN_COMICLR | INSN_R2(c1) | INSN_R1(ret) | INSN_IM11(c2);
+    } else {
+        op = INSN_COMCLR | INSN_R2(c1) | INSN_R1(c2) | INSN_T(ret);
+    }
+    op |= INSN_COND(pacond & 7);
+    op |= pacond & COND_FALSE ? 1 << 12 : 0;
+
+    tcg_out32(s, op);
+}
+
+static void tcg_out_brcond2(TCGContext *s, int cond, TCGArg al, TCGArg ah,
+                            TCGArg bl, int blconst, TCGArg bh, int bhconst,
+                            int label_index)
+{
+    switch (cond) {
+    case TCG_COND_EQ:
+    case TCG_COND_NE:
+        tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, al, bl, blconst);
+        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
+        break;
+
+    default:
+        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
+        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, ah, bh, bhconst);
+        tcg_out_brcond(s, tcg_unsigned_cond(cond),
+                       al, bl, blconst, label_index);
+        break;
+    }
+}
+
+static void tcg_out_setcond(TCGContext *s, int cond, TCGArg ret,
+                            TCGArg c1, TCGArg c2, int c2const)
+{
+    tcg_out_comclr(s, tcg_invert_cond(cond), ret, c1, c2, c2const);
+    tcg_out_movi(s, TCG_TYPE_I32, ret, 1);
+}
+
+static void tcg_out_setcond2(TCGContext *s, int cond, TCGArg ret,
+                             TCGArg al, TCGArg ah, TCGArg bl, int blconst,
+                             TCGArg bh, int bhconst)
+{
+    int scratch = TCG_REG_R20;
+
+    if (ret != al && ret != ah
+        && (blconst || ret != bl)
+        && (bhconst || ret != bh)) {
+        scratch = ret;
+    }
+
+    switch (cond) {
+    case TCG_COND_EQ:
+    case TCG_COND_NE:
+        tcg_out_setcond(s, cond, scratch, al, bl, blconst);
+        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
+        tcg_out_movi(s, TCG_TYPE_I32, scratch, cond == TCG_COND_NE);
+        break;
+
+    default:
+        tcg_out_setcond(s, tcg_unsigned_cond(cond), scratch, al, bl, blconst);
+        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
+        tcg_out_movi(s, TCG_TYPE_I32, scratch, 0);
+        tcg_out_comclr(s, cond, TCG_REG_R0, ah, bh, bhconst);
+        tcg_out_movi(s, TCG_TYPE_I32, scratch, 1);
+        break;
+    }
+
+    tcg_out_mov(s, ret, scratch);
+}
+
+#if defined(CONFIG_SOFTMMU)
 #include "../../softmmu_defs.h"
 
 static void *qemu_ld_helpers[4] = {
@@ -363,30 +848,77 @@  static void *qemu_st_helpers[4] = {
     __stl_mmu,
     __stq_mmu,
 };
+
+/* Load and compare a TLB entry, and branch if TLB miss.  OFFSET is set to
+   the offset of the first ADDR_READ or ADDR_WRITE member of the appropriate
+   TLB for the memory index.  The return value is the offset from ENV 
+   contained in R1 afterward (to be used when loading ADDEND); if the
+   return value is 0, R1 is not used.  */
+
+static int tcg_out_tlb_read(TCGContext *s, int r0, int r1, int addrlo,
+                            int addrhi, int s_bits, int lab_miss, int offset)
+{
+    int ret;
+
+    /* Extracting the index into the TLB.  The "normal C operation" is
+          r1 = addr_reg >> TARGET_PAGE_BITS;
+          r1 &= CPU_TLB_SIZE - 1;
+          r1 <<= CPU_TLB_ENTRY_BITS;
+       What this does is extract CPU_TLB_BITS beginning at TARGET_PAGE_BITS
+       and place them at CPU_TLB_ENTRY_BITS.  We can combine the first two
+       operations with an EXTRU.  Unfortunately, the current value of
+       CPU_TLB_ENTRY_BITS is > 3, so we can't merge that shift with the
+       add that follows.  */
+    tcg_out_extr(s, r1, addrlo, TARGET_PAGE_BITS, CPU_TLB_BITS, 0);
+    tcg_out_andi(s, r0, addrlo, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+    tcg_out_shli(s, r1, r1, CPU_TLB_ENTRY_BITS);
+    tcg_out_arith(s, r1, r1, TCG_AREG0, INSN_ADDL);
+
+    /* Make sure that both the addr_{read,write} and addend can be
+       read with a 14-bit offset from the same base register.  */
+    if (check_fit_tl(offset + CPU_TLB_SIZE, 14)) {
+        ret = 0;
+    } else {
+        ret = (offset + 0x400) & ~0x7ff;
+        offset = ret - offset;
+        tcg_out_addi2(s, TCG_REG_R1, r1, ret);
+        r1 = TCG_REG_R1;
+    }
+
+    /* Load the entry from the computed slot.  */
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R23, r1, offset);
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset + 4);
+    } else {
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset);
+    }
+
+    /* If not equal, jump to lab_miss. */
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_brcond2(s, TCG_COND_NE, TCG_REG_R20, TCG_REG_R23,
+                        r0, 0, addrhi, 0, lab_miss);
+    } else {
+        tcg_out_brcond(s, TCG_COND_NE, TCG_REG_R20, r0, 0, lab_miss);
+    }
+
+    return ret;
+}
 #endif
 
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 {
-    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
+    int addr_reg, addr_reg2;
+    int data_reg, data_reg2;
+    int r0, r1, mem_index, s_bits, bswap;
+    tcg_target_long offset;
 #if defined(CONFIG_SOFTMMU)
-    uint32_t *label1_ptr, *label2_ptr;
-#endif
-#if TARGET_LONG_BITS == 64
-#if defined(CONFIG_SOFTMMU)
-    uint32_t *label3_ptr;
-#endif
-    int addr_reg2;
+    int lab1, lab2, argreg;
 #endif
 
     data_reg = *args++;
-    if (opc == 3)
-        data_reg2 = *args++;
-    else
-        data_reg2 = 0; /* suppress warning */
+    data_reg2 = (opc == 3 ? *args++ : TCG_REG_R0);
     addr_reg = *args++;
-#if TARGET_LONG_BITS == 64
-    addr_reg2 = *args++;
-#endif
+    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);
     mem_index = *args;
     s_bits = opc & 3;
 
@@ -394,96 +926,22 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     r1 = TCG_REG_R25;
 
 #if defined(CONFIG_SOFTMMU)
-    tcg_out_mov(s, r1, addr_reg);
+    lab1 = gen_new_label();
+    lab2 = gen_new_label();
 
-    tcg_out_mov(s, r0, addr_reg);
+    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
+                              offsetof(CPUState,
+                                       tlb_table[mem_index][0].addr_read));
 
-    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
-                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
+    /* TLB Hit.  */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
+               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
 
-    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
-                   ARITH_AND);
-
-    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
-                   ARITH_AND);
-
-    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
-    tcg_out_arithi(s, r1, r1,
-                   offsetof(CPUState, tlb_table[mem_index][0].addr_read),
-                   ARITH_ADD);
-
-    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
-
-#if TARGET_LONG_BITS == 32
-    /* if equal, jump to label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_mov(s, r0, addr_reg); /* delay slot */
-#else
-    /* if not equal, jump to label3 */
-    label3_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_mov(s, r0, addr_reg); /* delay slot */
-
-    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
-
-    /* if equal, jump to label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_nop(s); /* delay slot */
-
-    /* label3: */
-    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
-#endif
-
-#if TARGET_LONG_BITS == 32
-    tcg_out_mov(s, TCG_REG_R26, addr_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R25, mem_index);
-#else
-    tcg_out_mov(s, TCG_REG_R26, addr_reg);
-    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
-#endif
-
-    tcg_out_call(s, qemu_ld_helpers[s_bits]);
-
-    switch(opc) {
-        case 0 | 4:
-            tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
-            break;
-        case 1 | 4:
-            tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
-            break;
-        case 0:
-        case 1:
-        case 2:
-        default:
-            tcg_out_mov(s, data_reg, TCG_REG_RET0);
-            break;
-        case 3:
-            tcg_abort();
-            tcg_out_mov(s, data_reg, TCG_REG_RET0);
-            tcg_out_mov(s, data_reg2, TCG_REG_RET1);
-            break;
-    }
-
-    /* jump to label2 */
-    label2_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
-
-    /* label1: */
-    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
-
-    tcg_out_arithi(s, TCG_REG_R20, r1,
-                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_read),
-                   ARITH_ADD);
-    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
-    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
+    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
+    offset = TCG_REG_R0;
 #else
     r0 = addr_reg;
+    offset = GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_R0;
 #endif
 
 #ifdef TARGET_WORDS_BIGENDIAN
@@ -492,190 +950,151 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     bswap = 1;
 #endif
     switch (opc) {
-        case 0:
-            tcg_out_ldst(s, data_reg, r0, 0, LDB);
-            break;
-        case 0 | 4:
-            tcg_out_ldst(s, data_reg, r0, 0, LDB);
-            tcg_out_ext8s(s, data_reg, data_reg);
-            break;
-        case 1:
-            tcg_out_ldst(s, data_reg, r0, 0, LDH);
-            if (bswap)
-                tcg_out_bswap16(s, data_reg, data_reg);
-            break;
-        case 1 | 4:
-            tcg_out_ldst(s, data_reg, r0, 0, LDH);
-            if (bswap)
-                tcg_out_bswap16(s, data_reg, data_reg);
+    case 0:
+        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
+        break;
+    case 0 | 4:
+        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
+        tcg_out_ext8s(s, data_reg, data_reg);
+        break;
+    case 1:
+        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
+        if (bswap) {
+            tcg_out_bswap16(s, data_reg, data_reg, 0);
+        }
+        break;
+    case 1 | 4:
+        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
+        if (bswap) {
+            tcg_out_bswap16(s, data_reg, data_reg, 1);
+        } else {
             tcg_out_ext16s(s, data_reg, data_reg);
-            break;
-        case 2:
-            tcg_out_ldst(s, data_reg, r0, 0, LDW);
-            if (bswap)
-                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
-            break;
-        case 3:
-            tcg_abort();
-            if (!bswap) {
-                tcg_out_ldst(s, data_reg, r0, 0, LDW);
-                tcg_out_ldst(s, data_reg2, r0, 4, LDW);
+        }
+        break;
+    case 2:
+        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDWX);
+        if (bswap) {
+            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
+        }
+        break;
+    case 3:
+        if (bswap) {
+            int t = data_reg2;
+            data_reg2 = data_reg;
+            data_reg = t;
+        }
+        if (offset == TCG_REG_R0) {
+            /* Make sure not to clobber the base register.  */
+            if (data_reg2 == r0) {
+                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
+                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
             } else {
-                tcg_out_ldst(s, data_reg, r0, 4, LDW);
-                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
-                tcg_out_ldst(s, data_reg2, r0, 0, LDW);
-                tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
+                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
+                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
             }
-            break;
-        default:
-            tcg_abort();
+        } else {
+            tcg_out_addi2(s, TCG_REG_R20, r0, 4);
+            tcg_out_ldst_index(s, data_reg2, r0, offset, INSN_LDWX);
+            tcg_out_ldst_index(s, data_reg, TCG_REG_R20, offset, INSN_LDWX);
+        }
+        if (bswap) {
+            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
+            tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
+        }
+        break;
+    default:
+        tcg_abort();
     }
 
 #if defined(CONFIG_SOFTMMU)
+    tcg_out_branch(s, lab2, 1);
+
+    /* TLB Miss.  */
+    /* label1: */
+    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
+
+    argreg = TCG_REG_R26;
+    tcg_out_mov(s, argreg--, addr_reg);
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_mov(s, argreg--, addr_reg2);
+    }
+    tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+
+    tcg_out_call(s, qemu_ld_helpers[s_bits]);
+
+    switch (opc) {
+    case 0:
+        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xff);
+        break;
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
+        break;
+    case 1:
+        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xffff);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
+        break;
+    case 2:
+    case 2 | 4:
+        tcg_out_mov(s, data_reg, TCG_REG_RET0);
+        break;
+    case 3:
+        tcg_out_mov(s, data_reg, TCG_REG_RET0);
+        tcg_out_mov(s, data_reg2, TCG_REG_RET1);
+        break;
+    default:
+        tcg_abort();
+    }
+
     /* label2: */
-    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
+    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
 #endif
 }
 
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
 {
-    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
-#if defined(CONFIG_SOFTMMU)
-    uint32_t *label1_ptr, *label2_ptr;
-#endif
-#if TARGET_LONG_BITS == 64
+    int addr_reg, addr_reg2;
+    int data_reg, data_reg2;
+    int r0, r1, mem_index, s_bits, bswap;
 #if defined(CONFIG_SOFTMMU)
-    uint32_t *label3_ptr;
-#endif
-    int addr_reg2;
+    tcg_target_long offset;
+    int lab1, lab2, argreg;
 #endif
 
     data_reg = *args++;
-    if (opc == 3)
-        data_reg2 = *args++;
-    else
-        data_reg2 = 0; /* suppress warning */
+    data_reg2 = (opc == 3 ? *args++ : 0);
     addr_reg = *args++;
-#if TARGET_LONG_BITS == 64
-    addr_reg2 = *args++;
-#endif
+    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : 0);
     mem_index = *args;
-
     s_bits = opc;
 
     r0 = TCG_REG_R26;
     r1 = TCG_REG_R25;
 
 #if defined(CONFIG_SOFTMMU)
-    tcg_out_mov(s, r1, addr_reg);
-
-    tcg_out_mov(s, r0, addr_reg);
-
-    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
-                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
-
-    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
-                   ARITH_AND);
-
-    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
-                   ARITH_AND);
+    lab1 = gen_new_label();
+    lab2 = gen_new_label();
 
-    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
-    tcg_out_arithi(s, r1, r1,
-                   offsetof(CPUState, tlb_table[mem_index][0].addr_write),
-                   ARITH_ADD);
+    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
+                              offsetof(CPUState,
+                                       tlb_table[mem_index][0].addr_write));
 
-    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
+    /* TLB Hit.  */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
+               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
 
-#if TARGET_LONG_BITS == 32
-    /* if equal, jump to label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_mov(s, r0, addr_reg); /* delay slot */
+    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
 #else
-    /* if not equal, jump to label3 */
-    label3_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_mov(s, r0, addr_reg); /* delay slot */
-
-    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
-
-    /* if equal, jump to label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
-                 INSN_COND(COND_EQUAL));
-    tcg_out_nop(s); /* delay slot */
-
-    /* label3: */
-    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
-#endif
-
-    tcg_out_mov(s, TCG_REG_R26, addr_reg);
-#if TARGET_LONG_BITS == 64
-    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
-    if (opc == 3) {
-        tcg_abort();
-        tcg_out_mov(s, TCG_REG_R24, data_reg);
-        tcg_out_mov(s, TCG_REG_R23, data_reg2);
-        /* TODO: push mem_index */
-        tcg_abort();
+    /* There are no indexed stores, so if GUEST_BASE is set
+       we must do the add explicitly.  Careful to avoid R20,
+       which is used for the bswaps to follow.  */
+    if (GUEST_BASE == 0) {
+        r0 = addr_reg;
     } else {
-        switch(opc) {
-        case 0:
-            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
-                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
-            break;
-        case 1:
-            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
-                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
-            break;
-        case 2:
-            tcg_out_mov(s, TCG_REG_R24, data_reg);
-            break;
-        }
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
+        tcg_out_arith(s, TCG_REG_R31, addr_reg, TCG_GUEST_BASE_REG, INSN_ADDL);
+        r0 = TCG_REG_R31;
     }
-#else
-    if (opc == 3) {
-        tcg_abort();
-        tcg_out_mov(s, TCG_REG_R25, data_reg);
-        tcg_out_mov(s, TCG_REG_R24, data_reg2);
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
-    } else {
-        switch(opc) {
-        case 0:
-            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
-                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
-            break;
-        case 1:
-            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
-                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
-            break;
-        case 2:
-            tcg_out_mov(s, TCG_REG_R25, data_reg);
-            break;
-        }
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
-    }
-#endif
-    tcg_out_call(s, qemu_st_helpers[s_bits]);
-
-    /* jump to label2 */
-    label2_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
-
-    /* label1: */
-    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
-
-    tcg_out_arithi(s, TCG_REG_R20, r1,
-                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_write),
-                   ARITH_ADD);
-    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
-    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
-#else
-    r0 = addr_reg;
 #endif
 
 #ifdef TARGET_WORDS_BIGENDIAN
@@ -685,170 +1104,345 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
 #endif
     switch (opc) {
     case 0:
-        tcg_out_ldst(s, data_reg, r0, 0, STB);
+        tcg_out_ldst(s, data_reg, r0, 0, INSN_STB);
         break;
     case 1:
         if (bswap) {
-            tcg_out_bswap16(s, TCG_REG_R20, data_reg);
+            tcg_out_bswap16(s, TCG_REG_R20, data_reg, 0);
             data_reg = TCG_REG_R20;
         }
-        tcg_out_ldst(s, data_reg, r0, 0, STH);
+        tcg_out_ldst(s, data_reg, r0, 0, INSN_STH);
         break;
     case 2:
         if (bswap) {
             tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
             data_reg = TCG_REG_R20;
         }
-        tcg_out_ldst(s, data_reg, r0, 0, STW);
+        tcg_out_ldst(s, data_reg, r0, 0, INSN_STW);
         break;
     case 3:
-        tcg_abort();
-        if (!bswap) {
-            tcg_out_ldst(s, data_reg, r0, 0, STW);
-            tcg_out_ldst(s, data_reg2, r0, 4, STW);
-        } else {
+        if (bswap) {
             tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
-            tcg_out_ldst(s, TCG_REG_R20, r0, 4, STW);
-            tcg_out_bswap32(s, TCG_REG_R20, data_reg2, TCG_REG_R20);
-            tcg_out_ldst(s, TCG_REG_R20, r0, 0, STW);
+            tcg_out_bswap32(s, TCG_REG_R23, data_reg2, TCG_REG_R23);
+            data_reg2 = TCG_REG_R20;
+            data_reg = TCG_REG_R23;
         }
+        tcg_out_ldst(s, data_reg2, r0, 0, INSN_STW);
+        tcg_out_ldst(s, data_reg, r0, 4, INSN_STW);
         break;
     default:
         tcg_abort();
     }
 
 #if defined(CONFIG_SOFTMMU)
+    tcg_out_branch(s, lab2, 1);
+
+    /* TLB Miss.  */
+    /* label1: */
+    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
+
+    argreg = TCG_REG_R26;
+    tcg_out_mov(s, argreg--, addr_reg);
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_mov(s, argreg--, addr_reg2);
+    }
+
+    switch(opc) {
+    case 0:
+        tcg_out_andi(s, argreg--, data_reg, 0xff);
+        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        break;
+    case 1:
+        tcg_out_andi(s, argreg--, data_reg, 0xffff);
+        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        break;
+    case 2:
+        tcg_out_mov(s, argreg--, data_reg);
+        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        break;
+    case 3:
+        /* Because of the alignment required by the 64-bit data argument,
+           we will always use R23/R24.  Also, we will always run out of
+           argument registers for storing mem_index, so that will have 
+           to go on the stack.  */
+        if (mem_index == 0) {
+            argreg = TCG_REG_R0;
+        } else {
+            argreg = TCG_REG_R20;
+            tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        }
+        tcg_out_mov(s, TCG_REG_R23, data_reg2);
+        tcg_out_mov(s, TCG_REG_R24, data_reg);
+        tcg_out_st(s, TCG_TYPE_I32, argreg, TCG_REG_SP,
+                   TCG_TARGET_CALL_STACK_OFFSET - 4);
+        break;
+    default:
+        tcg_abort();
+    }
+
+    tcg_out_call(s, qemu_st_helpers[s_bits]);
+
     /* label2: */
-    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
+    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
 #endif
 }
 
+static void tcg_out_exit_tb(TCGContext *s, TCGArg arg)
+{
+    if (!check_fit_tl(arg, 14)) {
+        uint32_t hi, lo;
+        hi = arg & ~0x7ff;
+        lo = arg & 0x7ff;
+        if (lo) {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, hi);
+            tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
+            tcg_out_addi(s, TCG_REG_RET0, lo);
+            return;
+        }
+        arg = hi;
+    }
+    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, arg);
+}
+
+static void tcg_out_goto_tb(TCGContext *s, TCGArg arg)
+{
+    if (s->tb_jmp_offset) {
+        /* direct jump method */
+        fprintf(stderr, "goto_tb direct\n");
+        tcg_abort();
+    } else {
+        /* indirect jump method */
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, TCG_REG_R0,
+                   (tcg_target_long)(s->tb_next + arg));
+        tcg_out32(s, INSN_BV_N | INSN_R2(TCG_REG_R20));
+    }
+    s->tb_next_offset[arg] = s->code_ptr - s->code_buf;
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
                               const int *const_args)
 {
-    int c;
-
     switch (opc) {
     case INDEX_op_exit_tb:
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, args[0]);
-        tcg_out32(s, BV_N | INSN_R2(TCG_REG_R18));
+        tcg_out_exit_tb(s, args[0]);
         break;
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_offset) {
-            /* direct jump method */
-            fprintf(stderr, "goto_tb direct\n");
-            tcg_abort();
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, args[0]);
-            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-        } else {
-            /* indirect jump method */
-            tcg_out_ld_ptr(s, TCG_REG_R20,
-                           (tcg_target_long)(s->tb_next + args[0]));
-            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
-        }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        tcg_out_goto_tb(s, args[0]);
         break;
+
     case INDEX_op_call:
-        tcg_out32(s, BLE_SR4 | INSN_R2(args[0]));
-        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
+        if (const_args[0]) {
+            tcg_out_call(s, (void *)args[0]);
+        } else {
+            tcg_out32(s, INSN_BLE_SR4 | INSN_R2(args[0]));
+            tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
+        }
         break;
+
     case INDEX_op_jmp:
         fprintf(stderr, "unimplemented jmp\n");
         tcg_abort();
         break;
+
     case INDEX_op_br:
-        fprintf(stderr, "unimplemented br\n");
-        tcg_abort();
+        tcg_out_branch(s, args[0], 1);
         break;
+
     case INDEX_op_movi_i32:
         tcg_out_movi(s, TCG_TYPE_I32, args[0], (uint32_t)args[1]);
         break;
 
     case INDEX_op_ld8u_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
         break;
     case INDEX_op_ld8s_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
         tcg_out_ext8s(s, args[0], args[0]);
         break;
     case INDEX_op_ld16u_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
         break;
     case INDEX_op_ld16s_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
         tcg_out_ext16s(s, args[0], args[0]);
         break;
     case INDEX_op_ld_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], LDW);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDW);
         break;
 
     case INDEX_op_st8_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], STB);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STB);
         break;
     case INDEX_op_st16_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], STH);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STH);
         break;
     case INDEX_op_st_i32:
-        tcg_out_ldst(s, args[0], args[1], args[2], STW);
+        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STW);
+        break;
+
+    case INDEX_op_add_i32:
+        if (const_args[2]) {
+            tcg_out_addi2(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_arith(s, args[0], args[1], args[2], INSN_ADDL);
+        }
         break;
 
     case INDEX_op_sub_i32:
-        c = ARITH_SUB;
-        goto gen_arith;
+        if (const_args[1]) {
+            if (const_args[2]) {
+                tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1] - args[2]);
+            } else {
+                /* Recall that SUBI is a reversed subtract.  */
+                tcg_out_arithi(s, args[0], args[2], args[1], INSN_SUBI);
+            }
+        } else if (const_args[2]) {
+            tcg_out_addi2(s, args[0], args[1], -args[2]);
+        } else {
+            tcg_out_arith(s, args[0], args[1], args[2], INSN_SUB);
+        }
+        break;
+
     case INDEX_op_and_i32:
-        c = ARITH_AND;
-        goto gen_arith;
+        if (const_args[2]) {
+            tcg_out_andi(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_arith(s, args[0], args[1], args[2], INSN_AND);
+        }
+        break;
+
     case INDEX_op_or_i32:
-        c = ARITH_OR;
-        goto gen_arith;
+        if (const_args[2]) {
+            tcg_out_ori(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_arith(s, args[0], args[1], args[2], INSN_OR);
+        }
+        break;
+
     case INDEX_op_xor_i32:
-        c = ARITH_XOR;
-        goto gen_arith;
-    case INDEX_op_add_i32:
-        c = ARITH_ADD;
-        goto gen_arith;
+        tcg_out_arith(s, args[0], args[1], args[2], INSN_XOR);
+        break;
+
+    case INDEX_op_andc_i32:
+        if (const_args[2]) {
+            tcg_out_andi(s, args[0], args[1], ~args[2]);
+        } else {
+            tcg_out_arith(s, args[0], args[1], args[2], INSN_ANDCM);
+        }
+        break;
 
     case INDEX_op_shl_i32:
-        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
-                     lowsignext(0x1f, 0, 11));
-        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
-        tcg_out32(s, ZVDEP | INSN_R2(args[0]) | INSN_R1(args[1]) |
-                     INSN_DEP_LEN(32));
+        if (const_args[2]) {
+            tcg_out_shli(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_shl(s, args[0], args[1], args[2]);
+        }
         break;
+
     case INDEX_op_shr_i32:
-        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(args[2]));
-        tcg_out32(s, VSHD | INSN_T(args[0]) | INSN_R1(TCG_REG_R0) |
-                     INSN_R2(args[1]));
+        if (const_args[2]) {
+            tcg_out_shri(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_shr(s, args[0], args[1], args[2]);
+        }
         break;
+
     case INDEX_op_sar_i32:
-        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
-                     lowsignext(0x1f, 0, 11));
-        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
-        tcg_out32(s, VEXTRS | INSN_R1(args[0]) | INSN_R2(args[1]) |
-                     INSN_DEP_LEN(32));
+        if (const_args[2]) {
+            tcg_out_sari(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_sar(s, args[0], args[1], args[2]);
+        }
+        break;
+
+    case INDEX_op_rotl_i32:
+        if (const_args[2]) {
+            tcg_out_rotli(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_rotl(s, args[0], args[1], args[2]);
+        }
+        break;
+
+    case INDEX_op_rotr_i32:
+        if (const_args[2]) {
+            tcg_out_rotri(s, args[0], args[1], args[2]);
+        } else {
+            tcg_out_rotr(s, args[0], args[1], args[2]);
+        }
         break;
 
     case INDEX_op_mul_i32:
-        fprintf(stderr, "unimplemented mul\n");
-        tcg_abort();
+        tcg_out_xmpyu(s, args[0], TCG_REG_R0, args[1], args[2]);
         break;
     case INDEX_op_mulu2_i32:
-        fprintf(stderr, "unimplemented mulu2\n");
-        tcg_abort();
+        tcg_out_xmpyu(s, args[0], args[1], args[2], args[3]);
         break;
-    case INDEX_op_div2_i32:
-        fprintf(stderr, "unimplemented div2\n");
-        tcg_abort();
+
+    case INDEX_op_bswap16_i32:
+        tcg_out_bswap16(s, args[0], args[1], 0);
         break;
-    case INDEX_op_divu2_i32:
-        fprintf(stderr, "unimplemented divu2\n");
-        tcg_abort();
+    case INDEX_op_bswap32_i32:
+        tcg_out_bswap32(s, args[0], args[1], TCG_REG_R20);
+        break;
+
+    case INDEX_op_not_i32:
+        tcg_out_arithi(s, args[0], args[1], -1, INSN_SUBI);
+        break;
+    case INDEX_op_ext8s_i32:
+        tcg_out_ext8s(s, args[0], args[1]);
+        break;
+    case INDEX_op_ext16s_i32:
+        tcg_out_ext16s(s, args[0], args[1]);
+        break;
+
+    /* These three correspond exactly to the fallback implementation.
+       But by including them we reduce the number of TCG ops that 
+       need to be generated, and these opcodes are fairly common.  */
+    case INDEX_op_neg_i32:
+        tcg_out_arith(s, args[0], TCG_REG_R0, args[1], INSN_SUB);
+        break;
+    case INDEX_op_ext8u_i32:
+        tcg_out_andi(s, args[0], args[1], 0xff);
+        break;
+    case INDEX_op_ext16u_i32:
+        tcg_out_andi(s, args[0], args[1], 0xffff);
         break;
 
     case INDEX_op_brcond_i32:
-        fprintf(stderr, "unimplemented brcond\n");
-        tcg_abort();
+        tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], args[3]);
+        break;
+    case INDEX_op_brcond2_i32:
+        tcg_out_brcond2(s, args[4], args[0], args[1],
+                        args[2], const_args[2],
+                        args[3], const_args[3], args[5]);
+        break;
+
+    case INDEX_op_setcond_i32:
+        tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]);
+        break;
+    case INDEX_op_setcond2_i32:
+        tcg_out_setcond2(s, args[5], args[0], args[1], args[2],
+                         args[3], const_args[3], args[4], const_args[4]);
+        break;
+
+    case INDEX_op_add2_i32:
+        if (const_args[4]) {
+            tcg_out_arithi(s, args[0], args[2], args[4], INSN_ADDI);
+        } else {
+            tcg_out_arith(s, args[0], args[2], args[4], INSN_ADD);
+        }
+        tcg_out_arith(s, args[1], args[3], args[5], INSN_ADDC);
+        break;
+
+    case INDEX_op_sub2_i32:
+        if (const_args[2]) {
+            /* Recall that SUBI is a reversed subtract.  */
+            tcg_out_arithi(s, args[0], args[4], args[2], INSN_SUBI);
+        } else {
+            tcg_out_arith(s, args[0], args[2], args[4], INSN_SUB);
+        }
+        tcg_out_arith(s, args[1], args[3], args[5], INSN_SUBB);
         break;
 
     case INDEX_op_qemu_ld8u:
@@ -866,6 +1460,9 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_qemu_ld32:
         tcg_out_qemu_ld(s, args, 2);
         break;
+    case INDEX_op_qemu_ld64:
+        tcg_out_qemu_ld(s, args, 3);
+        break;
 
     case INDEX_op_qemu_st8:
         tcg_out_qemu_st(s, args, 0);
@@ -876,47 +1473,70 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_qemu_st32:
         tcg_out_qemu_st(s, args, 2);
         break;
+    case INDEX_op_qemu_st64:
+        tcg_out_qemu_st(s, args, 3);
+        break;
 
     default:
         fprintf(stderr, "unknown opcode 0x%x\n", opc);
         tcg_abort();
     }
-    return;
-
-gen_arith:
-    tcg_out_arith(s, args[0], args[1], args[2], c);
 }
 
 static const TCGTargetOpDef hppa_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
 
-    { INDEX_op_call, { "r" } },
+    { INDEX_op_call, { "ri" } },
     { INDEX_op_jmp, { "r" } },
     { INDEX_op_br, { } },
 
     { INDEX_op_mov_i32, { "r", "r" } },
     { INDEX_op_movi_i32, { "r" } },
+
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
     { INDEX_op_ld16s_i32, { "r", "r" } },
     { INDEX_op_ld_i32, { "r", "r" } },
-    { INDEX_op_st8_i32, { "r", "r" } },
-    { INDEX_op_st16_i32, { "r", "r" } },
-    { INDEX_op_st_i32, { "r", "r" } },
+    { INDEX_op_st8_i32, { "rZ", "r" } },
+    { INDEX_op_st16_i32, { "rZ", "r" } },
+    { INDEX_op_st_i32, { "rZ", "r" } },
+
+    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
+    { INDEX_op_and_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_or_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_xor_i32, { "r", "rZ", "rZ" } },
+    { INDEX_op_andc_i32, { "r", "rZ", "ri" } },
+
+    { INDEX_op_mul_i32, { "r", "r", "r" } },
+    { INDEX_op_mulu2_i32, { "r", "r", "r", "r" } },
 
-    { INDEX_op_add_i32, { "r", "r", "r" } },
-    { INDEX_op_sub_i32, { "r", "r", "r" } },
-    { INDEX_op_and_i32, { "r", "r", "r" } },
-    { INDEX_op_or_i32, { "r", "r", "r" } },
-    { INDEX_op_xor_i32, { "r", "r", "r" } },
+    { INDEX_op_shl_i32, { "r", "r", "ri" } },
+    { INDEX_op_shr_i32, { "r", "r", "ri" } },
+    { INDEX_op_sar_i32, { "r", "r", "ri" } },
+    { INDEX_op_rotl_i32, { "r", "r", "ri" } },
+    { INDEX_op_rotr_i32, { "r", "r", "ri" } },
 
-    { INDEX_op_shl_i32, { "r", "r", "r" } },
-    { INDEX_op_shr_i32, { "r", "r", "r" } },
-    { INDEX_op_sar_i32, { "r", "r", "r" } },
+    { INDEX_op_bswap16_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "r", "r" } },
+    { INDEX_op_neg_i32, { "r", "r" } },
+    { INDEX_op_not_i32, { "r", "r" } },
 
-    { INDEX_op_brcond_i32, { "r", "r" } },
+    { INDEX_op_ext8s_i32, { "r", "r" } },
+    { INDEX_op_ext8u_i32, { "r", "r" } },
+    { INDEX_op_ext16s_i32, { "r", "r" } },
+    { INDEX_op_ext16u_i32, { "r", "r" } },
+
+    { INDEX_op_brcond_i32, { "rZ", "rJ" } },
+    { INDEX_op_brcond2_i32,  { "rZ", "rZ", "rJ", "rJ" } },
+
+    { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
+    { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
+
+    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
+    { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rZ", "rZ" } },
 
 #if TARGET_LONG_BITS == 32
     { INDEX_op_qemu_ld8u, { "r", "L" } },
@@ -926,10 +1546,10 @@  static const TCGTargetOpDef hppa_op_defs[] = {
     { INDEX_op_qemu_ld32, { "r", "L" } },
     { INDEX_op_qemu_ld64, { "r", "r", "L" } },
 
-    { INDEX_op_qemu_st8, { "L", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L", "L" } },
+    { INDEX_op_qemu_st8, { "LZ", "L" } },
+    { INDEX_op_qemu_st16, { "LZ", "L" } },
+    { INDEX_op_qemu_st32, { "LZ", "L" } },
+    { INDEX_op_qemu_st64, { "LZ", "LZ", "L" } },
 #else
     { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
     { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
@@ -938,25 +1558,98 @@  static const TCGTargetOpDef hppa_op_defs[] = {
     { INDEX_op_qemu_ld32, { "r", "L", "L" } },
     { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
 
-    { INDEX_op_qemu_st8, { "L", "L", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
+    { INDEX_op_qemu_st8, { "LZ", "L", "L" } },
+    { INDEX_op_qemu_st16, { "LZ", "L", "L" } },
+    { INDEX_op_qemu_st32, { "LZ", "L", "L" } },
+    { INDEX_op_qemu_st64, { "LZ", "LZ", "L", "L" } },
 #endif
     { -1 },
 };
 
+static int tcg_target_callee_save_regs[] = {
+    /* R2, the return address register, is saved specially
+       in the caller's frame.  */
+    /* R3, the frame pointer, is not currently modified.  */
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+    TCG_REG_R16,
+    /* R17 is the global env, so no need to save.  */
+    TCG_REG_R18
+};
+
+void tcg_target_qemu_prologue(TCGContext *s)
+{
+    int frame_size, i;
+
+    /* Allocate space for the fixed frame marker.  */
+    frame_size = -TCG_TARGET_CALL_STACK_OFFSET;
+    frame_size += TCG_TARGET_STATIC_CALL_ARGS_SIZE;
+
+    /* Allocate space for the saved registers.  */
+    frame_size += ARRAY_SIZE(tcg_target_callee_save_regs) * 4;
+
+    /* Align the allocated space.  */
+    frame_size = ((frame_size + TCG_TARGET_STACK_ALIGN - 1)
+                  & -TCG_TARGET_STACK_ALIGN);
+
+    /* The return address is stored in the caller's frame.  */
+    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -20);
+
+    /* Allocate stack frame, saving the first register at the same time.  */
+    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
+                 TCG_REG_SP, frame_size, INSN_STWM);
+
+    /* Save all callee saved registers.  */
+    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
+        tcg_out_st(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
+                   TCG_REG_SP, -frame_size + i * 4);
+    }
+
+    if (GUEST_BASE != 0) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
+    }
+
+    /* Jump to TB, and adjust R18 to be the return address.  */
+    tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R26));
+    tcg_out_mov(s, TCG_REG_R18, TCG_REG_R31);
+
+    /* Restore callee saved registers.  */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -frame_size - 20);
+    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
+        tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
+                   TCG_REG_SP, -frame_size + i * 4);
+    }
+
+    /* Deallocate stack frame and return.  */
+    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_RP));
+    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
+                 TCG_REG_SP, -frame_size, INSN_LDWM);
+}
+
 void tcg_target_init(TCGContext *s)
 {
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
-    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
-                     (1 << TCG_REG_R20) |
-                     (1 << TCG_REG_R21) |
-                     (1 << TCG_REG_R22) |
-                     (1 << TCG_REG_R23) |
-                     (1 << TCG_REG_R24) |
-                     (1 << TCG_REG_R25) |
-                     (1 << TCG_REG_R26));
+
+    tcg_regset_clear(tcg_target_call_clobber_regs);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R20);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R21);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R22);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R23);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R24);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R25);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R26);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET0);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET1);
 
     tcg_regset_clear(s->reserved_regs);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0);  /* hardwired to zero */
@@ -969,6 +1662,9 @@  void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_DP);  /* data pointer */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);  /* stack pointer */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R31); /* ble link reg */
+    if (GUEST_BASE != 0) {
+        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
+    }
 
     tcg_add_target_add_op_defs(hppa_op_defs);
 }
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index e956e71..36b6949 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -69,17 +69,33 @@  enum {
     TCG_REG_R31,
 };
 
+#define TCG_CT_CONST_0    0x0100
+#define TCG_CT_CONST_S5   0x0200
+#define TCG_CT_CONST_S11  0x0400
+
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_SP
-#define TCG_TARGET_STACK_ALIGN 16
+#define TCG_TARGET_STACK_ALIGN 64
+#define TCG_TARGET_CALL_STACK_OFFSET -48
+#define TCG_TARGET_STATIC_CALL_ARGS_SIZE 8*4
+#define TCG_TARGET_CALL_ALIGN_ARGS 1
 #define TCG_TARGET_STACK_GROWSUP
 
 /* optional instructions */
-#define TCG_TARGET_HAS_div2_i32
-//#define TCG_TARGET_HAS_ext8s_i32
-//#define TCG_TARGET_HAS_ext16s_i32
-//#define TCG_TARGET_HAS_bswap16_i32
-//#define TCG_TARGET_HAS_bswap32_i32
+// #define TCG_TARGET_HAS_div_i32
+#define TCG_TARGET_HAS_rot_i32
+#define TCG_TARGET_HAS_ext8s_i32
+#define TCG_TARGET_HAS_ext16s_i32
+#define TCG_TARGET_HAS_ext8u_i32
+#define TCG_TARGET_HAS_ext16u_i32
+#define TCG_TARGET_HAS_bswap16_i32
+#define TCG_TARGET_HAS_bswap32_i32
+#define TCG_TARGET_HAS_not_i32
+#define TCG_TARGET_HAS_neg_i32
+#define TCG_TARGET_HAS_andc_i32
+// #define TCG_TARGET_HAS_orc_i32
+
+#define TCG_TARGET_HAS_GUEST_BASE
 
 /* Note: must be synced with dyngen-exec.h */
 #define TCG_AREG0 TCG_REG_R17
@@ -87,116 +103,12 @@  enum {
 static inline void flush_icache_range(unsigned long start, unsigned long stop)
 {
     start &= ~31;
-    while (start <= stop)
-    {
-        asm volatile ("fdc 0(%0)\n"
-                      "sync\n"
-                      "fic 0(%%sr4, %0)\n"
-                      "sync\n"
+    while (start <= stop) {
+        asm volatile ("fdc 0(%0)\n\t"
+                      "sync\n\t"
+                      "fic 0(%%sr4, %0)\n\t"
+                      "sync"
                       : : "r"(start) : "memory");
         start += 32;
     }
 }
-
-/* supplied by libgcc */
-extern void *__canonicalize_funcptr_for_compare(void *);
-
-/* Field selection types defined by hppa */
-#define rnd(x)                  (((x)+0x1000)&~0x1fff)
-/* lsel: select left 21 bits */
-#define lsel(v,a)               (((v)+(a))>>11)
-/* rsel: select right 11 bits */
-#define rsel(v,a)               (((v)+(a))&0x7ff)
-/* lrsel with rounding of addend to nearest 8k */
-#define lrsel(v,a)              (((v)+rnd(a))>>11)
-/* rrsel with rounding of addend to nearest 8k */
-#define rrsel(v,a)              ((((v)+rnd(a))&0x7ff)+((a)-rnd(a)))
-
-#define mask(x,sz)              ((x) & ~((1<<(sz))-1))
-
-static inline int reassemble_12(int as12)
-{
-    return (((as12 & 0x800) >> 11) |
-            ((as12 & 0x400) >> 8) |
-            ((as12 & 0x3ff) << 3));
-}
-
-static inline int reassemble_14(int as14)
-{
-    return (((as14 & 0x1fff) << 1) |
-            ((as14 & 0x2000) >> 13));
-}
-
-static inline int reassemble_17(int as17)
-{
-    return (((as17 & 0x10000) >> 16) |
-            ((as17 & 0x0f800) << 5) |
-            ((as17 & 0x00400) >> 8) |
-            ((as17 & 0x003ff) << 3));
-}
-
-static inline int reassemble_21(int as21)
-{
-    return (((as21 & 0x100000) >> 20) |
-            ((as21 & 0x0ffe00) >> 8) |
-            ((as21 & 0x000180) << 7) |
-            ((as21 & 0x00007c) << 14) |
-            ((as21 & 0x000003) << 12));
-}
-
-static inline void hppa_patch21l(uint32_t *insn, int val, int addend)
-{
-    val = lrsel(val, addend);
-    *insn = mask(*insn, 21) | reassemble_21(val);
-}
-
-static inline void hppa_patch14r(uint32_t *insn, int val, int addend)
-{
-    val = rrsel(val, addend);
-    *insn = mask(*insn, 14) | reassemble_14(val);
-}
-
-static inline void hppa_patch17r(uint32_t *insn, int val, int addend)
-{
-    val = rrsel(val, addend);
-    *insn = (*insn & ~0x1f1ffd) | reassemble_17(val);
-}
-
-
-static inline void hppa_patch21l_dprel(uint32_t *insn, int val, int addend)
-{
-    register unsigned int dp asm("r27");
-    hppa_patch21l(insn, val - dp, addend);
-}
-
-static inline void hppa_patch14r_dprel(uint32_t *insn, int val, int addend)
-{
-    register unsigned int dp asm("r27");
-    hppa_patch14r(insn, val - dp, addend);
-}
-
-static inline void hppa_patch17f(uint32_t *insn, int val, int addend)
-{
-    int dot = (int)insn & ~0x3;
-    int v = ((val + addend) - dot - 8) / 4;
-    if (v > (1 << 16) || v < -(1 << 16)) {
-        printf("cannot fit branch to offset %d [%08x->%08x]\n", v, dot, val);
-        abort();
-    }
-    *insn = (*insn & ~0x1f1ffd) | reassemble_17(v);
-}
-
-static inline void hppa_load_imm21l(uint32_t *insn, int val, int addend)
-{
-    /* Transform addil L'sym(%dp) to ldil L'val, %r1 */
-    *insn = 0x20200000 | reassemble_21(lrsel(val, 0));
-}
-
-static inline void hppa_load_imm14r(uint32_t *insn, int val, int addend)
-{
-    /* Transform ldw R'sym(%r1), %rN to ldo R'sym(%r1), %rN */
-    hppa_patch14r(insn, val, addend);
-    /* HACK */
-    if (addend == 0)
-        *insn = (*insn & ~0xfc000000) | (0x0d << 26);
-}