diff mbox

[07/15] tcg-mips: Adjust qemu_ld/st for mips64

Message ID 1455014403-10742-8-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Feb. 9, 2016, 10:39 a.m. UTC
At the same time, use extract in the tlb_load for mips32r2.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/mips/tcg-target.c | 239 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 163 insertions(+), 76 deletions(-)

Comments

James Hogan Feb. 10, 2016, 4:34 p.m. UTC | #1
Hi Richard,

On Tue, Feb 09, 2016 at 09:39:55PM +1100, Richard Henderson wrote:
> @@ -1212,11 +1237,24 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
>             : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
>      int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
>  
> -    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addrl,
> -                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
> -    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
> -                    (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
> -    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
> +    if (use_mips32r2_instructions) {
> +        if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
> +            tcg_out_opc_bf(s, OPC_EXT, TCG_REG_A0, addrl,
> +                           TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
> +                           CPU_TLB_ENTRY_BITS);
> +        } else {
> +            tcg_out_opc_bf64(s, OPC_DEXT, OPC_DEXTM, OPC_DEXTU,
> +                             TCG_REG_A0, addrl,
> +                             TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
> +                             CPU_TLB_ENTRY_BITS);
> +        }

The ext/dext here will end up with bits below bit CPU_TLB_ENTRY_BITS
set, which will result in load of addend from slightly offset address,
so things go badly wrong. You still need to either ANDI off the low bits
or trim them off with the ext/dext and shift it left again.

So I don't think there's any benefit to the use of these instructions
unless CPU_TLB_SIZE + CPU_TLB_ENTRY_BITS exceeds the 16-bits available
in the ANDI immediate field for the non r2 case.

Cheers
James

> +    } else {
> +        tcg_out_opc_sa(s, ALIAS_TSRL, TCG_REG_A0, addrl,
> +                       TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
> +        tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
> +                        (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
> +    }
> +    tcg_out_opc_reg(s, ALIAS_PADD, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
>  
>      /* Compensate for very large offsets.  */
>      if (add_off >= 0x8000) {
Richard Henderson Feb. 10, 2016, 5:35 p.m. UTC | #2
On 02/11/2016 03:34 AM, James Hogan wrote:
> Hi Richard,
>
> On Tue, Feb 09, 2016 at 09:39:55PM +1100, Richard Henderson wrote:
>> @@ -1212,11 +1237,24 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
>>              : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
>>       int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
>>
>> -    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addrl,
>> -                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
>> -    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
>> -                    (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
>> -    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
>> +    if (use_mips32r2_instructions) {
>> +        if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
>> +            tcg_out_opc_bf(s, OPC_EXT, TCG_REG_A0, addrl,
>> +                           TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
>> +                           CPU_TLB_ENTRY_BITS);
>> +        } else {
>> +            tcg_out_opc_bf64(s, OPC_DEXT, OPC_DEXTM, OPC_DEXTU,
>> +                             TCG_REG_A0, addrl,
>> +                             TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
>> +                             CPU_TLB_ENTRY_BITS);
>> +        }
>
> The ext/dext here will end up with bits below bit CPU_TLB_ENTRY_BITS
> set, which will result in load of addend from slightly offset address,
> so things go badly wrong. You still need to either ANDI off the low bits
> or trim them off with the ext/dext and shift it left again.
>
> So I don't think there's any benefit to the use of these instructions
> unless CPU_TLB_SIZE + CPU_TLB_ENTRY_BITS exceeds the 16-bits available
> in the ANDI immediate field for the non r2 case.

Hmm.  I thought I'd deleted this code back out.  I must have messed up copying 
trees between machines and overwritten this.


r~
diff mbox

Patch

diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index e986437..242db14 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -33,8 +33,14 @@ 
 # define MIPS_BE  0
 #endif
 
-#define LO_OFF    (MIPS_BE * 4)
-#define HI_OFF    (4 - LO_OFF)
+#if TCG_TARGET_REG_BITS == 32
+# define LO_OFF  (MIPS_BE * 4)
+# define HI_OFF  (4 - LO_OFF)
+#else
+extern int link_error(void);
+# define LO_OFF  link_error()
+# define HI_OFF  link_error()
+#endif
 
 #ifndef NDEBUG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
@@ -188,7 +194,7 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set(ct->u.regs, 0xffffffff);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
-        if (TARGET_LONG_BITS == 64) {
+        if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
             tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
         }
 #endif
@@ -198,11 +204,11 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set(ct->u.regs, 0xffffffff);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
-        if (TARGET_LONG_BITS == 32) {
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
-        } else {
+        if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
             tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
             tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
+        } else {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
         }
 #endif
         break;
@@ -726,6 +732,16 @@  static inline void tcg_out_ext16s(TCGContext *s, TCGReg ret, TCGReg arg)
     }
 }
 
+static inline void tcg_out_ext32u(TCGContext *s, TCGReg ret, TCGReg arg)
+{
+    if (use_mips32r2_instructions) {
+        tcg_out_opc_bf(s, OPC_DEXT, ret, arg, 31, 0);
+    } else {
+        tcg_out_dsll(s, ret, arg, 32);
+        tcg_out_dsrl(s, ret, ret, 32);
+    }
+}
+
 static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,
                          TCGReg addr, intptr_t ofs)
 {
@@ -1124,6 +1140,10 @@  static void * const qemu_ld_helpers[16] = {
     [MO_BESW] = helper_be_ldsw_mmu,
     [MO_BEUL] = helper_be_ldul_mmu,
     [MO_BEQ]  = helper_be_ldq_mmu,
+#if TCG_TARGET_REG_BITS == 64
+    [MO_LESL] = helper_le_ldsl_mmu,
+    [MO_BESL] = helper_be_ldsl_mmu,
+#endif
 };
 
 static void * const qemu_st_helpers[16] = {
@@ -1151,6 +1171,9 @@  static int tcg_out_call_iarg_reg(TCGContext *s, int i, TCGReg arg)
     if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[i], arg);
     } else {
+        /* For N32 and N64, the initial offset is different.  But there
+           we also have 8 argument register so we don't run out here.  */
+        assert(TCG_TARGET_REG_BITS == 32);
         tcg_out_st(s, TCG_TYPE_REG, arg, TCG_REG_SP, 4 * i);
     }
     return i + 1;
@@ -1192,6 +1215,7 @@  static int tcg_out_call_iarg_imm(TCGContext *s, int i, TCGArg arg)
 
 static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah)
 {
+    assert(TCG_TARGET_REG_BITS == 32);
     i = (i + 1) & ~1;
     i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? ah : al));
     i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? al : ah));
@@ -1205,6 +1229,7 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
                              tcg_insn_unit *label_ptr[2], bool is_load)
 {
     TCGMemOp s_bits = get_memop(oi) & MO_SIZE;
+    target_ulong mask = TARGET_PAGE_MASK | ((1 << s_bits) - 1);
     int mem_index = get_mmuidx(oi);
     int cmp_off
         = (is_load
@@ -1212,11 +1237,24 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
            : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
 
-    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addrl,
-                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
-                    (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
-    tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
+    if (use_mips32r2_instructions) {
+        if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
+            tcg_out_opc_bf(s, OPC_EXT, TCG_REG_A0, addrl,
+                           TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
+                           CPU_TLB_ENTRY_BITS);
+        } else {
+            tcg_out_opc_bf64(s, OPC_DEXT, OPC_DEXTM, OPC_DEXTU,
+                             TCG_REG_A0, addrl,
+                             TARGET_PAGE_BITS + CPU_TLB_ENTRY_BITS - 1,
+                             CPU_TLB_ENTRY_BITS);
+        }
+    } else {
+        tcg_out_opc_sa(s, ALIAS_TSRL, TCG_REG_A0, addrl,
+                       TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
+        tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
+                        (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+    }
+    tcg_out_opc_reg(s, ALIAS_PADD, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
 
     /* Compensate for very large offsets.  */
     if (add_off >= 0x8000) {
@@ -1226,43 +1264,48 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
         QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
                                    tlb_table[NB_MMU_MODES - 1][1])
                           > 0x7ff0 + 0x7fff);
-        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
+        tcg_out_opc_imm(s, ALIAS_PADDI, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
         cmp_off -= 0x7ff0;
         add_off -= 0x7ff0;
     }
 
-    /* Load the (low half) tlb comparator.  */
-    tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0,
-                    cmp_off + (TARGET_LONG_BITS == 64 ? LO_OFF : 0));
-
-    /* Mask the page bits, keeping the alignment bits to compare against.
-       In between on 32-bit targets, load the tlb addend for the fast path.  */
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1,
-                 TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-    if (TARGET_LONG_BITS == 32) {
-        tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off);
+    /* Load the (low half) tlb comparator.  Mask the page bits, keeping the
+       alignment bits to compare against.  */
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A0, cmp_off + LO_OFF);
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, mask);
+    } else {
+        tcg_out_ld(s, TCG_TYPE_TL, TCG_TMP0, TCG_REG_A0, cmp_off);
+        tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, mask);
+        /* No second compare is required here;
+           load the tlb addend for the fast path.  */
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_REG_A0, add_off);
     }
     tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
 
+    /* Zero extend a 32-bit guest address for a 64-bit host.  */
+    if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+        tcg_out_ext32u(s, base, addrl);
+        addrl = base;
+    }
+
     label_ptr[0] = s->code_ptr;
     tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0);
 
     /* Load and test the high half tlb comparator.  */
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         /* delay slot */
-        tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, cmp_off + HI_OFF);
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A0, cmp_off + HI_OFF);
 
-        /* Load the tlb addend for the fast path. We can't do it earlier with
-           64-bit targets or we'll clobber a0 before reading the high half tlb
-           comparator.  */
-        tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off);
+        /* Load the tlb addend for the fast path.  */
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_REG_A0, add_off);
 
         label_ptr[1] = s->code_ptr;
         tcg_out_opc_br(s, OPC_BNE, addrh, TCG_TMP0);
     }
 
     /* delay slot */
-    tcg_out_opc_reg(s, OPC_ADDU, base, TCG_REG_A0, addrl);
+    tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_REG_A0, addrl);
 }
 
 static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOpIdx oi,
@@ -1280,7 +1323,7 @@  static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOpIdx oi,
     label->addrhi_reg = addrhi;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr[0];
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         label->label_ptr[1] = label_ptr[1];
     }
 }
@@ -1294,12 +1337,12 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     /* resolve label address */
     reloc_pc16(l->label_ptr[0], s->code_ptr);
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         reloc_pc16(l->label_ptr[1], s->code_ptr);
     }
 
     i = 1;
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         i = tcg_out_call_iarg_reg2(s, i, l->addrlo_reg, l->addrhi_reg);
     } else {
         i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
@@ -1311,7 +1354,7 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 
     v0 = l->datalo_reg;
-    if ((opc & MO_SIZE) == MO_64) {
+    if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
         /* We eliminated V0 from the possible output registers, so it
            cannot be clobbered here.  So we must move V1 first.  */
         if (MIPS_BE) {
@@ -1337,12 +1380,12 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     /* resolve label address */
     reloc_pc16(l->label_ptr[0], s->code_ptr);
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         reloc_pc16(l->label_ptr[1], s->code_ptr);
     }
 
     i = 1;
-    if (TARGET_LONG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         i = tcg_out_call_iarg_reg2(s, i, l->addrlo_reg, l->addrhi_reg);
     } else {
         i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
@@ -1354,14 +1397,15 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     case MO_16:
         i = tcg_out_call_iarg_reg16(s, i, l->datalo_reg);
         break;
-    case MO_32:
-        i = tcg_out_call_iarg_reg(s, i, l->datalo_reg);
-        break;
     case MO_64:
-        i = tcg_out_call_iarg_reg2(s, i, l->datalo_reg, l->datahi_reg);
-        break;
+        if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+            i = tcg_out_call_iarg_reg2(s, i, l->datalo_reg, l->datahi_reg);
+            break;
+        }
+        /* FALLTHRU */
     default:
-        tcg_abort();
+        i = tcg_out_call_iarg_reg(s, i, l->datalo_reg);
+        break;
     }
     i = tcg_out_call_iarg_imm(s, i, oi);
 
@@ -1376,7 +1420,7 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 #endif
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
-                                   TCGReg base, TCGMemOp opc)
+                                   TCGReg base, TCGMemOp opc, bool is_64)
 {
     switch (opc & (MO_SSIZE | MO_BSWAP)) {
     case MO_UB:
@@ -1385,6 +1429,7 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
     case MO_SB:
         tcg_out_opc_imm(s, OPC_LB, datalo, base, 0);
         break;
+
     case MO_UW | MO_BSWAP:
         tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
         tcg_out_bswap16(s, datalo, TCG_TMP1);
@@ -1392,6 +1437,7 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
     case MO_UW:
         tcg_out_opc_imm(s, OPC_LHU, datalo, base, 0);
         break;
+
     case MO_SW | MO_BSWAP:
         tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
         tcg_out_bswap16s(s, datalo, TCG_TMP1);
@@ -1399,22 +1445,47 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
     case MO_SW:
         tcg_out_opc_imm(s, OPC_LH, datalo, base, 0);
         break;
+
     case MO_UL | MO_BSWAP:
+        if (TCG_TARGET_REG_BITS == 64 && is_64) {
+            tcg_out_opc_imm(s, OPC_LWU, TCG_TMP1, base, 0);
+            tcg_out_bswap32u(s, datalo, TCG_TMP1);
+            break;
+        }
+        /* FALLTHRU */
+    case MO_SL | MO_BSWAP:
         tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 0);
         tcg_out_bswap32(s, datalo, TCG_TMP1);
         break;
+
     case MO_UL:
+        if (TCG_TARGET_REG_BITS == 64 && is_64) {
+            tcg_out_opc_imm(s, OPC_LWU, datalo, base, 0);
+            break;
+        }
+        /* FALLTHRU */
+    case MO_SL:
         tcg_out_opc_imm(s, OPC_LW, datalo, base, 0);
         break;
+
     case MO_Q | MO_BSWAP:
-        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, HI_OFF);
-        tcg_out_bswap32(s, datalo, TCG_TMP1);
-        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, LO_OFF);
-        tcg_out_bswap32(s, datahi, TCG_TMP1);
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, HI_OFF);
+            tcg_out_bswap32(s, datalo, TCG_TMP1);
+            tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, LO_OFF);
+            tcg_out_bswap32(s, datahi, TCG_TMP1);
+        } else {
+            tcg_out_opc_imm(s, OPC_LD, TCG_REG_V0, base, 0);
+            tcg_out_bswap64(s, datalo, TCG_REG_V0);
+        }
         break;
     case MO_Q:
-        tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
-        tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
+            tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
+        } else {
+            tcg_out_opc_imm(s, OPC_LD, datalo, base, 0);
+        }
         break;
     default:
         tcg_abort();
@@ -1435,33 +1506,41 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     TCGReg base = TCG_REG_V0;
 
     data_regl = *args++;
-    data_regh = (is_64 ? *args++ : 0);
+    data_regh = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addr_regl = *args++;
-    addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
     oi = *args++;
     opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 1);
-    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
+    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
     add_qemu_ldst_label(s, 1, oi, data_regl, data_regh, addr_regl, addr_regh,
                         s->code_ptr, label_ptr);
 #else
+    if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+        tcg_out_ext32u(s, base, addr_regl);
+        addr_regl = base;
+    }
     if (guest_base == 0 && data_regl != addr_regl) {
         base = addr_regl;
     } else if (guest_base == (int16_t)guest_base) {
-        tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, guest_base);
+        tcg_out_opc_imm(s, ALIAS_PADDI, base, addr_regl, guest_base);
     } else {
         tcg_out_movi(s, TCG_TYPE_PTR, base, guest_base);
-        tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
+        tcg_out_opc_reg(s, ALIAS_PADD, base, base, addr_regl);
     }
-    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
+    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
 #endif
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, TCGMemOp opc)
 {
+    if ((datalo | datahi) == 0) {
+        opc &= ~MO_BSWAP;
+    }
+
     switch (opc & (MO_SIZE | MO_BSWAP)) {
     case MO_8:
         tcg_out_opc_imm(s, OPC_SB, datalo, base, 0);
@@ -1485,14 +1564,25 @@  static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         break;
 
     case MO_64 | MO_BSWAP:
-        tcg_out_bswap32(s, TCG_TMP1, datalo);
-        tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, HI_OFF);
-        tcg_out_bswap32(s, TCG_TMP1, datahi);
-        tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, LO_OFF);
-        break;
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_out_bswap32(s, TCG_TMP1, datalo);
+            datalo = TCG_TMP1;
+            tcg_out_opc_imm(s, OPC_SW, datalo, base, HI_OFF);
+            tcg_out_bswap32(s, TCG_TMP1, datahi);
+            datahi = TCG_TMP1;
+            tcg_out_opc_imm(s, OPC_SW, datahi, base, LO_OFF);
+            break;
+        }
+        tcg_out_bswap64(s, TCG_REG_A1, datalo);
+        datalo = TCG_REG_A1;
+        /* FALLTHRU */
     case MO_64:
-        tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
-        tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
+            tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
+        } else {
+            tcg_out_opc_imm(s, OPC_SD, datalo, base, 0);
+        }
         break;
 
     default:
@@ -1511,9 +1601,9 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 #endif
 
     data_regl = *args++;
-    data_regh = (is_64 ? *args++ : 0);
+    data_regh = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addr_regl = *args++;
-    addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
+    addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
     oi = *args++;
     opc = get_memop(oi);
 
@@ -1526,16 +1616,18 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh,
                         s->code_ptr, label_ptr);
 #else
+    base = TCG_REG_A0;
+    if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+        tcg_out_ext32u(s, base, addr_regl);
+        addr_regl = base;
+    }
     if (guest_base == 0) {
         base = addr_regl;
+    } else if (guest_base == (int16_t)guest_base) {
+        tcg_out_opc_imm(s, ALIAS_PADDI, base, addr_regl, guest_base);
     } else {
-        base = TCG_REG_A0;
-        if (guest_base == (int16_t)guest_base) {
-            tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, guest_base);
-        } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, base, guest_base);
-            tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
-        }
+        tcg_out_movi(s, TCG_TYPE_PTR, base, guest_base);
+        tcg_out_opc_reg(s, ALIAS_PADD, base, base, addr_regl);
     }
     tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
 #endif
@@ -1849,12 +1941,7 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ext32u_i64:
     case INDEX_op_extu_i32_i64:
-        if (use_mips32r2_instructions) {
-            tcg_out_opc_bf(s, OPC_DEXT, a0, a1, 31, 0);
-        } else {
-            tcg_out_dsll(s, a0, a1, 32);
-            tcg_out_dsrl(s, a0, a0, 32);
-        }
+        tcg_out_ext32u(s, a0, a1);
         break;
 
     case INDEX_op_sar_i32: