diff mbox

tcg/mips: Add support for mips64el backend

Message ID 1478848833-25002-1-git-send-email-jinguojie@loongson.cn
State New
Headers show

Commit Message

Jin Guojie Nov. 11, 2016, 7:20 a.m. UTC
From: Jin Guojie <jinguojie@loongson.cn>

This patch implements TCG mips64r2(little-endian) translation backend.
Tested on Loongson 3A2000(a MIPS64-compatible CPU) with Fedora Linux 21 Remix.
linux-0.2.img.bz2 runs well.
The performance is nearly 10 times higher than tci mode.

https://en.wikipedia.org/wiki/Loongson
http://www.loongnix.org/index.php/Loongnix

Cc: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Jin Guojie <jinguojie@loongson.cn>
---
 tcg/mips/tcg-target.h     |  59 +++++++++++++
 tcg/mips/tcg-target.inc.c | 215 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 264 insertions(+), 10 deletions(-)

Comments

Richard Henderson Nov. 13, 2016, 7:56 a.m. UTC | #1
On 11/11/2016 08:20 AM, jinguojie@loongson.cn wrote:
> From: Jin Guojie <jinguojie@loongson.cn>
>
> This patch implements TCG mips64r2(little-endian) translation backend.
> Tested on Loongson 3A2000(a MIPS64-compatible CPU) with Fedora Linux 21 Remix.
> linux-0.2.img.bz2 runs well.
> The performance is nearly 10 times higher than tci mode.
>
> https://en.wikipedia.org/wiki/Loongson
> http://www.loongnix.org/index.php/Loongnix
>
> Cc: Aurelien Jarno <aurelien@aurel32.net>
> Signed-off-by: Jin Guojie <jinguojie@loongson.cn>

Have you seen

https://lists.nongnu.org/archive/html/qemu-devel/2016-02/msg01910.html

?  I know there are bugs in that patch set, but I would like any mips64 support 
to look like that.  In particular, reduce the use of #if to an absolute minimum.

> +#if UINTPTR_MAX == UINT32_MAX
> +# define TCG_TARGET_REG_BITS 32
> +#elif UINTPTR_MAX == UINT64_MAX
> +# define TCG_TARGET_REG_BITS 64
> +#endif

There are two mips64 abi's.  You're only allowing 64-bit code to be generated 
for n64, and not n32.

> +#undef use_movnz_instructions
> +#undef use_mips32_instructions
> +#undef use_mips32r6_instructions
> +
> +#define use_movnz_instructions  0
> +#define use_mips32_instructions  0
> +#define use_mips32r6_instructions  0

Why?  Certainly we should be able to generate code for mips64r2 and mips64r6.

> +#if TCG_TARGET_REG_BITS == 64
> +static const TCGReg tcg_target_call_oarg_regs[1] = {
> +    TCG_REG_V0,
> +};
> +#else
>  static const TCGReg tcg_target_call_oarg_regs[2] = {
>      TCG_REG_V0,
>      TCG_REG_V1
>  };
> +#endif

This change would be incorrect if we ever enhance tcg to handle __int128_t.  In 
the meantime it doesn't matter, and can be left unchanged.

> @@ -459,7 +502,15 @@ static inline void tcg_out_mov(TCGContext *s, TCGType type,
>  {
>      /* Simple reg-reg move, optimising out the 'do nothing' case */
>      if (ret != arg) {
> +#if TCG_TARGET_REG_BITS == 64
> +        if (type == TCG_TYPE_I32) {
> +            tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
> +        } else {
> +            tcg_out_opc_reg(s, OPC_DADDU, ret, arg, TCG_REG_ZERO);
> +        }
> +#else
>          tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
> +#endif
>      }

This is why a proper mips assembler uses OPC_OR.

>  }
>
> @@ -470,12 +521,21 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
>          tcg_out_opc_imm(s, OPC_ADDIU, reg, TCG_REG_ZERO, arg);
>      } else if (arg == (uint16_t)arg) {
>          tcg_out_opc_imm(s, OPC_ORI, reg, TCG_REG_ZERO, arg);
> -    } else {
> +    } else if (arg == (int32_t)arg) {
>          tcg_out_opc_imm(s, OPC_LUI, reg, TCG_REG_ZERO, arg >> 16);
>          if (arg & 0xffff) {
>              tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
>          }
>      }
> +#if TCG_TARGET_REG_BITS == 64
> +    /* 64-bit imm */
> +    else {
> +        tcg_out_opc_imm(s, OPC_LUI, reg, 0, (arg >> 32) & 0xffff);
> +        tcg_out_opc_imm(s, OPC_ORI, reg, reg, (arg >> 16) & 0xffff);
> +        tcg_out_opc_imm_64(s, OPC_DSLL, reg, reg, 16);
> +        tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
> +    }
> +#endif

This is only a 48-bit immediate.

>  }
>
>  static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
> @@ -566,7 +626,11 @@ static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,
>      if (ofs != lo) {
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs - lo);
>          if (addr != TCG_REG_ZERO) {
> +#if TCG_TARGET_REG_BITS == 64
> +            tcg_out_opc_reg(s, OPC_DADDU, TCG_TMP0, TCG_TMP0, addr);
> +#else
>              tcg_out_opc_reg(s, OPC_ADDU, TCG_TMP0, TCG_TMP0, addr);
> +#endif

See my patchset where I introduce OPC_PADDU to avoid this and other similar ifdefs.

> @@ -1163,6 +1276,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>      tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
>
>      v0 = l->datalo_reg;
> +#if TCG_TARGET_REG_BITS == 32
>      if ((opc & MO_SIZE) == MO_64) {
>          /* We eliminated V0 from the possible output registers, so it
>             cannot be clobbered here.  So we must move V1 first.  */
> @@ -1173,11 +1287,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>              tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_V1);
>          }
>      }
> +#endif
>
>      reloc_pc16(s->code_ptr, l->raddr);
>      tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
>      /* delay slot */
> +#if TCG_TARGET_REG_BITS == 32
>      tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
> +#else
> +    /* ext unsigned long(32) -> 64-bit */
> +    if ((opc & MO_SIZE) == MO_32) {
> +        tcg_out_mov(s, TCG_TYPE_I32, v0, TCG_REG_V0);
> +    } else {
> +        tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
> +    }
> +#endif

This is incorrect, as you're not passing down whether the operation is a 32-bit 
load into a 32-bit temporary, or a 32-bit load into a 64-bit temporary.  I.e. 
the difference between

   unsigned int x = *(unsigned int *)ptr;
and
   unsigned long long x = *(unsigned int *)ptr;


r~
Jin Guojie Nov. 14, 2016, 9:33 a.m. UTC | #2
Richard,

I have studied your V2 patch

https://lists.nongnu.org/archive/html/qemu-devel/2016-02/msg02969.html

. Though I have not tested this patch on Loongson machine, I feel this 
patch has implemented MIPS64 ISA very completely, including big/little
endian, N32 and N64 ABI. The use of #if is more clean. Many corner cases
are well handled. My patch is only a subset of yours.

I wonder why your patch have not be merged into the mainstream.
If I had seen it before, I don't need to waste time reinventing my patch.

Since tcg target for MIPS64 is of great use for developers, I 
really hope this feature can be merged into mainstream.

Is your v2 patch still in review process? Is there chance for this
patch to be merged in a not so long term? Or should other code
work should be done before being merged?

I want listen to your advice. Should I test your v2 patch on Loongson 
and use it? Or whether it is worth modifying my patch and resubmit it 
according to your review comments?

Jin Guojie

------------------ Original ------------------
From:  "Richard Henderson";<rth@twiddle.net>;

Send time: Sunday, Nov 13, 2016 3:56 PM
To: "jinguojie"<jinguojie@loongson.cn>; "qemu-devel"<qemu-devel@nongnu.org>; 
Cc: "Aurelien Jarno"<aurelien@aurel32.net>; 
Subject:  Re: [Qemu-devel] [PATCH] tcg/mips: Add support for mips64el backend



On 11/11/2016 08:20 AM, jinguojie@loongson.cn wrote:
> From: Jin Guojie <jinguojie@loongson.cn>

>

> This patch implements TCG mips64r2(little-endian) translation backend.

> Tested on Loongson 3A2000(a MIPS64-compatible CPU) with Fedora Linux 21 Remix.

> linux-0.2.img.bz2 runs well.

> The performance is nearly 10 times higher than tci mode.

>

> https://en.wikipedia.org/wiki/Loongson

> http://www.loongnix.org/index.php/Loongnix

>

> Cc: Aurelien Jarno <aurelien@aurel32.net>

> Signed-off-by: Jin Guojie <jinguojie@loongson.cn>


Have you seen

https://lists.nongnu.org/archive/html/qemu-devel/2016-02/msg01910.html

?  I know there are bugs in that patch set, but I would like any mips64 support 
to look like that.  In particular, reduce the use of #if to an absolute minimum.

> +#if UINTPTR_MAX == UINT32_MAX

> +# define TCG_TARGET_REG_BITS 32

> +#elif UINTPTR_MAX == UINT64_MAX

> +# define TCG_TARGET_REG_BITS 64

> +#endif


There are two mips64 abi's.  You're only allowing 64-bit code to be generated 
for n64, and not n32.

> +#undef use_movnz_instructions

> +#undef use_mips32_instructions

> +#undef use_mips32r6_instructions

> +

> +#define use_movnz_instructions  0

> +#define use_mips32_instructions  0

> +#define use_mips32r6_instructions  0


Why?  Certainly we should be able to generate code for mips64r2 and mips64r6.

> +#if TCG_TARGET_REG_BITS == 64

> +static const TCGReg tcg_target_call_oarg_regs[1] = {

> +    TCG_REG_V0,

> +};

> +#else

>  static const TCGReg tcg_target_call_oarg_regs[2] = {

>      TCG_REG_V0,

>      TCG_REG_V1

>  };

> +#endif


This change would be incorrect if we ever enhance tcg to handle __int128_t.  In 
the meantime it doesn't matter, and can be left unchanged.

> @@ -459,7 +502,15 @@ static inline void tcg_out_mov(TCGContext *s, TCGType type,

>  {

>      /* Simple reg-reg move, optimising out the 'do nothing' case */

>      if (ret != arg) {

> +#if TCG_TARGET_REG_BITS == 64

> +        if (type == TCG_TYPE_I32) {

> +            tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);

> +        } else {

> +            tcg_out_opc_reg(s, OPC_DADDU, ret, arg, TCG_REG_ZERO);

> +        }

> +#else

>          tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);

> +#endif

>      }


This is why a proper mips assembler uses OPC_OR.

>  }

>

> @@ -470,12 +521,21 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,

>          tcg_out_opc_imm(s, OPC_ADDIU, reg, TCG_REG_ZERO, arg);

>      } else if (arg == (uint16_t)arg) {

>          tcg_out_opc_imm(s, OPC_ORI, reg, TCG_REG_ZERO, arg);

> -    } else {

> +    } else if (arg == (int32_t)arg) {

>          tcg_out_opc_imm(s, OPC_LUI, reg, TCG_REG_ZERO, arg >> 16);

>          if (arg & 0xffff) {

>              tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);

>          }

>      }

> +#if TCG_TARGET_REG_BITS == 64

> +    /* 64-bit imm */

> +    else {

> +        tcg_out_opc_imm(s, OPC_LUI, reg, 0, (arg >> 32) & 0xffff);

> +        tcg_out_opc_imm(s, OPC_ORI, reg, reg, (arg >> 16) & 0xffff);

> +        tcg_out_opc_imm_64(s, OPC_DSLL, reg, reg, 16);

> +        tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);

> +    }

> +#endif


This is only a 48-bit immediate.

>  }

>

>  static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)

> @@ -566,7 +626,11 @@ static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,

>      if (ofs != lo) {

>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs - lo);

>          if (addr != TCG_REG_ZERO) {

> +#if TCG_TARGET_REG_BITS == 64

> +            tcg_out_opc_reg(s, OPC_DADDU, TCG_TMP0, TCG_TMP0, addr);

> +#else

>              tcg_out_opc_reg(s, OPC_ADDU, TCG_TMP0, TCG_TMP0, addr);

> +#endif


See my patchset where I introduce OPC_PADDU to avoid this and other similar ifdefs.

> @@ -1163,6 +1276,7 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)

>      tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

>

>      v0 = l->datalo_reg;

> +#if TCG_TARGET_REG_BITS == 32

>      if ((opc & MO_SIZE) == MO_64) {

>          /* We eliminated V0 from the possible output registers, so it

>             cannot be clobbered here.  So we must move V1 first.  */

> @@ -1173,11 +1287,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)

>              tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_V1);

>          }

>      }

> +#endif

>

>      reloc_pc16(s->code_ptr, l->raddr);

>      tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);

>      /* delay slot */

> +#if TCG_TARGET_REG_BITS == 32

>      tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);

> +#else

> +    /* ext unsigned long(32) -> 64-bit */

> +    if ((opc & MO_SIZE) == MO_32) {

> +        tcg_out_mov(s, TCG_TYPE_I32, v0, TCG_REG_V0);

> +    } else {

> +        tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);

> +    }

> +#endif


This is incorrect, as you're not passing down whether the operation is a 32-bit 
load into a 32-bit temporary, or a 32-bit load into a 64-bit temporary.  I.e. 
the difference between

   unsigned int x = *(unsigned int *)ptr;
and
   unsigned long long x = *(unsigned int *)ptr;


r~
Aurelien Jarno Nov. 14, 2016, 4:24 p.m. UTC | #3
Hi,

On 2016-11-14 17:33, Jin Guojie wrote:
> Richard,
> 
> I have studied your V2 patch
> 
> https://lists.nongnu.org/archive/html/qemu-devel/2016-02/msg02969.html
> 
> . Though I have not tested this patch on Loongson machine, I feel this 
> patch has implemented MIPS64 ISA very completely, including big/little
> endian, N32 and N64 ABI. The use of #if is more clean. Many corner cases
> are well handled. My patch is only a subset of yours.
> 
> I wonder why your patch have not be merged into the mainstream.
> If I had seen it before, I don't need to waste time reinventing my patch.
> 
> Since tcg target for MIPS64 is of great use for developers, I 
> really hope this feature can be merged into mainstream.
> 
> Is your v2 patch still in review process? Is there chance for this
> patch to be merged in a not so long term? Or should other code
> work should be done before being merged?

Please see:
 https://lists.nongnu.org/archive/html/qemu-devel/2016-02/msg06444.html

In short this patch set looks overall good, but breaks support for
existing big-endian 32-bit hosts. It also doesn't fully work on 64-bit
hosts for 32-bit guests, but I guess that's something acceptable, as
it's not a regression.

Aurelien
Richard Henderson Nov. 15, 2016, 9:37 p.m. UTC | #4
On 11/14/2016 10:33 AM, Jin Guojie wrote:
> I want listen to your advice. Should I test your v2 patch on Loongson
> and use it? Or whether it is worth modifying my patch and resubmit it
> according to your review comments?

I would like very much if you would test my patch on Loongson (or a 
re-submission of my patch; I could perhaps prepare that against master in the 
next few days).

If it is possible, I would like if you could help fix the problems that 
Aurelien discovered with my patch.  I have no access to mips hardware myself, 
so all of the development that I was doing was from within a qemu itself.  As 
you can imagine, qemu-in-qemu is very very slow.

At the time I was hoping that people from imgtec would be able to help, but 
that never came to pass.  Oh well.


r~
James Hogan Nov. 16, 2016, 10:52 a.m. UTC | #5
Hi Richard,

On Tue, Nov 15, 2016 at 10:37:41PM +0100, Richard Henderson wrote:
> On 11/14/2016 10:33 AM, Jin Guojie wrote:
> > I want listen to your advice. Should I test your v2 patch on Loongson
> > and use it? Or whether it is worth modifying my patch and resubmit it
> > according to your review comments?
> 
> I would like very much if you would test my patch on Loongson (or a 
> re-submission of my patch; I could perhaps prepare that against master in the 
> next few days).
> 
> If it is possible, I would like if you could help fix the problems that 
> Aurelien discovered with my patch.  I have no access to mips hardware myself, 
> so all of the development that I was doing was from within a qemu itself.  As 
> you can imagine, qemu-in-qemu is very very slow.
> 
> At the time I was hoping that people from imgtec would be able to help, but 
> that never came to pass.  Oh well.

I'm up for helping a bit with this (testing & debugging), though I admit
it fell off my radar a bit. We could try and run it up on our kernel
test farm too. Please keep me Cc'd on any future patches :)

Cheers
James
diff mbox

Patch

diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 3aeac87..4665e0b 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -31,6 +31,12 @@ 
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 #define TCG_TARGET_NB_REGS 32
 
+#if UINTPTR_MAX == UINT32_MAX
+# define TCG_TARGET_REG_BITS 32
+#elif UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#endif
+
 typedef enum {
     TCG_REG_ZERO = 0,
     TCG_REG_AT,
@@ -40,6 +46,7 @@  typedef enum {
     TCG_REG_A1,
     TCG_REG_A2,
     TCG_REG_A3,
+#if TCG_TARGET_REG_BITS == 32
     TCG_REG_T0,
     TCG_REG_T1,
     TCG_REG_T2,
@@ -48,6 +55,16 @@  typedef enum {
     TCG_REG_T5,
     TCG_REG_T6,
     TCG_REG_T7,
+#else
+    TCG_REG_A4,
+    TCG_REG_A5,
+    TCG_REG_A6,
+    TCG_REG_A7,
+    TCG_REG_T0,
+    TCG_REG_T1,
+    TCG_REG_T2,
+    TCG_REG_T3,
+#endif
     TCG_REG_S0,
     TCG_REG_S1,
     TCG_REG_S2,
@@ -132,6 +149,48 @@  extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_ext8u_i32        0 /* andi rt, rs, 0xff   */
 #define TCG_TARGET_HAS_ext16u_i32       0 /* andi rt, rs, 0xffff */
 
+#if TCG_TARGET_REG_BITS == 64
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
+#define TCG_TARGET_HAS_bswap16_i64      1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
+#define TCG_TARGET_HAS_deposit_i64      1
+#define TCG_TARGET_HAS_div_i64          0
+#define TCG_TARGET_HAS_rem_i64          0
+#define TCG_TARGET_HAS_ext8s_i64        1
+#define TCG_TARGET_HAS_ext16s_i64       1
+#define TCG_TARGET_HAS_ext32s_i64       1
+#define TCG_TARGET_HAS_ext8u_i64        1
+#define TCG_TARGET_HAS_ext16u_i64       1
+#define TCG_TARGET_HAS_ext32u_i64       1
+#define TCG_TARGET_HAS_andc_i64         0
+#define TCG_TARGET_HAS_eqv_i64          0
+#define TCG_TARGET_HAS_nand_i64         0
+#define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_neg_i64          1
+#define TCG_TARGET_HAS_not_i64          1
+#define TCG_TARGET_HAS_orc_i64          0
+#define TCG_TARGET_HAS_rot_i64          1
+#define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
+
+#undef use_movnz_instructions
+#undef use_mips32_instructions
+#undef use_mips32r6_instructions
+
+#define use_movnz_instructions  0
+#define use_mips32_instructions  0
+#define use_mips32r6_instructions  0
+#endif /* TCG_TARGET_REG_BITS == 64 */
+
 #ifdef __OpenBSD__
 #include <machine/sysarch.h>
 #else
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index abce602..153a49a 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -93,33 +93,53 @@  static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_T1,
     TCG_REG_T2,
     TCG_REG_T3,
+#if TCG_TARGET_REG_BITS == 32
     TCG_REG_T4,
     TCG_REG_T5,
     TCG_REG_T6,
     TCG_REG_T7,
+#endif
     TCG_REG_T8,
     TCG_REG_T9,
     TCG_REG_V1,
     TCG_REG_V0,
 
     /* Argument registers, opposite order of allocation.  */
+#if TCG_TARGET_REG_BITS == 64
+    TCG_REG_A7,
+    TCG_REG_A6,
+    TCG_REG_A5,
+    TCG_REG_A4,
+#endif
     TCG_REG_A3,
     TCG_REG_A2,
     TCG_REG_A1,
     TCG_REG_A0,
 };
 
-static const TCGReg tcg_target_call_iarg_regs[4] = {
+static const TCGReg tcg_target_call_iarg_regs[] = {
     TCG_REG_A0,
     TCG_REG_A1,
     TCG_REG_A2,
-    TCG_REG_A3
+    TCG_REG_A3,
+#if TCG_TARGET_REG_BITS == 64
+    TCG_REG_A4,
+    TCG_REG_A5,
+    TCG_REG_A6,
+    TCG_REG_A7,
+#endif
 };
 
+#if TCG_TARGET_REG_BITS == 64
+static const TCGReg tcg_target_call_oarg_regs[1] = {
+    TCG_REG_V0,
+};
+#else
 static const TCGReg tcg_target_call_oarg_regs[2] = {
     TCG_REG_V0,
     TCG_REG_V1
 };
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -348,6 +368,14 @@  typedef enum {
     OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 5,
     OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 5,
     OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 5,
+
+#if TCG_TARGET_REG_BITS == 64
+    OPC_DADDIU   = 0x19 << 26,
+    OPC_DADDU    = OPC_SPECIAL | 0x2D,
+    OPC_DSLL     = OPC_SPECIAL | 0x38,
+    OPC_LD       = 0x37 << 26,
+    OPC_SD       = 0x3F << 26,
+#endif
 } MIPSInsn;
 
 /*
@@ -380,6 +408,21 @@  static inline void tcg_out_opc_imm(TCGContext *s, MIPSInsn opc,
     tcg_out32(s, inst);
 }
 
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_out_opc_imm_64(TCGContext *s, int opc,
+                                      int rd, int rt, int sa)
+{
+    int32_t inst;
+
+    inst = opc;
+    inst |= (rt & 0x1F) << 16;
+    inst |= (rd & 0x1F) << 11;
+    inst |= (sa & 0x1F) << 6;
+    tcg_out32(s, inst);
+}
+#endif
+
+
 /*
  * Type bitfield
  */
@@ -459,7 +502,15 @@  static inline void tcg_out_mov(TCGContext *s, TCGType type,
 {
     /* Simple reg-reg move, optimising out the 'do nothing' case */
     if (ret != arg) {
+#if TCG_TARGET_REG_BITS == 64
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
+        } else {
+            tcg_out_opc_reg(s, OPC_DADDU, ret, arg, TCG_REG_ZERO);
+        }
+#else
         tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
+#endif
     }
 }
 
@@ -470,12 +521,21 @@  static inline void tcg_out_movi(TCGContext *s, TCGType type,
         tcg_out_opc_imm(s, OPC_ADDIU, reg, TCG_REG_ZERO, arg);
     } else if (arg == (uint16_t)arg) {
         tcg_out_opc_imm(s, OPC_ORI, reg, TCG_REG_ZERO, arg);
-    } else {
+    } else if (arg == (int32_t)arg) {
         tcg_out_opc_imm(s, OPC_LUI, reg, TCG_REG_ZERO, arg >> 16);
         if (arg & 0xffff) {
             tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
         }
     }
+#if TCG_TARGET_REG_BITS == 64
+    /* 64-bit imm */
+    else {
+        tcg_out_opc_imm(s, OPC_LUI, reg, 0, (arg >> 32) & 0xffff);
+        tcg_out_opc_imm(s, OPC_ORI, reg, reg, (arg >> 16) & 0xffff);
+        tcg_out_opc_imm_64(s, OPC_DSLL, reg, reg, 16);
+        tcg_out_opc_imm(s, OPC_ORI, reg, reg, arg & 0xffff);
+    }
+#endif
 }
 
 static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
@@ -566,7 +626,11 @@  static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,
     if (ofs != lo) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, ofs - lo);
         if (addr != TCG_REG_ZERO) {
+#if TCG_TARGET_REG_BITS == 64
+            tcg_out_opc_reg(s, OPC_DADDU, TCG_TMP0, TCG_TMP0, addr);
+#else
             tcg_out_opc_reg(s, OPC_ADDU, TCG_TMP0, TCG_TMP0, addr);
+#endif
         }
         addr = TCG_TMP0;
     }
@@ -576,13 +640,29 @@  static void tcg_out_ldst(TCGContext *s, MIPSInsn opc, TCGReg data,
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
                               TCGReg arg1, intptr_t arg2)
 {
+#if TCG_TARGET_REG_BITS == 64
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst(s, OPC_LW, arg, arg1, arg2);
+    } else if (type == TCG_TYPE_I64) {
+        tcg_out_ldst(s, OPC_LD, arg, arg1, arg2);
+    }
+#else
     tcg_out_ldst(s, OPC_LW, arg, arg1, arg2);
+#endif
 }
 
 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                               TCGReg arg1, intptr_t arg2)
 {
+#if TCG_TARGET_REG_BITS == 64
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst(s, OPC_SW, arg, arg1, arg2);
+    } else if (type == TCG_TYPE_I64) {
+        tcg_out_ldst(s, OPC_SD, arg, arg1, arg2);
+    }
+#else
     tcg_out_ldst(s, OPC_SW, arg, arg1, arg2);
+#endif
 }
 
 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -605,6 +685,18 @@  static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val)
     }
 }
 
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_out_daddi(TCGContext *s, int reg, tcg_target_long val)
+{
+    if (val == (int16_t)val) {
+        tcg_out_opc_imm(s, OPC_DADDIU, reg, reg, val);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, val);
+        tcg_out_opc_reg(s, OPC_DADDU, reg, reg, TCG_REG_AT);
+    }
+}
+#endif
+
 static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh, TCGReg al,
                             TCGReg ah, TCGArg bl, TCGArg bh, bool cbl,
                             bool cbh, bool is_sub)
@@ -1063,7 +1155,11 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
                     (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+#if TCG_TARGET_REG_BITS == 64
+    tcg_out_opc_reg(s, OPC_DADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
+#else
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
+#endif
 
     /* Compensate for very large offsets.  */
     if (add_off >= 0x8000) {
@@ -1073,14 +1169,23 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
         QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
                                    tlb_table[NB_MMU_MODES - 1][1])
                           > 0x7ff0 + 0x7fff);
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_opc_imm(s, OPC_DADDIU, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
+#else
         tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
+#endif
         cmp_off -= 0x7ff0;
         add_off -= 0x7ff0;
     }
 
     /* Load the (low half) tlb comparator.  */
+#if TCG_TARGET_REG_BITS == 64
+    tcg_out_opc_imm(s, OPC_LWU, TCG_TMP0, TCG_REG_A0,
+                    cmp_off + (TARGET_LONG_BITS == 64 ? LO_OFF : 0));
+#else
     tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0,
                     cmp_off + (TARGET_LONG_BITS == 64 ? LO_OFF : 0));
+#endif
 
     /* We don't currently support unaligned accesses.
        We could do so with mips32r6.  */
@@ -1092,7 +1197,11 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1,
                  TARGET_PAGE_MASK | ((1 << a_bits) - 1));
     if (TARGET_LONG_BITS == 32) {
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_opc_imm(s, OPC_LD, TCG_REG_A0, TCG_REG_A0, add_off);
+#else
         tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off);
+#endif
     }
     tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
 
@@ -1114,7 +1223,11 @@  static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     }
 
     /* delay slot */
+#if TCG_TARGET_REG_BITS == 64
+    tcg_out_opc_reg(s, OPC_DADDU, base, TCG_REG_A0, addrl);
+#else
     tcg_out_opc_reg(s, OPC_ADDU, base, TCG_REG_A0, addrl);
+#endif
 }
 
 static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOpIdx oi,
@@ -1163,6 +1276,7 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 
     v0 = l->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
     if ((opc & MO_SIZE) == MO_64) {
         /* We eliminated V0 from the possible output registers, so it
            cannot be clobbered here.  So we must move V1 first.  */
@@ -1173,11 +1287,21 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_V1);
         }
     }
+#endif
 
     reloc_pc16(s->code_ptr, l->raddr);
     tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
     /* delay slot */
+#if TCG_TARGET_REG_BITS == 32
     tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
+#else
+    /* ext unsigned long(32) -> 64-bit */
+    if ((opc & MO_SIZE) == MO_32) {
+        tcg_out_mov(s, TCG_TYPE_I32, v0, TCG_REG_V0);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
+    }
+#endif
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
@@ -1210,7 +1334,11 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
         i = tcg_out_call_iarg_reg(s, i, l->datalo_reg);
         break;
     case MO_64:
+#if TCG_TARGET_REG_BITS == 64
+        i = tcg_out_call_iarg_reg(s, i, l->datalo_reg);
+#else
         i = tcg_out_call_iarg_reg2(s, i, l->datalo_reg, l->datahi_reg);
+#endif
         break;
     default:
         tcg_abort();
@@ -1265,8 +1393,12 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         tcg_out_bswap32(s, datahi, TCG_TMP1);
         break;
     case MO_Q:
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_opc_imm(s, OPC_LD, datalo, base, 0);
+#else
         tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
         tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
+#endif
         break;
     default:
         tcg_abort();
@@ -1287,7 +1419,11 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     TCGReg base = TCG_REG_V0;
 
     data_regl = *args++;
+#if TCG_TARGET_REG_BITS == 64
+    data_regh = 0;
+#else
     data_regh = (is_64 ? *args++ : 0);
+#endif
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
     oi = *args++;
@@ -1343,8 +1479,12 @@  static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, LO_OFF);
         break;
     case MO_64:
+#if TCG_TARGET_REG_BITS == 64
+        tcg_out_opc_imm(s, OPC_SD, datalo, base, 0);
+#else
         tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
         tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
+#endif
         break;
 
     default:
@@ -1363,7 +1503,11 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 #endif
 
     data_regl = *args++;
+#if TCG_TARGET_REG_BITS == 64
+    data_regh = 0;
+#else
     data_regh = (is_64 ? *args++ : 0);
+#endif
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
     oi = *args++;
@@ -1481,6 +1625,15 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         goto do_ldst;
     case INDEX_op_st_i32:
         i1 = OPC_SW;
+        goto do_ldst;
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_ld_i64:
+        i1 = OPC_LD;
+        goto do_ldst;
+    case INDEX_op_st_i64:
+        i1 = OPC_SD;
+        goto do_ldst;
+#endif
     do_ldst:
         tcg_out_ldst(s, i1, a0, a1, a2);
         break;
@@ -1488,6 +1641,11 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_add_i32:
         i1 = OPC_ADDU, i2 = OPC_ADDIU;
         goto do_binary;
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_add_i64:
+        i1 = OPC_DADDU, i2 = OPC_DADDIU;
+        goto do_binary;
+#endif
     case INDEX_op_or_i32:
         i1 = OPC_OR, i2 = OPC_ORI;
         goto do_binary;
@@ -1683,6 +1841,9 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_movi_i64:
+#endif
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
@@ -1702,8 +1863,14 @@  static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_st8_i32, { "rZ", "r" } },
     { INDEX_op_st16_i32, { "rZ", "r" } },
     { INDEX_op_st_i32, { "rZ", "r" } },
-
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_ld_i64, { "r", "r" } },
+    { INDEX_op_st_i64, { "rZ", "r" } },
+#endif
     { INDEX_op_add_i32, { "r", "rZ", "rJ" } },
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_add_i64, { "r", "rZ", "rJ" } },
+#endif
     { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
 #if !use_mips32r6_instructions
     { INDEX_op_muls2_i32, { "r", "r", "rZ", "rZ" } },
@@ -1753,8 +1920,13 @@  static const TCGTargetOpDef mips_op_defs[] = {
 #if TARGET_LONG_BITS == 32
     { INDEX_op_qemu_ld_i32, { "L", "lZ" } },
     { INDEX_op_qemu_st_i32, { "SZ", "SZ" } },
+  #if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_qemu_ld_i64, { "L", "lZ" } },
+    { INDEX_op_qemu_st_i64, { "SZ", "SZ" } },
+  #else
     { INDEX_op_qemu_ld_i64, { "L", "L", "lZ" } },
     { INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ" } },
+  #endif
 #else
     { INDEX_op_qemu_ld_i32, { "L", "lZ", "lZ" } },
     { INDEX_op_qemu_st_i32, { "SZ", "SZ", "SZ" } },
@@ -1862,22 +2034,34 @@  static void tcg_target_detect_isa(void)
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
     int i, frame_size;
+#if TCG_TARGET_REG_BITS == 64
+  #define SAVE_REG_SIZE 8
+  #define SAVE_REG_TYPE TCG_TYPE_I64
+#else
+  #define SAVE_REG_SIZE 4
+  #define SAVE_REG_TYPE TCG_TYPE_I32
+#endif
 
     /* reserve some stack space, also for TCG temps. */
-    frame_size = ARRAY_SIZE(tcg_target_callee_save_regs) * 4
+    frame_size = ARRAY_SIZE(tcg_target_callee_save_regs) * SAVE_REG_SIZE
                  + TCG_STATIC_CALL_ARGS_SIZE
                  + CPU_TEMP_BUF_NLONGS * sizeof(long);
     frame_size = (frame_size + TCG_TARGET_STACK_ALIGN - 1) &
                  ~(TCG_TARGET_STACK_ALIGN - 1);
-    tcg_set_frame(s, TCG_REG_SP, ARRAY_SIZE(tcg_target_callee_save_regs) * 4
+    tcg_set_frame(s, TCG_REG_SP,
+                  ARRAY_SIZE(tcg_target_callee_save_regs) * SAVE_REG_SIZE
                   + TCG_STATIC_CALL_ARGS_SIZE,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
     /* TB prologue */
+#if TCG_TARGET_REG_BITS == 64
+    tcg_out_daddi(s, TCG_REG_SP, -frame_size);
+#else
     tcg_out_addi(s, TCG_REG_SP, -frame_size);
+#endif
     for(i = 0 ; i < ARRAY_SIZE(tcg_target_callee_save_regs) ; i++) {
-        tcg_out_st(s, TCG_TYPE_I32, tcg_target_callee_save_regs[i],
-                   TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE + i * 4);
+        tcg_out_st(s, SAVE_REG_TYPE, tcg_target_callee_save_regs[i],
+                   TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE + i * SAVE_REG_SIZE);
     }
 
     /* Call generated code */
@@ -1887,12 +2071,16 @@  static void tcg_target_qemu_prologue(TCGContext *s)
 
     /* TB epilogue */
     for(i = 0 ; i < ARRAY_SIZE(tcg_target_callee_save_regs) ; i++) {
-        tcg_out_ld(s, TCG_TYPE_I32, tcg_target_callee_save_regs[i],
-                   TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE + i * 4);
+        tcg_out_ld(s, SAVE_REG_TYPE, tcg_target_callee_save_regs[i],
+                   TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE + i * SAVE_REG_SIZE);
     }
 
     tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
+#if TCG_TARGET_REG_BITS == 64
+    tcg_out_daddi(s, TCG_REG_SP, frame_size);
+#else
     tcg_out_addi(s, TCG_REG_SP, frame_size);
+#endif
 }
 
 static void tcg_target_init(TCGContext *s)
@@ -1910,10 +2098,17 @@  static void tcg_target_init(TCGContext *s)
                    (1 << TCG_REG_T1) |
                    (1 << TCG_REG_T2) |
                    (1 << TCG_REG_T3) |
+#if TCG_TARGET_REG_BITS == 32
                    (1 << TCG_REG_T4) |
                    (1 << TCG_REG_T5) |
                    (1 << TCG_REG_T6) |
                    (1 << TCG_REG_T7) |
+#else
+                   (1 << TCG_REG_A4) |
+                   (1 << TCG_REG_A5) |
+                   (1 << TCG_REG_A6) |
+                   (1 << TCG_REG_A7) |
+#endif
                    (1 << TCG_REG_T8) |
                    (1 << TCG_REG_T9));