diff mbox

[v2,12/22] target-mips: add ALIGN, DALIGN, BITSWAP and DBITSWAP instructions

Message ID 1402499992-64851-13-git-send-email-leon.alrae@imgtec.com
State New
Headers show

Commit Message

Leon Alrae June 11, 2014, 3:19 p.m. UTC
From: Yongbok Kim <yongbok.kim@imgtec.com>

Signed-off-by: Yongbok Kim <yongbok.kim@imgtec.com>
Signed-off-by: Leon Alrae <leon.alrae@imgtec.com>
---
v2:
* have separate bitswap and dbitswap helpers and use common function
* use TCG_CALL_NO_RWG_SE flag for bitswap and dbitswap helpers
* remove useless shift in ALIGN and DALIGN
* improve ALIGN implementation by merging rt and rs into 64-bit register
* add missing zero register case
---
 disas/mips.c            |    4 ++
 target-mips/helper.h    |    5 ++
 target-mips/op_helper.c |   24 +++++++++
 target-mips/translate.c |  122 ++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 143 insertions(+), 12 deletions(-)

Comments

Richard Henderson June 11, 2014, 4:39 p.m. UTC | #1
On 06/11/2014 08:19 AM, Leon Alrae wrote:
> * add missing zero register case

What missing zero register case?

> +        if (rd == 0) {
> +            /* Treat as NOP. */
> +            break;
> +        }

This is normally handled by gen_store_gpr...

> +            if (rt == 0) {
> +                tcg_gen_movi_tl(cpu_gpr[rd], 0);
> +            } else {
> +                gen_helper_dbitswap(cpu_gpr[rd], cpu_gpr[rt]);
> +            }
> +            break;

... and this is normally handed by gen_load_gpr.

Open-coding these tests just clutters the code, making it harder to read.  C.f.
the 1500 lines removed during a cleanup of target-alpha for exactly this sort
of thing.


r~
Leon Alrae June 12, 2014, 8:35 a.m. UTC | #2
On 11/06/2014 17:39, Richard Henderson wrote:
> On 06/11/2014 08:19 AM, Leon Alrae wrote:
>> * add missing zero register case
> 
> What missing zero register case?
> 
>> +        if (rd == 0) {
>> +            /* Treat as NOP. */
>> +            break;
>> +        }
> 
> This is normally handled by gen_store_gpr...
> 
>> +            if (rt == 0) {
>> +                tcg_gen_movi_tl(cpu_gpr[rd], 0);
>> +            } else {
>> +                gen_helper_dbitswap(cpu_gpr[rd], cpu_gpr[rt]);
>> +            }
>> +            break;
> 
> ... and this is normally handed by gen_load_gpr.
> 
> Open-coding these tests just clutters the code, making it harder to read.  C.f.
> the 1500 lines removed during a cleanup of target-alpha for exactly this sort
> of thing.

Yes, it clutters the code a bit. However, by looking at the other
instructions I got the impression that preferable approach is to avoid
generating unnecessary code if we can. This seems to be sensible so I'm
trying to stick to it. So in bitswap example I think there is no point
in calling the helper if we know that the result will be 0.

Regards,
Leon
Richard Henderson June 12, 2014, 2:34 p.m. UTC | #3
On 06/12/2014 01:35 AM, Leon Alrae wrote:
> Yes, it clutters the code a bit. However, by looking at the other
> instructions I got the impression that preferable approach is to avoid
> generating unnecessary code if we can. This seems to be sensible so I'm
> trying to stick to it. So in bitswap example I think there is no point
> in calling the helper if we know that the result will be 0.

On the other hand, for certain values of never that exclude conformance testing
software, one will never see r0 as input to bswap.  So it's still a useless
optimization.


r~
Aurelien Jarno June 19, 2014, 9:06 p.m. UTC | #4
On Wed, Jun 11, 2014 at 04:19:42PM +0100, Leon Alrae wrote:
> From: Yongbok Kim <yongbok.kim@imgtec.com>
> 
> Signed-off-by: Yongbok Kim <yongbok.kim@imgtec.com>
> Signed-off-by: Leon Alrae <leon.alrae@imgtec.com>
> ---
> v2:
> * have separate bitswap and dbitswap helpers and use common function
> * use TCG_CALL_NO_RWG_SE flag for bitswap and dbitswap helpers
> * remove useless shift in ALIGN and DALIGN
> * improve ALIGN implementation by merging rt and rs into 64-bit register
> * add missing zero register case
> ---
>  disas/mips.c            |    4 ++
>  target-mips/helper.h    |    5 ++
>  target-mips/op_helper.c |   24 +++++++++
>  target-mips/translate.c |  122 ++++++++++++++++++++++++++++++++++++++++++-----
>  4 files changed, 143 insertions(+), 12 deletions(-)
> 
> diff --git a/disas/mips.c b/disas/mips.c
> index 8cb9032..127a7d5 100644
> --- a/disas/mips.c
> +++ b/disas/mips.c
> @@ -1246,6 +1246,10 @@ const struct mips_opcode mips_builtin_opcodes[] =
>  {"cache",   "k,o(b)",   0x7c000025, 0xfc00003f, RD_b,                 0, I32R6},
>  {"seleqz",  "d,v,t",    0x00000035, 0xfc0007ff, WR_d|RD_s|RD_t,       0, I32R6},
>  {"selnez",  "d,v,t",    0x00000037, 0xfc0007ff, WR_d|RD_s|RD_t,       0, I32R6},
> +{"align",   "d,v,t",    0x7c000220, 0xfc00073f, WR_d|RD_s|RD_t,       0, I32R6},
> +{"dalign",  "d,v,t",    0x7c000224, 0xfc00063f, WR_d|RD_s|RD_t,       0, I64R6},
> +{"bitswap", "d,w",      0x7c000020, 0xffe007ff, WR_d|RD_t,            0, I32R6},
> +{"dbitswap","d,w",      0x7c000024, 0xffe007ff, WR_d|RD_t,            0, I64R6},
>  {"pref",    "k,o(b)",   0xcc000000, 0xfc000000, RD_b,           	0,		I4|I32|G3	},
>  {"prefx",   "h,t(b)",	0x4c00000f, 0xfc0007ff, RD_b|RD_t,		0,		I4|I33	},
>  {"nop",     "",         0x00000000, 0xffffffff, 0,              	INSN2_ALIAS,	I1      }, /* sll */
> diff --git a/target-mips/helper.h b/target-mips/helper.h
> index 74ef094..5511dfc 100644
> --- a/target-mips/helper.h
> +++ b/target-mips/helper.h
> @@ -39,6 +39,11 @@ DEF_HELPER_3(macchiu, tl, env, tl, tl)
>  DEF_HELPER_3(msachi, tl, env, tl, tl)
>  DEF_HELPER_3(msachiu, tl, env, tl, tl)
>  
> +DEF_HELPER_FLAGS_1(bitswap, TCG_CALL_NO_RWG_SE, tl, tl)
> +#ifdef TARGET_MIPS64
> +DEF_HELPER_FLAGS_1(dbitswap, TCG_CALL_NO_RWG_SE, tl, tl)
> +#endif
> +
>  #ifndef CONFIG_USER_ONLY
>  /* CP0 helpers */
>  DEF_HELPER_1(mfc0_mvpcontrol, tl, env)
> diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
> index 4704216..34e9823 100644
> --- a/target-mips/op_helper.c
> +++ b/target-mips/op_helper.c
> @@ -265,6 +265,30 @@ target_ulong helper_mulshiu(CPUMIPSState *env, target_ulong arg1,
>                         (uint64_t)(uint32_t)arg2);
>  }
>  
> +static inline target_ulong bitswap(target_ulong v)
> +{
> +    v = ((v >> 1) & (target_ulong)0x5555555555555555) |
> +              ((v & (target_ulong)0x5555555555555555) << 1);
> +    v = ((v >> 2) & (target_ulong)0x3333333333333333) |
> +              ((v & (target_ulong)0x3333333333333333) << 2);
> +    v = ((v >> 4) & (target_ulong)0x0F0F0F0F0F0F0F0F) |
> +              ((v & (target_ulong)0x0F0F0F0F0F0F0F0F) << 4);
> +    return v;
> +}
> +
> +
> +#ifdef TARGET_MIPS64
> +target_ulong helper_dbitswap(target_ulong rt)
> +{
> +    return bitswap(rt);
> +}
> +#endif
> +
> +target_ulong helper_bitswap(target_ulong rt)
> +{
> +    return (int32_t)bitswap(rt);
> +}
> +
>  #ifndef CONFIG_USER_ONLY
>  
>  static inline hwaddr do_translate_address(CPUMIPSState *env,
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index 363b178..270e7d3 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -390,17 +390,23 @@ enum {
>  #define MASK_BSHFL(op)     MASK_SPECIAL3(op) | (op & (0x1F << 6))
>  
>  enum {
> -    OPC_WSBH     = (0x02 << 6) | OPC_BSHFL,
> -    OPC_SEB      = (0x10 << 6) | OPC_BSHFL,
> -    OPC_SEH      = (0x18 << 6) | OPC_BSHFL,
> +    OPC_WSBH      = (0x02 << 6) | OPC_BSHFL,
> +    OPC_SEB       = (0x10 << 6) | OPC_BSHFL,
> +    OPC_SEH       = (0x18 << 6) | OPC_BSHFL,
> +    OPC_ALIGN     = (0x08 << 6) | OPC_BSHFL, /* 010.bp */
> +    OPC_ALIGN_END = (0x0B << 6) | OPC_BSHFL, /* 010.00 to 010.11 */
> +    OPC_BITSWAP   = (0x00 << 6) | OPC_BSHFL  /* 00000 */
>  };
>  
>  /* DBSHFL opcodes */
>  #define MASK_DBSHFL(op)    MASK_SPECIAL3(op) | (op & (0x1F << 6))
>  
>  enum {
> -    OPC_DSBH     = (0x02 << 6) | OPC_DBSHFL,
> -    OPC_DSHD     = (0x05 << 6) | OPC_DBSHFL,
> +    OPC_DSBH       = (0x02 << 6) | OPC_DBSHFL,
> +    OPC_DSHD       = (0x05 << 6) | OPC_DBSHFL,
> +    OPC_DALIGN     = (0x08 << 6) | OPC_DBSHFL, /* 01.bp */
> +    OPC_DALIGN_END = (0x0F << 6) | OPC_DBSHFL, /* 01.000 to 01.111 */
> +    OPC_DBITSWAP   = (0x00 << 6) | OPC_DBSHFL, /* 00000 */
>  };
>  
>  /* MIPS DSP REGIMM opcodes */
> @@ -15111,12 +15117,14 @@ static void decode_opc_special2_legacy(CPUMIPSState *env, DisasContext *ctx)
>  
>  static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
>  {
> -    int rs, rt;
> -    uint32_t op1;
> +    int rs, rt, rd, sa;
> +    uint32_t op1, op2;
>      int16_t imm;
>  
>      rs = (ctx->opcode >> 21) & 0x1f;
>      rt = (ctx->opcode >> 16) & 0x1f;
> +    rd = (ctx->opcode >> 11) & 0x1f;
> +    sa = (ctx->opcode >> 6) & 0x1f;
>      imm = (int16_t)ctx->opcode >> 7;
>  
>      op1 = MASK_SPECIAL3(ctx->opcode);
> @@ -15137,6 +15145,44 @@ static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
>      case R6_OPC_LL:
>          gen_ld(ctx, op1, rt, rs, imm);
>          break;
> +    case OPC_BSHFL:
> +        if (rd == 0) {
> +            /* Treat as NOP. */
> +            break;
> +        }
> +        op2 = MASK_BSHFL(ctx->opcode);
> +        switch (op2) {
> +        case OPC_ALIGN ... OPC_ALIGN_END:
> +            sa &= 3;
> +            TCGv t0 = tcg_temp_new();
> +            gen_load_gpr(t0, rt);
> +            if (sa == 0) {
> +                tcg_gen_mov_tl(cpu_gpr[rd], t0);
> +            } else {
> +                TCGv t1 = tcg_temp_new();
> +                TCGv_i64 t2 = tcg_temp_new_i64();
> +                gen_load_gpr(t1, rs);
> +                tcg_gen_concat_tl_i64(t2, t1, t0);
> +                tcg_gen_shri_i64(t2, t2, 8 * (4 - sa));
> +#if defined(TARGET_MIPS64)
> +                tcg_gen_ext32s_i64(cpu_gpr[rd], t2);
> +#else
> +                tcg_gen_trunc_i64_i32(cpu_gpr[rd], t2);
> +#endif
> +                tcg_temp_free_i64(t2);
> +                tcg_temp_free(t1);
> +            }
> +            tcg_temp_free(t0);
> +            break;
> +        case OPC_BITSWAP:
> +            if (rt == 0) {
> +                tcg_gen_movi_tl(cpu_gpr[rd], 0);
> +            } else {
> +                gen_helper_bitswap(cpu_gpr[rd], cpu_gpr[rt]);
> +            }
> +            break;
> +        }
> +        break;
>  #if defined(TARGET_MIPS64)
>      case R6_OPC_SCD:
>          gen_st_cond(ctx, op1, rt, rs, imm);
> @@ -15144,6 +15190,39 @@ static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
>      case R6_OPC_LLD:
>          gen_ld(ctx, op1, rt, rs, imm);
>          break;
> +    case OPC_DBSHFL:
> +        if (rd == 0) {
> +            /* Treat as NOP. */
> +            break;
> +        }

This optimisation is fine, otherwise we'll end up writing in the
wrong register.

> +        op2 = MASK_DBSHFL(ctx->opcode);
> +        switch (op2) {
> +        case OPC_DALIGN ... OPC_DALIGN_END:
> +            sa &= 7;
> +            check_mips_64(ctx);
> +            TCGv t0 = tcg_temp_new();
> +            gen_load_gpr(t0, rt);
> +            if (sa == 0) {
> +                tcg_gen_mov_tl(cpu_gpr[rd], t0);
> +            } else {
> +                TCGv t1 = tcg_temp_new();
> +                gen_load_gpr(t1, rs);
> +                tcg_gen_shli_tl(t0, t0, 8 * sa);
> +                tcg_gen_shri_tl(t1, t1, 8 * (8 - sa));
> +                tcg_gen_or_tl(cpu_gpr[rd], t1, t0);
> +                tcg_temp_free(t1);
> +            }
> +            tcg_temp_free(t0);
> +            break;
> +        case OPC_DBITSWAP:
> +            if (rt == 0) {
> +                tcg_gen_movi_tl(cpu_gpr[rd], 0);
> +            } else {
> +                gen_helper_dbitswap(cpu_gpr[rd], cpu_gpr[rt]);
> +            }
> +            break;

As Richard said, this is not really useful if it is not common to call
the instruction that way.

It's true we have existing code like that, but it is usually either
needed for correctness (in the shift cases), or for performances (to
avoid doing additions for MOV/MOVI, or to skip the usual forms of NOP).

> +        }
> +        break;
>  #endif
>      default:            /* Invalid */
>          MIPS_INVAL("special3_r6");
> @@ -15689,9 +15768,18 @@ static void decode_opc_special3(CPUMIPSState *env, DisasContext *ctx)
>          gen_bitops(ctx, op1, rt, rs, sa, rd);
>          break;
>      case OPC_BSHFL:
> -        check_insn(ctx, ISA_MIPS32R2);
>          op2 = MASK_BSHFL(ctx->opcode);
> -        gen_bshfl(ctx, op2, rt, rd);
> +        switch (op2) {
> +        case OPC_ALIGN ... OPC_ALIGN_END:
> +        case OPC_BITSWAP:
> +            check_insn(ctx, ISA_MIPS32R6);
> +            decode_opc_special3_r6(env, ctx);
> +            break;
> +        default:
> +            check_insn(ctx, ISA_MIPS32R2);
> +            gen_bshfl(ctx, op2, rt, rd);
> +            break;
> +        }
>          break;
>  #if defined(TARGET_MIPS64)
>      case OPC_DEXTM ... OPC_DEXT:
> @@ -15701,10 +15789,20 @@ static void decode_opc_special3(CPUMIPSState *env, DisasContext *ctx)
>          gen_bitops(ctx, op1, rt, rs, sa, rd);
>          break;
>      case OPC_DBSHFL:
> -        check_insn(ctx, ISA_MIPS64R2);
> -        check_mips_64(ctx);
>          op2 = MASK_DBSHFL(ctx->opcode);
> -        gen_bshfl(ctx, op2, rt, rd);
> +        switch (op2) {
> +        case OPC_DALIGN ... OPC_DALIGN_END:
> +        case OPC_DBITSWAP:
> +            check_insn(ctx, ISA_MIPS32R6);
> +            decode_opc_special3_r6(env, ctx);
> +            break;
> +        default:
> +            check_insn(ctx, ISA_MIPS64R2);
> +            check_mips_64(ctx);
> +            op2 = MASK_DBSHFL(ctx->opcode);
> +            gen_bshfl(ctx, op2, rt, rd);
> +            break;
> +        }
>          break;
>  #endif
>      case OPC_RDHWR:

Besides the above nitpicks:

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
diff mbox

Patch

diff --git a/disas/mips.c b/disas/mips.c
index 8cb9032..127a7d5 100644
--- a/disas/mips.c
+++ b/disas/mips.c
@@ -1246,6 +1246,10 @@  const struct mips_opcode mips_builtin_opcodes[] =
 {"cache",   "k,o(b)",   0x7c000025, 0xfc00003f, RD_b,                 0, I32R6},
 {"seleqz",  "d,v,t",    0x00000035, 0xfc0007ff, WR_d|RD_s|RD_t,       0, I32R6},
 {"selnez",  "d,v,t",    0x00000037, 0xfc0007ff, WR_d|RD_s|RD_t,       0, I32R6},
+{"align",   "d,v,t",    0x7c000220, 0xfc00073f, WR_d|RD_s|RD_t,       0, I32R6},
+{"dalign",  "d,v,t",    0x7c000224, 0xfc00063f, WR_d|RD_s|RD_t,       0, I64R6},
+{"bitswap", "d,w",      0x7c000020, 0xffe007ff, WR_d|RD_t,            0, I32R6},
+{"dbitswap","d,w",      0x7c000024, 0xffe007ff, WR_d|RD_t,            0, I64R6},
 {"pref",    "k,o(b)",   0xcc000000, 0xfc000000, RD_b,           	0,		I4|I32|G3	},
 {"prefx",   "h,t(b)",	0x4c00000f, 0xfc0007ff, RD_b|RD_t,		0,		I4|I33	},
 {"nop",     "",         0x00000000, 0xffffffff, 0,              	INSN2_ALIAS,	I1      }, /* sll */
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 74ef094..5511dfc 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -39,6 +39,11 @@  DEF_HELPER_3(macchiu, tl, env, tl, tl)
 DEF_HELPER_3(msachi, tl, env, tl, tl)
 DEF_HELPER_3(msachiu, tl, env, tl, tl)
 
+DEF_HELPER_FLAGS_1(bitswap, TCG_CALL_NO_RWG_SE, tl, tl)
+#ifdef TARGET_MIPS64
+DEF_HELPER_FLAGS_1(dbitswap, TCG_CALL_NO_RWG_SE, tl, tl)
+#endif
+
 #ifndef CONFIG_USER_ONLY
 /* CP0 helpers */
 DEF_HELPER_1(mfc0_mvpcontrol, tl, env)
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index 4704216..34e9823 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -265,6 +265,30 @@  target_ulong helper_mulshiu(CPUMIPSState *env, target_ulong arg1,
                        (uint64_t)(uint32_t)arg2);
 }
 
+static inline target_ulong bitswap(target_ulong v)
+{
+    v = ((v >> 1) & (target_ulong)0x5555555555555555) |
+              ((v & (target_ulong)0x5555555555555555) << 1);
+    v = ((v >> 2) & (target_ulong)0x3333333333333333) |
+              ((v & (target_ulong)0x3333333333333333) << 2);
+    v = ((v >> 4) & (target_ulong)0x0F0F0F0F0F0F0F0F) |
+              ((v & (target_ulong)0x0F0F0F0F0F0F0F0F) << 4);
+    return v;
+}
+
+
+#ifdef TARGET_MIPS64
+target_ulong helper_dbitswap(target_ulong rt)
+{
+    return bitswap(rt);
+}
+#endif
+
+target_ulong helper_bitswap(target_ulong rt)
+{
+    return (int32_t)bitswap(rt);
+}
+
 #ifndef CONFIG_USER_ONLY
 
 static inline hwaddr do_translate_address(CPUMIPSState *env,
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 363b178..270e7d3 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -390,17 +390,23 @@  enum {
 #define MASK_BSHFL(op)     MASK_SPECIAL3(op) | (op & (0x1F << 6))
 
 enum {
-    OPC_WSBH     = (0x02 << 6) | OPC_BSHFL,
-    OPC_SEB      = (0x10 << 6) | OPC_BSHFL,
-    OPC_SEH      = (0x18 << 6) | OPC_BSHFL,
+    OPC_WSBH      = (0x02 << 6) | OPC_BSHFL,
+    OPC_SEB       = (0x10 << 6) | OPC_BSHFL,
+    OPC_SEH       = (0x18 << 6) | OPC_BSHFL,
+    OPC_ALIGN     = (0x08 << 6) | OPC_BSHFL, /* 010.bp */
+    OPC_ALIGN_END = (0x0B << 6) | OPC_BSHFL, /* 010.00 to 010.11 */
+    OPC_BITSWAP   = (0x00 << 6) | OPC_BSHFL  /* 00000 */
 };
 
 /* DBSHFL opcodes */
 #define MASK_DBSHFL(op)    MASK_SPECIAL3(op) | (op & (0x1F << 6))
 
 enum {
-    OPC_DSBH     = (0x02 << 6) | OPC_DBSHFL,
-    OPC_DSHD     = (0x05 << 6) | OPC_DBSHFL,
+    OPC_DSBH       = (0x02 << 6) | OPC_DBSHFL,
+    OPC_DSHD       = (0x05 << 6) | OPC_DBSHFL,
+    OPC_DALIGN     = (0x08 << 6) | OPC_DBSHFL, /* 01.bp */
+    OPC_DALIGN_END = (0x0F << 6) | OPC_DBSHFL, /* 01.000 to 01.111 */
+    OPC_DBITSWAP   = (0x00 << 6) | OPC_DBSHFL, /* 00000 */
 };
 
 /* MIPS DSP REGIMM opcodes */
@@ -15111,12 +15117,14 @@  static void decode_opc_special2_legacy(CPUMIPSState *env, DisasContext *ctx)
 
 static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
 {
-    int rs, rt;
-    uint32_t op1;
+    int rs, rt, rd, sa;
+    uint32_t op1, op2;
     int16_t imm;
 
     rs = (ctx->opcode >> 21) & 0x1f;
     rt = (ctx->opcode >> 16) & 0x1f;
+    rd = (ctx->opcode >> 11) & 0x1f;
+    sa = (ctx->opcode >> 6) & 0x1f;
     imm = (int16_t)ctx->opcode >> 7;
 
     op1 = MASK_SPECIAL3(ctx->opcode);
@@ -15137,6 +15145,44 @@  static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
     case R6_OPC_LL:
         gen_ld(ctx, op1, rt, rs, imm);
         break;
+    case OPC_BSHFL:
+        if (rd == 0) {
+            /* Treat as NOP. */
+            break;
+        }
+        op2 = MASK_BSHFL(ctx->opcode);
+        switch (op2) {
+        case OPC_ALIGN ... OPC_ALIGN_END:
+            sa &= 3;
+            TCGv t0 = tcg_temp_new();
+            gen_load_gpr(t0, rt);
+            if (sa == 0) {
+                tcg_gen_mov_tl(cpu_gpr[rd], t0);
+            } else {
+                TCGv t1 = tcg_temp_new();
+                TCGv_i64 t2 = tcg_temp_new_i64();
+                gen_load_gpr(t1, rs);
+                tcg_gen_concat_tl_i64(t2, t1, t0);
+                tcg_gen_shri_i64(t2, t2, 8 * (4 - sa));
+#if defined(TARGET_MIPS64)
+                tcg_gen_ext32s_i64(cpu_gpr[rd], t2);
+#else
+                tcg_gen_trunc_i64_i32(cpu_gpr[rd], t2);
+#endif
+                tcg_temp_free_i64(t2);
+                tcg_temp_free(t1);
+            }
+            tcg_temp_free(t0);
+            break;
+        case OPC_BITSWAP:
+            if (rt == 0) {
+                tcg_gen_movi_tl(cpu_gpr[rd], 0);
+            } else {
+                gen_helper_bitswap(cpu_gpr[rd], cpu_gpr[rt]);
+            }
+            break;
+        }
+        break;
 #if defined(TARGET_MIPS64)
     case R6_OPC_SCD:
         gen_st_cond(ctx, op1, rt, rs, imm);
@@ -15144,6 +15190,39 @@  static void decode_opc_special3_r6(CPUMIPSState *env, DisasContext *ctx)
     case R6_OPC_LLD:
         gen_ld(ctx, op1, rt, rs, imm);
         break;
+    case OPC_DBSHFL:
+        if (rd == 0) {
+            /* Treat as NOP. */
+            break;
+        }
+        op2 = MASK_DBSHFL(ctx->opcode);
+        switch (op2) {
+        case OPC_DALIGN ... OPC_DALIGN_END:
+            sa &= 7;
+            check_mips_64(ctx);
+            TCGv t0 = tcg_temp_new();
+            gen_load_gpr(t0, rt);
+            if (sa == 0) {
+                tcg_gen_mov_tl(cpu_gpr[rd], t0);
+            } else {
+                TCGv t1 = tcg_temp_new();
+                gen_load_gpr(t1, rs);
+                tcg_gen_shli_tl(t0, t0, 8 * sa);
+                tcg_gen_shri_tl(t1, t1, 8 * (8 - sa));
+                tcg_gen_or_tl(cpu_gpr[rd], t1, t0);
+                tcg_temp_free(t1);
+            }
+            tcg_temp_free(t0);
+            break;
+        case OPC_DBITSWAP:
+            if (rt == 0) {
+                tcg_gen_movi_tl(cpu_gpr[rd], 0);
+            } else {
+                gen_helper_dbitswap(cpu_gpr[rd], cpu_gpr[rt]);
+            }
+            break;
+        }
+        break;
 #endif
     default:            /* Invalid */
         MIPS_INVAL("special3_r6");
@@ -15689,9 +15768,18 @@  static void decode_opc_special3(CPUMIPSState *env, DisasContext *ctx)
         gen_bitops(ctx, op1, rt, rs, sa, rd);
         break;
     case OPC_BSHFL:
-        check_insn(ctx, ISA_MIPS32R2);
         op2 = MASK_BSHFL(ctx->opcode);
-        gen_bshfl(ctx, op2, rt, rd);
+        switch (op2) {
+        case OPC_ALIGN ... OPC_ALIGN_END:
+        case OPC_BITSWAP:
+            check_insn(ctx, ISA_MIPS32R6);
+            decode_opc_special3_r6(env, ctx);
+            break;
+        default:
+            check_insn(ctx, ISA_MIPS32R2);
+            gen_bshfl(ctx, op2, rt, rd);
+            break;
+        }
         break;
 #if defined(TARGET_MIPS64)
     case OPC_DEXTM ... OPC_DEXT:
@@ -15701,10 +15789,20 @@  static void decode_opc_special3(CPUMIPSState *env, DisasContext *ctx)
         gen_bitops(ctx, op1, rt, rs, sa, rd);
         break;
     case OPC_DBSHFL:
-        check_insn(ctx, ISA_MIPS64R2);
-        check_mips_64(ctx);
         op2 = MASK_DBSHFL(ctx->opcode);
-        gen_bshfl(ctx, op2, rt, rd);
+        switch (op2) {
+        case OPC_DALIGN ... OPC_DALIGN_END:
+        case OPC_DBITSWAP:
+            check_insn(ctx, ISA_MIPS32R6);
+            decode_opc_special3_r6(env, ctx);
+            break;
+        default:
+            check_insn(ctx, ISA_MIPS64R2);
+            check_mips_64(ctx);
+            op2 = MASK_DBSHFL(ctx->opcode);
+            gen_bshfl(ctx, op2, rt, rd);
+            break;
+        }
         break;
 #endif
     case OPC_RDHWR: