Patchwork [v2,13/19] target-mips: implement unaligned loads using TCG

login
register
mail settings
Submitter Aurelien Jarno
Date Oct. 30, 2012, 12:12 a.m.
Message ID <1351555932-19695-14-git-send-email-aurelien@aurel32.net>
Download mbox | patch
Permalink /patch/195218/
State New
Headers show

Comments

Aurelien Jarno - Oct. 30, 2012, 12:12 a.m.
Load/store from helpers should be avoided as they are quite
inefficient. Rewrite unaligned loads instructions using TCG and
aligned loads. The number of actual loads operations to implement
an unaligned load instruction is reduced from up to 8 to 1.

Note: As we can't rely on shift by 32 or 64 undefined behaviour,
the code loads already shift by one constants.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-mips/helper.h    |    4 --
 target-mips/op_helper.c |  142 -----------------------------------------------
 target-mips/translate.c |   75 ++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 159 deletions(-)
Blue Swirl - Oct. 30, 2012, 6:59 p.m.
On Tue, Oct 30, 2012 at 12:12 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> Load/store from helpers should be avoided as they are quite
> inefficient. Rewrite unaligned loads instructions using TCG and
> aligned loads. The number of actual loads operations to implement
> an unaligned load instruction is reduced from up to 8 to 1.

There are still other ops around the load operation. How about
implementing unaligned accesses at TCG level, then targets like x86
which don't care about alignment can implement them with normal
accesses more efficiently?

>
> Note: As we can't rely on shift by 32 or 64 undefined behaviour,
> the code loads already shift by one constants.
>
> Reviewed-by: Richard Henderson <rth@twiddle.net>
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-mips/helper.h    |    4 --
>  target-mips/op_helper.c |  142 -----------------------------------------------
>  target-mips/translate.c |   75 ++++++++++++++++++++-----
>  3 files changed, 62 insertions(+), 159 deletions(-)
>
> diff --git a/target-mips/helper.h b/target-mips/helper.h
> index 210960f..0e38cdd 100644
> --- a/target-mips/helper.h
> +++ b/target-mips/helper.h
> @@ -4,13 +4,9 @@ DEF_HELPER_3(raise_exception_err, noreturn, env, i32, int)
>  DEF_HELPER_2(raise_exception, noreturn, env, i32)
>
>  #ifdef TARGET_MIPS64
> -DEF_HELPER_4(ldl, tl, env, tl, tl, int)
> -DEF_HELPER_4(ldr, tl, env, tl, tl, int)
>  DEF_HELPER_4(sdl, void, env, tl, tl, int)
>  DEF_HELPER_4(sdr, void, env, tl, tl, int)
>  #endif
> -DEF_HELPER_4(lwl, tl, env, tl, tl, int)
> -DEF_HELPER_4(lwr, tl, env, tl, tl, int)
>  DEF_HELPER_4(swl, void, env, tl, tl, int)
>  DEF_HELPER_4(swr, void, env, tl, tl, int)
>
> diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
> index 78497d9..773c710 100644
> --- a/target-mips/op_helper.c
> +++ b/target-mips/op_helper.c
> @@ -350,56 +350,6 @@ HELPER_ST_ATOMIC(scd, ld, sd, 0x7)
>  #define GET_OFFSET(addr, offset) (addr - (offset))
>  #endif
>
> -target_ulong helper_lwl(CPUMIPSState *env, target_ulong arg1,
> -                        target_ulong arg2, int mem_idx)
> -{
> -    target_ulong tmp;
> -
> -    tmp = do_lbu(env, arg2, mem_idx);
> -    arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
> -
> -    if (GET_LMASK(arg2) <= 2) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
> -        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
> -    }
> -
> -    if (GET_LMASK(arg2) <= 1) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
> -        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
> -    }
> -
> -    if (GET_LMASK(arg2) == 0) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFF00) | tmp;
> -    }
> -    return (int32_t)arg1;
> -}
> -
> -target_ulong helper_lwr(CPUMIPSState *env, target_ulong arg1,
> -                        target_ulong arg2, int mem_idx)
> -{
> -    target_ulong tmp;
> -
> -    tmp = do_lbu(env, arg2, mem_idx);
> -    arg1 = (arg1 & 0xFFFFFF00) | tmp;
> -
> -    if (GET_LMASK(arg2) >= 1) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
> -        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
> -    }
> -
> -    if (GET_LMASK(arg2) >= 2) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
> -        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
> -    }
> -
> -    if (GET_LMASK(arg2) == 3) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
> -        arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
> -    }
> -    return (int32_t)arg1;
> -}
> -
>  void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
>                  int mem_idx)
>  {
> @@ -440,98 +390,6 @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
>  #define GET_LMASK64(v) (((v) & 7) ^ 7)
>  #endif
>
> -target_ulong helper_ldl(CPUMIPSState *env, target_ulong arg1,
> -                        target_ulong arg2, int mem_idx)
> -{
> -    uint64_t tmp;
> -
> -    tmp = do_lbu(env, arg2, mem_idx);
> -    arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
> -
> -    if (GET_LMASK64(arg2) <= 6) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
> -        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
> -    }
> -
> -    if (GET_LMASK64(arg2) <= 5) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
> -        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
> -    }
> -
> -    if (GET_LMASK64(arg2) <= 4) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
> -    }
> -
> -    if (GET_LMASK64(arg2) <= 3) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 4), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
> -    }
> -
> -    if (GET_LMASK64(arg2) <= 2) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 5), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
> -    }
> -
> -    if (GET_LMASK64(arg2) <= 1) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 6), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp << 8);
> -    }
> -
> -    if (GET_LMASK64(arg2) == 0) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, 7), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
> -    }
> -
> -    return arg1;
> -}
> -
> -target_ulong helper_ldr(CPUMIPSState *env, target_ulong arg1,
> -                        target_ulong arg2, int mem_idx)
> -{
> -    uint64_t tmp;
> -
> -    tmp = do_lbu(env, arg2, mem_idx);
> -    arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
> -
> -    if (GET_LMASK64(arg2) >= 1) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp  << 8);
> -    }
> -
> -    if (GET_LMASK64(arg2) >= 2) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
> -    }
> -
> -    if (GET_LMASK64(arg2) >= 3) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
> -    }
> -
> -    if (GET_LMASK64(arg2) >= 4) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -4), mem_idx);
> -        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
> -    }
> -
> -    if (GET_LMASK64(arg2) >= 5) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -5), mem_idx);
> -        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
> -    }
> -
> -    if (GET_LMASK64(arg2) >= 6) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -6), mem_idx);
> -        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
> -    }
> -
> -    if (GET_LMASK64(arg2) == 7) {
> -        tmp = do_lbu(env, GET_OFFSET(arg2, -7), mem_idx);
> -        arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
> -    }
> -
> -    return arg1;
> -}
> -
>  void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
>                  int mem_idx)
>  {
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index c46129d..b385923 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -1125,7 +1125,7 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
>                      int rt, int base, int16_t offset)
>  {
>      const char *opn = "ld";
> -    TCGv t0, t1;
> +    TCGv t0, t1, t2;
>
>      if (rt == 0 && env->insn_flags & (INSN_LOONGSON2E | INSN_LOONGSON2F)) {
>          /* Loongson CPU uses a load to zero register for prefetch.
> @@ -1157,21 +1157,45 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
>          opn = "lld";
>          break;
>      case OPC_LDL:
> -        save_cpu_state(ctx, 1);
>          t1 = tcg_temp_new();
> +        tcg_gen_andi_tl(t1, t0, 7);
> +#ifndef TARGET_WORDS_BIGENDIAN
> +        tcg_gen_xori_tl(t1, t1, 7);
> +#endif
> +        tcg_gen_shli_tl(t1, t1, 3);
> +        tcg_gen_andi_tl(t0, t0, ~7);
> +        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
> +        tcg_gen_shl_tl(t0, t0, t1);
> +        tcg_gen_xori_tl(t1, t1, 63);
> +        t2 = tcg_const_tl(0x7fffffffffffffffull);
> +        tcg_gen_shr_tl(t2, t2, t1);
>          gen_load_gpr(t1, rt);
> -        gen_helper_1e2i(ldl, t1, t1, t0, ctx->mem_idx);
> -        gen_store_gpr(t1, rt);
> +        tcg_gen_and_tl(t1, t1, t2);
> +        tcg_temp_free(t2);
> +        tcg_gen_or_tl(t0, t0, t1);
>          tcg_temp_free(t1);
> +        gen_store_gpr(t0, rt);
>          opn = "ldl";
>          break;
>      case OPC_LDR:
> -        save_cpu_state(ctx, 1);
>          t1 = tcg_temp_new();
> +        tcg_gen_andi_tl(t1, t0, 7);
> +#ifdef TARGET_WORDS_BIGENDIAN
> +        tcg_gen_xori_tl(t1, t1, 7);
> +#endif
> +        tcg_gen_shli_tl(t1, t1, 3);
> +        tcg_gen_andi_tl(t0, t0, ~7);
> +        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
> +        tcg_gen_shr_tl(t0, t0, t1);
> +        tcg_gen_xori_tl(t1, t1, 63);
> +        t2 = tcg_const_tl(0xfffffffffffffffeull);
> +        tcg_gen_shl_tl(t2, t2, t1);
>          gen_load_gpr(t1, rt);
> -        gen_helper_1e2i(ldr, t1, t1, t0, ctx->mem_idx);
> -        gen_store_gpr(t1, rt);
> +        tcg_gen_and_tl(t1, t1, t2);
> +        tcg_temp_free(t2);
> +        tcg_gen_or_tl(t0, t0, t1);
>          tcg_temp_free(t1);
> +        gen_store_gpr(t0, rt);
>          opn = "ldr";
>          break;
>      case OPC_LDPC:
> @@ -1217,21 +1241,46 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
>          opn = "lbu";
>          break;
>      case OPC_LWL:
> -        save_cpu_state(ctx, 1);
>          t1 = tcg_temp_new();
> +        tcg_gen_andi_tl(t1, t0, 3);
> +#ifndef TARGET_WORDS_BIGENDIAN
> +        tcg_gen_xori_tl(t1, t1, 3);
> +#endif
> +        tcg_gen_shli_tl(t1, t1, 3);
> +        tcg_gen_andi_tl(t0, t0, ~3);
> +        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
> +        tcg_gen_shl_tl(t0, t0, t1);
> +        tcg_gen_xori_tl(t1, t1, 31);
> +        t2 = tcg_const_tl(0x7fffffffull);
> +        tcg_gen_shr_tl(t2, t2, t1);
>          gen_load_gpr(t1, rt);
> -        gen_helper_1e2i(lwl, t1, t1, t0, ctx->mem_idx);
> -        gen_store_gpr(t1, rt);
> +        tcg_gen_and_tl(t1, t1, t2);
> +        tcg_temp_free(t2);
> +        tcg_gen_or_tl(t0, t0, t1);
>          tcg_temp_free(t1);
> +        tcg_gen_ext32s_tl(t0, t0);
> +        gen_store_gpr(t0, rt);
>          opn = "lwl";
>          break;
>      case OPC_LWR:
> -        save_cpu_state(ctx, 1);
>          t1 = tcg_temp_new();
> +        tcg_gen_andi_tl(t1, t0, 3);
> +#ifdef TARGET_WORDS_BIGENDIAN
> +        tcg_gen_xori_tl(t1, t1, 3);
> +#endif
> +        tcg_gen_shli_tl(t1, t1, 3);
> +        tcg_gen_andi_tl(t0, t0, ~3);
> +        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
> +        tcg_gen_shr_tl(t0, t0, t1);
> +        tcg_gen_xori_tl(t1, t1, 31);
> +        t2 = tcg_const_tl(0xfffffffeull);
> +        tcg_gen_shl_tl(t2, t2, t1);
>          gen_load_gpr(t1, rt);
> -        gen_helper_1e2i(lwr, t1, t1, t0, ctx->mem_idx);
> -        gen_store_gpr(t1, rt);
> +        tcg_gen_and_tl(t1, t1, t2);
> +        tcg_temp_free(t2);
> +        tcg_gen_or_tl(t0, t0, t1);
>          tcg_temp_free(t1);
> +        gen_store_gpr(t0, rt);
>          opn = "lwr";
>          break;
>      case OPC_LL:
> --
> 1.7.10.4
>
>
Aurelien Jarno - Oct. 30, 2012, 8 p.m.
On Tue, Oct 30, 2012 at 06:59:36PM +0000, Blue Swirl wrote:
> On Tue, Oct 30, 2012 at 12:12 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> > Load/store from helpers should be avoided as they are quite
> > inefficient. Rewrite unaligned loads instructions using TCG and
> > aligned loads. The number of actual loads operations to implement
> > an unaligned load instruction is reduced from up to 8 to 1.
> 
> There are still other ops around the load operation. How about
> implementing unaligned accesses at TCG level, then targets like x86
> which don't care about alignment can implement them with normal
> accesses more efficiently?

Well maybe the name "unaligned load instructions is misleading". These
instructions actually do not do any unaligned access, instead they
merge the value from memory in the left (LWL, LDL) or right (LWR, LDR)
part of the register, the number of merged bytes depends on the actual
alignemnt of the address. That way a combination of LWL + LWR or LDL +
LDR instructions provide an effective unaligned access. That's why there
are still ops around the actually load, for merging the value in the
register.

If you want to reduce the number of ops around the load, the way to go
is to add a deposit op that take registers for ofs and len. Not sure
it's worthwhile here.

> >
> > Note: As we can't rely on shift by 32 or 64 undefined behaviour,
> > the code loads already shift by one constants.
> >
> > Reviewed-by: Richard Henderson <rth@twiddle.net>
> > Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> > ---
> >  target-mips/helper.h    |    4 --
> >  target-mips/op_helper.c |  142 -----------------------------------------------
> >  target-mips/translate.c |   75 ++++++++++++++++++++-----
> >  3 files changed, 62 insertions(+), 159 deletions(-)
> >
> > diff --git a/target-mips/helper.h b/target-mips/helper.h
> > index 210960f..0e38cdd 100644
> > --- a/target-mips/helper.h
> > +++ b/target-mips/helper.h
> > @@ -4,13 +4,9 @@ DEF_HELPER_3(raise_exception_err, noreturn, env, i32, int)
> >  DEF_HELPER_2(raise_exception, noreturn, env, i32)
> >
> >  #ifdef TARGET_MIPS64
> > -DEF_HELPER_4(ldl, tl, env, tl, tl, int)
> > -DEF_HELPER_4(ldr, tl, env, tl, tl, int)
> >  DEF_HELPER_4(sdl, void, env, tl, tl, int)
> >  DEF_HELPER_4(sdr, void, env, tl, tl, int)
> >  #endif
> > -DEF_HELPER_4(lwl, tl, env, tl, tl, int)
> > -DEF_HELPER_4(lwr, tl, env, tl, tl, int)
> >  DEF_HELPER_4(swl, void, env, tl, tl, int)
> >  DEF_HELPER_4(swr, void, env, tl, tl, int)
> >
> > diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
> > index 78497d9..773c710 100644
> > --- a/target-mips/op_helper.c
> > +++ b/target-mips/op_helper.c
> > @@ -350,56 +350,6 @@ HELPER_ST_ATOMIC(scd, ld, sd, 0x7)
> >  #define GET_OFFSET(addr, offset) (addr - (offset))
> >  #endif
> >
> > -target_ulong helper_lwl(CPUMIPSState *env, target_ulong arg1,
> > -                        target_ulong arg2, int mem_idx)
> > -{
> > -    target_ulong tmp;
> > -
> > -    tmp = do_lbu(env, arg2, mem_idx);
> > -    arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
> > -
> > -    if (GET_LMASK(arg2) <= 2) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
> > -        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
> > -    }
> > -
> > -    if (GET_LMASK(arg2) <= 1) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
> > -        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
> > -    }
> > -
> > -    if (GET_LMASK(arg2) == 0) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFF00) | tmp;
> > -    }
> > -    return (int32_t)arg1;
> > -}
> > -
> > -target_ulong helper_lwr(CPUMIPSState *env, target_ulong arg1,
> > -                        target_ulong arg2, int mem_idx)
> > -{
> > -    target_ulong tmp;
> > -
> > -    tmp = do_lbu(env, arg2, mem_idx);
> > -    arg1 = (arg1 & 0xFFFFFF00) | tmp;
> > -
> > -    if (GET_LMASK(arg2) >= 1) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
> > -        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
> > -    }
> > -
> > -    if (GET_LMASK(arg2) >= 2) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
> > -        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
> > -    }
> > -
> > -    if (GET_LMASK(arg2) == 3) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
> > -        arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
> > -    }
> > -    return (int32_t)arg1;
> > -}
> > -
> >  void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
> >                  int mem_idx)
> >  {
> > @@ -440,98 +390,6 @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
> >  #define GET_LMASK64(v) (((v) & 7) ^ 7)
> >  #endif
> >
> > -target_ulong helper_ldl(CPUMIPSState *env, target_ulong arg1,
> > -                        target_ulong arg2, int mem_idx)
> > -{
> > -    uint64_t tmp;
> > -
> > -    tmp = do_lbu(env, arg2, mem_idx);
> > -    arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
> > -
> > -    if (GET_LMASK64(arg2) <= 6) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
> > -        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) <= 5) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
> > -        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) <= 4) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) <= 3) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 4), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) <= 2) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 5), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) <= 1) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 6), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp << 8);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) == 0) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, 7), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
> > -    }
> > -
> > -    return arg1;
> > -}
> > -
> > -target_ulong helper_ldr(CPUMIPSState *env, target_ulong arg1,
> > -                        target_ulong arg2, int mem_idx)
> > -{
> > -    uint64_t tmp;
> > -
> > -    tmp = do_lbu(env, arg2, mem_idx);
> > -    arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
> > -
> > -    if (GET_LMASK64(arg2) >= 1) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp  << 8);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) >= 2) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) >= 3) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) >= 4) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -4), mem_idx);
> > -        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) >= 5) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -5), mem_idx);
> > -        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) >= 6) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -6), mem_idx);
> > -        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
> > -    }
> > -
> > -    if (GET_LMASK64(arg2) == 7) {
> > -        tmp = do_lbu(env, GET_OFFSET(arg2, -7), mem_idx);
> > -        arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
> > -    }
> > -
> > -    return arg1;
> > -}
> > -
> >  void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
> >                  int mem_idx)
> >  {
> > diff --git a/target-mips/translate.c b/target-mips/translate.c
> > index c46129d..b385923 100644
> > --- a/target-mips/translate.c
> > +++ b/target-mips/translate.c
> > @@ -1125,7 +1125,7 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
> >                      int rt, int base, int16_t offset)
> >  {
> >      const char *opn = "ld";
> > -    TCGv t0, t1;
> > +    TCGv t0, t1, t2;
> >
> >      if (rt == 0 && env->insn_flags & (INSN_LOONGSON2E | INSN_LOONGSON2F)) {
> >          /* Loongson CPU uses a load to zero register for prefetch.
> > @@ -1157,21 +1157,45 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
> >          opn = "lld";
> >          break;
> >      case OPC_LDL:
> > -        save_cpu_state(ctx, 1);
> >          t1 = tcg_temp_new();
> > +        tcg_gen_andi_tl(t1, t0, 7);
> > +#ifndef TARGET_WORDS_BIGENDIAN
> > +        tcg_gen_xori_tl(t1, t1, 7);
> > +#endif
> > +        tcg_gen_shli_tl(t1, t1, 3);
> > +        tcg_gen_andi_tl(t0, t0, ~7);
> > +        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
> > +        tcg_gen_shl_tl(t0, t0, t1);
> > +        tcg_gen_xori_tl(t1, t1, 63);
> > +        t2 = tcg_const_tl(0x7fffffffffffffffull);
> > +        tcg_gen_shr_tl(t2, t2, t1);
> >          gen_load_gpr(t1, rt);
> > -        gen_helper_1e2i(ldl, t1, t1, t0, ctx->mem_idx);
> > -        gen_store_gpr(t1, rt);
> > +        tcg_gen_and_tl(t1, t1, t2);
> > +        tcg_temp_free(t2);
> > +        tcg_gen_or_tl(t0, t0, t1);
> >          tcg_temp_free(t1);
> > +        gen_store_gpr(t0, rt);
> >          opn = "ldl";
> >          break;
> >      case OPC_LDR:
> > -        save_cpu_state(ctx, 1);
> >          t1 = tcg_temp_new();
> > +        tcg_gen_andi_tl(t1, t0, 7);
> > +#ifdef TARGET_WORDS_BIGENDIAN
> > +        tcg_gen_xori_tl(t1, t1, 7);
> > +#endif
> > +        tcg_gen_shli_tl(t1, t1, 3);
> > +        tcg_gen_andi_tl(t0, t0, ~7);
> > +        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
> > +        tcg_gen_shr_tl(t0, t0, t1);
> > +        tcg_gen_xori_tl(t1, t1, 63);
> > +        t2 = tcg_const_tl(0xfffffffffffffffeull);
> > +        tcg_gen_shl_tl(t2, t2, t1);
> >          gen_load_gpr(t1, rt);
> > -        gen_helper_1e2i(ldr, t1, t1, t0, ctx->mem_idx);
> > -        gen_store_gpr(t1, rt);
> > +        tcg_gen_and_tl(t1, t1, t2);
> > +        tcg_temp_free(t2);
> > +        tcg_gen_or_tl(t0, t0, t1);
> >          tcg_temp_free(t1);
> > +        gen_store_gpr(t0, rt);
> >          opn = "ldr";
> >          break;
> >      case OPC_LDPC:
> > @@ -1217,21 +1241,46 @@ static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
> >          opn = "lbu";
> >          break;
> >      case OPC_LWL:
> > -        save_cpu_state(ctx, 1);
> >          t1 = tcg_temp_new();
> > +        tcg_gen_andi_tl(t1, t0, 3);
> > +#ifndef TARGET_WORDS_BIGENDIAN
> > +        tcg_gen_xori_tl(t1, t1, 3);
> > +#endif
> > +        tcg_gen_shli_tl(t1, t1, 3);
> > +        tcg_gen_andi_tl(t0, t0, ~3);
> > +        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
> > +        tcg_gen_shl_tl(t0, t0, t1);
> > +        tcg_gen_xori_tl(t1, t1, 31);
> > +        t2 = tcg_const_tl(0x7fffffffull);
> > +        tcg_gen_shr_tl(t2, t2, t1);
> >          gen_load_gpr(t1, rt);
> > -        gen_helper_1e2i(lwl, t1, t1, t0, ctx->mem_idx);
> > -        gen_store_gpr(t1, rt);
> > +        tcg_gen_and_tl(t1, t1, t2);
> > +        tcg_temp_free(t2);
> > +        tcg_gen_or_tl(t0, t0, t1);
> >          tcg_temp_free(t1);
> > +        tcg_gen_ext32s_tl(t0, t0);
> > +        gen_store_gpr(t0, rt);
> >          opn = "lwl";
> >          break;
> >      case OPC_LWR:
> > -        save_cpu_state(ctx, 1);
> >          t1 = tcg_temp_new();
> > +        tcg_gen_andi_tl(t1, t0, 3);
> > +#ifdef TARGET_WORDS_BIGENDIAN
> > +        tcg_gen_xori_tl(t1, t1, 3);
> > +#endif
> > +        tcg_gen_shli_tl(t1, t1, 3);
> > +        tcg_gen_andi_tl(t0, t0, ~3);
> > +        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
> > +        tcg_gen_shr_tl(t0, t0, t1);
> > +        tcg_gen_xori_tl(t1, t1, 31);
> > +        t2 = tcg_const_tl(0xfffffffeull);
> > +        tcg_gen_shl_tl(t2, t2, t1);
> >          gen_load_gpr(t1, rt);
> > -        gen_helper_1e2i(lwr, t1, t1, t0, ctx->mem_idx);
> > -        gen_store_gpr(t1, rt);
> > +        tcg_gen_and_tl(t1, t1, t2);
> > +        tcg_temp_free(t2);
> > +        tcg_gen_or_tl(t0, t0, t1);
> >          tcg_temp_free(t1);
> > +        gen_store_gpr(t0, rt);
> >          opn = "lwr";
> >          break;
> >      case OPC_LL:
> > --
> > 1.7.10.4
> >
> >
>

Patch

diff --git a/target-mips/helper.h b/target-mips/helper.h
index 210960f..0e38cdd 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -4,13 +4,9 @@  DEF_HELPER_3(raise_exception_err, noreturn, env, i32, int)
 DEF_HELPER_2(raise_exception, noreturn, env, i32)
 
 #ifdef TARGET_MIPS64
-DEF_HELPER_4(ldl, tl, env, tl, tl, int)
-DEF_HELPER_4(ldr, tl, env, tl, tl, int)
 DEF_HELPER_4(sdl, void, env, tl, tl, int)
 DEF_HELPER_4(sdr, void, env, tl, tl, int)
 #endif
-DEF_HELPER_4(lwl, tl, env, tl, tl, int)
-DEF_HELPER_4(lwr, tl, env, tl, tl, int)
 DEF_HELPER_4(swl, void, env, tl, tl, int)
 DEF_HELPER_4(swr, void, env, tl, tl, int)
 
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index 78497d9..773c710 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -350,56 +350,6 @@  HELPER_ST_ATOMIC(scd, ld, sd, 0x7)
 #define GET_OFFSET(addr, offset) (addr - (offset))
 #endif
 
-target_ulong helper_lwl(CPUMIPSState *env, target_ulong arg1,
-                        target_ulong arg2, int mem_idx)
-{
-    target_ulong tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
-
-    if (GET_LMASK(arg2) <= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
-    }
-
-    if (GET_LMASK(arg2) <= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
-    }
-
-    if (GET_LMASK(arg2) == 0) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00) | tmp;
-    }
-    return (int32_t)arg1;
-}
-
-target_ulong helper_lwr(CPUMIPSState *env, target_ulong arg1,
-                        target_ulong arg2, int mem_idx)
-{
-    target_ulong tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0xFFFFFF00) | tmp;
-
-    if (GET_LMASK(arg2) >= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
-    }
-
-    if (GET_LMASK(arg2) >= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
-    }
-
-    if (GET_LMASK(arg2) == 3) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
-        arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
-    }
-    return (int32_t)arg1;
-}
-
 void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
@@ -440,98 +390,6 @@  void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
 #define GET_LMASK64(v) (((v) & 7) ^ 7)
 #endif
 
-target_ulong helper_ldl(CPUMIPSState *env, target_ulong arg1,
-                        target_ulong arg2, int mem_idx)
-{
-    uint64_t tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
-
-    if (GET_LMASK64(arg2) <= 6) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
-    }
-
-    if (GET_LMASK64(arg2) <= 5) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
-    }
-
-    if (GET_LMASK64(arg2) <= 4) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
-    }
-
-    if (GET_LMASK64(arg2) <= 3) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 4), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
-    }
-
-    if (GET_LMASK64(arg2) <= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 5), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
-    }
-
-    if (GET_LMASK64(arg2) <= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 6), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp << 8);
-    }
-
-    if (GET_LMASK64(arg2) == 0) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 7), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
-    }
-
-    return arg1;
-}
-
-target_ulong helper_ldr(CPUMIPSState *env, target_ulong arg1,
-                        target_ulong arg2, int mem_idx)
-{
-    uint64_t tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
-
-    if (GET_LMASK64(arg2) >= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp  << 8);
-    }
-
-    if (GET_LMASK64(arg2) >= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
-    }
-
-    if (GET_LMASK64(arg2) >= 3) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
-    }
-
-    if (GET_LMASK64(arg2) >= 4) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -4), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
-    }
-
-    if (GET_LMASK64(arg2) >= 5) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -5), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
-    }
-
-    if (GET_LMASK64(arg2) >= 6) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -6), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
-    }
-
-    if (GET_LMASK64(arg2) == 7) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -7), mem_idx);
-        arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
-    }
-
-    return arg1;
-}
-
 void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
diff --git a/target-mips/translate.c b/target-mips/translate.c
index c46129d..b385923 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -1125,7 +1125,7 @@  static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
                     int rt, int base, int16_t offset)
 {
     const char *opn = "ld";
-    TCGv t0, t1;
+    TCGv t0, t1, t2;
 
     if (rt == 0 && env->insn_flags & (INSN_LOONGSON2E | INSN_LOONGSON2F)) {
         /* Loongson CPU uses a load to zero register for prefetch.
@@ -1157,21 +1157,45 @@  static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
         opn = "lld";
         break;
     case OPC_LDL:
-        save_cpu_state(ctx, 1);
         t1 = tcg_temp_new();
+        tcg_gen_andi_tl(t1, t0, 7);
+#ifndef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 7);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~7);
+        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
+        tcg_gen_shl_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 63);
+        t2 = tcg_const_tl(0x7fffffffffffffffull);
+        tcg_gen_shr_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_1e2i(ldl, t1, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
         tcg_temp_free(t1);
+        gen_store_gpr(t0, rt);
         opn = "ldl";
         break;
     case OPC_LDR:
-        save_cpu_state(ctx, 1);
         t1 = tcg_temp_new();
+        tcg_gen_andi_tl(t1, t0, 7);
+#ifdef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 7);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~7);
+        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
+        tcg_gen_shr_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 63);
+        t2 = tcg_const_tl(0xfffffffffffffffeull);
+        tcg_gen_shl_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_1e2i(ldr, t1, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
         tcg_temp_free(t1);
+        gen_store_gpr(t0, rt);
         opn = "ldr";
         break;
     case OPC_LDPC:
@@ -1217,21 +1241,46 @@  static void gen_ld (CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
         opn = "lbu";
         break;
     case OPC_LWL:
-        save_cpu_state(ctx, 1);
         t1 = tcg_temp_new();
+        tcg_gen_andi_tl(t1, t0, 3);
+#ifndef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 3);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~3);
+        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
+        tcg_gen_shl_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 31);
+        t2 = tcg_const_tl(0x7fffffffull);
+        tcg_gen_shr_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_1e2i(lwl, t1, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
         tcg_temp_free(t1);
+        tcg_gen_ext32s_tl(t0, t0);
+        gen_store_gpr(t0, rt);
         opn = "lwl";
         break;
     case OPC_LWR:
-        save_cpu_state(ctx, 1);
         t1 = tcg_temp_new();
+        tcg_gen_andi_tl(t1, t0, 3);
+#ifdef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 3);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~3);
+        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
+        tcg_gen_shr_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 31);
+        t2 = tcg_const_tl(0xfffffffeull);
+        tcg_gen_shl_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_1e2i(lwr, t1, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
         tcg_temp_free(t1);
+        gen_store_gpr(t0, rt);
         opn = "lwr";
         break;
     case OPC_LL: