diff mbox

[2/2] target-mips: simplify LWL/LDL mask generation

Message ID 1436888717-8122-3-git-send-email-aurelien@aurel32.net
State New
Headers show

Commit Message

Aurelien Jarno July 14, 2015, 3:45 p.m. UTC
The LWL/LDL instructions mask the GPR with a mask depending on the
address alignement. It is currently computed by doing:

    mask = 0x7fffffffffffffffull >> (t1 ^ 63)

It's simpler to generate it by doing:

    mask = (1 << t1) - 1

It uses the same number of TCG instructions, but it avoids a 32/64-bit
constant loading which can take a few instructions on RISC hosts.

Cc: Leon Alrae <leon.alrae@imgtec.com>
Tested-by: Hervé Poussineau <hpoussin@reactos.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-mips/translate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Paolo Bonzini July 14, 2015, 4:17 p.m. UTC | #1
On 14/07/2015 17:45, Aurelien Jarno wrote:
> The LWL/LDL instructions mask the GPR with a mask depending on the
> address alignement. It is currently computed by doing:
> 
>     mask = 0x7fffffffffffffffull >> (t1 ^ 63)
> 
> It's simpler to generate it by doing:
> 
>     mask = (1 << t1) - 1

Using ~(-1 << t1) may let you use an ANDN instruction, and is also the
same number of instructions on x86.

Paolo

> It uses the same number of TCG instructions, but it avoids a 32/64-bit
> constant loading which can take a few instructions on RISC hosts.
> 
> Cc: Leon Alrae <leon.alrae@imgtec.com>
> Tested-by: Hervé Poussineau <hpoussin@reactos.org>
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-mips/translate.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index 0ac3bd8..9891209 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -2153,9 +2153,9 @@ static void gen_ld(DisasContext *ctx, uint32_t opc,
>          tcg_gen_andi_tl(t0, t0, ~7);
>          tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ);
>          tcg_gen_shl_tl(t0, t0, t1);
> -        tcg_gen_xori_tl(t1, t1, 63);
> -        t2 = tcg_const_tl(0x7fffffffffffffffull);
> -        tcg_gen_shr_tl(t2, t2, t1);
> +        t2 = tcg_const_tl(1);
> +        tcg_gen_shl_tl(t2, t2, t1);
> +        tcg_gen_subi_tl(t2, t2, 1);
>          gen_load_gpr(t1, rt);
>          tcg_gen_and_tl(t1, t1, t2);
>          tcg_temp_free(t2);
> @@ -2246,9 +2246,9 @@ static void gen_ld(DisasContext *ctx, uint32_t opc,
>          tcg_gen_andi_tl(t0, t0, ~3);
>          tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL);
>          tcg_gen_shl_tl(t0, t0, t1);
> -        tcg_gen_xori_tl(t1, t1, 31);
> -        t2 = tcg_const_tl(0x7fffffffull);
> -        tcg_gen_shr_tl(t2, t2, t1);
> +        t2 = tcg_const_tl(1);
> +        tcg_gen_shl_tl(t2, t2, t1);
> +        tcg_gen_subi_tl(t2, t2, 1);
>          gen_load_gpr(t1, rt);
>          tcg_gen_and_tl(t1, t1, t2);
>          tcg_temp_free(t2);
>
Aurelien Jarno July 14, 2015, 4:45 p.m. UTC | #2
On 2015-07-14 18:17, Paolo Bonzini wrote:
> 
> 
> On 14/07/2015 17:45, Aurelien Jarno wrote:
> > The LWL/LDL instructions mask the GPR with a mask depending on the
> > address alignement. It is currently computed by doing:
> > 
> >     mask = 0x7fffffffffffffffull >> (t1 ^ 63)
> > 
> > It's simpler to generate it by doing:
> > 
> >     mask = (1 << t1) - 1
> 
> Using ~(-1 << t1) may let you use an ANDN instruction, and is also the
> same number of instructions on x86.
> 

Indeed thanks for the hint. The generated code has the same size, but is
one instruction less:

   mov    0x88(%rsp),%r10
   shlx   %r10,%rbx,%rbx
-  mov    $0x1,%r11d
+  mov    $0xffffffffffffffff,%r11
   shlx   %r10,%r11,%r11
-  dec    %r11
   mov    0x18(%r14),%r10
-  and    %r11,%r10
+  andn   %r10,%r11,%r10
   or     %r10,%rbx
   movslq %ebx,%rbx

I'll send a new version of the patch.
Paolo Bonzini July 14, 2015, 5:11 p.m. UTC | #3
On 14/07/2015 18:45, Aurelien Jarno wrote:
>>> > > 
>>> > >     mask = 0x7fffffffffffffffull >> (t1 ^ 63)
>>> > > 
>>> > > It's simpler to generate it by doing:
>>> > > 
>>> > >     mask = (1 << t1) - 1
>> > 
>> > Using ~(-1 << t1) may let you use an ANDN instruction, and is also the
>> > same number of instructions on x86.
>> > 
> Indeed thanks for the hint. The generated code has the same size, but is
> one instruction less:
> 
>    mov    0x88(%rsp),%r10
>    shlx   %r10,%rbx,%rbx
> -  mov    $0x1,%r11d
> +  mov    $0xffffffffffffffff,%r11
>    shlx   %r10,%r11,%r11
> -  dec    %r11
>    mov    0x18(%r14),%r10
> -  and    %r11,%r10
> +  andn   %r10,%r11,%r10
>    or     %r10,%rbx
>    movslq %ebx,%rbx

Oh, indeed I forgot about the fancy new x86 bit manipulation
instructions!  Even better. :)

Paolo
diff mbox

Patch

diff --git a/target-mips/translate.c b/target-mips/translate.c
index 0ac3bd8..9891209 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -2153,9 +2153,9 @@  static void gen_ld(DisasContext *ctx, uint32_t opc,
         tcg_gen_andi_tl(t0, t0, ~7);
         tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ);
         tcg_gen_shl_tl(t0, t0, t1);
-        tcg_gen_xori_tl(t1, t1, 63);
-        t2 = tcg_const_tl(0x7fffffffffffffffull);
-        tcg_gen_shr_tl(t2, t2, t1);
+        t2 = tcg_const_tl(1);
+        tcg_gen_shl_tl(t2, t2, t1);
+        tcg_gen_subi_tl(t2, t2, 1);
         gen_load_gpr(t1, rt);
         tcg_gen_and_tl(t1, t1, t2);
         tcg_temp_free(t2);
@@ -2246,9 +2246,9 @@  static void gen_ld(DisasContext *ctx, uint32_t opc,
         tcg_gen_andi_tl(t0, t0, ~3);
         tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL);
         tcg_gen_shl_tl(t0, t0, t1);
-        tcg_gen_xori_tl(t1, t1, 31);
-        t2 = tcg_const_tl(0x7fffffffull);
-        tcg_gen_shr_tl(t2, t2, t1);
+        t2 = tcg_const_tl(1);
+        tcg_gen_shl_tl(t2, t2, t1);
+        tcg_gen_subi_tl(t2, t2, 1);
         gen_load_gpr(t1, rt);
         tcg_gen_and_tl(t1, t1, t2);
         tcg_temp_free(t2);