diff mbox

[v3,2/5] target/arm: optimize rev16() using extract op

Message ID 20170512182132.jdw4g2pd5gvf2dti@aurel32.net
State New
Headers show

Commit Message

Aurelien Jarno May 12, 2017, 6:21 p.m. UTC
On 2017-05-12 09:50, Richard Henderson wrote:
> On 05/11/2017 08:35 PM, Philippe Mathieu-Daudé wrote:
> > -    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 16);
> > -    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, 0xffff);
> > +    tcg_gen_extract_i64(tcg_tmp, tcg_rn, 16, 0xffff);
> 
> So your new script didn't work then?  This should be "..., 16, 16);".

Indeed that should be ..., 16, 16). That said looking a bit at the 
actual code, it looks like rev16 is not implemented efficiently. Instead
of byteswapping individual 16-bit words one by one, it would be better
to work on the whole register at the same time using shifts and mask.
This is actually how rev16 is implemented for aarch32 (and a few other
targets). Something like that (i can send a proper patch later):


This makes the generated x86-64 code much shorter, especially with sf=1:


* rev16 with sf = 0

before:
    0x5631ecfda582:  movzwl %bx,%r12d
    0x5631ecfda586:  rol    $0x8,%r12w
    0x5631ecfda58b:  shr    $0x10,%rbx
    0x5631ecfda58f:  rol    $0x8,%bx
    0x5631ecfda593:  movzwl %bx,%ebx
    0x5631ecfda596:  shl    $0x10,%rbx
    0x5631ecfda59a:  mov    $0xffffffff0000ffff,%r13
    0x5631ecfda5a4:  and    %r13,%r12
    0x5631ecfda5a7:  or     %rbx,%r12
    
after:
    0x559f7aeae5f2:  mov    %rbx,%r12
    0x559f7aeae5f5:  shr    $0x8,%r12
    0x559f7aeae5f9:  and    $0xff00ff,%r12d
    0x559f7aeae600:  shl    $0x8,%rbx
    0x559f7aeae604:  and    $0xff00ff00,%ebx
    0x559f7aeae60a:  or     %r12,%rbx
    
    
* rev16 with sf = 1
    
before:
    0x5631ecfe5380:  mov    %rbx,%r12
    0x5631ecfe5383:  movzwl %bx,%ebx
    0x5631ecfe5386:  rol    $0x8,%bx
    0x5631ecfe538a:  mov    %r12,%r13
    0x5631ecfe538d:  shr    $0x10,%r13
    0x5631ecfe5391:  movzwl %r13w,%r13d
    0x5631ecfe5395:  rol    $0x8,%r13w
    0x5631ecfe539a:  movzwl %r13w,%r13d
    0x5631ecfe539e:  shl    $0x10,%r13
    0x5631ecfe53a2:  mov    $0xffffffff0000ffff,%r15
    0x5631ecfe53ac:  and    %r15,%rbx
    0x5631ecfe53af:  or     %r13,%rbx
    0x5631ecfe53b2:  mov    %r12,%r13
    0x5631ecfe53b5:  shr    $0x20,%r13
    0x5631ecfe53b9:  movzwl %r13w,%r13d
    0x5631ecfe53bd:  rol    $0x8,%r13w
    0x5631ecfe53c2:  movzwl %r13w,%r13d
    0x5631ecfe53c6:  shl    $0x20,%r13
    0x5631ecfe53ca:  mov    $0xffff0000ffffffff,%r15
    0x5631ecfe53d4:  and    %r15,%rbx
    0x5631ecfe53d7:  or     %r13,%rbx
    0x5631ecfe53da:  shr    $0x30,%r12
    0x5631ecfe53de:  rol    $0x8,%r12w
    0x5631ecfe53e3:  shl    $0x30,%r12
    0x5631ecfe53e7:  mov    $0xffffffffffff,%r13
    0x5631ecfe53f1:  and    %r13,%rbx
    0x5631ecfe53f4:  or     %r12,%rbx
    
after:
    0x559f7aeb93e0:  mov    %rbx,%r12
    0x559f7aeb93e3:  shr    $0x8,%r12
    0x559f7aeb93e7:  mov    $0xff00ff00ff00ff,%r13
    0x559f7aeb93f1:  and    %r13,%r12
    0x559f7aeb93f4:  shl    $0x8,%rbx
    0x559f7aeb93f8:  mov    $0xff00ff00ff00ff00,%r13
    0x559f7aeb9402:  and    %r13,%rbx
    0x559f7aeb9405:  or     %r12,%rbx

Aurelien

Comments

Richard Henderson May 12, 2017, 7:05 p.m. UTC | #1
On 05/12/2017 11:21 AM, Aurelien Jarno wrote:
> +    uint64_t mask1 = sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff;
> +    uint64_t mask2 = sf ? 0xff00ff00ff00ff00ull : 0xff00ff00;
> +
> +    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
> +    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, mask1);
> +    tcg_gen_shli_i64(tcg_rd, tcg_rn, 8);
> +    tcg_gen_andi_i64(tcg_rd, tcg_rd, mask2);

It would probably be better to use a single mask, since they're not free to 
instantiate in a register.  So e.g.

   TCGv mask = tcg_const_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
   tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
   tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
   tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
   tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);


r~
Aurelien Jarno May 12, 2017, 7:22 p.m. UTC | #2
On 2017-05-12 12:05, Richard Henderson wrote:
> On 05/12/2017 11:21 AM, Aurelien Jarno wrote:
> > +    uint64_t mask1 = sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff;
> > +    uint64_t mask2 = sf ? 0xff00ff00ff00ff00ull : 0xff00ff00;
> > +
> > +    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
> > +    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, mask1);
> > +    tcg_gen_shli_i64(tcg_rd, tcg_rn, 8);
> > +    tcg_gen_andi_i64(tcg_rd, tcg_rd, mask2);
> 
> It would probably be better to use a single mask, since they're not free to
> instantiate in a register.  So e.g.
> 
>   TCGv mask = tcg_const_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
>   tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
>   tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
>   tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
>   tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);

Indeed that improves things a bit for sf=1. For sf=0 though the
constant is never loaded into a register, it is passed to the and
instructions as an immediate.
Richard Henderson May 12, 2017, 7:38 p.m. UTC | #3
On 05/12/2017 12:22 PM, Aurelien Jarno wrote:
> On 2017-05-12 12:05, Richard Henderson wrote:
>> On 05/12/2017 11:21 AM, Aurelien Jarno wrote:
>>> +    uint64_t mask1 = sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff;
>>> +    uint64_t mask2 = sf ? 0xff00ff00ff00ff00ull : 0xff00ff00;
>>> +
>>> +    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
>>> +    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, mask1);
>>> +    tcg_gen_shli_i64(tcg_rd, tcg_rn, 8);
>>> +    tcg_gen_andi_i64(tcg_rd, tcg_rd, mask2);
>>
>> It would probably be better to use a single mask, since they're not free to
>> instantiate in a register.  So e.g.
>>
>>    TCGv mask = tcg_const_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
>>    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
>>    tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
>>    tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
>>    tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);
> 
> Indeed that improves things a bit for sf=1. For sf=0 though the
> constant is never loaded into a register, it is passed to the and
> instructions as an immediate.
> 
For x86 (and sometimes s390) it isn't, but it certainly would be for all other 
hosts.


r~
diff mbox

Patch

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 24de30d92c..ccb276417b 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -4034,25 +4034,14 @@  static void handle_rev16(DisasContext *s, unsigned int sf,
     TCGv_i64 tcg_rd = cpu_reg(s, rd);
     TCGv_i64 tcg_tmp = tcg_temp_new_i64();
     TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
-
-    tcg_gen_andi_i64(tcg_tmp, tcg_rn, 0xffff);
-    tcg_gen_bswap16_i64(tcg_rd, tcg_tmp);
-
-    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 16);
-    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, 0xffff);
-    tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
-    tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 16, 16);
-
-    if (sf) {
-        tcg_gen_shri_i64(tcg_tmp, tcg_rn, 32);
-        tcg_gen_andi_i64(tcg_tmp, tcg_tmp, 0xffff);
-        tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
-        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 32, 16);
-
-        tcg_gen_shri_i64(tcg_tmp, tcg_rn, 48);
-        tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
-        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 48, 16);
-    }
+    uint64_t mask1 = sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff;
+    uint64_t mask2 = sf ? 0xff00ff00ff00ff00ull : 0xff00ff00;
+
+    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
+    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, mask1);
+    tcg_gen_shli_i64(tcg_rd, tcg_rn, 8);
+    tcg_gen_andi_i64(tcg_rd, tcg_rd, mask2);
+    tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_tmp);
 
     tcg_temp_free_i64(tcg_tmp);
 }