diff mbox

[v4,22/33] tcg-aarch64: Use MOVN in tcg_out_movi

Message ID 1379195690-6509-23-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Sept. 14, 2013, 9:54 p.m. UTC
When profitable, initialize the register with MOVN instead of MOVZ,
before setting the remaining lanes with MOVK.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/aarch64/tcg-target.c | 88 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 13 deletions(-)

Comments

Claudio Fontana Sept. 16, 2013, 9:16 a.m. UTC | #1
On 14.09.2013 23:54, Richard Henderson wrote:
> When profitable, initialize the register with MOVN instead of MOVZ,
> before setting the remaining lanes with MOVK.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/aarch64/tcg-target.c | 88 +++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 75 insertions(+), 13 deletions(-)
> 
> diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
> index f9319ed..cecda05 100644
> --- a/tcg/aarch64/tcg-target.c
> +++ b/tcg/aarch64/tcg-target.c
> @@ -559,24 +559,86 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
>                           tcg_target_long value)
>  {
>      AArch64Insn insn;
> -
> -    if (type == TCG_TYPE_I32) {
> +    int i, wantinv, shift;
> +    tcg_target_long svalue = value;
> +    tcg_target_long ivalue, imask;
> +
> +    /* For 32-bit values, discard potential garbage in value.  For 64-bit
> +       values within [2**31, 2**32-1], we can create smaller sequences by
> +       interpreting this as a negative 32-bit number, while ensuring that
> +       the high 32 bits are cleared by setting SF=0.  */
> +    if (type == TCG_TYPE_I32 || (value & ~0xffffffffull) == 0) {
> +        svalue = (int32_t)value;
>          value = (uint32_t)value;
> +        type = TCG_TYPE_I32;
> +    }
> +
> +    /* Would it take fewer insns to begin with MOVN?  For the value and its
> +       inverse, count the number of 16-bit lanes that are 0.  For the benefit
> +       of 32-bit quantities, compare the zero-extended normal value vs the
> +       sign-extended inverted value.  For example,
> +            v = 0x00000000f100ffff, zeros = 2
> +           ~v = 0xffffffff0eff0000, zeros = 1
> +          ~sv = 0x000000000eff0000, zeros = 3
> +       By using ~sv we see that 3 > 2, leading us to emit just a single insn
> +       "movn ret, 0x0eff, lsl #16".  */
> +
> +    ivalue = ~svalue;
> +    imask = 0;
> +    wantinv = 0;
> +
> +    /* ??? This can be done in the simd unit without a loop:
> +        // Move value and ivalue into V0 and V1 respectively.
> +        mov     v0.d[0], value
> +        mov     v1.d[0], ivalue
> +        // Compare each 16-bit lane vs 0, producing -1 for true.
> +        cmeq    v0.4h, v0.4h, #0
> +        cmeq    v1.4h, v1.4h, #0
> +        mov     imask, v1.d[0]
> +        // Sum the comparisons, producing 0 to -4.
> +        addv    h0, v0.4h
> +        addv    h1, v1.4h
> +        // Subtract the two, forming a positive wantinv result.
> +        sub     v0.4h, v0.4h, v1.4h
> +        smov    wantinv, v0.h[0]
> +     */
> +    for (i = 0; i < 64; i += 16) {
> +        tcg_target_long mask = 0xffffull << i;
> +        if ((value & mask) == 0) {
> +            wantinv -= 1;
> +        }
> +        if ((ivalue & mask) == 0) {
> +            wantinv += 1;
> +            imask |= mask;
> +        }
>      }
>  
> -    /* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
> -       first MOVZ with the half-word immediate skipping the zeros, with a shift
> -       (LSL) equal to this number. Then all next instructions use MOVKs.
> -       Zero the processed half-word in the value, continue until empty.
> -       We build the final result 16bits at a time with up to 4 instructions,
> -       but do not emit instructions for 16bit zero holes. */
> +    /* If we had more 0xffff than 0x0000, invert VALUE and use MOVN.  */
>      insn = INSN_MOVZ;
> -    do {
> -        unsigned shift = ctz64(value) & (63 & -16);
> -        tcg_fmt_Rd_uimm(s, insn, shift >= 32, rd, value >> shift, shift);
> +    if (wantinv > 0) {
> +        value = ivalue;
> +        insn = INSN_MOVN;
> +    }
> +
> +    /* Find the lowest lane that is not 0x0000.  */
> +    shift = ctz64(value) & (63 & -16);
> +    tcg_fmt_Rd_uimm(s, insn, type, rd, value >> shift, shift);
> +
> +    if (wantinv > 0) {
> +        /* Re-invert the value, so MOVK sees non-inverted bits.  */
> +        value = ~value;
> +        /* Clear out all the 0xffff lanes.  */
> +        value ^= imask;
> +    }
> +    /* Clear out the lane that we just set.  */
> +    value &= ~(0xffffUL << shift);
> +
> +    /* Iterate until all lanes have been set, and thus cleared from VALUE.  */
> +    while (value) {
> +        shift = ctz64(value) & (63 & -16);
> +        tcg_fmt_Rd_uimm(s, INSN_MOVK, type, rd, value >> shift, shift);
>          value &= ~(0xffffUL << shift);
> -        insn = INSN_MOVK;
> -    } while (value);
> +    }
>  }
>  
>  static inline void tcg_out_ldst_r(TCGContext *s,
> 

I agree in general with the approach "lets see if it is more convenient to start with MOVN".
The existing implementation is, although not easy, leaner.
Can we make it a little this one a little bit leaner?
I'll think myself about it as well.

C.
Richard Henderson Sept. 16, 2013, 3:50 p.m. UTC | #2
On 09/16/2013 02:16 AM, Claudio Fontana wrote:
> I agree in general with the approach "lets see if it is more convenient to start with MOVN".
> The existing implementation is, although not easy, leaner.
> Can we make it a little this one a little bit leaner?

This sentence is not well formed.  What did you mean?

In what sense did you mean "lean"?  Smaller or faster?
If faster, see the comment about using neon insns.
If smaller... um... why?



r~
Claudio Fontana Sept. 17, 2013, 7:55 a.m. UTC | #3
On 16.09.2013 17:50, Richard Henderson wrote:
> On 09/16/2013 02:16 AM, Claudio Fontana wrote:
>> I agree in general with the approach "lets see if it is more convenient to start with MOVN".
>> The existing implementation is, although not easy, leaner.
>> Can we make it a little this one a little bit leaner?
> 
> This sentence is not well formed.  What did you mean?
> 
> In what sense did you mean "lean"?  Smaller or faster?
> If faster, see the comment about using neon insns.
> If smaller... um... why?

I am not suggesting implementing the neon insns based thing.
I am suggesting looking at ways to reduce the size and complexity of the code needed to implement the same thing you just posted.
If you don't see the why, there is probably little I can say to change that.
Richard Henderson Sept. 17, 2013, 3:56 p.m. UTC | #4
On 09/17/2013 12:55 AM, Claudio Fontana wrote:
> On 16.09.2013 17:50, Richard Henderson wrote:
>> On 09/16/2013 02:16 AM, Claudio Fontana wrote:
>>> I agree in general with the approach "lets see if it is more convenient to start with MOVN".
>>> The existing implementation is, although not easy, leaner.
>>> Can we make it a little this one a little bit leaner?
>>
>> This sentence is not well formed.  What did you mean?
>>
>> In what sense did you mean "lean"?  Smaller or faster?
>> If faster, see the comment about using neon insns.
>> If smaller... um... why?
> 
> I am not suggesting implementing the neon insns based thing.
> I am suggesting looking at ways to reduce the size and complexity of the code needed to implement the same thing you just posted.
> If you don't see the why, there is probably little I can say to change that.

I interpreted you meaning akin to -Os, producing a smaller binary.

I obviously wrote it in the least complex way I could think to do the job.  If
you can find a simpler way, feel free.


r~
diff mbox

Patch

diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index f9319ed..cecda05 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -559,24 +559,86 @@  static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
                          tcg_target_long value)
 {
     AArch64Insn insn;
-
-    if (type == TCG_TYPE_I32) {
+    int i, wantinv, shift;
+    tcg_target_long svalue = value;
+    tcg_target_long ivalue, imask;
+
+    /* For 32-bit values, discard potential garbage in value.  For 64-bit
+       values within [2**31, 2**32-1], we can create smaller sequences by
+       interpreting this as a negative 32-bit number, while ensuring that
+       the high 32 bits are cleared by setting SF=0.  */
+    if (type == TCG_TYPE_I32 || (value & ~0xffffffffull) == 0) {
+        svalue = (int32_t)value;
         value = (uint32_t)value;
+        type = TCG_TYPE_I32;
+    }
+
+    /* Would it take fewer insns to begin with MOVN?  For the value and its
+       inverse, count the number of 16-bit lanes that are 0.  For the benefit
+       of 32-bit quantities, compare the zero-extended normal value vs the
+       sign-extended inverted value.  For example,
+            v = 0x00000000f100ffff, zeros = 2
+           ~v = 0xffffffff0eff0000, zeros = 1
+          ~sv = 0x000000000eff0000, zeros = 3
+       By using ~sv we see that 3 > 2, leading us to emit just a single insn
+       "movn ret, 0x0eff, lsl #16".  */
+
+    ivalue = ~svalue;
+    imask = 0;
+    wantinv = 0;
+
+    /* ??? This can be done in the simd unit without a loop:
+        // Move value and ivalue into V0 and V1 respectively.
+        mov     v0.d[0], value
+        mov     v1.d[0], ivalue
+        // Compare each 16-bit lane vs 0, producing -1 for true.
+        cmeq    v0.4h, v0.4h, #0
+        cmeq    v1.4h, v1.4h, #0
+        mov     imask, v1.d[0]
+        // Sum the comparisons, producing 0 to -4.
+        addv    h0, v0.4h
+        addv    h1, v1.4h
+        // Subtract the two, forming a positive wantinv result.
+        sub     v0.4h, v0.4h, v1.4h
+        smov    wantinv, v0.h[0]
+     */
+    for (i = 0; i < 64; i += 16) {
+        tcg_target_long mask = 0xffffull << i;
+        if ((value & mask) == 0) {
+            wantinv -= 1;
+        }
+        if ((ivalue & mask) == 0) {
+            wantinv += 1;
+            imask |= mask;
+        }
     }
 
-    /* count trailing zeros in 16 bit steps, mapping 64 to 0. Emit the
-       first MOVZ with the half-word immediate skipping the zeros, with a shift
-       (LSL) equal to this number. Then all next instructions use MOVKs.
-       Zero the processed half-word in the value, continue until empty.
-       We build the final result 16bits at a time with up to 4 instructions,
-       but do not emit instructions for 16bit zero holes. */
+    /* If we had more 0xffff than 0x0000, invert VALUE and use MOVN.  */
     insn = INSN_MOVZ;
-    do {
-        unsigned shift = ctz64(value) & (63 & -16);
-        tcg_fmt_Rd_uimm(s, insn, shift >= 32, rd, value >> shift, shift);
+    if (wantinv > 0) {
+        value = ivalue;
+        insn = INSN_MOVN;
+    }
+
+    /* Find the lowest lane that is not 0x0000.  */
+    shift = ctz64(value) & (63 & -16);
+    tcg_fmt_Rd_uimm(s, insn, type, rd, value >> shift, shift);
+
+    if (wantinv > 0) {
+        /* Re-invert the value, so MOVK sees non-inverted bits.  */
+        value = ~value;
+        /* Clear out all the 0xffff lanes.  */
+        value ^= imask;
+    }
+    /* Clear out the lane that we just set.  */
+    value &= ~(0xffffUL << shift);
+
+    /* Iterate until all lanes have been set, and thus cleared from VALUE.  */
+    while (value) {
+        shift = ctz64(value) & (63 & -16);
+        tcg_fmt_Rd_uimm(s, INSN_MOVK, type, rd, value >> shift, shift);
         value &= ~(0xffffUL << shift);
-        insn = INSN_MOVK;
-    } while (value);
+    }
 }
 
 static inline void tcg_out_ldst_r(TCGContext *s,