[AArch64] Optimized memcmp

Submitted by Wilco Dijkstra on Aug. 8, 2017, 4:03 p.m.

Details

Message ID DB6PR0801MB205393A98B4357550962759B838A0@DB6PR0801MB2053.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Wilco Dijkstra Aug. 8, 2017, 4:03 p.m.
ping


From: Wilco Dijkstra
Sent: 07 July 2017 16:11
To: libc-alpha@sourceware.org
Cc: nd; Szabolcs Nagy
Subject: [PATCH][AArch64] Optimized memcmp
    
This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and 
unaligned by >500% (yes >6 times faster) on large inputs.

ChangeLog:
2017-07-07  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcmp.S (memcmp): 
        Rewrite of optimized memcmp.


GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53:

Length    1, alignment  1/ 1:           153%
Length    1, alignment  1/ 1:           119%
Length    1, alignment  1/ 1:           154%
Length    2, alignment  2/ 2:           121%
Length    2, alignment  2/ 2:           140%
Length    2, alignment  2/ 2:           121%
Length    3, alignment  3/ 3:           105%
Length    3, alignment  3/ 3:           105%
Length    3, alignment  3/ 3:           105%
Length    4, alignment  4/ 4:           155%
Length    4, alignment  4/ 4:           154%
Length    4, alignment  4/ 4:           161%
Length    5, alignment  5/ 5:           173%
Length    5, alignment  5/ 5:           173%
Length    5, alignment  5/ 5:           173%
Length    6, alignment  6/ 6:           145%
Length    6, alignment  6/ 6:           145%
Length    6, alignment  6/ 6:           145%
Length    7, alignment  7/ 7:           125%
Length    7, alignment  7/ 7:           125%
Length    7, alignment  7/ 7:           125%
Length    8, alignment  8/ 8:           111%
Length    8, alignment  8/ 8:           130%
Length    8, alignment  8/ 8:           124%
Length    9, alignment  9/ 9:           160%
Length    9, alignment  9/ 9:           160%
Length    9, alignment  9/ 9:           150%
Length   10, alignment 10/10:           170%
Length   10, alignment 10/10:           137%
Length   10, alignment 10/10:           150%
Length   11, alignment 11/11:           160%
Length   11, alignment 11/11:           160%
Length   11, alignment 11/11:           160%
Length   12, alignment 12/12:           146%
Length   12, alignment 12/12:           168%
Length   12, alignment 12/12:           156%
Length   13, alignment 13/13:           167%
Length   13, alignment 13/13:           167%
Length   13, alignment 13/13:           173%
Length   14, alignment 14/14:           167%
Length   14, alignment 14/14:           168%
Length   14, alignment 14/14:           168%
Length   15, alignment 15/15:           168%
Length   15, alignment 15/15:           173%
Length   15, alignment 15/15:           173%
Length    1, alignment  0/ 0:           134%
Length    1, alignment  0/ 0:           127%
Length    1, alignment  0/ 0:           119%
Length    2, alignment  0/ 0:           94%
Length    2, alignment  0/ 0:           94%
Length    2, alignment  0/ 0:           106%
Length    3, alignment  0/ 0:           82%
Length    3, alignment  0/ 0:           87%
Length    3, alignment  0/ 0:           82%
Length    4, alignment  0/ 0:           115%
Length    4, alignment  0/ 0:           115%
Length    4, alignment  0/ 0:           122%
Length    5, alignment  0/ 0:           127%
Length    5, alignment  0/ 0:           119%
Length    5, alignment  0/ 0:           127%
Length    6, alignment  0/ 0:           103%
Length    6, alignment  0/ 0:           100%
Length    6, alignment  0/ 0:           100%
Length    7, alignment  0/ 0:           82%
Length    7, alignment  0/ 0:           91%
Length    7, alignment  0/ 0:           87%
Length    8, alignment  0/ 0:           111%
Length    8, alignment  0/ 0:           124%
Length    8, alignment  0/ 0:           124%
Length    9, alignment  0/ 0:           136%
Length    9, alignment  0/ 0:           136%
Length    9, alignment  0/ 0:           136%
Length   10, alignment  0/ 0:           136%
Length   10, alignment  0/ 0:           135%
Length   10, alignment  0/ 0:           136%
Length   11, alignment  0/ 0:           136%
Length   11, alignment  0/ 0:           136%
Length   11, alignment  0/ 0:           135%
Length   12, alignment  0/ 0:           136%
Length   12, alignment  0/ 0:           136%
Length   12, alignment  0/ 0:           136%
Length   13, alignment  0/ 0:           135%
Length   13, alignment  0/ 0:           136%
Length   13, alignment  0/ 0:           136%
Length   14, alignment  0/ 0:           136%
Length   14, alignment  0/ 0:           136%
Length   14, alignment  0/ 0:           136%
Length   15, alignment  0/ 0:           136%
Length   15, alignment  0/ 0:           136%
Length   15, alignment  0/ 0:           136%
Length    4, alignment  0/ 0:           115%
Length    4, alignment  0/ 0:           115%
Length    4, alignment  0/ 0:           115%
Length   32, alignment  0/ 0:           127%
Length   32, alignment  7/ 2:           395%
Length   32, alignment  0/ 0:           127%
Length   32, alignment  0/ 0:           127%
Length    8, alignment  0/ 0:           111%
Length    8, alignment  0/ 0:           124%
Length    8, alignment  0/ 0:           124%
Length   64, alignment  0/ 0:           128%
Length   64, alignment  6/ 4:           475%
Length   64, alignment  0/ 0:           131%
Length   64, alignment  0/ 0:           134%
Length   16, alignment  0/ 0:           128%
Length   16, alignment  0/ 0:           119%
Length   16, alignment  0/ 0:           128%
Length  128, alignment  0/ 0:           129%
Length  128, alignment  5/ 6:           475%
Length  128, alignment  0/ 0:           130%
Length  128, alignment  0/ 0:           129%
Length   32, alignment  0/ 0:           126%
Length   32, alignment  0/ 0:           126%
Length   32, alignment  0/ 0:           126%
Length  256, alignment  0/ 0:           127%
Length  256, alignment  4/ 8:           545%
Length  256, alignment  0/ 0:           126%
Length  256, alignment  0/ 0:           128%
Length   64, alignment  0/ 0:           171%
Length   64, alignment  0/ 0:           171%
Length   64, alignment  0/ 0:           174%
Length  512, alignment  0/ 0:           126%
Length  512, alignment  3/10:           585%
Length  512, alignment  0/ 0:           126%
Length  512, alignment  0/ 0:           127%
Length  128, alignment  0/ 0:           129%
Length  128, alignment  0/ 0:           128%
Length  128, alignment  0/ 0:           129%
Length 1024, alignment  0/ 0:           125%
Length 1024, alignment  2/12:           611%
Length 1024, alignment  0/ 0:           126%
Length 1024, alignment  0/ 0:           126%
Length  256, alignment  0/ 0:           128%
Length  256, alignment  0/ 0:           127%
Length  256, alignment  0/ 0:           128%
Length 2048, alignment  0/ 0:           125%
Length 2048, alignment  1/14:           625%
Length 2048, alignment  0/ 0:           125%
Length 2048, alignment  0/ 0:           125%
Length  512, alignment  0/ 0:           126%
Length  512, alignment  0/ 0:           127%
Length  512, alignment  0/ 0:           127%
Length 4096, alignment  0/ 0:           125%
Length 4096, alignment  0/16:           125%
Length 4096, alignment  0/ 0:           125%
Length 4096, alignment  0/ 0:           125%
Length 1024, alignment  0/ 0:           126%
Length 1024, alignment  0/ 0:           126%
Length 1024, alignment  0/ 0:           126%
Length 8192, alignment  0/ 0:           125%
Length 8192, alignment 63/18:           636%
Length 8192, alignment  0/ 0:           125%
Length 8192, alignment  0/ 0:           125%
Length   16, alignment  1/ 2:           317%
Length   16, alignment  1/ 2:           317%
Length   16, alignment  1/ 2:           317%
Length   32, alignment  2/ 4:           395%
Length   32, alignment  2/ 4:           395%
Length   32, alignment  2/ 4:           398%
Length   64, alignment  3/ 6:           475%
Length   64, alignment  3/ 6:           475%
Length   64, alignment  3/ 6:           477%
Length  128, alignment  4/ 8:           479%
Length  128, alignment  4/ 8:           479%
Length  128, alignment  4/ 8:           479%
Length  256, alignment  5/10:           543%
Length  256, alignment  5/10:           539%
Length  256, alignment  5/10:           543%
Length  512, alignment  6/12:           585%
Length  512, alignment  6/12:           585%
Length  512, alignment  6/12:           585%
Length 1024, alignment  7/14:           611%
Length 1024, alignment  7/14:           611%
Length 1024, alignment  7/14:           611%

---

Comments

Szabolcs Nagy Aug. 8, 2017, 4:09 p.m.
On 08/08/17 17:03, Wilco Dijkstra wrote:
> 
> ping
> 
> 
> From: Wilco Dijkstra
> Sent: 07 July 2017 16:11
> To: libc-alpha@sourceware.org
> Cc: nd; Szabolcs Nagy
> Subject: [PATCH][AArch64] Optimized memcmp
>     
> This is an optimized memcmp for AArch64.  This is a complete rewrite
> using a different algorithm.  The previous version split into cases
> where both inputs were aligned, the inputs were mutually aligned and
> unaligned using a byte loop.  The new version combines all these cases,
> while small inputs of less than 8 bytes are handled separately.
> 
> This allows the main code to be sped up using unaligned loads since
> there are now at least 8 bytes to be compared.  After the first 8 bytes,
> align the first input.  This ensures each iteration does at most one
> unaligned access and mutually aligned inputs behave as aligned.
> After the main loop, process the last 8 bytes using unaligned accesses.
> 
> This improves performance of (mutually) aligned cases by 25% and 
> unaligned by >500% (yes >6 times faster) on large inputs.
> 
> ChangeLog:
> 2017-07-07  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>         * sysdeps/aarch64/memcmp.S (memcmp): 
>         Rewrite of optimized memcmp.
> 

ok to commit.

Patch hide | download patch | download mbox

diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89297551a07fcfa055827564b833e86f8fb..b99c081bba2c7f26b5b53315d6b806ae22eaaafc 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,98 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1            x0
 #define src2            x1
 #define limit           x2
-#define result         x0
+#define result         w0
 
 /* Internal variables.  */
 #define data1           x3
 #define data1w          w3
 #define data2           x4
 #define data2w          w4
-#define has_nul                x5
-#define diff           x6
-#define endloop                x7
-#define tmp1           x8
-#define tmp2           x9
-#define tmp3           x10
-#define pos            x11
-#define limit_wd       x12
-#define mask           x13
+#define tmp1           x5
 
 ENTRY_ALIGN (memcmp, 6)
         DELOUSE (0)
         DELOUSE (1)
         DELOUSE (2)
-       cbz     limit, L(ret0)
-       eor     tmp1, src1, src2
-       tst     tmp1, #7
-       b.ne    L(misaligned8)
-       ands    tmp1, src1, #7
-       b.ne    L(mutual_align)
-       add     limit_wd, limit, #7
-       lsr     limit_wd, limit_wd, #3
-       /* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-       ldr     data1, [src1], #8
-       ldr     data2, [src2], #8
-L(start_realigned):
-       subs    limit_wd, limit_wd, #1
-       eor     diff, data1, data2      /* Non-zero if differences found.  */
-       csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
-       cbz     endloop, L(loop_aligned)
-       /* End of performance-critical section  -- one 64B cache line.  */
-
-       /* Not reached the limit, must have found a diff.  */
-       cbnz    limit_wd, L(not_limit)
-
-       /* Limit % 8 == 0 => all bytes significant.  */
-       ands    limit, limit, #7
-       b.eq    L(not_limit)
-
-       lsl     limit, limit, #3        /* Bits -> bytes.  */
-       mov     mask, #~0
-#ifdef __AARCH64EB__
-       lsr     mask, mask, limit
-#else
-       lsl     mask, mask, limit
-#endif
-       bic     data1, data1, mask
-       bic     data2, data2, mask
-
-       orr     diff, diff, mask
-L(not_limit):
 
-#ifndef        __AARCH64EB__
-       rev     diff, diff
+       subs    limit, limit, 8
+       b.lo    .Lless8
+
+       /* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       and     tmp1, src1, 7
+       add     limit, limit, tmp1
+       cmp     data1, data2
+       bne     .Lreturn
+
+       /* Align src1 and adjust src2 with bytes not yet done.  */
+       sub     src1, src1, tmp1
+       sub     src2, src2, tmp1
+
+       subs    limit, limit, 8
+       b.ls    .Llast_bytes
+
+       /* Loop performing 8 bytes per iteration using aligned src1.
+          Limit is pre-decremented by 8 and must be larger than zero.
+          Exit if <= 8 bytes left to do or if the data is not equal.  */
+       .p2align 4
+.Lloop8:
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       subs    limit, limit, 8
+       ccmp    data1, data2, 0, hi  /* NZCV = 0b0000.  */
+       b.eq    .Lloop8
+
+       cmp     data1, data2
+       bne     .Lreturn
+
+       /* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+       ldr     data1, [src1, limit]
+       ldr     data2, [src2, limit]
+
+       /* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
         rev     data1, data1
         rev     data2, data2
 #endif
-       /* The MS-non-zero bit of DIFF marks either the first bit
-          that is different, or the end of the significant data.
-          Shifting left now will bring the critical information into the
-          top bits.  */
-       clz     pos, diff
-       lsl     data1, data1, pos
-       lsl     data2, data2, pos
-       /* But we need to zero-extend (char is unsigned) the value and then
-          perform a signed 32-bit subtraction.  */
-       lsr     data1, data1, #56
-       sub     result, data1, data2, lsr #56
-       RET
-
-L(mutual_align):
-       /* Sources are mutually aligned, but are not currently at an
-          alignment boundary.  Round down the addresses and then mask off
-          the bytes that precede the start point.  */
-       bic     src1, src1, #7
-       bic     src2, src2, #7
-       add     limit, limit, tmp1      /* Adjust the limit for the extra.  */
-       lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
-       ldr     data1, [src1], #8
-       neg     tmp1, tmp1              /* Bits to alignment -64.  */
-       ldr     data2, [src2], #8
-       mov     tmp2, #~0
-#ifdef __AARCH64EB__
-       /* Big-endian.  Early bytes are at MSB.  */
-       lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#else
-       /* Little-endian.  Early bytes are at LSB.  */
-       lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#endif
-       add     limit_wd, limit, #7
-       orr     data1, data1, tmp2
-       orr     data2, data2, tmp2
-       lsr     limit_wd, limit_wd, #3
-       b       L(start_realigned)
-
-L(ret0):
-       mov     result, #0
-       RET
-
-       .p2align 6
-L(misaligned8):
-       sub     limit, limit, #1
-1:
-       /* Perhaps we can do better than this.  */
-       ldrb    data1w, [src1], #1
-       ldrb    data2w, [src2], #1
-       subs    limit, limit, #1
-       ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-       b.eq    1b
-       sub     result, data1, data2
-       RET
+       cmp     data1, data2
+.Lret_eq:
+       cset    result, ne
+       cneg    result, result, lo
+       ret
+
+       .p2align 4
+       /* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+       adds    limit, limit, 4
+       b.lo    .Lless4
+       ldr     data1w, [src1], 4
+       ldr     data2w, [src2], 4
+       cmp     data1w, data2w
+       b.ne    .Lreturn
+       sub     limit, limit, 4
+.Lless4:
+       adds    limit, limit, 4
+       beq     .Lret_eq
+.Lbyte_loop:
+       ldrb    data1w, [src1], 1
+       ldrb    data2w, [src2], 1
+       subs    limit, limit, 1
+       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
+       b.eq    .Lbyte_loop
+       sub     result, data1w, data2w
+       ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)