diff mbox series

AArch64: Improve strlen_asimd

Message ID PAWPR08MB8982BE5A8C1635DBCB89700D83FD9@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Improve strlen_asimd | expand

Commit Message

Wilco Dijkstra Jan. 12, 2023, 3:51 p.m. UTC
Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
Performance improves slightly as a result.  Passes regress.

---

Comments

Szabolcs Nagy Jan. 13, 2023, 12:25 p.m. UTC | #1
The 01/12/2023 15:51, Wilco Dijkstra wrote:
> Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
> Performance improves slightly as a result.  Passes regress.
> 

I prefer to commit this and the other string function optimization
patches and not delay to next release so start using and widely
testing them sooner (we can fix and backport perf regressions).

please commit it, thanks.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
> index ca6ab96ecf2de45def79539facd8e0b86f4edc95..490439491d19c3f14b0228f42248bc8aa6e9e8bd 100644
> --- a/sysdeps/aarch64/multiarch/strlen_asimd.S
> +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
> @@ -48,6 +48,7 @@
>  #define tmp    x2
>  #define tmpw   w2
>  #define synd   x3
> +#define syndw  w3
>  #define shift  x4
> 
>  /* For the first 32 bytes, NUL detection works on the principle that
> @@ -87,7 +88,6 @@
> 
>  ENTRY (__strlen_asimd)
>         PTR_ARG (0)
> -
>         and     tmp1, srcin, MIN_PAGE_SIZE - 1
>         cmp     tmp1, MIN_PAGE_SIZE - 32
>         b.hi    L(page_cross)
> @@ -123,7 +123,6 @@ ENTRY (__strlen_asimd)
>         add     len, len, tmp1, lsr 3
>         ret
> 
> -       .p2align 3
>         /* Look for a NUL byte at offset 16..31 in the string.  */
>  L(bytes16_31):
>         ldp     data1, data2, [srcin, 16]
> @@ -151,6 +150,7 @@ L(bytes16_31):
>         add     len, len, tmp1, lsr 3
>         ret
> 
> +       nop
>  L(loop_entry):
>         bic     src, srcin, 31
> 
> @@ -166,18 +166,12 @@ L(loop):
>         /* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
>         cmeq    maskv.16b, datav1.16b, 0
>         sub     len, src, srcin
> -       tst     synd, 0xffffffff
> -       b.ne    1f
> +       cbnz    syndw, 1f
>         cmeq    maskv.16b, datav2.16b, 0
>         add     len, len, 16
>  1:
>         /* Generate a bitmask and compute correct byte offset.  */
> -#ifdef __AARCH64EB__
> -       bic     maskv.8h, 0xf0
> -#else
> -       bic     maskv.8h, 0x0f, lsl 8
> -#endif
> -       umaxp   maskv.16b, maskv.16b, maskv.16b
> +       shrn    maskv.8b, maskv.8h, 4
>         fmov    synd, maskd
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
> @@ -186,8 +180,6 @@ L(loop):
>         add     len, len, tmp, lsr 2
>         ret
> 
> -        .p2align 4
> -
>  L(page_cross):
>         bic     src, srcin, 31
>         mov     tmpw, 0x0c03
Carlos O'Donell Jan. 16, 2023, 10:16 p.m. UTC | #2
On 1/13/23 07:25, Szabolcs Nagy via Libc-alpha wrote:
> The 01/12/2023 15:51, Wilco Dijkstra wrote:
>> Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
>> Performance improves slightly as a result.  Passes regress.
>>
> 
> I prefer to commit this and the other string function optimization
> patches and not delay to next release so start using and widely
> testing them sooner (we can fix and backport perf regressions).
> 
> please commit it, thanks.
> 
> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

Yes, please commit this ASAP.

As RM I would like to see machine testing start next week and so I'd like to
see such changes committed early this week.
Wilco Dijkstra Jan. 17, 2023, 4:37 p.m. UTC | #3
Hi Carlos,

>> I prefer to commit this and the other string function optimization
>> patches and not delay to next release so start using and widely
>> testing them sooner (we can fix and backport perf regressions).
>> 
>> please commit it, thanks.
>> 
>> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
> Yes, please commit this ASAP.
> 
> As RM I would like to see machine testing start next week and so I'd like to
> see such changes committed early this week.

Thanks, I've pushed it.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index ca6ab96ecf2de45def79539facd8e0b86f4edc95..490439491d19c3f14b0228f42248bc8aa6e9e8bd 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,7 @@ 
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -87,7 +88,6 @@ 
 
 ENTRY (__strlen_asimd)
 	PTR_ARG (0)
-
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	cmp	tmp1, MIN_PAGE_SIZE - 32
 	b.hi	L(page_cross)
@@ -123,7 +123,6 @@  ENTRY (__strlen_asimd)
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -151,6 +150,7 @@  L(bytes16_31):
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -166,18 +166,12 @@  L(loop):
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -186,8 +180,6 @@  L(loop):
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03