diff mbox series

[v2] aarch64: Optimized implementation of strnlen

Message ID 20191022094003.9612-1-zhangxuelei4@huawei.com
State New
Headers show
Series [v2] aarch64: Optimized implementation of strnlen | expand

Commit Message

Xuelei Zhang Oct. 22, 2019, 9:40 a.m. UTC
Optimize the strlen implementation by using vector operations and
loop unrooling in main loop. Compared to aarch64/strnlen.S, it
reduces latency of cases in bench-strnlen by 11%~24% when the length
of src is greater than 64 bytes, with gains throughout the benchmark.
---
 sysdeps/aarch64/strnlen.S | 52 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

Comments

Wilco Dijkstra Oct. 22, 2019, 5:57 p.m. UTC | #1
Hi Xuelei,

> Optimize the strlen implementation by using vector operations and
> loop unrooling in main loop. Compared to aarch64/strnlen.S, it
> reduces latency of cases in bench-strnlen by 11%~24% when the length
> of src is greater than 64 bytes, with gains throughout the benchmark.

Like with strcpy, this improves performance on various microarchitectures,
so this is OK.

Wilco
Adhemerval Zanella Netto Dec. 19, 2019, 7:44 p.m. UTC | #2
On 22/10/2019 14:57, Wilco Dijkstra wrote:
> Hi Xuelei,
> 
>> Optimize the strlen implementation by using vector operations and
>> loop unrooling in main loop. Compared to aarch64/strnlen.S, it
>> reduces latency of cases in bench-strnlen by 11%~24% when the length
>> of src is greater than 64 bytes, with gains throughout the benchmark.
> 
> Like with strcpy, this improves performance on various microarchitectures,
> so this is OK.
> 
> Wilco

I pushed it upstream.
diff mbox series

Patch

diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 70283c80749..a57753b0a28 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -45,6 +45,11 @@ 
 #define pos		x13
 #define limit_wd	x14
 
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define dataq2		q3
+#define datav2		v3
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
@@ -71,7 +76,7 @@  ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
 	   cycle, as we get much better parallelism out of the operations.  */
 
 	/* Start of critial section -- keep to one 64Byte cache line.  */
-L(loop):
+
 	ldp	data1, data2, [src], #16
 L(realigned):
 	sub	tmp1, data1, zeroones
@@ -119,6 +124,51 @@  L(nul_in_data2):
 	csel	len, len, limit, ls		/* Return the lower value.  */
 	RET
 
+L(loop):
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.eq	L(loop_end)
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.ne	L(loop)
+L(loop_end):
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	cbnz	tmp1, L(hit_limit)	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+#ifdef __AARCH64EB__
+	rev64	datav.16b, datav.16b
+#endif
+	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+	   pair of scalars and then compute the length from the earliest NULL
+	   byte.  */
+
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
+	csel	data1, data1, data2, ne
+	sub	len, src, srcin
+	sub	len, len, #16
+	rev	data1, data1
+	add	tmp2, len, 8
+	clz	tmp1, data1
+	csel	len, len, tmp2, ne
+	add	len, len, tmp1, lsr 3
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	RET
+
 L(misaligned):
 	/* Deal with a partial first word.
 	   We're doing two things in parallel here;