[v2] aarch64: Optimized implementation of strcpy
diff mbox series

Message ID 20191022093930.10588-1-zhangxuelei4@huawei.com
State New
Headers show
Series
  • [v2] aarch64: Optimized implementation of strcpy
Related show

Commit Message

Xuelei Zhang Oct. 22, 2019, 9:39 a.m. UTC
Optimize the strcpy implementation by using vector loads and operations
in main loop.Compared to aarch64/strcpy.S, it reduces latency of cases
in bench-strlen by 5%~18% when the length of src is greater than 64
bytes, with gains throughout the benchmark.
---
 sysdeps/aarch64/strcpy.S | 59 ++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

Comments

Wilco Dijkstra Oct. 22, 2019, 5:54 p.m. UTC | #1
Hi Xuelei,

> Optimize the strcpy implementation by using vector loads and operations
> in main loop.Compared to aarch64/strcpy.S, it reduces latency of cases
> in bench-strlen by 5%~18% when the length of src is greater than 64
> bytes, with gains throughout the benchmark.

This is OK. I tried it on a few microarchitectures, and it's either as fast or
faster on long strings.

Wilco

Patch
diff mbox series

diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index edc16252f68..290bcf8d236 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -53,6 +53,12 @@ 
 #define len		x16
 #define to_align	x17
 
+/* NEON register */
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define datav2		v3
+
 #ifdef BUILD_STPCPY
 #define STRCPY __stpcpy
 #else
@@ -199,7 +205,6 @@  L(fp_lt2):
 #endif
 	ret
 
-	.p2align 6
 	/* Aligning here ensures that the entry code and main loop all lies
 	   within one 64-byte cache line.  */
 L(bulk_entry):
@@ -214,46 +219,36 @@  L(bulk_entry):
 	   especially on cores with a high number of issue slots per
 	   cycle, as we get much better parallelism out of the operations.  */
 L(main_loop):
-	stp	data1, data2, [dst], #16
+	str	dataq, [dst], #16
 L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp3, datav2.d[0]
+	cbnz	tmp3, L(main_loop)
 
 	/* Since we know we are copying at least 16 bytes, the fastest way
 	   to deal with the tail is to determine the location of the
 	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
 #ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
+	rev64	datav.16b, datav.16b
+#endif
+	/* Ëã³öloc */
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
 	csel	data1, data1, data2, ne
+	mov	pos, 8
 	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
+	clz	tmp1, data1
+	csel	pos, xzr, pos, ne
+	add	pos, pos, tmp1, lsr 3
+	add	src, src, pos
+	add	dst, dst, pos
+	ldr	dataq,[src, #-31]
+	str	dataq,[dst, #-15]
 #ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
+	mov	dstin, dst
 #endif
 	ret