aarch64: Optimized strnlen for Kunpeng processor
diff mbox series

Message ID 20191017145301.6008-1-zhangxuelei4@huawei.com
State New
Headers show
Series
  • aarch64: Optimized strnlen for Kunpeng processor
Related show

Commit Message

Xuelei Zhang Oct. 17, 2019, 2:53 p.m. UTC
Optimize the strlen implementation by using vector operations and
loop unrooling in main loop. Compared to aarch64/strnlen.S, it
reduces latency of cases in bench-strnlen by 11%~24% when the length
of src is greater than 64 bytes, with gains throughout the benchmark.

Here is the result:
                    	simple_strnlen	__strnlen_kunpeng	__strnlen_generic
Length    1, alignment  0:	16.0938	12.3438	11.0938
Length    1, alignment  0:	11.25	12.0312	12.1875
Length    1, alignment  0:	12.9688	12.1875	12.0312
Length    2, alignment  0:	10.9375	12.3438	12.3438
Length    2, alignment  0:	12.5	12.6562	12.8125
Length    2, alignment  0:	14.5312	12.3438	12.1875
Length    3, alignment  0:	12.8125	12.3438	12.1875
Length    3, alignment  0:	13.5938	12.5	12.6562
Length    3, alignment  0:	14.8438	12.3438	12.1875
Length    4, alignment  0:	13.75	12.3438	12.3438
Length    4, alignment  0:	15	12.3438	12.3438
Length    4, alignment  0:	15.7812	12.3438	12.1875
Length    5, alignment  0:	14.2188	12.5	12.0312
Length    5, alignment  0:	15	12.3438	12.3438
Length    5, alignment  0:	16.7188	12.3438	12.5
Length    6, alignment  0:	14.6875	12.1875	12.0312
Length    6, alignment  0:	16.4062	12.3438	12.5
Length    6, alignment  0:	17.3438	12.3438	12.3438
Length    7, alignment  0:	15.4688	12.3438	12.3438
Length    7, alignment  0:	16.5625	12.3438	12.3438
Length    7, alignment  0:	17.5	12.3438	12.3438
Length    1, alignment  1:	10.7812	11.5625	11.0938
Length    1, alignment  1:	10.9375	13.4375	13.5938
Length    1, alignment  1:	12.5	13.4375	13.75
Length    2, alignment  2:	10.9375	13.5938	13.4375
Length    2, alignment  2:	12.6562	13.5938	13.2812
Length    2, alignment  2:	14.375	13.5938	13.9062
Length    3, alignment  3:	12.3438	13.5938	13.4375
Length    3, alignment  3:	24.0625	13.5938	13.5938
Length    3, alignment  3:	15.3125	14.0625	13.2812
Length    4, alignment  4:	23.2812	12.9688	12.6562
Length    4, alignment  4:	15.3125	13.4375	12.9688
Length    4, alignment  4:	16.25	13.125	13.125
Length    5, alignment  5:	14.6875	13.2812	13.125
Length    5, alignment  5:	15.625	13.2812	12.9688
Length    5, alignment  5:	17.3438	13.2812	13.125
Length    6, alignment  6:	14.8438	13.2812	12.8125
Length    6, alignment  6:	16.5625	13.125	12.6562
Length    6, alignment  6:	17.3438	13.125	12.9688
Length    7, alignment  7:	15.4688	13.2812	13.125
Length    7, alignment  7:	17.1875	13.125	13.125
Length    7, alignment  7:	18.5938	13.2812	12.9688
Length    4, alignment  0:	16.0938	12.3438	12.0312
Length    4, alignment  1:	15.3125	13.2812	13.125
Length    8, alignment  0:	19.0625	12.5	12.1875
Length    8, alignment  1:	18.125	13.4375	12.9688
Length   16, alignment  0:	25.3125	13.5938	14.2188
Length   16, alignment  1:	24.5312	14.5312	15.1562
Length   32, alignment  0:	37.3438	14.0625	16.875
Length   32, alignment  1:	36.5625	15.3125	17.5
Length   64, alignment  0:	67.5	17.1875	20.7812
Length   64, alignment  1:	67.6562	17.5	19.8438
Length  128, alignment  0:	117.031	20.4688	23.9062
Length  128, alignment  1:	117.344	22.3438	27.8125
Length  256, alignment  0:	215.312	30	33.9062
Length  256, alignment  1:	215.312	31.25	36.0938
Length  512, alignment  0:	412.031	44.0625	57.8125
Length  512, alignment  1:	412.656	46.5625	58.5938
Length 1024, alignment  0:	806.25	79.8438	102.031
Length 1024, alignment  1:	806.094	79.2188	101.875
Length    1, alignment  0:	12.6562	12.3438	12.3438
Length    2, alignment  0:	14.0625	11.7188	12.3438
Length    3, alignment  0:	14.6875	12.0312	12.0312
Length    4, alignment  0:	15.625	12.1875	11.875
Length    5, alignment  0:	16.25	12.3438	11.875
Length    6, alignment  0:	17.0312	12.3438	12.8125
Length    7, alignment  0:	17.5	12.0312	12.3438
Length    1, alignment  1:	12.5	13.5938	13.9062
Length    2, alignment  2:	13.75	13.4375	13.5938
Length    3, alignment  3:	14.375	13.75	13.4375
Length    4, alignment  4:	15.3125	13.2812	12.8125
Length    5, alignment  5:	16.25	13.125	12.8125
Length    6, alignment  6:	16.7188	13.5938	13.4375
Length    7, alignment  7:	17.6562	13.2812	12.9688
Length    4, alignment  0:	15.3125	12.6562	12.5
Length    4, alignment  1:	15.1562	13.2812	13.2812
Length    8, alignment  0:	18.4375	12.3438	12.6562
Length    8, alignment  1:	18.4375	13.2812	13.125
Length   16, alignment  0:	25	13.4375	14.0625
Length   16, alignment  1:	24.6875	14.0625	15
Length   32, alignment  0:	37.5	13.9062	14.5312
Length   32, alignment  1:	37.0312	14.8438	17.3438
Length   64, alignment  0:	67.8125	17.1875	18.2812
Length   64, alignment  1:	67.8125	17.3438	19.8438
Length  128, alignment  0:	117.031	21.25	23.9062
Length  128, alignment  1:	116.562	21.25	25
Length  256, alignment  0:	215.156	30.3125	34.0625
Length  256, alignment  1:	215.312	31.875	35.1562
Length  512, alignment  0:	411.719	44.2188	59.0625
Length  512, alignment  1:	412.031	46.0938	57.8125
Length 1024, alignment  0:	805.938	77.5	102.344
Length 1024, alignment  1:	805.625	79.5312	102.5
---
 sysdeps/aarch64/multiarch/Makefile          |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   4 +
 sysdeps/aarch64/multiarch/strnlen.c         |  37 +++++
 sysdeps/aarch64/multiarch/strnlen_generic.S |  40 ++++++
 sysdeps/aarch64/multiarch/strnlen_kunpeng.S | 215 ++++++++++++++++++++++++++++
 sysdeps/aarch64/strnlen.S                   |  12 +-
 6 files changed, 305 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/strnlen.c
 create mode 100644 sysdeps/aarch64/multiarch/strnlen_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/strnlen_kunpeng.S

Patch
diff mbox series

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a90..a9d163d20f 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,5 +3,6 @@  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
 		   memset_generic memset_falkor memset_emag \
 		   memchr_generic memchr_nosimd \
+		   strnlen_generic strnlen_kunpeng \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e5..1e253799a5 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -62,5 +62,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
 
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_kunpeng)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_generic))
+
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/strnlen.c b/sysdeps/aarch64/multiarch/strnlen.c
new file mode 100644
index 0000000000..3c832de847
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen.c
@@ -0,0 +1,37 @@ 
+/* Multiple versions of strnlen. AARCH64 version.
+   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+/* Redefine strnlen so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# define strnlen __redirect_strnlen
+# define __strnlen __redirect___strnlen
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__strnlen) __strnlen_generic attribute_hidden;
+extern __typeof (__strnlen) __strnlen_kunpeng attribute_hidden;
+# undef strnlen
+# undef __strnlen
+
+libc_ifunc_redirected (__redirect___strnlen, __strnlen,
+           (IS_KUNPENG(midr) ? __strnlen_kunpeng : __strnlen_generic));
+
+weak_alias (__strnlen, strnlen);
+#endif
diff --git a/sysdeps/aarch64/multiarch/strnlen_generic.S b/sysdeps/aarch64/multiarch/strnlen_generic.S
new file mode 100644
index 0000000000..4b562bc3dd
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen_generic.S
@@ -0,0 +1,40 @@ 
+/* A Generic Optimized strnlen implementation for AARCH64.
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* The actual strnlen code is in ../strnlen.S.  If we are building libc this file
+   defines __strnlen_generic.  Otherwise the include of ../strnlen.S will define
+   the normal __strnlen  entry points.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define STRNLEN __strnlen_generic
+
+/* Do not hide the generic version of strnlen, we use it internally.  */
+# undef libc_hidden_def
+# define libc_hidden_def(name)
+
+# ifdef SHARED
+	.globl __GI_strnlen; __GI_strnlen = STRNLEN
+	.globl __GI___strnlen; __GI___strnlen = STRNLEN
+# endif
+#endif
+
+#include "../strnlen.S"
+
diff --git a/sysdeps/aarch64/multiarch/strnlen_kunpeng.S b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S
new file mode 100644
index 0000000000..a2be5fd1ec
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S
@@ -0,0 +1,215 @@ 
+/* Optimized strnlen for Huawei Kunpeng processor.
+
+   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+#define limit		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define data1		x3
+#define data2		x4
+#define data2a		x5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define pos		x13
+#define limit_wd	x14
+
+/* NEON register */
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define dataq2		q3
+#define datav2		v3
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ENTRY_ALIGN_AND_PAD (__strnlen_kunpeng, 6, 9)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+	cbz	limit, L(hit_limit)
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	L(misaligned)
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+	ldp	data1, data2, [src], #16
+L(realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	subs	limit_wd, limit_wd, #1
+	orr	tmp1, has_nul1, has_nul2
+	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
+	b.eq	L(loop)
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	orr	tmp1, has_nul1, has_nul2
+	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+	sub	len, src, srcin
+	cbz	has_nul1, L(nul_in_data2)
+#ifdef __AARCH64EB__
+	mov	data2, data1
+#endif
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+L(nul_in_data2):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	RET
+
+L(loop):
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.eq	L(loop_end)
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.ne	L(loop)
+L(loop_end):
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	cbnz	tmp1, L(hit_limit)	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+#ifdef __AARCH64EB__
+	rev64	datav.16b, datav.16b
+#endif
+	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+	   pair of scalars and then compute the length from the earliest NULL
+	   byte.  */
+
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
+	csel	data1, data1, data2, ne
+	sub	len, src, srcin
+	sub	len, len, #16
+	rev	data1, data1
+	add	tmp2, len, 8
+	clz	tmp1, data1
+	csel	len, len, tmp2, ne
+	add	len, len, tmp1, lsr 3
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	RET
+
+L(misaligned):
+	/* Deal with a partial first word.
+	   We're doing two things in parallel here;
+	   1) Calculate the number of words (but avoiding overflow if
+	      limit is near ULONG_MAX) - to do this we need to work out
+	      limit + tmp1 - 1 as a 65-bit value before shifting it;
+	   2) Load and mask the initial data words - we force the bytes
+	      before the ones we are interested in to 0xff - this ensures
+	      early bytes will not hit any zero detection.  */
+	sub	limit_wd, limit, #1
+	neg	tmp4, tmp1
+	cmp	tmp1, #8
+
+	and	tmp3, limit_wd, #15
+	lsr	limit_wd, limit_wd, #4
+	mov	tmp2, #~0
+
+	ldp	data1, data2, [src], #16
+	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
+	add	tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit_wd, tmp3, lsr #4
+
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	L(realigned)
+
+L(hit_limit):
+	mov	len, limit
+	RET
+END (__strnlen_kunpeng)
+weak_alias (__strnlen_kunpeng, strnlen_kunpeng)
+libc_hidden_builtin_def (strnlen_kunpeng)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 70283c8074..9a4dfbda15 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -25,6 +25,10 @@ 
  * ARMv8-a, AArch64
  */
 
+#ifndef STRNLEN
+# define STRNLEN __strnlen
+#endif
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -49,7 +53,7 @@ 
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
-ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+ENTRY_ALIGN_AND_PAD (STRNLEN, 6, 9)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
@@ -159,7 +163,7 @@  L(misaligned):
 L(hit_limit):
 	mov	len, limit
 	RET
-END (__strnlen)
-libc_hidden_def (__strnlen)
-weak_alias (__strnlen, strnlen)
+END (STRNLEN)
+libc_hidden_def (STRNLEN)
+weak_alias (STRNLEN, strnlen)
 libc_hidden_def (strnlen)