diff mbox series

[v2] aarch64: Optimized strlen for strlen_asimd

Message ID 20191022094118.11468-1-zhangxuelei4@huawei.com
State New
Headers show
Series [v2] aarch64: Optimized strlen for strlen_asimd | expand

Commit Message

Xuelei Zhang Oct. 22, 2019, 9:41 a.m. UTC
Optimize the strlen implementation by using vector operations and
loop unrolling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.

Here is the result:

Function: strlen
Variant:
                                    builtin_strlen	generic_strlen	memchr_strlen	__strlen_asimd(old)	__strlen_asimd(new)	__strlen_generic
=====================================================================================================================================================
               length=1, alignment=1:        20.00 (-64.10%)	       14.38 (-17.95%)	       16.25 (-33.33%)	       11.56 (  5.13%)	       11.72 (  3.85%)	       12.19
               length=1, alignment=0:        15.00 (-26.32%)	       12.66 ( -6.58%)	       16.09 (-35.53%)	       12.19 ( -2.63%)	       12.03 ( -1.32%)	       11.88
               length=2, alignment=2:        15.16 (-25.97%)	       14.06 (-16.88%)	       15.62 (-29.87%)	       12.03 (  0.00%)	       11.72 (  2.60%)	       12.03
               length=2, alignment=0:        14.53 (-20.78%)	       12.81 ( -6.49%)	       16.25 (-35.07%)	       12.66 ( -5.19%)	       12.03 (  0.00%)	       12.03
               length=3, alignment=3:        15.00 (-21.52%)	       14.38 (-16.46%)	       15.78 (-27.85%)	       12.03 (  2.53%)	       12.03 (  2.53%)	       12.34
               length=3, alignment=0:        14.53 (-24.00%)	       12.66 ( -8.00%)	       16.88 (-44.00%)	       12.19 ( -4.00%)	       12.03 ( -2.67%)	       11.72
               length=4, alignment=4:        14.69 (-23.68%)	       15.62 (-31.58%)	       16.25 (-36.84%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=4, alignment=0:        14.84 (-20.25%)	       12.66 ( -2.53%)	       16.72 (-35.44%)	       11.88 (  3.80%)	       12.19 (  1.27%)	       12.34
               length=5, alignment=5:        14.38 (-21.05%)	       14.84 (-25.00%)	       15.62 (-31.58%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=0:        14.84 (-21.80%)	       13.12 ( -7.69%)	       16.41 (-34.61%)	       12.03 (  1.28%)	       11.88 (  2.56%)	       12.19
               length=6, alignment=6:        14.69 (-25.33%)	       14.69 (-25.33%)	       15.78 (-34.67%)	       11.88 ( -1.33%)	       11.88 ( -1.33%)	       11.72
               length=6, alignment=0:        14.69 (-23.68%)	       13.28 (-11.84%)	       16.41 (-38.16%)	       12.66 ( -6.58%)	       12.34 ( -3.95%)	       11.88
               length=7, alignment=7:        14.84 (-23.38%)	       13.28 (-10.39%)	       15.78 (-31.17%)	       12.19 ( -1.30%)	       12.03 (  0.00%)	       12.03
               length=7, alignment=0:        14.53 (-19.23%)	       12.81 ( -5.13%)	       16.25 (-33.33%)	       12.03 (  1.28%)	       12.03 (  1.28%)	       12.19
               length=4, alignment=0:        14.69 (-25.33%)	       12.81 ( -9.33%)	       15.94 (-36.00%)	       11.72 (  0.00%)	       11.88 ( -1.33%)	       11.72
               length=4, alignment=7:        14.69 (-22.08%)	       13.28 (-10.39%)	       15.94 (-32.47%)	       12.03 (  0.00%)	       12.03 (  0.00%)	       12.03
               length=4, alignment=2:        15.00 (-28.00%)	       15.31 (-30.67%)	       16.09 (-37.33%)	       11.88 ( -1.33%)	       12.03 ( -2.67%)	       11.72
               length=2, alignment=2:        14.69 (-23.68%)	       14.06 (-18.42%)	       15.78 (-32.89%)	       12.03 ( -1.32%)	       12.03 ( -1.32%)	       11.88
               length=8, alignment=0:        14.84 (-26.67%)	       14.53 (-24.00%)	       16.09 (-37.33%)	       12.03 ( -2.67%)	       11.72 (  0.00%)	       11.72
               length=8, alignment=7:        14.22 (-19.74%)	       12.97 ( -9.21%)	       15.94 (-34.21%)	       12.03 ( -1.32%)	       11.72 (  1.32%)	       11.88
               length=8, alignment=3:        14.84 (-25.00%)	       17.19 (-44.74%)	       15.78 (-32.89%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=3:        15.00 (-24.68%)	       15.16 (-25.97%)	       15.94 (-32.47%)	       11.88 (  1.30%)	       12.03 (  0.00%)	       12.03
              length=16, alignment=0:        16.41 (-17.98%)	       15.47 (-11.24%)	       16.09 (-15.73%)	       12.19 ( 12.36%)	       13.59 (  2.25%)	       13.91
              length=16, alignment=7:        16.25 (-14.29%)	       15.62 ( -9.89%)	       16.09 (-13.19%)	       12.34 ( 13.19%)	       13.44 (  5.49%)	       14.22
              length=16, alignment=4:        16.09 (-17.05%)	       17.19 (-25.00%)	       15.62 (-13.64%)	       12.03 ( 12.50%)	       13.59 (  1.14%)	       13.75
              length=10, alignment=4:        15.31 (-27.27%)	       16.41 (-36.36%)	       15.78 (-31.17%)	       11.88 (  1.30%)	       12.50 ( -3.90%)	       12.03
              length=32, alignment=0:        15.94 ( -5.15%)	       18.28 (-20.62%)	       18.59 (-22.68%)	       14.22 (  6.18%)	       13.44 ( 11.34%)	       15.16
              length=32, alignment=7:        15.16 ( -4.30%)	       18.44 (-26.88%)	       17.19 (-18.28%)	       12.81 ( 11.83%)	       13.12 (  9.68%)	       14.53
              length=32, alignment=5:        15.31 ( -7.69%)	       20.94 (-47.25%)	       16.41 (-15.38%)	       12.34 ( 13.19%)	       12.81 (  9.89%)	       14.22
              length=21, alignment=5:        16.09 (-17.05%)	       18.28 (-32.95%)	       15.94 (-15.91%)	       12.03 ( 12.50%)	       13.12 (  4.55%)	       13.75
              length=64, alignment=0:        18.59 ( -4.39%)	       23.12 (-29.82%)	       19.22 ( -7.90%)	       15.62 ( 12.28%)	       15.94 ( 10.53%)	       17.81
              length=64, alignment=7:        18.12 (-10.48%)	       23.91 (-45.71%)	       19.69 (-20.00%)	       14.69 ( 10.48%)	       14.53 ( 11.43%)	       16.41
              length=64, alignment=6:        17.19 ( -1.85%)	       23.12 (-37.04%)	       24.06 (-42.59%)	       14.69 ( 12.96%)	       14.53 ( 13.89%)	       16.88
              length=42, alignment=6:        18.91 (-16.35%)	       20.16 (-24.04%)	       17.19 ( -5.77%)	       14.06 ( 13.46%)	       15.94 (  1.92%)	       16.25
             length=128, alignment=0:        21.09 (  4.25%)	       32.81 (-48.94%)	       21.72 (  1.42%)	       19.22 ( 12.77%)	       19.22 ( 12.77%)	       22.03
             length=128, alignment=7:        19.38 ( 10.14%)	       32.66 (-51.45%)	       21.72 ( -0.72%)	       19.22 ( 10.87%)	       18.44 ( 14.49%)	       21.56
             length=128, alignment=7:        18.75 ( 12.41%)	       31.09 (-45.26%)	       19.69 (  8.03%)	       19.22 ( 10.22%)	       18.44 ( 13.87%)	       21.41
              length=85, alignment=7:        21.72 (-17.80%)	       26.56 (-44.07%)	       24.22 (-31.36%)	       17.03 (  7.63%)	       16.56 ( 10.17%)	       18.44
             length=256, alignment=0:        30.16 (  3.50%)	       64.22 (-105.50%)	       25.94 ( 17.00%)	       26.88 ( 14.00%)	       26.56 ( 15.00%)	       31.25
             length=256, alignment=7:        28.75 (  7.07%)	       51.25 (-65.66%)	       28.75 (  7.07%)	       27.19 ( 12.12%)	       27.66 ( 10.61%)	       30.94
             length=256, alignment=8:        29.06 (  5.58%)	       65.47 (-112.69%)	       25.62 ( 16.75%)	       27.03 ( 12.18%)	       27.81 (  9.64%)	       30.78
             length=170, alignment=8:        24.53 (  4.85%)	       38.28 (-48.48%)	       22.66 ( 12.12%)	       23.59 (  8.48%)	       22.19 ( 13.94%)	       25.78
             length=512, alignment=0:        45.47 (  9.91%)	       94.22 (-86.69%)	       37.50 ( 25.70%)	       43.75 ( 13.31%)	       43.44 ( 13.93%)	       50.47
             length=512, alignment=7:        44.84 ( 10.03%)	       94.22 (-89.03%)	       38.28 ( 23.20%)	       43.91 ( 11.91%)	       44.06 ( 11.60%)	       49.84
             length=512, alignment=9:        44.53 ( 11.49%)	       97.03 (-92.86%)	       37.97 ( 24.53%)	       43.44 ( 13.66%)	       43.91 ( 12.73%)	       50.31
             length=341, alignment=9:        35.94 (  8.37%)	       71.72 (-82.87%)	       30.62 ( 21.91%)	       32.19 ( 17.93%)	       34.38 ( 12.35%)	       39.22
            length=1024, alignment=0:        78.75 ( 11.27%)	      168.28 (-89.61%)	       61.09 ( 31.16%)	      103.12 (-16.20%)	       76.41 ( 13.91%)	       88.75
            length=1024, alignment=7:        76.88 ( 11.83%)	      168.28 (-93.01%)	       62.03 ( 28.85%)	      105.94 (-21.51%)	       77.50 ( 11.11%)	       87.19
           length=1024, alignment=10:        77.81 ( 11.23%)	      170.78 (-94.83%)	       61.88 ( 29.41%)	      102.66 (-17.11%)	       77.66 ( 11.41%)	       87.66
            length=682, alignment=10:        60.31 (  9.18%)	      125.94 (-89.65%)	       45.31 ( 31.76%)	       55.16 ( 16.94%)	       58.44 ( 12.00%)	       66.41
            length=2048, alignment=0:       145.94 ( 13.84%)	      316.09 (-86.62%)	      110.78 ( 34.59%)	      143.59 ( 15.22%)	      144.69 ( 14.58%)	      169.38
            length=2048, alignment=7:       145.31 ( 16.44%)	      316.09 (-81.76%)	      111.09 ( 36.12%)	      144.53 ( 16.89%)	      143.28 ( 17.61%)	      173.91
           length=2048, alignment=11:       144.84 ( 16.86%)	      319.38 (-83.32%)	      111.25 ( 36.14%)	      144.38 ( 17.13%)	      143.59 ( 17.58%)	      174.22
           length=1365, alignment=11:       101.41 ( 17.01%)	      221.41 (-81.20%)	       78.59 ( 35.68%)	      100.94 ( 17.39%)	      100.78 ( 17.52%)	      122.19
            length=4096, alignment=0:       280.00 ( 10.62%)	      617.19 (-97.01%)	      221.88 ( 29.18%)	      301.41 (  3.79%)	      278.44 ( 11.12%)	      313.28
            length=4096, alignment=7:       283.75 ( 12.61%)	      618.44 (-90.47%)	      208.12 ( 35.90%)	      292.34 (  9.96%)	      277.81 ( 14.44%)	      324.69
           length=4096, alignment=12:       283.59 ( 12.87%)	      621.25 (-90.88%)	      208.12 ( 36.05%)	      293.75 (  9.75%)	      277.34 ( 14.79%)	      325.47
           length=2730, alignment=12:       202.66 (  8.85%)	      424.06 (-90.72%)	      142.34 ( 35.98%)	      203.91 (  8.29%)	      201.88 (  9.21%)	      222.34
---
 sysdeps/aarch64/multiarch/strlen.c       |  4 ++-
 sysdeps/aarch64/multiarch/strlen_asimd.S | 42 ++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 17 deletions(-)

Comments

Wilco Dijkstra Oct. 22, 2019, 4:33 p.m. UTC | #1
Hi Xuelei,

> Optimize the strlen implementation by using vector operations and
> loop unrolling in main loop.Compared to __strlen_generic,it reduces
> latency of cases in bench-strlen by 7%~18% when the length of src
> is greater than 128 bytes, with gains throughout the benchmark.

This is a good improvement, OK to commit. Also given it uses integer
arithmetic for the first 16 bytes, it can never be worse off than the generic
variant for small inputs.

Wilco

diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babeec..abf6513eeea 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+	    ? __strlen_asimd
+	    :__strlen_generic));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6abb825..1de6cd3a173 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@
 #define dataq2		q3
 #define datav2		v3
 
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 16
 #else
@@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
 	b.gt	L(page_cross)
-	ldr	dataq, [srcin]
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
+	rev	data1, data1
+	rev	data2, data2
 #endif
 
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop_entry)
-
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
-	rev	data1, data1
-	clz	tmp1, data1
-	csel	len, xzr, len, ne
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
 L(main_loop_entry):
 	bic	src, srcin, 15
+	sub	src, src, 16
 
 L(main_loop):
-	ldr	dataq, [src, 16]!
+	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
 
 L(tail):
 #ifdef __AARCH64EB__

OK
Adhemerval Zanella Dec. 19, 2019, 7:44 p.m. UTC | #2
On 22/10/2019 13:33, Wilco Dijkstra wrote:
> Hi Xuelei,
> 
>> Optimize the strlen implementation by using vector operations and
>> loop unrolling in main loop.Compared to __strlen_generic,it reduces
>> latency of cases in bench-strlen by 7%~18% when the length of src
>> is greater than 128 bytes, with gains throughout the benchmark.
> 
> This is a good improvement, OK to commit. Also given it uses integer
> arithmetic for the first 16 bytes, it can never be worse off than the generic
> variant for small inputs.
> 
> Wilco

I pushed it upstream.
diff mbox series

Patch

diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babeec..abf6513eeea 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@  extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+	    ? __strlen_asimd
+	    :__strlen_generic));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6abb825..1de6cd3a173 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@ 
 #define dataq2		q3
 #define datav2		v3
 
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 16
 #else
@@ -82,40 +85,47 @@  ENTRY_ALIGN (__strlen_asimd, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
 	b.gt	L(page_cross)
-	ldr	dataq, [srcin]
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
+	rev	data1, data1
+	rev	data2, data2
 #endif
 
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop_entry)
-
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
-	rev	data1, data1
-	clz	tmp1, data1
-	csel	len, xzr, len, ne
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
 L(main_loop_entry):
 	bic	src, srcin, 15
+	sub	src, src, 16
 
 L(main_loop):
-	ldr	dataq, [src, 16]!
+	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
 
 L(tail):
 #ifdef __AARCH64EB__