diff mbox

powerpc: Improve strcmp performance for shorter strings

Message ID 1486360733-32462-1-git-send-email-raji@linux.vnet.ibm.com
State New
Headers show

Commit Message

Rajalakshmi Srinivasaraghavan Feb. 6, 2017, 5:58 a.m. UTC
For strings >16B and <32B existing algorithm takes more time than default
implementation when strings are placed closed to end of page. This is due
to byte by byte access for handling page cross. This is improved by
following >32B code path where the address is adjusted to aligned memory
before doing load doubleword operation instead of loading bytes.

Tested on powerpc64 and powerpc64le.

2017-02-04  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power8/strcmp.S: Adjust address for
	unaligned load for shorter strings.
	* sysdeps/powerpc/powerpc64/power9/strcmp.S: Likewise.
---
 sysdeps/powerpc/powerpc64/power8/strcmp.S | 30 ++++++++----------------------
 sysdeps/powerpc/powerpc64/power9/strcmp.S | 30 ++++++++----------------------
 2 files changed, 16 insertions(+), 44 deletions(-)
diff mbox

Patch

diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
index c34ff4a..d46bff8 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -30,21 +30,21 @@ 
 EALIGN (strcmp, 4, 0)
 	li	r0,0
 
-	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
 	   the code:
 
 	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
 
-	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
 
 	rldicl	r7,r3,0,52
 	rldicl	r9,r4,0,52
-	cmpldi	cr7,r7,4096-32
+	cmpldi	cr7,r7,4096-16
 	bgt	cr7,L(pagecross_check)
-	cmpldi	cr5,r9,4096-32
+	cmpldi	cr5,r9,4096-16
 	bgt	cr5,L(pagecross_check)
 
-	/* For short string up to 32 bytes, load both s1 and s2 using
+	/* For short string up to 16 bytes, load both s1 and s2 using
 	   unaligned dwords and compare.  */
 	ld	r8,0(r3)
 	ld	r10,0(r4)
@@ -60,25 +60,11 @@  EALIGN (strcmp, 4, 0)
 	orc.	r9,r12,r11
 	bne	cr0,L(different_nocmpb)
 
-	ld	r8,16(r3)
-	ld	r10,16(r4)
-	cmpb	r12,r8,r0
-	cmpb	r11,r8,r10
-	orc.	r9,r12,r11
-	bne	cr0,L(different_nocmpb)
-
-	ld	r8,24(r3)
-	ld	r10,24(r4)
-	cmpb	r12,r8,r0
-	cmpb	r11,r8,r10
-	orc.	r9,r12,r11
-	bne	cr0,L(different_nocmpb)
-
-	addi	r7,r3,32
-	addi	r4,r4,32
+	addi	r7,r3,16
+	addi	r4,r4,16
 
 L(align_8b):
-	/* Now it has checked for first 32 bytes, align source1 to doubleword
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
 	   and adjust source2 address.  */
 	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
 	subf	r4,r9,r4	/* Adjust source2 address based on source1
diff --git a/sysdeps/powerpc/powerpc64/power9/strcmp.S b/sysdeps/powerpc/powerpc64/power9/strcmp.S
index 3e32396..17ec8c2 100644
--- a/sysdeps/powerpc/powerpc64/power9/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power9/strcmp.S
@@ -65,21 +65,21 @@ 
 EALIGN (strcmp, 4, 0)
 	li	r0, 0
 
-	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
 	   the code:
 
 	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
 
-	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
 
 	rldicl	r7, r3, 0, 52
 	rldicl	r9, r4, 0, 52
-	cmpldi	cr7, r7, 4096-32
+	cmpldi	cr7, r7, 4096-16
 	bgt	cr7, L(pagecross_check)
-	cmpldi	cr5, r9, 4096-32
+	cmpldi	cr5, r9, 4096-16
 	bgt	cr5, L(pagecross_check)
 
-	/* For short strings up to 32 bytes,  load both s1 and s2 using
+	/* For short strings up to 16 bytes,  load both s1 and s2 using
 	   unaligned dwords and compare.  */
 	ld	r8, 0(r3)
 	ld	r10, 0(r4)
@@ -95,25 +95,11 @@  EALIGN (strcmp, 4, 0)
 	orc.	r9, r12, r11
 	bne	cr0, L(different_nocmpb)
 
-	ld	r8, 16(r3)
-	ld	r10, 16(r4)
-	cmpb	r12, r8, r0
-	cmpb	r11, r8, r10
-	orc.	r9, r12, r11
-	bne	cr0, L(different_nocmpb)
-
-	ld	r8, 24(r3)
-	ld	r10, 24(r4)
-	cmpb	r12, r8, r0
-	cmpb	r11, r8, r10
-	orc.	r9, r12, r11
-	bne	cr0, L(different_nocmpb)
-
-	addi	r7, r3, 32
-	addi	r4, r4, 32
+	addi	r7, r3, 16
+	addi	r4, r4, 16
 
 L(align):
-	/* Now it has checked for first 32 bytes.  */
+	/* Now it has checked for first 16 bytes.  */
 	vspltisb	v0, 0
 	vspltisb	v2, -1
 	lvsr	v6, 0, r4   /* Compute mask.  */