diff mbox

[1/2] powerpc: Add 64bit optimised memcmp

Message ID 1420768591-6831-1-git-send-email-anton@samba.org (mailing list archive)
State Superseded
Headers show

Commit Message

Anton Blanchard Jan. 9, 2015, 1:56 a.m. UTC
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.

Optimise the loop in a few ways:

- Unroll the byte at a time loop

- For large (at least 32 byte) comparisons that are also 8 byte
  aligned, use an unrolled modulo scheduled loop using 8 byte
  loads. This is similar to our glibc memcmp.

A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:

baseline:	29.93 s

modified:	 1.70 s

Just over 17x faster.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/lib/Makefile    |   3 +-
 arch/powerpc/lib/memcmp_64.S | 233 +++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/lib/string.S    |   2 +
 3 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcmp_64.S

Comments

David Laight Jan. 9, 2015, 10:06 a.m. UTC | #1
From: Anton Blanchard

> I noticed ksm spending quite a lot of time in memcmp on a large

> KVM box. The current memcmp loop is very unoptimised - byte at a

> time compares with no loop unrolling. We can do much much better.

> 

> Optimise the loop in a few ways:

> 

> - Unroll the byte at a time loop

> 

> - For large (at least 32 byte) comparisons that are also 8 byte

>   aligned, use an unrolled modulo scheduled loop using 8 byte

>   loads. This is similar to our glibc memcmp.

> 

> A simple microbenchmark testing 10000000 iterations of an 8192 byte

> memcmp was used to measure the performance:

> 

> baseline:	29.93 s

> 

> modified:	 1.70 s

> 

> Just over 17x faster.


The unrolled loop (deleted) looks excessive.
On a modern cpu with multiple execution units you can usually
manage to get the loop overhead to execute in parallel to the
actual 'work'.
So I suspect that a much simpler 'word at a time' loop will be
almost as fast - especially in the case where the code isn't
already in the cache and the compare is relatively short.
Try something based on:
	a1 = *a++;
	b1 = *b++;
	while {
		a2 = *a++;
		b2 = *b++;
		if (a1 != a2)
			break;
		a1 = *a++;
		b1 = *b++;
	} while (a2 != a1);

	David
Adhemerval Zanella Jan. 9, 2015, 11:01 a.m. UTC | #2
On 08-01-2015 23:56, Anton Blanchard wrote:
> I noticed ksm spending quite a lot of time in memcmp on a large
> KVM box. The current memcmp loop is very unoptimised - byte at a
> time compares with no loop unrolling. We can do much much better.
>
> Optimise the loop in a few ways:
>
> - Unroll the byte at a time loop
>
> - For large (at least 32 byte) comparisons that are also 8 byte
>   aligned, use an unrolled modulo scheduled loop using 8 byte
>   loads. This is similar to our glibc memcmp.
>
> A simple microbenchmark testing 10000000 iterations of an 8192 byte
> memcmp was used to measure the performance:
>
> baseline:	29.93 s
>
> modified:	 1.70 s
>
> Just over 17x faster.
>
> Signed-off-by: Anton Blanchard <anton@samba.org>
>
Why not use glibc implementations instead? All of them (ppc64, power4, and
power7) avoids use byte at time compares for unaligned cases inputs; while
showing the same performance for aligned one than this new implementation.
To give you an example, a 8192 bytes compare with input alignment of 63/18
shows:

__memcmp_power7:  320 cycles
__memcmp_power4:  320 cycles
__memcmp_ppc64:   340 cycles
this memcmp:     3185 cycles
Anton Blanchard Jan. 12, 2015, 12:55 a.m. UTC | #3
Hi David,

> The unrolled loop (deleted) looks excessive.
> On a modern cpu with multiple execution units you can usually
> manage to get the loop overhead to execute in parallel to the
> actual 'work'.
> So I suspect that a much simpler 'word at a time' loop will be
> almost as fast - especially in the case where the code isn't
> already in the cache and the compare is relatively short.

I'm always keen to keep things as simple as possible, but your loop is
over 50% slower. Once the loop hits a steady state you are going to run
into front end issues with instruction fetch on POWER8.

Anton

> Try something based on:
> 	a1 = *a++;
> 	b1 = *b++;
> 	while {
> 		a2 = *a++;
> 		b2 = *b++;
> 		if (a1 != a2)
> 			break;
> 		a1 = *a++;
> 		b1 = *b++;
> 	} while (a2 != a1);
> 
> 	David
>
Joakim Tjernlund Jan. 12, 2015, 6:55 a.m. UTC | #4
On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> Hi David,
> 
> > The unrolled loop (deleted) looks excessive.
> > On a modern cpu with multiple execution units you can usually
> > manage to get the loop overhead to execute in parallel to the
> > actual 'work'.
> > So I suspect that a much simpler 'word at a time' loop will be almost as fast - especially in the case where the code isn't
> > already in the cache and the compare is relatively short.
> 
> I'm always keen to keep things as simple as possible, but your loop is over 50% slower. Once the loop hits a steady state you are going to run into front end issues with instruction fetch on POWER8.
> 

Out of curiosity, does preincrement make any difference(or can gcc do that for you nowadays)?

         a1 = *a;
         b1 = *b;
         while {
                 a2 = *++a;
                 b2 = *++b;
                 if (a1 != a2)
                 break;
                 a1 = *++a;
                 b1 = *++b;
         } while (a2 != a1);

 Jocke

> Anton
> 
> > Try something based on:
> >         a1 = *a++;
> >         b1 = *b++;
> >         while {
> >                 a2 = *a++;
> >                 b2 = *b++;
> >                 if (a1 != a2)
> >                 break;
> >                 a1 = *a++;
> >                 b1 = *b++;
> >         } while (a2 != a1);
> > 
> >         David
> > 
> 
> _______________________________________________
> Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
David Laight Jan. 12, 2015, 9:45 a.m. UTC | #5
From: Joakim Tjernlund 
> On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> > Hi David,
> >
> > > The unrolled loop (deleted) looks excessive.
> > > On a modern cpu with multiple execution units you can usually
> > > manage to get the loop overhead to execute in parallel to the
> > > actual 'work'.
> > > So I suspect that a much simpler 'word at a time' loop will be almost as fast - especially in the
> case where the code isn't
> > > already in the cache and the compare is relatively short.
> >
> > I'm always keen to keep things as simple as possible, but your loop is over 50% slower. Once the
> loop hits a steady state you are going to run into front end issues with instruction fetch on POWER8.

Interesting, I'm not an expert on ppc scheduling, but on my old x86 Athon 700 (I think
it was that one) a similar loop ran as fast as 'rep movsw'.

> Out of curiosity, does preincrement make any difference(or can gcc do that for you nowadays)?

It will only change register pressure slightly, and might allow any execution
delays be filled - but that is very processor dependant.
Actually you probably want to do 'a += 2' somewhere to reduce the instruction count.
Similarly the end condition needs to compare one of the pointers.

Elsewhere (not ppc) I've used (++p)[-1] instead of *p++ to move the increment
before the load to get better scheduling.

>          a1 = *a;
>          b1 = *b;
>          while {
>                  a2 = *++a;
>                  b2 = *++b;
>                  if (a1 != a2)
     That should have been a1 != b1
>                  	break;
>                  a1 = *++a;
>                  b1 = *++b;
>          } while (a2 != a1);
     and a2 != b2

	David
diff mbox

Patch

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@  obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   usercopy_64.o mem_64.o hweight_64.o \
-			   copyuser_power7.o string_64.o copypage_power7.o
+			   copyuser_power7.o string_64.o copypage_power7.o \
+			   memcmp_64.o
 ifeq ($(CONFIG_GENERIC_CSUM),)
 obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..d71880b
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@ 
+/*
+ * Author: Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8	r6
+#define off16	r7
+#define off24	r8
+
+#define rA	r9
+#define rB	r10
+#define rC	r11
+#define rD	r27
+#define rE	r28
+#define rF	r29
+#define rG	r30
+#define rH	r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD	ldbrx
+#else
+#define LD	ldx
+#endif
+
+_GLOBAL(memcmp)
+	cmpdi	cr1,r5,0
+
+	/* Use the short loop if both strings are not 8B aligned */
+	or	r6,r3,r4
+	rldicl.	r6,r6,0,(64-3)
+
+	/* Use the short loop if length is less than 32B */
+	cmpdi	cr5,r5,31
+
+	beq	cr1,.Lzero
+	bne	.Lshort
+	bgt	cr5,.Llong
+
+.Lshort:
+	mtctr	r5
+
+1:	lbz	rA,0(r3)
+	lbz	rB,0(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,1(r3)
+	lbz	rB,1(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,2(r3)
+	lbz	rB,2(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,3(r3)
+	lbz	rB,3(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+
+	addi	r3,r3,4
+	addi	r4,r4,4
+
+	bdnzt	eq,1b
+
+.Lzero:
+	li	r3,0
+	blr
+
+.Lnon_zero:
+	mr	r3,rC
+	blr
+
+.Llong:
+	li	off8,8
+	li	off16,16
+	li	off24,24
+
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lfirst32
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr5,rE,rF
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lsecond32
+
+	.balign	16
+
+1:	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+	bne	cr5,.LcmpEF
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr5,rE,rF
+	bne	cr6,.LcmpGH
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdnz	1b
+
+.Lsecond32:
+	cmpld	cr1,rC,rD
+	bne	cr5,.LcmpEF
+
+	cmpld	cr5,rE,rF
+	bne	cr6,.LcmpGH
+
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	bne	cr1,.LcmpCD
+	bne	cr5,.LcmpEF
+	bne	cr6,.LcmpGH
+
+.Ltail:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lshort
+
+.Lfirst32:
+	cmpld	cr1,rC,rD
+	cmpld	cr5,rE,rF
+	cmpld	cr6,rG,rH
+
+	bne	cr0,.LcmpAB
+	bne	cr1,.LcmpCD
+	bne	cr5,.LcmpEF
+	bne	cr6,.LcmpGH
+
+	b	.Ltail
+
+.LcmpAB:
+	li	r3,1
+	bgt	cr0,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpCD:
+	li	r3,1
+	bgt	cr1,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpEF:
+	li	r3,1
+	bgt	cr5,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpGH:
+	li	r3,1
+	bgt	cr6,.Lout
+	li	r3,-1
+
+.Lout:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@  _GLOBAL(strlen)
 	subf	r3,r3,r4
 	blr
 
+#ifdef CONFIG_PPC32
 _GLOBAL(memcmp)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -106,6 +107,7 @@  _GLOBAL(memcmp)
 	blr
 2:	li	r3,0
 	blr
+#endif
 
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0