powerpc: Add POWER9 copy_page() loop

Message ID 20170320234046.32718-1-anton@ozlabs.org
State New
Headers show

Commit Message

Anton Blanchard March 20, 2017, 11:40 p.m.
From: Anton Blanchard <anton@samba.org>

Add a POWER9 optimised copy_page() loop. This loop uses the new D form
vector loads and stores, and uses dcbz to pre zero the destination.

A few questions:

- I'm using a nested feature section, but that is going to get unwieldy
  at some stage. It would be nice to update the call site for copy_page
  directly.

- I'm using CPU_FTR_ARCH_300, but as our functions grow perhaps we want
  the cputable entry to contain a pointer to optimised functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/lib/Makefile          |   2 +-
 arch/powerpc/lib/copypage_64.S     |   4 +
 arch/powerpc/lib/copypage_power9.S | 224 +++++++++++++++++++++++++++++++++++++
 3 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/copypage_power9.S

Comments

Nicholas Piggin March 21, 2017, 3:01 a.m. | #1
On Tue, 21 Mar 2017 10:40:46 +1100
Anton Blanchard <anton@ozlabs.org> wrote:

> From: Anton Blanchard <anton@samba.org>
> 
> Add a POWER9 optimised copy_page() loop. This loop uses the new D form
> vector loads and stores, and uses dcbz to pre zero the destination.
> 
> A few questions:
> 
> - I'm using a nested feature section, but that is going to get unwieldy
>   at some stage. It would be nice to update the call site for copy_page
>   directly.

I've got a patch that makes alternate feature patching a bit
more flexible and not hit relocation limits when using big "else"
parts. I was thinking of doing something like

_GLOBAL_TOC(copy_page)
BEGIN_FTR_SECTION_NESTED(50)
#include "copypage_power9.S"
FTR_SECTION_ELSE_NESTED(50)
#include "copypage_power7.S"
ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)

Patching callers directly is another option though. I'll bug mpe
about it again when he's least expecting it.

> - I'm using CPU_FTR_ARCH_300, but as our functions grow perhaps we want
>   the cputable entry to contain a pointer to optimised functions.

We might be able to do some nested alternatives macros to hide the
details and allow an IFSET / ELSEIFSET / etc / ELSE.

> 
> Signed-off-by: Anton Blanchard <anton@samba.org>
> ---
>  arch/powerpc/lib/Makefile          |   2 +-
>  arch/powerpc/lib/copypage_64.S     |   4 +
>  arch/powerpc/lib/copypage_power9.S | 224 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 229 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/lib/copypage_power9.S
> 
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 2b5e090..d3667b5 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
>  
>  obj64-y	+= copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
>  	   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
> -	   memcpy_64.o memcmp_64.o
> +	   memcpy_64.o memcmp_64.o copypage_power9.o
>  
>  obj64-$(CONFIG_SMP)	+= locks.o
>  obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
> diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
> index 4bcc9e7..051423e 100644
> --- a/arch/powerpc/lib/copypage_64.S
> +++ b/arch/powerpc/lib/copypage_64.S
> @@ -21,7 +21,11 @@ _GLOBAL_TOC(copy_page)
>  BEGIN_FTR_SECTION
>  	lis	r5,PAGE_SIZE@h
>  FTR_SECTION_ELSE
> +  BEGIN_FTR_SECTION_NESTED(50)
> +	b	copypage_power9
> +  FTR_SECTION_ELSE_NESTED(50)
>  	b	copypage_power7
> +  ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)
>  ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
>  	ori	r5,r5,PAGE_SIZE@l
>  BEGIN_FTR_SECTION
> diff --git a/arch/powerpc/lib/copypage_power9.S b/arch/powerpc/lib/copypage_power9.S
> new file mode 100644
> index 0000000..2493f94
> --- /dev/null
> +++ b/arch/powerpc/lib/copypage_power9.S
> @@ -0,0 +1,224 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2017
> + *
> + * Author: Anton Blanchard <anton@au.ibm.com>
> + */
> +#include <asm/page.h>
> +#include <asm/ppc_asm.h>
> +
> +_GLOBAL(copypage_power9)
> +	/*
> +	 * We prefetch the source using enhanced touch instructions. We use
> +	 * a stream ID of 0 for this. Since the source is page aligned we
> +	 * don't need to clear the bottom 7 bits of the address.
> +	 */
> +#ifdef CONFIG_PPC_64K_PAGES
> +	lis	r7,0x0E01	/* depth=7
> +				 * units/cachelines=512 */
> +#else
> +	lis	r7,0x0E00	/* depth=7 */
> +	ori	r7,r7,0x1000	/* units/cachelines=32 */
> +#endif
> +
> +	lis	r8,0x8000	/* GO=1 */
> +	clrldi	r8,r8,32
> +
> +.machine push
> +.machine "power4"
> +	/* setup read stream 0 */
> +	dcbt	r0,r4,0b01000	/* addr from */
> +	dcbt	r0,r7,0b01010	/* length and depth from */
> +	eieio
> +	dcbt	r0,r8,0b01010	/* all streams GO */
> +	eieio
> +.machine pop

I guess POWER asm doesn't need this but it's good practice to prevent
copy paste errors? It would be nice to have some macros to hide all these
constants, but that's for another patch. The commenting is good.

I don't suppose the stream setup is costly enough to consider touching a
cacheline or two ahead before starting it?

> +
> +	/*
> +	 * To reduce memory bandwidth on the store side we send dcbzs ahead.
> +	 * Experimental testing shows 2 cachelines as good enough.
> +	 */
> +	li	r6,128
> +	dcbz	0,r3
> +	dcbz	r6,r3
> +
> +#ifdef CONFIG_ALTIVEC
> +	mflr	r0
> +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
> +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
> +	std	r0,16(r1)
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +	bl	enter_vmx_copy
> +	cmpwi	r3,0
> +	ld	r0,STACKFRAMESIZE+16(r1)
> +	ld	r3,STK_REG(R31)(r1)
> +	ld	r4,STK_REG(R30)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +	mtlr	r0

(Also for another day) We might be able to avoid the stack and call
for some common cases. Pretty small overcall cost I guess, but it could
be beneficial for memcpy if not copy_page.

Thanks,
Nick
Anton Blanchard March 21, 2017, 4:01 a.m. | #2
Hi Nick,

> I've got a patch that makes alternate feature patching a bit
> more flexible and not hit relocation limits when using big "else"
> parts. I was thinking of doing something like
> 
> _GLOBAL_TOC(copy_page)
> BEGIN_FTR_SECTION_NESTED(50)
> #include "copypage_power9.S"
> FTR_SECTION_ELSE_NESTED(50)
> #include "copypage_power7.S"
> ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)

Good idea, I hadn't thought of embedding it all in a feature section.

> I guess POWER asm doesn't need this but it's good practice to prevent
> copy paste errors? It would be nice to have some macros to hide all
> these constants, but that's for another patch. The commenting is good.

The .machine X macros? Unfortunately the format of dcbt is different
for recent server chips. This wasn't a great idea in retrospect because
if you do get the instruction layout wrong, you wont get a fault to warn
you.

> I don't suppose the stream setup is costly enough to consider
> touching a cacheline or two ahead before starting it?

Starting up software streams is a bit of an art - if the demand loads
get ahead then a hardware stream gets started before the software one.
Note all the eieios to try and avoid this happening.

I've struggled with software prefetch on previous chips and sometimes I
wonder if it is worth the pain.

> (Also for another day) We might be able to avoid the stack and call
> for some common cases. Pretty small overcall cost I guess, but it
> could be beneficial for memcpy if not copy_page.

Definitely. Also the breakpoint for using vector should be much
lower if we have already saved the user state in a previous call.

Anton
Nicholas Piggin March 21, 2017, 4:21 a.m. | #3
On Tue, 21 Mar 2017 15:01:03 +1100
Anton Blanchard <anton@samba.org> wrote:

> Hi Nick,
> 
> > I've got a patch that makes alternate feature patching a bit
> > more flexible and not hit relocation limits when using big "else"
> > parts. I was thinking of doing something like
> > 
> > _GLOBAL_TOC(copy_page)
> > BEGIN_FTR_SECTION_NESTED(50)
> > #include "copypage_power9.S"
> > FTR_SECTION_ELSE_NESTED(50)
> > #include "copypage_power7.S"
> > ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)  
> 
> Good idea, I hadn't thought of embedding it all in a feature section.

It may not work currently because you get those ftr_alt_97 relocation
errors with the "else" parts because relative branches to other code
need to be direct and I think reachable from both places.


> > I guess POWER asm doesn't need this but it's good practice to prevent
> > copy paste errors? It would be nice to have some macros to hide all
> > these constants, but that's for another patch. The commenting is good.  
> 
> The .machine X macros? Unfortunately the format of dcbt is different
> for recent server chips. This wasn't a great idea in retrospect because
> if you do get the instruction layout wrong, you wont get a fault to warn
> you.

Is that embedded vs server, or pre-POWER4 vs POWER4 and up? Anyway no
big deal.

> > I don't suppose the stream setup is costly enough to consider
> > touching a cacheline or two ahead before starting it?  
> 
> Starting up software streams is a bit of an art - if the demand loads
> get ahead then a hardware stream gets started before the software one.
> Note all the eieios to try and avoid this happening.
> 
> I've struggled with software prefetch on previous chips and sometimes I
> wonder if it is worth the pain.

Oh I see. Makes sense.

> > (Also for another day) We might be able to avoid the stack and call
> > for some common cases. Pretty small overcall cost I guess, but it
> > could be beneficial for memcpy if not copy_page.  
> 
> Definitely. Also the breakpoint for using vector should be much
> lower if we have already saved the user state in a previous call.

Yes agreed.

Another problem is multiple small mem/string/crypto operations may
never trip the limit even if it would make sense. Difficult to improve
that (kernel could provide a hint to the arch maybe).
Anton Blanchard April 3, 2017, 12:54 a.m. | #4
Hi Nick,

> > Good idea, I hadn't thought of embedding it all in a feature
> > section.  
> 
> It may not work currently because you get those ftr_alt_97 relocation
> errors with the "else" parts because relative branches to other code
> need to be direct and I think reachable from both places.

I thought about this a bit more. One potential issue will be
profiling - perf annotate will match the samples against the unpatched
code which could be very confusing.

Anton
Benjamin Herrenschmidt April 3, 2017, 1:01 a.m. | #5
On Mon, 2017-04-03 at 10:54 +1000, Anton Blanchard wrote:
> > > Good idea, I hadn't thought of embedding it all in a feature
> > > section.  
> > 
> > It may not work currently because you get those ftr_alt_97 relocation
> > errors with the "else" parts because relative branches to other code
> > need to be direct and I think reachable from both places.
> 
> I thought about this a bit more. One potential issue will be
> profiling - perf annotate will match the samples against the unpatched
> code which could be very confusing.

Could we make all those functions a dynamic-linker style stub ?

IE, they "find" the right target function and call a helper to patch
the calling site to call directly into the right one on the first
call.

Cheers,
Ben.

Patch

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 2b5e090..d3667b5 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@  obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 
 obj64-y	+= copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
 	   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
-	   memcpy_64.o memcmp_64.o
+	   memcpy_64.o memcmp_64.o copypage_power9.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 4bcc9e7..051423e 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -21,7 +21,11 @@  _GLOBAL_TOC(copy_page)
 BEGIN_FTR_SECTION
 	lis	r5,PAGE_SIZE@h
 FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(50)
+	b	copypage_power9
+  FTR_SECTION_ELSE_NESTED(50)
 	b	copypage_power7
+  ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 	ori	r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/lib/copypage_power9.S b/arch/powerpc/lib/copypage_power9.S
new file mode 100644
index 0000000..2493f94
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power9.S
@@ -0,0 +1,224 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+
+_GLOBAL(copypage_power9)
+	/*
+	 * We prefetch the source using enhanced touch instructions. We use
+	 * a stream ID of 0 for this. Since the source is page aligned we
+	 * don't need to clear the bottom 7 bits of the address.
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	lis	r7,0x0E01	/* depth=7
+				 * units/cachelines=512 */
+#else
+	lis	r7,0x0E00	/* depth=7 */
+	ori	r7,r7,0x1000	/* units/cachelines=32 */
+#endif
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	/* setup read stream 0 */
+	dcbt	r0,r4,0b01000	/* addr from */
+	dcbt	r0,r7,0b01010	/* length and depth from */
+	eieio
+	dcbt	r0,r8,0b01010	/* all streams GO */
+	eieio
+.machine pop
+
+	/*
+	 * To reduce memory bandwidth on the store side we send dcbzs ahead.
+	 * Experimental testing shows 2 cachelines as good enough.
+	 */
+	li	r6,128
+	dcbz	0,r3
+	dcbz	r6,r3
+
+#ifdef CONFIG_ALTIVEC
+	mflr	r0
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	enter_vmx_copy
+	cmpwi	r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STK_REG(R31)(r1)
+	ld	r4,STK_REG(R30)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	mtlr	r0
+
+	li	r0,((PAGE_SIZE/128)-2)
+	mtctr	r0
+
+	li	r8,256
+
+	beq	.Lnonvmx_copy
+
+	.balign 16
+1:	dcbz	r8,r3
+	lxv	vs32,0(r4)
+	lxv	vs33,16(r4)
+	stxv	vs32,0(r3)
+	stxv	vs33,16(r3)
+
+	lxv	vs34,32(r4)
+	lxv	vs35,48(r4)
+	stxv	vs34,32(r3)
+	stxv	vs35,48(r3)
+
+	lxv	vs36,64(r4)
+	lxv	vs37,80(r4)
+	stxv	vs36,64(r3)
+	stxv	vs37,80(r3)
+
+	lxv	vs38,96(r4)
+	lxv	vs39,112(r4)
+	stxv	vs38,96(r3)
+	stxv	vs39,112(r3)
+
+	addi	r4,r4,128
+	addi	r3,r3,128
+	bdnz	1b
+
+	li	r0,2
+	mtctr	r0
+
+1:	lxv	vs32,0(r4)
+	lxv	vs33,16(r4)
+	stxv	vs32,0(r3)
+	stxv	vs33,16(r3)
+
+	lxv	vs34,32(r4)
+	lxv	vs35,48(r4)
+	stxv	vs34,32(r3)
+	stxv	vs35,48(r3)
+
+	lxv	vs36,64(r4)
+	lxv	vs37,80(r4)
+	stxv	vs36,64(r3)
+	stxv	vs37,80(r3)
+
+	lxv	vs38,96(r4)
+	lxv	vs39,112(r4)
+	stxv	vs38,96(r3)
+	stxv	vs39,112(r3)
+
+	addi	r4,r4,128
+	addi	r3,r3,128
+	bdnz	1b
+
+	b	exit_vmx_copy		/* tail call optimise */
+#else
+	li	r0,((PAGE_SIZE/128)-2)
+	mtctr	r0
+
+	li	r8,256
+#endif
+
+	.balign 16
+.Lnonvmx_copy:
+1:	dcbz	r8,r3
+	ld	r0,0(r4)
+	ld	r5,8(r4)
+	ld	r6,16(r4)
+	ld	r7,24(r4)
+	std	r0,0(r3)
+	std	r5,8(r3)
+	std	r6,16(r3)
+	std	r7,24(r3)
+
+	ld	r0,32(r4)
+	ld	r5,40(r4)
+	ld	r6,48(r4)
+	ld	r7,56(r4)
+	std	r0,32(r3)
+	std	r5,40(r3)
+	std	r6,48(r3)
+	std	r7,56(r3)
+
+	ld	r0,64(r4)
+	ld	r5,72(r4)
+	ld	r6,80(r4)
+	ld	r7,88(r4)
+	std	r0,64(r3)
+	std	r5,72(r3)
+	std	r6,80(r3)
+	std	r7,88(r3)
+
+	ld	r0,96(r4)
+	ld	r5,104(r4)
+	ld	r6,112(r4)
+	ld	r7,120(r4)
+	addi	r4,r4,128
+	std	r0,96(r3)
+	std	r5,104(r3)
+	std	r6,112(r3)
+	std	r7,120(r3)
+	addi	r3,r3,128
+	bdnz	1b
+
+	li	r0,2
+	mtctr	r0
+
+1:	ld	r0,0(r4)
+	ld	r5,8(r4)
+	ld	r6,16(r4)
+	ld	r7,24(r4)
+	std	r0,0(r3)
+	std	r5,8(r3)
+	std	r6,16(r3)
+	std	r7,24(r3)
+
+	ld	r0,32(r4)
+	ld	r5,40(r4)
+	ld	r6,48(r4)
+	ld	r7,56(r4)
+	std	r0,32(r3)
+	std	r5,40(r3)
+	std	r6,48(r3)
+	std	r7,56(r3)
+
+	ld	r0,64(r4)
+	ld	r5,72(r4)
+	ld	r6,80(r4)
+	ld	r7,88(r4)
+	std	r0,64(r3)
+	std	r5,72(r3)
+	std	r6,80(r3)
+	std	r7,88(r3)
+
+	ld	r0,96(r4)
+	ld	r5,104(r4)
+	ld	r6,112(r4)
+	ld	r7,120(r4)
+	addi	r4,r4,128
+	std	r0,96(r3)
+	std	r5,104(r3)
+	std	r6,112(r3)
+	std	r7,120(r3)
+	addi	r3,r3,128
+	bdnz	1b
+
+	blr