diff mbox series

powerpc: Optimized strcpy for POWER9

Message ID 20200501145209.1dea304c@kryten.localdomain
State New
Headers show
Series powerpc: Optimized strcpy for POWER9 | expand

Commit Message

Anton Blanchard May 1, 2020, 4:52 a.m. UTC
This version is significantly faster on small strings and relatively
unaligned large strings.
---
 sysdeps/powerpc/powerpc64/le/power9/strcpy.S  | 144 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
 .../powerpc64/multiarch/strcpy-power9.S       |  26 ++++
 sysdeps/powerpc/powerpc64/multiarch/strcpy.c  |   7 +
 5 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S

Comments

Adhemerval Zanella Netto May 4, 2020, 12:58 p.m. UTC | #1
On 01/05/2020 01:52, Anton Blanchard via Libc-alpha wrote:
> This version is significantly faster on small strings and relatively
> unaligned large strings.

So it seems that it uses the ISA 3.0 partial stores to optimize vector
instructions usage, could you add it on the commit message?

The power8 version also adds an stpcpy version (which I am not sure is
really a gain for short strings) based on its strcpy.  Maybe it could
be a good thing to check if this could be adapted to be build as
stpcpy as well.

Usually for such optimizations we try to get a baseline benchmark results
using glibc benchtests.  Could you post the results for before and after?

(Some of the glibc benchtests does have some shortcomings, but at least it 
documents some of the expected gains and performance differences with the 
new implementation.)

> ---
>  sysdeps/powerpc/powerpc64/le/power9/strcpy.S  | 144 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>  .../powerpc64/multiarch/strcpy-power9.S       |  26 ++++
>  sysdeps/powerpc/powerpc64/multiarch/strcpy.c  |   7 +
>  5 files changed, 182 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> new file mode 100644
> index 0000000000..5749228054
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -0,0 +1,144 @@
> +/* Optimized strcpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRCPY
> +# define STRCPY strcpy
> +#endif
> +
> +/* Implements the function
> +
> +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (STRCPY, 4)
> +	CALL_MCOUNT 2
> +
> +	/* NULL string optimisation  */
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	cmpwi	r0,0
> +	beqlr

Is it a worth optimization? None of other strcpy arch optimizations does
it.

> +
> +	addi	r4,r4,1
> +	addi	r11,r3,1
> +
> +	vspltisb v18,0		/* Zeroes in v18  */
> +
> +	neg	r5,r4
> +	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9
> +	bgt	L(no_null)
> +
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	blr
> +
> +L(no_null):
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +
> +L(loop):

Should we eforce alignment here?

> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)

Out of curiosity, why unroll 4x time here?

> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +
> +	b	L(loop)
> +
> +L(tail1):
> +	vctzlsbb r8,v6
> +	addi	r8,r8,1
> +	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r9
> +	blr
> +
> +L(tail2):
> +	stxv	32+v0,0(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +END (STRCPY)
> +libc_hidden_builtin_def (strcpy)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index ea936bf9ed..db11345053 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  		   strncase-power8
>  
>  ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += strcmp-power9 strncmp-power9
> +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index b9fef3f43c..ad11ede20e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
>    IFUNC_IMPL (i, name, strcpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __strcpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __strcpy_power8)
>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> new file mode 100644
> index 0000000000..d22aa0a8d6
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strcpy implementation for POWER9/PPC64.
> +   Copyright (C) 2016-2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRCPY __strcpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
> +#endif

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> index 66ea93b8f4..dcdee5181f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> @@ -25,9 +25,16 @@
>  extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
>  extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
>  extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
> +# endif
>  #undef strcpy
>  
>  libc_ifunc_redirected (__redirect_strcpy, strcpy,
> +# ifdef __LITTLE_ENDIAN__
> +			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +			? __strcpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __strcpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

Ok.
Anton Blanchard May 5, 2020, 2:40 a.m. UTC | #2
Hi Adhemerval,

> > This version is significantly faster on small strings and relatively
> > unaligned large strings.  
> 
> So it seems that it uses the ISA 3.0 partial stores to optimize vector
> instructions usage, could you add it on the commit message?

Will do.

> The power8 version also adds an stpcpy version (which I am not sure is
> really a gain for short strings) based on its strcpy.  Maybe it could
> be a good thing to check if this could be adapted to be build as
> stpcpy as well.

Good idea. I modified it to also implement stpcpy, and it shows decent
gains - 40% faster on small strings and 90% faster on long relatively
unaligned strings.

> Usually for such optimizations we try to get a baseline benchmark
> results using glibc benchtests.  Could you post the results for
> before and after?

How do you want the results? Should I just mail the output of the
relevant glibc benchtest?

> (Some of the glibc benchtests does have some shortcomings, but at
> least it documents some of the expected gains and performance
> differences with the new implementation.)

I wonder if an option to do more thorough testing would be useful. For
example, I have a local modification to do an exhaustive test of all
alignments and lengths up to a certain point (eg 128 bytes) to make
sure there aren't any corner cases.

More random argument benchmarks would be useful too.

Thanks,
Anton

> 
> > ---
> >  sysdeps/powerpc/powerpc64/le/power9/strcpy.S  | 144
> > ++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/Makefile  |
> >  2 +- .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
> >  .../powerpc64/multiarch/strcpy-power9.S       |  26 ++++
> >  sysdeps/powerpc/powerpc64/multiarch/strcpy.c  |   7 +
> >  5 files changed, 182 insertions(+), 1 deletion(-)
> >  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> >  create mode 100644
> > sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> > 
> > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> > b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S new file mode 100644
> > index 0000000000..5749228054
> > --- /dev/null
> > +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> > @@ -0,0 +1,144 @@
> > +/* Optimized strcpy implementation for PowerPC64/POWER9.
> > +   Copyright (C) 2020 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it
> > and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later
> > version. +
> > +   The GNU C Library is distributed in the hope that it will be
> > useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +#ifndef STRCPY
> > +# define STRCPY strcpy
> > +#endif
> > +
> > +/* Implements the function
> > +
> > +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
> > +
> > +   The implementation can load bytes past a null terminator, but
> > only
> > +   up to the next 16B boundary, so it never crosses a page.  */
> > +
> > +.machine power9
> > +ENTRY_TOCLESS (STRCPY, 4)
> > +	CALL_MCOUNT 2
> > +
> > +	/* NULL string optimisation  */
> > +	lbz	r0,0(r4)
> > +	stb	r0,0(r3)
> > +	cmpwi	r0,0
> > +	beqlr  
> 
> Is it a worth optimization? None of other strcpy arch optimizations
> does it.
> 
> > +
> > +	addi	r4,r4,1
> > +	addi	r11,r3,1
> > +
> > +	vspltisb v18,0		/* Zeroes in v18  */
> > +
> > +	neg	r5,r4
> > +	rldicl	r9,r5,0,60	/* How many bytes to get
> > source 16B aligned?  */ +
> > +	/* Get source 16B aligned  */
> > +	lvx	v0,0,r4
> > +	lvsr	v1,0,r4
> > +	vperm	v0,v18,v0,v1
> > +
> > +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00
> > otherwise  */
> > +	vctzlsbb r8,v6		/* Number of trailing zeroes
> >  */
> > +	addi	r8,r8,1		/* Add null terminator
> > */ +
> > +	/* r8 = bytes including null
> > +	   r9 = bytes to get source 16B aligned
> > +	   if r8 > r9
> > +	      no null, copy r9 bytes
> > +	   else
> > +	      there is a null, copy r8 bytes and return.  */
> > +	cmpd	r8,r9
> > +	bgt	L(no_null)
> > +
> > +	sldi	r10,r8,56	/* stxvl wants size in top 8
> > bits  */
> > +	stxvl	32+v0,r11,r10	/* Partial store  */
> > +
> > +	blr
> > +
> > +L(no_null):
> > +	sldi	r10,r9,56	/* stxvl wants size in top 8
> > bits  */
> > +	stxvl	32+v0,r11,r10	/* Partial store  */
> > +
> > +	add	r4,r4,r9
> > +	add	r11,r11,r9
> > +
> > +L(loop):  
> 
> Should we eforce alignment here?
> 
> > +	lxv	32+v0,0(r4)
> > +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> > +	bne	cr6,L(tail1)
> > +
> > +	lxv	32+v1,16(r4)
> > +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> > +	bne	cr6,L(tail2)
> > +
> > +	lxv	32+v2,32(r4)
> > +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> > +	bne	cr6,L(tail3)
> > +
> > +	lxv	32+v3,48(r4)
> > +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> > +	bne	cr6,L(tail4)
> > +
> > +	stxv	32+v0,0(r11)
> > +	stxv	32+v1,16(r11)
> > +	stxv	32+v2,32(r11)
> > +	stxv	32+v3,48(r11)  
> 
> Out of curiosity, why unroll 4x time here?
> 
> > +
> > +	addi	r4,r4,64
> > +	addi	r11,r11,64
> > +
> > +	b	L(loop)
> > +
> > +L(tail1):
> > +	vctzlsbb r8,v6
> > +	addi	r8,r8,1
> > +	sldi	r9,r8,56	/* stxvl wants size in top 8
> > bits  */
> > +	stxvl	32+v0,r11,r9
> > +	blr
> > +
> > +L(tail2):
> > +	stxv	32+v0,0(r11)
> > +	vctzlsbb r8,v6		/* Number of trailing zeroes
> >  */
> > +	addi	r8,r8,1		/* Add null terminator
> > */
> > +	sldi	r10,r8,56	/* stxvl wants size in top 8
> > bits  */
> > +	addi	r11,r11,16
> > +	stxvl	32+v1,r11,r10	/* Partial store  */
> > +	blr
> > +
> > +L(tail3):
> > +	stxv	32+v0,0(r11)
> > +	stxv	32+v1,16(r11)
> > +	vctzlsbb r8,v6		/* Number of trailing zeroes
> >  */
> > +	addi	r8,r8,1		/* Add null terminator
> > */
> > +	sldi	r10,r8,56	/* stxvl wants size in top 8
> > bits  */
> > +	addi	r11,r11,32
> > +	stxvl	32+v2,r11,r10	/* Partial store  */
> > +	blr
> > +
> > +L(tail4):
> > +	stxv	32+v0,0(r11)
> > +	stxv	32+v1,16(r11)
> > +	stxv	32+v2,32(r11)
> > +	vctzlsbb r8,v6		/* Number of trailing zeroes
> >  */
> > +	addi	r8,r8,1		/* Add null terminator
> > */
> > +	sldi	r10,r8,56	/* stxvl wants size in top 8
> > bits  */
> > +	addi	r11,r11,48
> > +	stxvl	32+v3,r11,r10	/* Partial store  */
> > +	blr
> > +END (STRCPY)
> > +libc_hidden_builtin_def (strcpy)
> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> > b/sysdeps/powerpc/powerpc64/multiarch/Makefile index
> > ea936bf9ed..db11345053 100644 ---
> > a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++
> > b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@
> > sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2
> > memcpy-power6 \ strncase-power8 
> >  ifneq (,$(filter %le,$(config-machine)))
> > -sysdep_routines += strcmp-power9 strncmp-power9
> > +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
> >  endif
> >  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> >  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops  
> 
> Ok.
> 
> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> > b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index
> > b9fef3f43c..ad11ede20e 100644 ---
> > a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++
> > b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -85,6
> > +85,10 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array, /* Support
> > sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */ IFUNC_IMPL (i,
> > name, strcpy, +#ifdef __LITTLE_ENDIAN__
> > +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 &
> > PPC_FEATURE2_ARCH_3_00,
> > +			      __strcpy_power9)
> > +#endif
> >  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 &
> > PPC_FEATURE2_ARCH_2_07, __strcpy_power8)
> >  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap &
> > PPC_FEATURE_HAS_VSX,  
> 
> Ok.
> 
> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> > b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S new file mode
> > 100644 index 0000000000..d22aa0a8d6
> > --- /dev/null
> > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> > @@ -0,0 +1,26 @@
> > +/* Optimized strcpy implementation for POWER9/PPC64.
> > +   Copyright (C) 2016-2020 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it
> > and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later
> > version. +
> > +   The GNU C Library is distributed in the hope that it will be
> > useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> > +#define STRCPY __strcpy_power9
> > +
> > +#undef libc_hidden_builtin_def
> > +#define libc_hidden_builtin_def(name)
> > +
> > +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
> > +#endif  
> 
> Ok.
> 
> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> > b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index
> > 66ea93b8f4..dcdee5181f 100644 ---
> > a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++
> > b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -25,9 +25,16 @@
> >  extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
> >  extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
> >  extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
> > +# ifdef __LITTLE_ENDIAN__
> > +extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
> > +# endif
> >  #undef strcpy
> >  
> >  libc_ifunc_redirected (__redirect_strcpy, strcpy,
> > +# ifdef __LITTLE_ENDIAN__
> > +			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
> > +			? __strcpy_power9 :
> > +# endif
> >  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> >  		       ? __strcpy_power8
> >  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> >   
> 
> Ok.
>
Adhemerval Zanella Netto May 5, 2020, 7:01 p.m. UTC | #3
On 04/05/2020 23:40, Anton Blanchard wrote:
> Hi Adhemerval,
> 
>>> This version is significantly faster on small strings and relatively
>>> unaligned large strings.  
>>
>> So it seems that it uses the ISA 3.0 partial stores to optimize vector
>> instructions usage, could you add it on the commit message?
> 
> Will do.
> 
>> The power8 version also adds an stpcpy version (which I am not sure is
>> really a gain for short strings) based on its strcpy.  Maybe it could
>> be a good thing to check if this could be adapted to be build as
>> stpcpy as well.
> 
> Good idea. I modified it to also implement stpcpy, and it shows decent
> gains - 40% faster on small strings and 90% faster on long relatively
> unaligned strings.

Nice.

> 
>> Usually for such optimizations we try to get a baseline benchmark
>> results using glibc benchtests.  Could you post the results for
>> before and after?
> 
> How do you want the results? Should I just mail the output of the
> relevant glibc benchtest?

Yes, just attach the bench-st{r,p}cpy.out. pre and pos patch.

> 
>> (Some of the glibc benchtests does have some shortcomings, but at
>> least it documents some of the expected gains and performance
>> differences with the new implementation.)
> 
> I wonder if an option to do more thorough testing would be useful. For
> example, I have a local modification to do an exhaustive test of all
> alignments and lengths up to a certain point (eg 128 bytes) to make
> sure there aren't any corner cases.

Sure, testing improvements is always welcomed!

> 
> More random argument benchmarks would be useful too.

And benchtests improvements as well.

> 
> Thanks,
> Anton
> 
>>
>>> ---
>>>  sysdeps/powerpc/powerpc64/le/power9/strcpy.S  | 144
>>> ++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/Makefile  |
>>>  2 +- .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>>>  .../powerpc64/multiarch/strcpy-power9.S       |  26 ++++
>>>  sysdeps/powerpc/powerpc64/multiarch/strcpy.c  |   7 +
>>>  5 files changed, 182 insertions(+), 1 deletion(-)
>>>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>>>  create mode 100644
>>> sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
>>>
>>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>>> b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S new file mode 100644
>>> index 0000000000..5749228054
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>>> @@ -0,0 +1,144 @@
>>> +/* Optimized strcpy implementation for PowerPC64/POWER9.
>>> +   Copyright (C) 2020 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it
>>> and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later
>>> version. +
>>> +   The GNU C Library is distributed in the hope that it will be
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <sysdep.h>
>>> +
>>> +#ifndef STRCPY
>>> +# define STRCPY strcpy
>>> +#endif
>>> +
>>> +/* Implements the function
>>> +
>>> +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
>>> +
>>> +   The implementation can load bytes past a null terminator, but
>>> only
>>> +   up to the next 16B boundary, so it never crosses a page.  */
>>> +
>>> +.machine power9
>>> +ENTRY_TOCLESS (STRCPY, 4)
>>> +	CALL_MCOUNT 2
>>> +
>>> +	/* NULL string optimisation  */
>>> +	lbz	r0,0(r4)
>>> +	stb	r0,0(r3)
>>> +	cmpwi	r0,0
>>> +	beqlr  
>>
>> Is it a worth optimization? None of other strcpy arch optimizations
>> does it.
>>
>>> +
>>> +	addi	r4,r4,1
>>> +	addi	r11,r3,1
>>> +
>>> +	vspltisb v18,0		/* Zeroes in v18  */
>>> +
>>> +	neg	r5,r4
>>> +	rldicl	r9,r5,0,60	/* How many bytes to get
>>> source 16B aligned?  */ +
>>> +	/* Get source 16B aligned  */
>>> +	lvx	v0,0,r4
>>> +	lvsr	v1,0,r4
>>> +	vperm	v0,v18,v0,v1
>>> +
>>> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00
>>> otherwise  */
>>> +	vctzlsbb r8,v6		/* Number of trailing zeroes
>>>  */
>>> +	addi	r8,r8,1		/* Add null terminator
>>> */ +
>>> +	/* r8 = bytes including null
>>> +	   r9 = bytes to get source 16B aligned
>>> +	   if r8 > r9
>>> +	      no null, copy r9 bytes
>>> +	   else
>>> +	      there is a null, copy r8 bytes and return.  */
>>> +	cmpd	r8,r9
>>> +	bgt	L(no_null)
>>> +
>>> +	sldi	r10,r8,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	stxvl	32+v0,r11,r10	/* Partial store  */
>>> +
>>> +	blr
>>> +
>>> +L(no_null):
>>> +	sldi	r10,r9,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	stxvl	32+v0,r11,r10	/* Partial store  */
>>> +
>>> +	add	r4,r4,r9
>>> +	add	r11,r11,r9
>>> +
>>> +L(loop):  
>>
>> Should we eforce alignment here?
>>
>>> +	lxv	32+v0,0(r4)
>>> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
>>> +	bne	cr6,L(tail1)
>>> +
>>> +	lxv	32+v1,16(r4)
>>> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
>>> +	bne	cr6,L(tail2)
>>> +
>>> +	lxv	32+v2,32(r4)
>>> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
>>> +	bne	cr6,L(tail3)
>>> +
>>> +	lxv	32+v3,48(r4)
>>> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
>>> +	bne	cr6,L(tail4)
>>> +
>>> +	stxv	32+v0,0(r11)
>>> +	stxv	32+v1,16(r11)
>>> +	stxv	32+v2,32(r11)
>>> +	stxv	32+v3,48(r11)  
>>
>> Out of curiosity, why unroll 4x time here?
>>
>>> +
>>> +	addi	r4,r4,64
>>> +	addi	r11,r11,64
>>> +
>>> +	b	L(loop)
>>> +
>>> +L(tail1):
>>> +	vctzlsbb r8,v6
>>> +	addi	r8,r8,1
>>> +	sldi	r9,r8,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	stxvl	32+v0,r11,r9
>>> +	blr
>>> +
>>> +L(tail2):
>>> +	stxv	32+v0,0(r11)
>>> +	vctzlsbb r8,v6		/* Number of trailing zeroes
>>>  */
>>> +	addi	r8,r8,1		/* Add null terminator
>>> */
>>> +	sldi	r10,r8,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	addi	r11,r11,16
>>> +	stxvl	32+v1,r11,r10	/* Partial store  */
>>> +	blr
>>> +
>>> +L(tail3):
>>> +	stxv	32+v0,0(r11)
>>> +	stxv	32+v1,16(r11)
>>> +	vctzlsbb r8,v6		/* Number of trailing zeroes
>>>  */
>>> +	addi	r8,r8,1		/* Add null terminator
>>> */
>>> +	sldi	r10,r8,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	addi	r11,r11,32
>>> +	stxvl	32+v2,r11,r10	/* Partial store  */
>>> +	blr
>>> +
>>> +L(tail4):
>>> +	stxv	32+v0,0(r11)
>>> +	stxv	32+v1,16(r11)
>>> +	stxv	32+v2,32(r11)
>>> +	vctzlsbb r8,v6		/* Number of trailing zeroes
>>>  */
>>> +	addi	r8,r8,1		/* Add null terminator
>>> */
>>> +	sldi	r10,r8,56	/* stxvl wants size in top 8
>>> bits  */
>>> +	addi	r11,r11,48
>>> +	stxvl	32+v3,r11,r10	/* Partial store  */
>>> +	blr
>>> +END (STRCPY)
>>> +libc_hidden_builtin_def (strcpy)
>>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile
>>> b/sysdeps/powerpc/powerpc64/multiarch/Makefile index
>>> ea936bf9ed..db11345053 100644 ---
>>> a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++
>>> b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@
>>> sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2
>>> memcpy-power6 \ strncase-power8 
>>>  ifneq (,$(filter %le,$(config-machine)))
>>> -sysdep_routines += strcmp-power9 strncmp-power9
>>> +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
>>>  endif
>>>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>>>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops  
>>
>> Ok.
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>>> b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index
>>> b9fef3f43c..ad11ede20e 100644 ---
>>> a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++
>>> b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -85,6
>>> +85,10 @@ __libc_ifunc_impl_list (const char *name, struct
>>> libc_ifunc_impl *array, /* Support
>>> sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */ IFUNC_IMPL (i,
>>> name, strcpy, +#ifdef __LITTLE_ENDIAN__
>>> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 &
>>> PPC_FEATURE2_ARCH_3_00,
>>> +			      __strcpy_power9)
>>> +#endif
>>>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 &
>>> PPC_FEATURE2_ARCH_2_07, __strcpy_power8)
>>>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap &
>>> PPC_FEATURE_HAS_VSX,  
>>
>> Ok.
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
>>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S new file mode
>>> 100644 index 0000000000..d22aa0a8d6
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
>>> @@ -0,0 +1,26 @@
>>> +/* Optimized strcpy implementation for POWER9/PPC64.
>>> +   Copyright (C) 2016-2020 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it
>>> and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later
>>> version. +
>>> +   The GNU C Library is distributed in the hope that it will be
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
>>> +#define STRCPY __strcpy_power9
>>> +
>>> +#undef libc_hidden_builtin_def
>>> +#define libc_hidden_builtin_def(name)
>>> +
>>> +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
>>> +#endif  
>>
>> Ok.
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
>>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index
>>> 66ea93b8f4..dcdee5181f 100644 ---
>>> a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++
>>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -25,9 +25,16 @@
>>>  extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
>>>  extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
>>>  extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
>>> +# ifdef __LITTLE_ENDIAN__
>>> +extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
>>> +# endif
>>>  #undef strcpy
>>>  
>>>  libc_ifunc_redirected (__redirect_strcpy, strcpy,
>>> +# ifdef __LITTLE_ENDIAN__
>>> +			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
>>> +			? __strcpy_power9 :
>>> +# endif
>>>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>>>  		       ? __strcpy_power8
>>>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
>>>   
>>
>> Ok.
>>
>
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
new file mode 100644
index 0000000000..5749228054
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -0,0 +1,144 @@ 
+/* Optimized strcpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCPY
+# define STRCPY strcpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (STRCPY, 4)
+	CALL_MCOUNT 2
+
+	/* NULL string optimisation  */
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	cmpwi	r0,0
+	beqlr
+
+	addi	r4,r4,1
+	addi	r11,r3,1
+
+	vspltisb v18,0		/* Zeroes in v18  */
+
+	neg	r5,r4
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpd	r8,r9
+	bgt	L(no_null)
+
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(no_null):
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+
+L(loop):
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+
+	b	L(loop)
+
+L(tail1):
+	vctzlsbb r8,v6
+	addi	r8,r8,1
+	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r9
+	blr
+
+L(tail2):
+	stxv	32+v0,0(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+END (STRCPY)
+libc_hidden_builtin_def (strcpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ea936bf9ed..db11345053 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b9fef3f43c..ad11ede20e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -85,6 +85,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strcpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
new file mode 100644
index 0000000000..d22aa0a8d6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
@@ -0,0 +1,26 @@ 
+/* Optimized strcpy implementation for POWER9/PPC64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRCPY __strcpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index 66ea93b8f4..dcdee5181f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -25,9 +25,16 @@ 
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
 extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
+# endif
 #undef strcpy
 
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
+# ifdef __LITTLE_ENDIAN__
+			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			? __strcpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strcpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)