Message ID | 20200501145209.1dea304c@kryten.localdomain |
---|---|
State | New |
Headers | show |
Series | powerpc: Optimized strcpy for POWER9 | expand |
On 01/05/2020 01:52, Anton Blanchard via Libc-alpha wrote: > This version is significantly faster on small strings and relatively > unaligned large strings. So it seems that it uses the ISA 3.0 partial stores to optimize vector instructions usage, could you add it on the commit message? The power8 version also adds an stpcpy version (which I am not sure is really a gain for short strings) based on its strcpy. Maybe it could be a good thing to check if this could be adapted to be build as stpcpy as well. Usually for such optimizations we try to get a baseline benchmark results using glibc benchtests. Could you post the results for before and after? (Some of the glibc benchtests does have some shortcomings, but at least it documents some of the expected gains and performance differences with the new implementation.) > --- > sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 144 ++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 4 + > .../powerpc64/multiarch/strcpy-power9.S | 26 ++++ > sysdeps/powerpc/powerpc64/multiarch/strcpy.c | 7 + > 5 files changed, 182 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > new file mode 100644 > index 0000000000..5749228054 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > @@ -0,0 +1,144 @@ > +/* Optimized strcpy implementation for PowerPC64/POWER9. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#ifndef STRCPY > +# define STRCPY strcpy > +#endif > + > +/* Implements the function > + > + char * [r3] strcpy (char *dest [r3], const char *src [r4]) > + > + The implementation can load bytes past a null terminator, but only > + up to the next 16B boundary, so it never crosses a page. */ > + > +.machine power9 > +ENTRY_TOCLESS (STRCPY, 4) > + CALL_MCOUNT 2 > + > + /* NULL string optimisation */ > + lbz r0,0(r4) > + stb r0,0(r3) > + cmpwi r0,0 > + beqlr Is it a worth optimization? None of other strcpy arch optimizations does it. > + > + addi r4,r4,1 > + addi r11,r3,1 > + > + vspltisb v18,0 /* Zeroes in v18 */ > + > + neg r5,r4 > + rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ > + > + /* Get source 16B aligned */ > + lvx v0,0,r4 > + lvsr v1,0,r4 > + vperm v0,v18,v0,v1 > + > + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + addi r8,r8,1 /* Add null terminator */ > + > + /* r8 = bytes including null > + r9 = bytes to get source 16B aligned > + if r8 > r9 > + no null, copy r9 bytes > + else > + there is a null, copy r8 bytes and return. */ > + cmpd r8,r9 > + bgt L(no_null) > + > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + blr > + > +L(no_null): > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r4,r4,r9 > + add r11,r11,r9 > + > +L(loop): Should we eforce alignment here? > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + bne cr6,L(tail1) > + > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + bne cr6,L(tail2) > + > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + bne cr6,L(tail3) > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + bne cr6,L(tail4) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) Out of curiosity, why unroll 4x time here? > + > + addi r4,r4,64 > + addi r11,r11,64 > + > + b L(loop) > + > +L(tail1): > + vctzlsbb r8,v6 > + addi r8,r8,1 > + sldi r9,r8,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r9 > + blr > + > +L(tail2): > + stxv 32+v0,0(r11) > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + addi r8,r8,1 /* Add null terminator */ > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 > + stxvl 32+v1,r11,r10 /* Partial store */ > + blr > + > +L(tail3): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + addi r8,r8,1 /* Add null terminator */ > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 > + stxvl 32+v2,r11,r10 /* Partial store */ > + blr > + > +L(tail4): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + addi r8,r8,1 /* Add null terminator */ > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 > + stxvl 32+v3,r11,r10 /* Partial store */ > + blr > +END (STRCPY) > +libc_hidden_builtin_def (strcpy) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index ea936bf9ed..db11345053 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ > strncase-power8 > > ifneq (,$(filter %le,$(config-machine))) > -sysdep_routines += strcmp-power9 strncmp-power9 > +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 > endif > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops Ok. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index b9fef3f43c..ad11ede20e 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ > IFUNC_IMPL (i, name, strcpy, > +#ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00, > + __strcpy_power9) > +#endif > IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, > __strcpy_power8) > IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX, Ok. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > new file mode 100644 > index 0000000000..d22aa0a8d6 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > @@ -0,0 +1,26 @@ > +/* Optimized strcpy implementation for POWER9/PPC64. > + Copyright (C) 2016-2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) > +#define STRCPY __strcpy_power9 > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S> > +#endif Ok. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c > index 66ea93b8f4..dcdee5181f 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c > @@ -25,9 +25,16 @@ > extern __typeof (strcpy) __strcpy_ppc attribute_hidden; > extern __typeof (strcpy) __strcpy_power7 attribute_hidden; > extern __typeof (strcpy) __strcpy_power8 attribute_hidden; > +# ifdef __LITTLE_ENDIAN__ > +extern __typeof (strcpy) __strcpy_power9 attribute_hidden; > +# endif > #undef strcpy > > libc_ifunc_redirected (__redirect_strcpy, strcpy, > +# ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > + ? __strcpy_power9 : > +# endif > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > ? __strcpy_power8 > : (hwcap & PPC_FEATURE_HAS_VSX) > Ok.
Hi Adhemerval, > > This version is significantly faster on small strings and relatively > > unaligned large strings. > > So it seems that it uses the ISA 3.0 partial stores to optimize vector > instructions usage, could you add it on the commit message? Will do. > The power8 version also adds an stpcpy version (which I am not sure is > really a gain for short strings) based on its strcpy. Maybe it could > be a good thing to check if this could be adapted to be build as > stpcpy as well. Good idea. I modified it to also implement stpcpy, and it shows decent gains - 40% faster on small strings and 90% faster on long relatively unaligned strings. > Usually for such optimizations we try to get a baseline benchmark > results using glibc benchtests. Could you post the results for > before and after? How do you want the results? Should I just mail the output of the relevant glibc benchtest? > (Some of the glibc benchtests does have some shortcomings, but at > least it documents some of the expected gains and performance > differences with the new implementation.) I wonder if an option to do more thorough testing would be useful. For example, I have a local modification to do an exhaustive test of all alignments and lengths up to a certain point (eg 128 bytes) to make sure there aren't any corner cases. More random argument benchmarks would be useful too. Thanks, Anton > > > --- > > sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 144 > > ++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/Makefile | > > 2 +- .../powerpc64/multiarch/ifunc-impl-list.c | 4 + > > .../powerpc64/multiarch/strcpy-power9.S | 26 ++++ > > sysdeps/powerpc/powerpc64/multiarch/strcpy.c | 7 + > > 5 files changed, 182 insertions(+), 1 deletion(-) > > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S > > create mode 100644 > > sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > > > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > > b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S new file mode 100644 > > index 0000000000..5749228054 > > --- /dev/null > > +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > > @@ -0,0 +1,144 @@ > > +/* Optimized strcpy implementation for PowerPC64/POWER9. > > + Copyright (C) 2020 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it > > and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later > > version. + > > + The GNU C Library is distributed in the hope that it will be > > useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <sysdep.h> > > + > > +#ifndef STRCPY > > +# define STRCPY strcpy > > +#endif > > + > > +/* Implements the function > > + > > + char * [r3] strcpy (char *dest [r3], const char *src [r4]) > > + > > + The implementation can load bytes past a null terminator, but > > only > > + up to the next 16B boundary, so it never crosses a page. */ > > + > > +.machine power9 > > +ENTRY_TOCLESS (STRCPY, 4) > > + CALL_MCOUNT 2 > > + > > + /* NULL string optimisation */ > > + lbz r0,0(r4) > > + stb r0,0(r3) > > + cmpwi r0,0 > > + beqlr > > Is it a worth optimization? None of other strcpy arch optimizations > does it. > > > + > > + addi r4,r4,1 > > + addi r11,r3,1 > > + > > + vspltisb v18,0 /* Zeroes in v18 */ > > + > > + neg r5,r4 > > + rldicl r9,r5,0,60 /* How many bytes to get > > source 16B aligned? */ + > > + /* Get source 16B aligned */ > > + lvx v0,0,r4 > > + lvsr v1,0,r4 > > + vperm v0,v18,v0,v1 > > + > > + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 > > otherwise */ > > + vctzlsbb r8,v6 /* Number of trailing zeroes > > */ > > + addi r8,r8,1 /* Add null terminator > > */ + > > + /* r8 = bytes including null > > + r9 = bytes to get source 16B aligned > > + if r8 > r9 > > + no null, copy r9 bytes > > + else > > + there is a null, copy r8 bytes and return. */ > > + cmpd r8,r9 > > + bgt L(no_null) > > + > > + sldi r10,r8,56 /* stxvl wants size in top 8 > > bits */ > > + stxvl 32+v0,r11,r10 /* Partial store */ > > + > > + blr > > + > > +L(no_null): > > + sldi r10,r9,56 /* stxvl wants size in top 8 > > bits */ > > + stxvl 32+v0,r11,r10 /* Partial store */ > > + > > + add r4,r4,r9 > > + add r11,r11,r9 > > + > > +L(loop): > > Should we eforce alignment here? > > > + lxv 32+v0,0(r4) > > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > > + bne cr6,L(tail1) > > + > > + lxv 32+v1,16(r4) > > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > > + bne cr6,L(tail2) > > + > > + lxv 32+v2,32(r4) > > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > > + bne cr6,L(tail3) > > + > > + lxv 32+v3,48(r4) > > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > > + bne cr6,L(tail4) > > + > > + stxv 32+v0,0(r11) > > + stxv 32+v1,16(r11) > > + stxv 32+v2,32(r11) > > + stxv 32+v3,48(r11) > > Out of curiosity, why unroll 4x time here? > > > + > > + addi r4,r4,64 > > + addi r11,r11,64 > > + > > + b L(loop) > > + > > +L(tail1): > > + vctzlsbb r8,v6 > > + addi r8,r8,1 > > + sldi r9,r8,56 /* stxvl wants size in top 8 > > bits */ > > + stxvl 32+v0,r11,r9 > > + blr > > + > > +L(tail2): > > + stxv 32+v0,0(r11) > > + vctzlsbb r8,v6 /* Number of trailing zeroes > > */ > > + addi r8,r8,1 /* Add null terminator > > */ > > + sldi r10,r8,56 /* stxvl wants size in top 8 > > bits */ > > + addi r11,r11,16 > > + stxvl 32+v1,r11,r10 /* Partial store */ > > + blr > > + > > +L(tail3): > > + stxv 32+v0,0(r11) > > + stxv 32+v1,16(r11) > > + vctzlsbb r8,v6 /* Number of trailing zeroes > > */ > > + addi r8,r8,1 /* Add null terminator > > */ > > + sldi r10,r8,56 /* stxvl wants size in top 8 > > bits */ > > + addi r11,r11,32 > > + stxvl 32+v2,r11,r10 /* Partial store */ > > + blr > > + > > +L(tail4): > > + stxv 32+v0,0(r11) > > + stxv 32+v1,16(r11) > > + stxv 32+v2,32(r11) > > + vctzlsbb r8,v6 /* Number of trailing zeroes > > */ > > + addi r8,r8,1 /* Add null terminator > > */ > > + sldi r10,r8,56 /* stxvl wants size in top 8 > > bits */ > > + addi r11,r11,48 > > + stxvl 32+v3,r11,r10 /* Partial store */ > > + blr > > +END (STRCPY) > > +libc_hidden_builtin_def (strcpy) > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile > > b/sysdeps/powerpc/powerpc64/multiarch/Makefile index > > ea936bf9ed..db11345053 100644 --- > > a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ > > b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@ > > sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 > > memcpy-power6 \ strncase-power8 > > ifneq (,$(filter %le,$(config-machine))) > > -sysdep_routines += strcmp-power9 strncmp-power9 > > +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 > > endif > > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > > Ok. > > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > > b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index > > b9fef3f43c..ad11ede20e 100644 --- > > a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ > > b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -85,6 > > +85,10 @@ __libc_ifunc_impl_list (const char *name, struct > > libc_ifunc_impl *array, /* Support > > sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ IFUNC_IMPL (i, > > name, strcpy, +#ifdef __LITTLE_ENDIAN__ > > + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & > > PPC_FEATURE2_ARCH_3_00, > > + __strcpy_power9) > > +#endif > > IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & > > PPC_FEATURE2_ARCH_2_07, __strcpy_power8) > > IFUNC_IMPL_ADD (array, i, strcpy, hwcap & > > PPC_FEATURE_HAS_VSX, > > Ok. > > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > > b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S new file mode > > 100644 index 0000000000..d22aa0a8d6 > > --- /dev/null > > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S > > @@ -0,0 +1,26 @@ > > +/* Optimized strcpy implementation for POWER9/PPC64. > > + Copyright (C) 2016-2020 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it > > and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later > > version. + > > + The GNU C Library is distributed in the hope that it will be > > useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) > > +#define STRCPY __strcpy_power9 > > + > > +#undef libc_hidden_builtin_def > > +#define libc_hidden_builtin_def(name) > > + > > +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S> > > +#endif > > Ok. > > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c > > b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index > > 66ea93b8f4..dcdee5181f 100644 --- > > a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++ > > b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -25,9 +25,16 @@ > > extern __typeof (strcpy) __strcpy_ppc attribute_hidden; > > extern __typeof (strcpy) __strcpy_power7 attribute_hidden; > > extern __typeof (strcpy) __strcpy_power8 attribute_hidden; > > +# ifdef __LITTLE_ENDIAN__ > > +extern __typeof (strcpy) __strcpy_power9 attribute_hidden; > > +# endif > > #undef strcpy > > > > libc_ifunc_redirected (__redirect_strcpy, strcpy, > > +# ifdef __LITTLE_ENDIAN__ > > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > > + ? __strcpy_power9 : > > +# endif > > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > > ? __strcpy_power8 > > : (hwcap & PPC_FEATURE_HAS_VSX) > > > > Ok. >
On 04/05/2020 23:40, Anton Blanchard wrote: > Hi Adhemerval, > >>> This version is significantly faster on small strings and relatively >>> unaligned large strings. >> >> So it seems that it uses the ISA 3.0 partial stores to optimize vector >> instructions usage, could you add it on the commit message? > > Will do. > >> The power8 version also adds an stpcpy version (which I am not sure is >> really a gain for short strings) based on its strcpy. Maybe it could >> be a good thing to check if this could be adapted to be build as >> stpcpy as well. > > Good idea. I modified it to also implement stpcpy, and it shows decent > gains - 40% faster on small strings and 90% faster on long relatively > unaligned strings. Nice. > >> Usually for such optimizations we try to get a baseline benchmark >> results using glibc benchtests. Could you post the results for >> before and after? > > How do you want the results? Should I just mail the output of the > relevant glibc benchtest? Yes, just attach the bench-st{r,p}cpy.out. pre and pos patch. > >> (Some of the glibc benchtests does have some shortcomings, but at >> least it documents some of the expected gains and performance >> differences with the new implementation.) > > I wonder if an option to do more thorough testing would be useful. For > example, I have a local modification to do an exhaustive test of all > alignments and lengths up to a certain point (eg 128 bytes) to make > sure there aren't any corner cases. Sure, testing improvements is always welcomed! > > More random argument benchmarks would be useful too. And benchtests improvements as well. > > Thanks, > Anton > >> >>> --- >>> sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 144 >>> ++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/Makefile | >>> 2 +- .../powerpc64/multiarch/ifunc-impl-list.c | 4 + >>> .../powerpc64/multiarch/strcpy-power9.S | 26 ++++ >>> sysdeps/powerpc/powerpc64/multiarch/strcpy.c | 7 + >>> 5 files changed, 182 insertions(+), 1 deletion(-) >>> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S >>> create mode 100644 >>> sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S >>> >>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S >>> b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S new file mode 100644 >>> index 0000000000..5749228054 >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S >>> @@ -0,0 +1,144 @@ >>> +/* Optimized strcpy implementation for PowerPC64/POWER9. >>> + Copyright (C) 2020 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it >>> and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later >>> version. + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> + >>> +#ifndef STRCPY >>> +# define STRCPY strcpy >>> +#endif >>> + >>> +/* Implements the function >>> + >>> + char * [r3] strcpy (char *dest [r3], const char *src [r4]) >>> + >>> + The implementation can load bytes past a null terminator, but >>> only >>> + up to the next 16B boundary, so it never crosses a page. */ >>> + >>> +.machine power9 >>> +ENTRY_TOCLESS (STRCPY, 4) >>> + CALL_MCOUNT 2 >>> + >>> + /* NULL string optimisation */ >>> + lbz r0,0(r4) >>> + stb r0,0(r3) >>> + cmpwi r0,0 >>> + beqlr >> >> Is it a worth optimization? None of other strcpy arch optimizations >> does it. >> >>> + >>> + addi r4,r4,1 >>> + addi r11,r3,1 >>> + >>> + vspltisb v18,0 /* Zeroes in v18 */ >>> + >>> + neg r5,r4 >>> + rldicl r9,r5,0,60 /* How many bytes to get >>> source 16B aligned? */ + >>> + /* Get source 16B aligned */ >>> + lvx v0,0,r4 >>> + lvsr v1,0,r4 >>> + vperm v0,v18,v0,v1 >>> + >>> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 >>> otherwise */ >>> + vctzlsbb r8,v6 /* Number of trailing zeroes >>> */ >>> + addi r8,r8,1 /* Add null terminator >>> */ + >>> + /* r8 = bytes including null >>> + r9 = bytes to get source 16B aligned >>> + if r8 > r9 >>> + no null, copy r9 bytes >>> + else >>> + there is a null, copy r8 bytes and return. */ >>> + cmpd r8,r9 >>> + bgt L(no_null) >>> + >>> + sldi r10,r8,56 /* stxvl wants size in top 8 >>> bits */ >>> + stxvl 32+v0,r11,r10 /* Partial store */ >>> + >>> + blr >>> + >>> +L(no_null): >>> + sldi r10,r9,56 /* stxvl wants size in top 8 >>> bits */ >>> + stxvl 32+v0,r11,r10 /* Partial store */ >>> + >>> + add r4,r4,r9 >>> + add r11,r11,r9 >>> + >>> +L(loop): >> >> Should we eforce alignment here? >> >>> + lxv 32+v0,0(r4) >>> + vcmpequb. v6,v0,v18 /* Any zero bytes? */ >>> + bne cr6,L(tail1) >>> + >>> + lxv 32+v1,16(r4) >>> + vcmpequb. v6,v1,v18 /* Any zero bytes? */ >>> + bne cr6,L(tail2) >>> + >>> + lxv 32+v2,32(r4) >>> + vcmpequb. v6,v2,v18 /* Any zero bytes? */ >>> + bne cr6,L(tail3) >>> + >>> + lxv 32+v3,48(r4) >>> + vcmpequb. v6,v3,v18 /* Any zero bytes? */ >>> + bne cr6,L(tail4) >>> + >>> + stxv 32+v0,0(r11) >>> + stxv 32+v1,16(r11) >>> + stxv 32+v2,32(r11) >>> + stxv 32+v3,48(r11) >> >> Out of curiosity, why unroll 4x time here? >> >>> + >>> + addi r4,r4,64 >>> + addi r11,r11,64 >>> + >>> + b L(loop) >>> + >>> +L(tail1): >>> + vctzlsbb r8,v6 >>> + addi r8,r8,1 >>> + sldi r9,r8,56 /* stxvl wants size in top 8 >>> bits */ >>> + stxvl 32+v0,r11,r9 >>> + blr >>> + >>> +L(tail2): >>> + stxv 32+v0,0(r11) >>> + vctzlsbb r8,v6 /* Number of trailing zeroes >>> */ >>> + addi r8,r8,1 /* Add null terminator >>> */ >>> + sldi r10,r8,56 /* stxvl wants size in top 8 >>> bits */ >>> + addi r11,r11,16 >>> + stxvl 32+v1,r11,r10 /* Partial store */ >>> + blr >>> + >>> +L(tail3): >>> + stxv 32+v0,0(r11) >>> + stxv 32+v1,16(r11) >>> + vctzlsbb r8,v6 /* Number of trailing zeroes >>> */ >>> + addi r8,r8,1 /* Add null terminator >>> */ >>> + sldi r10,r8,56 /* stxvl wants size in top 8 >>> bits */ >>> + addi r11,r11,32 >>> + stxvl 32+v2,r11,r10 /* Partial store */ >>> + blr >>> + >>> +L(tail4): >>> + stxv 32+v0,0(r11) >>> + stxv 32+v1,16(r11) >>> + stxv 32+v2,32(r11) >>> + vctzlsbb r8,v6 /* Number of trailing zeroes >>> */ >>> + addi r8,r8,1 /* Add null terminator >>> */ >>> + sldi r10,r8,56 /* stxvl wants size in top 8 >>> bits */ >>> + addi r11,r11,48 >>> + stxvl 32+v3,r11,r10 /* Partial store */ >>> + blr >>> +END (STRCPY) >>> +libc_hidden_builtin_def (strcpy) >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile >>> b/sysdeps/powerpc/powerpc64/multiarch/Makefile index >>> ea936bf9ed..db11345053 100644 --- >>> a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ >>> b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@ >>> sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 >>> memcpy-power6 \ strncase-power8 >>> ifneq (,$(filter %le,$(config-machine))) >>> -sysdep_routines += strcmp-power9 strncmp-power9 >>> +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 >>> endif >>> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops >>> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops >> >> Ok. >> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >>> b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index >>> b9fef3f43c..ad11ede20e 100644 --- >>> a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ >>> b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -85,6 >>> +85,10 @@ __libc_ifunc_impl_list (const char *name, struct >>> libc_ifunc_impl *array, /* Support >>> sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ IFUNC_IMPL (i, >>> name, strcpy, +#ifdef __LITTLE_ENDIAN__ >>> + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & >>> PPC_FEATURE2_ARCH_3_00, >>> + __strcpy_power9) >>> +#endif >>> IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & >>> PPC_FEATURE2_ARCH_2_07, __strcpy_power8) >>> IFUNC_IMPL_ADD (array, i, strcpy, hwcap & >>> PPC_FEATURE_HAS_VSX, >> >> Ok. >> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S >>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S new file mode >>> 100644 index 0000000000..d22aa0a8d6 >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S >>> @@ -0,0 +1,26 @@ >>> +/* Optimized strcpy implementation for POWER9/PPC64. >>> + Copyright (C) 2016-2020 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it >>> and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later >>> version. + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) >>> +#define STRCPY __strcpy_power9 >>> + >>> +#undef libc_hidden_builtin_def >>> +#define libc_hidden_builtin_def(name) >>> + >>> +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S> >>> +#endif >> >> Ok. >> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c >>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index >>> 66ea93b8f4..dcdee5181f 100644 --- >>> a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++ >>> b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -25,9 +25,16 @@ >>> extern __typeof (strcpy) __strcpy_ppc attribute_hidden; >>> extern __typeof (strcpy) __strcpy_power7 attribute_hidden; >>> extern __typeof (strcpy) __strcpy_power8 attribute_hidden; >>> +# ifdef __LITTLE_ENDIAN__ >>> +extern __typeof (strcpy) __strcpy_power9 attribute_hidden; >>> +# endif >>> #undef strcpy >>> >>> libc_ifunc_redirected (__redirect_strcpy, strcpy, >>> +# ifdef __LITTLE_ENDIAN__ >>> + (hwcap2 & PPC_FEATURE2_ARCH_3_00) >>> + ? __strcpy_power9 : >>> +# endif >>> (hwcap2 & PPC_FEATURE2_ARCH_2_07) >>> ? __strcpy_power8 >>> : (hwcap & PPC_FEATURE_HAS_VSX) >>> >> >> Ok. >> >
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S new file mode 100644 index 0000000000..5749228054 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S @@ -0,0 +1,144 @@ +/* Optimized strcpy implementation for PowerPC64/POWER9. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifndef STRCPY +# define STRCPY strcpy +#endif + +/* Implements the function + + char * [r3] strcpy (char *dest [r3], const char *src [r4]) + + The implementation can load bytes past a null terminator, but only + up to the next 16B boundary, so it never crosses a page. */ + +.machine power9 +ENTRY_TOCLESS (STRCPY, 4) + CALL_MCOUNT 2 + + /* NULL string optimisation */ + lbz r0,0(r4) + stb r0,0(r3) + cmpwi r0,0 + beqlr + + addi r4,r4,1 + addi r11,r3,1 + + vspltisb v18,0 /* Zeroes in v18 */ + + neg r5,r4 + rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ + + /* Get source 16B aligned */ + lvx v0,0,r4 + lvsr v1,0,r4 + vperm v0,v18,v0,v1 + + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r8,r8,1 /* Add null terminator */ + + /* r8 = bytes including null + r9 = bytes to get source 16B aligned + if r8 > r9 + no null, copy r9 bytes + else + there is a null, copy r8 bytes and return. */ + cmpd r8,r9 + bgt L(no_null) + + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + blr + +L(no_null): + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + add r4,r4,r9 + add r11,r11,r9 + +L(loop): + lxv 32+v0,0(r4) + vcmpequb. v6,v0,v18 /* Any zero bytes? */ + bne cr6,L(tail1) + + lxv 32+v1,16(r4) + vcmpequb. v6,v1,v18 /* Any zero bytes? */ + bne cr6,L(tail2) + + lxv 32+v2,32(r4) + vcmpequb. v6,v2,v18 /* Any zero bytes? */ + bne cr6,L(tail3) + + lxv 32+v3,48(r4) + vcmpequb. v6,v3,v18 /* Any zero bytes? */ + bne cr6,L(tail4) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + + addi r4,r4,64 + addi r11,r11,64 + + b L(loop) + +L(tail1): + vctzlsbb r8,v6 + addi r8,r8,1 + sldi r9,r8,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r9 + blr + +L(tail2): + stxv 32+v0,0(r11) + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r8,r8,1 /* Add null terminator */ + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,16 + stxvl 32+v1,r11,r10 /* Partial store */ + blr + +L(tail3): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r8,r8,1 /* Add null terminator */ + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,32 + stxvl 32+v2,r11,r10 /* Partial store */ + blr + +L(tail4): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + vctzlsbb r8,v6 /* Number of trailing zeroes */ + addi r8,r8,1 /* Add null terminator */ + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,48 + stxvl 32+v3,r11,r10 /* Partial store */ + blr +END (STRCPY) +libc_hidden_builtin_def (strcpy) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index ea936bf9ed..db11345053 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ strncase-power8 ifneq (,$(filter %le,$(config-machine))) -sysdep_routines += strcmp-power9 strncmp-power9 +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index b9fef3f43c..ad11ede20e 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ IFUNC_IMPL (i, name, strcpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00, + __strcpy_power9) +#endif IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __strcpy_power8) IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S new file mode 100644 index 0000000000..d22aa0a8d6 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S @@ -0,0 +1,26 @@ +/* Optimized strcpy implementation for POWER9/PPC64. + Copyright (C) 2016-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define STRCPY __strcpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index 66ea93b8f4..dcdee5181f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -25,9 +25,16 @@ extern __typeof (strcpy) __strcpy_ppc attribute_hidden; extern __typeof (strcpy) __strcpy_power7 attribute_hidden; extern __typeof (strcpy) __strcpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (strcpy) __strcpy_power9 attribute_hidden; +# endif #undef strcpy libc_ifunc_redirected (__redirect_strcpy, strcpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __strcpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __strcpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)