Message ID | 20200904165653.16202-2-rzinsly@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2,1/2] powerpc: Add optimized strncpy for POWER9 | expand |
Benchtest output: generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc Length 16, n 16, alignment 1/ 1: 6.55566 2.5481 2.74063 5.28665 9.96288 Length 16, n 16, alignment 1/ 1: 6.70016 2.54137 2.7108 4.77502 9.91703 Length 16, n 16, alignment 1/ 2: 6.55975 2.56295 2.70641 5.49298 9.59591 Length 16, n 16, alignment 2/ 1: 6.90759 2.52713 2.854 5.48949 9.37664 Length 2, n 4, alignment 7/ 2: 7.90969 2.22698 3.90151 4.6461 8.4503 Length 4, n 2, alignment 2/ 7: 6.14855 1.73403 2.67338 3.05675 6.86316 Length 2, n 4, alignment 7/ 2: 8.40868 2.22338 4.50838 4.51078 9.28489 Length 4, n 2, alignment 2/ 7: 6.14849 1.73402 2.67225 2.85349 6.34342 Length 16, n 16, alignment 2/ 2: 6.963 2.54442 2.87779 5.63547 9.85162 Length 16, n 16, alignment 2/ 2: 6.59452 2.54121 2.84662 5.57178 9.51406 Length 16, n 16, alignment 2/ 4: 6.79115 2.55835 2.84836 5.50427 9.67999 Length 16, n 16, alignment 4/ 2: 6.78419 2.54132 3.54229 5.52563 8.50938 Length 4, n 8, alignment 6/ 4: 8.45703 2.17266 4.80507 3.8714 9.04725 Length 8, n 4, alignment 4/ 6: 6.01753 1.73761 2.8185 2.41527 8.00051 Length 4, n 8, alignment 6/ 4: 7.82081 2.22612 4.80057 3.76103 8.99812 Length 8, n 4, alignment 4/ 6: 6.01752 1.73474 2.82089 2.41524 7.82703 Length 16, n 16, alignment 3/ 3: 6.78194 2.54143 3.21392 5.46447 8.90749 Length 16, n 16, alignment 3/ 3: 6.76324 2.54088 3.22883 5.39689 9.14749 Length 16, n 16, alignment 3/ 6: 7.05278 2.55795 3.22243 5.53422 9.11315 Length 16, n 16, alignment 6/ 3: 6.72881 2.54183 4.58459 5.51658 7.85006 Length 8, n 16, alignment 5/ 6: 7.67184 2.23969 4.13269 4.90728 10.2248 Length 16, n 8, alignment 6/ 5: 5.73672 1.88048 2.6693 4.35579 6.11674 Length 8, n 16, alignment 5/ 6: 7.51707 2.2284 3.67276 4.90637 10.2411 Length 16, n 8, alignment 6/ 5: 5.73665 1.88119 2.57514 3.96351 6.16253 Length 16, n 16, alignment 4/ 4: 7.03577 2.5415 3.66445 4.94157 8.98371 Length 16, n 16, alignment 4/ 4: 6.93549 2.53033 3.65577 5.53815 8.48335 Length 16, n 16, alignment 4/ 0: 6.95106 2.53483 3.48744 5.43759 8.45425 Length 16, n 16, alignment 0/ 4: 6.44601 1.87936 2.41984 5.49488 6.92169 Length 16, n 32, alignment 4/ 0: 9.2036 3.04122 5.78685 6.66434 10.9065 Length 32, n 16, alignment 0/ 4: 6.65504 1.87934 2.41817 6.08706 6.98513 Length 16, n 32, alignment 4/ 0: 9.17461 3.04153 5.77758 6.66444 10.8015 Length 32, n 16, alignment 0/ 4: 6.44123 1.87936 2.41847 5.55207 6.86039 Length 16, n 16, alignment 5/ 5: 6.56005 2.53132 4.22362 5.43527 9.25109 Length 16, n 16, alignment 5/ 5: 6.55552 2.53088 4.22655 5.59271 9.61369 Length 16, n 16, alignment 5/ 2: 6.55553 2.54559 4.31135 5.47438 8.83103 Length 16, n 16, alignment 2/ 5: 6.88992 2.56255 2.84059 5.23185 9.51441 Length 32, n 64, alignment 3/ 2: 12.5054 3.75138 6.42457 10.4719 15.0663 Length 64, n 32, alignment 2/ 3: 9.87185 2.78283 3.17042 7.66624 11.503 Length 32, n 64, alignment 3/ 2: 12.4999 3.74537 6.38161 10.4578 15.1104 Length 64, n 32, alignment 2/ 3: 9.86495 2.77889 3.19171 7.63272 13.9799 Length 16, n 16, alignment 6/ 6: 6.41353 2.5453 4.50915 5.30382 8.45391 Length 16, n 16, alignment 6/ 6: 6.49495 2.54119 4.54493 5.55909 8.1629 Length 16, n 16, alignment 6/ 4: 6.41743 2.54487 4.57202 4.98659 7.53033 Length 16, n 16, alignment 4/ 6: 6.91724 2.54649 3.67868 5.36838 8.45677 Length 64, n 128, alignment 2/ 4: 14.0687 4.93151 8.11667 11.4411 16.9533 Length 128, n 64, alignment 4/ 2: 11.7134 3.58948 4.90121 10.3018 11.6692 Length 64, n 128, alignment 2/ 4: 14.0677 4.93413 7.28129 11.439 22.2186 Length 128, n 64, alignment 4/ 2: 11.7149 3.59312 4.85286 10.3403 19.4651 Length 16, n 16, alignment 7/ 7: 6.76501 2.52563 5.55792 5.44155 8.39997 Length 16, n 16, alignment 7/ 7: 7.16923 2.5265 5.55148 5.60184 7.98311 Length 16, n 16, alignment 7/ 6: 6.76252 2.52629 5.48067 5.51161 7.61026 Length 16, n 16, alignment 6/ 7: 6.65772 2.5521 4.55758 5.48893 7.7301 Length 128, n 256, alignment 1/ 6: 16.2494 7.62034 9.3616 16.2888 19.7029 Length 256, n 128, alignment 6/ 1: 13.4311 4.94455 8.10802 12.2681 15.6941 Length 128, n 256, alignment 1/ 6: 16.2608 7.6209 9.35509 16.2856 38.0277 Length 256, n 128, alignment 6/ 1: 13.4327 4.89474 8.35934 12.2646 34.3268 Length 8, n 16, alignment 0/ 0: 7.20671 2.23256 3.75778 5.63555 7.36414 Length 32, n 16, alignment 0/ 0: 6.4449 1.88 2.41577 2.89598 6.42537 Length 8, n 16, alignment 7/ 2: 7.45976 2.21832 3.91671 4.6524 8.45825 Length 32, n 16, alignment 7/ 2: 6.78267 2.34296 5.59161 5.58598 6.88842 Length 16, n 32, alignment 0/ 0: 9.47971 3.10847 4.74758 4.75377 10.2238 Length 64, n 32, alignment 0/ 0: 8.45634 2.34747 2.59248 2.82356 9.42305 Length 16, n 32, alignment 6/ 4: 9.37784 3.05067 6.92384 9.47727 10.1826 Length 64, n 32, alignment 6/ 4: 9.89233 2.77968 4.63672 7.09838 10.2804 Length 32, n 64, alignment 0/ 0: 11.0813 3.71086 4.43777 5.3549 12.2048 Length 128, n 64, alignment 0/ 0: 9.25192 3.20123 3.53388 4.50794 10.1934 Length 32, n 64, alignment 5/ 6: 12.5099 3.75871 7.29613 9.64902 13.5821 Length 128, n 64, alignment 5/ 6: 11.6115 3.60165 5.71818 9.07288 12.7929 Length 64, n 128, alignment 0/ 0: 12.3671 4.80754 5.46926 6.84492 14.9238 Length 256, n 128, alignment 0/ 0: 8.08427 4.52607 6.47996 5.92086 11.701 Length 64, n 128, alignment 4/ 0: 12.5692 4.89717 7.11058 10.472 15.875 Length 256, n 128, alignment 4/ 0: 12.2945 4.94163 7.11645 12.3831 16.6219 Length 128, n 256, alignment 0/ 0: 13.8948 7.28911 7.78784 9.30215 17.0358 Length 512, n 256, alignment 0/ 0: 10.5266 6.56481 9.14202 9.31096 20.0531 Length 128, n 256, alignment 3/ 2: 16.3534 7.46332 9.90009 18.5282 19.5969 Length 512, n 256, alignment 3/ 2: 17.0519 7.09947 10.1635 23.5411 25.0043 Length 256, n 512, alignment 0/ 0: 15.8935 12.6195 14.0756 14.7553 28.5299 Length 1024, n 512, alignment 0/ 0: 16.3758 10.8028 16.5447 16.8966 37.8653 Length 256, n 512, alignment 2/ 4: 21.16 13.2779 14.3088 26.4475 30.1647 Length 1024, n 512, alignment 2/ 4: 25.3364 12.0899 17.5443 42.7216 47.5803 Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801 Length 2048, n 1024, alignment 0/ 0: 28.4023 19.1577 36.9065 35.4799 68.3555 Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908 Length 2048, n 1024, alignment 1/ 6: 42.9897 21.5402 38.739 78.3266 84.3956
On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote: > Benchtest output: > generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc <snip> > Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801 <snip> > Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908 These two seem to be the only cases in which the power9 version loses to the power8 one. Have you investigated what happens in these two specific cases?
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote: > Add stpncpy support into the POWER9 strncpy. > --- > sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++ > sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++ > .../powerpc64/multiarch/stpncpy-power9.S | 24 +++++++ > sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++ > 6 files changed, 126 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > new file mode 100644 > index 0000000000..81d9673d8b > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > @@ -0,0 +1,24 @@ > +/* Optimized stpncpy implementation for POWER9 LE. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define USE_AS_STPNCPY > +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> > + > +weak_alias (__stpncpy, stpncpy) > +libc_hidden_def (__stpncpy) > +libc_hidden_builtin_def (stpncpy) > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > index 34fcdee913..f7265b11ec 100644 > --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -18,16 +18,30 @@ > > #include <sysdep.h> > > +#ifdef USE_AS_STPNCPY > +# ifndef STPNCPY > +# define FUNC_NAME __stpncpy > +# else > +# define FUNC_NAME STPNCPY > +# endif > +#else > # ifndef STRNCPY > # define FUNC_NAME strncpy > # else > # define FUNC_NAME STRNCPY > # endif > +#endif /* !USE_AS_STPNCPY */ > > /* Implements the function > > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > > + or > + > + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + if USE_AS_STPNCPY is defined. > + > The implementation can load bytes past a null terminator, but only > up to the next 16-byte aligned address, so it never crosses a page. */ > > @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > > /* Empty/1-byte string optimization */ > cmpdi r5,0 > +#ifdef USE_AS_STPNCPY > + bgt L(cont) > + /* Compute pointer to last byte copied into dest. */ > + addi r3,r3,1 > + blr > +L(cont): > +#else > beqlr > +#endif > > addi r4,r4,1 > neg r7,r4 > @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r5 > +#endif > blr > > L(null): > sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r7 > +#endif > add r11,r11,r8 > sub r5,r5,r8 > b L(zero_padding_loop) > @@ -168,6 +198,10 @@ L(n_tail4): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,48 /* Offset */ > stxvl 32+v3,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail1): > @@ -179,6 +213,10 @@ L(prep_n_tail1): > L(n_tail1): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail2): > @@ -192,6 +230,10 @@ L(n_tail2): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,16 /* offset */ > stxvl 32+v1,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail3): > @@ -206,6 +248,10 @@ L(n_tail3): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,32 /* Offset */ > stxvl 32+v2,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_tail1): > @@ -215,6 +261,10 @@ L(tail1): > addi r9,r8,1 /* Add null terminator */ > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -229,6 +279,10 @@ L(tail2): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,16 /* offset */ > stxvl 32+v1,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -244,6 +298,10 @@ L(tail3): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,32 /* offset */ > stxvl 32+v2,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -259,6 +317,10 @@ L(tail4): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,48 /* offset */ > stxvl 32+v3,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* Compute pointer to last byte copied into dest. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > > @@ -279,3 +341,6 @@ L(zero_padding_end): > blr > > END (FUNC_NAME) > +#ifndef USE_AS_STPNCPY > +libc_hidden_builtin_def (strncpy) > +#endif > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index cd2b47b403..f46bf50732 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ > > ifneq (,$(filter %le,$(config-machine))) > sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ > - rawmemchr-power9 strlen-power9 strncpy-power9 > + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 > endif > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index aa63e1c23f..56790bcfe3 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */ > IFUNC_IMPL (i, name, stpncpy, > +#ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, stpncpy, > + hwcap2 & PPC_FEATURE2_ARCH_3_00, > + __stpncpy_power9) > +#endif > IFUNC_IMPL_ADD (array, i, stpncpy, > hwcap2 & PPC_FEATURE2_ARCH_2_07, > __stpncpy_power8) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > new file mode 100644 > index 0000000000..ccbab55c31 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > @@ -0,0 +1,24 @@ > +/* Optimized stpncpy implementation for POWER9 LE. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define STPNCPY __stpncpy_power9 > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > index 17df886431..ac17b26650 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > @@ -26,10 +26,17 @@ > extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; > extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; > extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; > +# ifdef __LITTLE_ENDIAN__ > +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden; > +# endif > # undef stpncpy > # undef __stpncpy > > libc_ifunc_redirected (__redirect___stpncpy, __stpncpy, > +# ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > + ? __stpncpy_power9 : > +# endif > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > ? __stpncpy_power8 > : (hwcap & PPC_FEATURE_HAS_VSX) > LGTM. Reviewed-by: Matheus Castanho <msc@linux.ibm.com> -- Matheus Castanho
Hi Matheus, On 16/09/2020 09:32, Matheus Castanho wrote: > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote: >> Benchtest output: >> generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc > <snip> >> Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801 > <snip> >> Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908 > > These two seem to be the only cases in which the power9 version loses to > the power8 one. Have you investigated what happens in these two specific > cases? > Yes the power8 optimization calls memset to do the zero padding at the end if n > length. In this case where n is way higher, memset is faster than the loop used in my implementation. Thanks for the review! Regards,
On Wed, Sep 16, 2020 at 09:56:59AM -0300, Raphael M Zinsly via Libc-alpha wrote: > On 16/09/2020 09:32, Matheus Castanho wrote: > > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote: > > > Benchtest output: > > > generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc > > <snip> > > > Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801 > > <snip> > > > Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908 > > > > These two seem to be the only cases in which the power9 version loses to > > the power8 one. Have you investigated what happens in these two specific > > cases? > > > Yes the power8 optimization calls memset to do the zero padding at the end > if n > length. In this case where n is way higher, memset is faster than the > loop used in my implementation. Is there some sort of threshold that would help these cases by transitioning to memset (or replicating the relevant part of that code here? PC
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S new file mode 100644 index 0000000000..81d9673d8b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define USE_AS_STPNCPY +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> + +weak_alias (__stpncpy, stpncpy) +libc_hidden_def (__stpncpy) +libc_hidden_builtin_def (stpncpy) diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S index 34fcdee913..f7265b11ec 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -18,16 +18,30 @@ #include <sysdep.h> +#ifdef USE_AS_STPNCPY +# ifndef STPNCPY +# define FUNC_NAME __stpncpy +# else +# define FUNC_NAME STPNCPY +# endif +#else # ifndef STRNCPY # define FUNC_NAME strncpy # else # define FUNC_NAME STRNCPY # endif +#endif /* !USE_AS_STPNCPY */ /* Implements the function char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPNCPY is defined. + The implementation can load bytes past a null terminator, but only up to the next 16-byte aligned address, so it never crosses a page. */ @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4) /* Empty/1-byte string optimization */ cmpdi r5,0 +#ifdef USE_AS_STPNCPY + bgt L(cont) + /* Compute pointer to last byte copied into dest. */ + addi r3,r3,1 + blr +L(cont): +#else beqlr +#endif addi r4,r4,1 neg r7,r4 @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4) sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(null): sldi r10,r8,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r7 +#endif add r11,r11,r8 sub r5,r5,r8 b L(zero_padding_loop) @@ -168,6 +198,10 @@ L(n_tail4): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* Offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail1): @@ -179,6 +213,10 @@ L(prep_n_tail1): L(n_tail1): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail2): @@ -192,6 +230,10 @@ L(n_tail2): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail3): @@ -206,6 +248,10 @@ L(n_tail3): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* Offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_tail1): @@ -215,6 +261,10 @@ L(tail1): addi r9,r8,1 /* Add null terminator */ sldi r10,r9,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -229,6 +279,10 @@ L(tail2): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -244,6 +298,10 @@ L(tail3): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -259,6 +317,10 @@ L(tail4): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 @@ -279,3 +341,6 @@ L(zero_padding_end): blr END (FUNC_NAME) +#ifndef USE_AS_STPNCPY +libc_hidden_builtin_def (strncpy) +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index cd2b47b403..f46bf50732 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index aa63e1c23f..56790bcfe3 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, stpncpy, + hwcap2 & PPC_FEATURE2_ARCH_3_00, + __stpncpy_power9) +#endif IFUNC_IMPL_ADD (array, i, stpncpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __stpncpy_power8) diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S new file mode 100644 index 0000000000..ccbab55c31 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define STPNCPY __stpncpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c index 17df886431..ac17b26650 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c @@ -26,10 +26,17 @@ extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden; +# endif # undef stpncpy # undef __stpncpy libc_ifunc_redirected (__redirect___stpncpy, __stpncpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __stpncpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __stpncpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)