Message ID | 20200820182917.12602-2-rzinsly@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [1/2] powerpc: Optimized strncpy for POWER9 | expand |
Here is the make bench output: generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc Length 16, n 16, alignment 1/ 1: 7.31792 2.79249 2.98207 6.20964 11.2262 Length 16, n 16, alignment 1/ 1: 7.26441 2.79883 2.97986 6.09795 11.1118 Length 16, n 16, alignment 1/ 2: 7.22475 2.82518 2.98169 6.18967 10.9933 Length 16, n 16, alignment 2/ 1: 7.28211 2.78851 3.1079 6.06067 10.4232 Length 2, n 4, alignment 7/ 2: 9.30193 2.4733 4.30086 4.74387 9.25328 Length 4, n 2, alignment 2/ 7: 6.7756 1.91031 2.93946 3.24475 7.76389 Length 2, n 4, alignment 7/ 2: 8.81319 2.4726 4.57341 4.74421 9.44667 Length 4, n 2, alignment 2/ 7: 6.77806 1.9118 2.93637 3.1857 7.00171 Length 16, n 16, alignment 2/ 2: 7.35335 2.80104 3.10653 5.85492 10.5689 Length 16, n 16, alignment 2/ 2: 7.14308 2.78571 3.10889 6.10044 10.4816 Length 16, n 16, alignment 2/ 4: 7.21628 2.81563 3.10724 6.14674 10.6005 Length 16, n 16, alignment 4/ 2: 7.47713 2.80531 3.80081 5.86977 9.43599 Length 4, n 8, alignment 6/ 4: 8.63537 2.4676 5.53825 4.1877 9.88309 Length 8, n 4, alignment 4/ 6: 6.63429 1.91051 3.10751 2.76472 8.4156 Length 4, n 8, alignment 6/ 4: 8.59304 2.43152 5.30288 4.16475 9.77498 Length 8, n 4, alignment 4/ 6: 6.63843 1.91047 3.19713 2.69566 8.67023 Length 16, n 16, alignment 3/ 3: 7.45277 2.80045 3.42433 6.06204 9.92282 Length 16, n 16, alignment 3/ 3: 8.04191 2.78645 3.43317 5.99773 10.0662 Length 16, n 16, alignment 3/ 6: 7.5816 2.81606 3.44168 6.0801 9.94673 Length 16, n 16, alignment 6/ 3: 7.10582 2.80176 5.03947 6.06942 8.40249 Length 8, n 16, alignment 5/ 6: 8.19747 2.42028 4.30043 5.0752 11.3093 Length 16, n 8, alignment 6/ 5: 6.37287 2.07239 2.56322 4.36972 6.52164 Length 8, n 16, alignment 5/ 6: 8.25022 2.45124 4.05051 5.02258 10.8683 Length 16, n 8, alignment 6/ 5: 6.31868 2.07215 2.83061 4.44584 7.14464 Length 16, n 16, alignment 4/ 4: 7.54408 2.80105 3.82846 5.71392 9.91359 Length 16, n 16, alignment 4/ 4: 7.66265 2.79063 3.86233 6.06489 9.31705 Length 16, n 16, alignment 4/ 0: 7.84286 2.79896 3.83148 6.08954 9.55253 Length 16, n 16, alignment 0/ 4: 7.36697 2.07019 2.66533 6.13894 7.75685 Length 16, n 32, alignment 4/ 0: 10.3819 3.33088 6.32994 7.24949 12.3827 Length 32, n 16, alignment 0/ 4: 7.15586 2.07172 2.66097 6.11743 7.56448 Length 16, n 32, alignment 4/ 0: 10.3262 3.35225 6.34556 7.3211 12.2527 Length 32, n 16, alignment 0/ 4: 7.13287 2.07265 2.6613 6.17878 7.61901 Length 16, n 16, alignment 5/ 5: 7.22471 2.80128 4.65776 6.15455 9.93333 Length 16, n 16, alignment 5/ 5: 7.22458 2.78586 4.65874 6.06763 9.87968 Length 16, n 16, alignment 5/ 2: 7.22718 2.79127 4.65999 6.025 10.3775 Length 16, n 16, alignment 2/ 5: 7.73485 2.8025 3.10754 6.08303 10.3871 Length 32, n 64, alignment 3/ 2: 13.7685 4.1256 7.04965 11.5105 15.3903 Length 64, n 32, alignment 2/ 3: 10.526 3.05149 3.59497 8.45078 13.7462 Length 32, n 64, alignment 3/ 2: 13.7681 4.11611 7.08236 11.5129 16.6004 Length 64, n 32, alignment 2/ 3: 10.962 3.05712 3.60447 8.43981 15.4906 Length 16, n 16, alignment 6/ 6: 7.30916 2.80056 5.03985 6.16331 8.43692 Length 16, n 16, alignment 6/ 6: 7.31688 2.7914 5.02931 6.12345 8.42848 Length 16, n 16, alignment 6/ 4: 7.7402 2.7993 5.04435 6.02685 8.28199 Length 16, n 16, alignment 4/ 6: 7.79103 2.82496 3.82464 6.0778 9.31532 Length 64, n 128, alignment 2/ 4: 15.4969 5.3714 8.09812 12.6067 18.7831 Length 128, n 64, alignment 4/ 2: 12.9023 3.93138 5.46487 10.7071 13.3253 Length 64, n 128, alignment 2/ 4: 15.4998 5.42611 7.88843 12.6007 24.0491 Length 128, n 64, alignment 4/ 2: 12.8971 3.94646 5.49689 11.1747 21.5779 Length 16, n 16, alignment 7/ 7: 7.68992 2.78151 6.14775 6.19397 8.38412 Length 16, n 16, alignment 7/ 7: 7.90811 2.7803 6.11502 6.17383 8.78371 Length 16, n 16, alignment 7/ 6: 7.45456 2.80173 5.93657 6.15191 8.38489 Length 16, n 16, alignment 6/ 7: 7.44846 2.80238 5.03654 6.1154 8.41589 Length 128, n 256, alignment 1/ 6: 17.9114 8.39532 10.3246 17.9457 21.9452 Length 256, n 128, alignment 6/ 1: 14.8346 5.41104 8.89047 13.5379 17.1437 Length 128, n 256, alignment 1/ 6: 17.9118 8.39985 10.3271 17.9503 42.0831 Length 256, n 128, alignment 6/ 1: 14.8306 5.40714 9.04492 13.5227 37.819 Length 8, n 16, alignment 0/ 0: 8.19945 2.46752 4.04264 4.62897 8.22975 Length 32, n 16, alignment 0/ 0: 7.23617 2.07229 2.66504 2.66683 7.93411 Length 8, n 16, alignment 7/ 2: 8.26373 2.41779 4.18003 5.31418 9.0473 Length 32, n 16, alignment 7/ 2: 7.46119 2.63992 6.16424 6.14534 7.28237 Length 16, n 32, alignment 0/ 0: 10.1282 3.42401 5.00287 5.02318 11.4985 Length 64, n 32, alignment 0/ 0: 9.29452 2.57779 2.79807 3.1362 10.9532 Length 16, n 32, alignment 6/ 4: 10.2194 3.30297 7.48371 10.4067 11.2264 Length 64, n 32, alignment 6/ 4: 10.6887 3.04976 5.13062 8.10511 11.1225 Length 32, n 64, alignment 0/ 0: 12.1806 4.09924 5.12341 6.14159 14.0965 Length 128, n 64, alignment 0/ 0: 10.1569 3.52625 3.88528 4.65782 11.3018 Length 32, n 64, alignment 5/ 6: 13.7795 4.13456 8.53476 10.2846 15.1556 Length 128, n 64, alignment 5/ 6: 12.8171 3.92765 5.82505 10.3559 15.0831 Length 64, n 128, alignment 0/ 0: 13.6328 5.33523 6.43324 7.92213 16.4658 Length 256, n 128, alignment 0/ 0: 8.92495 4.97169 7.13044 6.30158 12.9039 Length 64, n 128, alignment 4/ 0: 13.8393 5.36588 7.52682 11.5294 17.5523 Length 256, n 128, alignment 4/ 0: 13.5309 5.36019 7.56527 13.3503 17.8202 Length 128, n 256, alignment 0/ 0: 15.2956 8.14449 8.79678 9.69352 21.2463 Length 512, n 256, alignment 0/ 0: 11.5667 7.22974 10.1355 10.2592 21.5805 Length 128, n 256, alignment 3/ 2: 18.0152 8.21506 10.9175 20.4131 22.3927 Length 512, n 256, alignment 3/ 2: 18.7328 7.81909 11.251 25.0633 29.2378 Length 256, n 512, alignment 0/ 0: 17.5135 13.9768 15.6849 16.1219 30.9344 Length 1024, n 512, alignment 0/ 0: 17.988 11.8498 18.4388 18.7385 41.5762 Length 256, n 512, alignment 2/ 4: 23.3724 14.8026 15.9182 28.6762 33.9031 Length 1024, n 512, alignment 2/ 4: 27.9562 13.2785 19.5893 46.9671 52.4943 Length 512, n 1024, alignment 0/ 0: 23.3637 25.283 21.2536 23.4228 55.6501 Length 2048, n 1024, alignment 0/ 0: 31.303 21.2731 40.7001 38.8365 75.1105 Length 512, n 1024, alignment 1/ 6: 33.0535 26.873 24.8167 51.5917 56.236 Length 2048, n 1024, alignment 1/ 6: 47.5444 24.0206 42.5163 86.0245 92.5819
Thank you for your contributions, I have a few minor comments/suggestions below. On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote: > Adds stpncpy support into the POWER9 strncpy. s/Adds/Add/ s/into the/to/. Likewise, s/Optimzed/Add optimized/ in the title. > --- > sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++ > sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++ > .../powerpc64/multiarch/stpncpy-power9.S | 24 ++++++ > sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++ > 6 files changed, 135 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > new file mode 100644 > index 0000000000..a96840bb6f > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S > @@ -0,0 +1,24 @@ > +/* Optimized stpncpy implementation for PowerPC64/POWER9. > + Copyright (C) 2015-2020 Free Software Foundation, Inc. Should this date be exclusively 2020? > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define USE_AS_STPNCPY > +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> > + > +weak_alias (__stpncpy, stpncpy) > +libc_hidden_def (__stpncpy) > +libc_hidden_builtin_def (stpncpy) OK. > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > index cde68384d4..64b06a9040 100644 > --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -18,16 +18,30 @@ > > #include <sysdep.h> > > +#ifdef USE_AS_STPNCPY > +# ifndef STPNCPY > +# define FUNC_NAME __stpncpy > +# else > +# define FUNC_NAME STPNCPY > +# endif > +#else > # ifndef STRNCPY > # define FUNC_NAME strncpy > # else > # define FUNC_NAME STRNCPY > # endif > +#endif /* !USE_AS_STPNCPY */ > > /* Implements the function > > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > > + or > + > + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + if USE_AS_STPNCPY is defined. > + > The implementation can load bytes past a null terminator, but only > up to the next 16B boundary, so it never crosses a page. */ > > @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > beq L(zero_padding_loop) > > cmpwi r5,0 > +#ifdef USE_AS_STPNCPY > + bgt L(cont) > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ "Compute pointer to last byte copied into dest." Likwise for the other copied instances. > + addi r3,r3,1 > + blr > +#endif OK. > beqlr This is unreachable in stpncpy, can this be conditionally included in the !stpncpy configuration? > > L(cont): > @@ -77,12 +98,22 @@ L(cont): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r5 > +#endif > blr > > L(null): > sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r7 > +#endif > add r11,r11,r8 > sub r5,r5,r8 > b L(zero_padding_loop) > @@ -164,6 +195,11 @@ L(n_tail4): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,48 /* Offset */ > stxvl 32+v3,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail1): > @@ -174,6 +210,11 @@ L(prep_n_tail1): > L(n_tail1): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail2): > @@ -186,6 +227,11 @@ L(n_tail2): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,16 /* offset */ > stxvl 32+v1,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_n_tail3): > @@ -199,6 +245,11 @@ L(n_tail3): > sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,32 /* Offset */ > stxvl 32+v2,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r5 > +#endif > blr > > L(prep_tail1): > @@ -208,6 +259,11 @@ L(tail1): > addi r9,r8,1 /* Add null terminator */ > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > stxvl 32+v0,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -222,6 +278,11 @@ L(tail2): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,16 /* offset */ > stxvl 32+v1,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -237,6 +298,11 @@ L(tail3): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,32 /* offset */ > stxvl 32+v2,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > b L(zero_padding_loop) > @@ -252,6 +318,11 @@ L(tail4): > sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > addi r11,r11,48 /* offset */ > stxvl 32+v3,r11,r10 /* Partial store */ > +#ifdef USE_AS_STPNCPY > + /* stpncpy returns the dest address plus the size not counting the > + final '\0'. */ > + add r3,r11,r8 > +#endif > add r11,r11,r9 > sub r5,r5,r9 > > @@ -274,3 +345,6 @@ L(zero_padding_end): > L(n_tail): > > END (FUNC_NAME) > +#ifndef USE_AS_STPNCPY > +libc_hidden_builtin_def (strncpy) > +#endif OK. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index cd2b47b403..f46bf50732 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile OK. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index aa63e1c23f..56790bcfe3 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c OK. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > new file mode 100644 > index 0000000000..ecbbb5c8e9 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S > @@ -0,0 +1,24 @@ > +/* Optimized stpncpy implementation for POWER9/PPC64. > + Copyright (C) 2015-2020 Free Software Foundation, Inc. Minor nit, I suspect that date should only include 2020. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define STPNCPY __stpncpy_power9 > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + OK. > +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > index 17df886431..21702716a3 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c > @@ -26,10 +26,17 @@ > extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; > extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; > extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; > +# ifdef __LITTLE_ENDIAN__ > +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden; > +# endif > # undef stpncpy > # undef __stpncpy > > libc_ifunc_redirected (__redirect___stpncpy, __stpncpy, > +# ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > + ? __stpncpy_power9 : > +# endif > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > ? __stpncpy_power8 > : (hwcap & PPC_FEATURE_HAS_VSX) > I think the spacing is off by two here.
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S new file mode 100644 index 0000000000..a96840bb6f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for PowerPC64/POWER9. + Copyright (C) 2015-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define USE_AS_STPNCPY +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> + +weak_alias (__stpncpy, stpncpy) +libc_hidden_def (__stpncpy) +libc_hidden_builtin_def (stpncpy) diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S index cde68384d4..64b06a9040 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -18,16 +18,30 @@ #include <sysdep.h> +#ifdef USE_AS_STPNCPY +# ifndef STPNCPY +# define FUNC_NAME __stpncpy +# else +# define FUNC_NAME STPNCPY +# endif +#else # ifndef STRNCPY # define FUNC_NAME strncpy # else # define FUNC_NAME STRNCPY # endif +#endif /* !USE_AS_STPNCPY */ /* Implements the function char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPNCPY is defined. + The implementation can load bytes past a null terminator, but only up to the next 16B boundary, so it never crosses a page. */ @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4) beq L(zero_padding_loop) cmpwi r5,0 +#ifdef USE_AS_STPNCPY + bgt L(cont) + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + addi r3,r3,1 + blr +#endif beqlr L(cont): @@ -77,12 +98,22 @@ L(cont): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r5 +#endif blr L(null): sldi r10,r8,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r7 +#endif add r11,r11,r8 sub r5,r5,r8 b L(zero_padding_loop) @@ -164,6 +195,11 @@ L(n_tail4): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* Offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r5 +#endif blr L(prep_n_tail1): @@ -174,6 +210,11 @@ L(prep_n_tail1): L(n_tail1): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r5 +#endif blr L(prep_n_tail2): @@ -186,6 +227,11 @@ L(n_tail2): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r5 +#endif blr L(prep_n_tail3): @@ -199,6 +245,11 @@ L(n_tail3): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* Offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r5 +#endif blr L(prep_tail1): @@ -208,6 +259,11 @@ L(tail1): addi r9,r8,1 /* Add null terminator */ sldi r10,r9,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -222,6 +278,11 @@ L(tail2): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -237,6 +298,11 @@ L(tail3): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding_loop) @@ -252,6 +318,11 @@ L(tail4): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* stpncpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 @@ -274,3 +345,6 @@ L(zero_padding_end): L(n_tail): END (FUNC_NAME) +#ifndef USE_AS_STPNCPY +libc_hidden_builtin_def (strncpy) +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index cd2b47b403..f46bf50732 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index aa63e1c23f..56790bcfe3 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, stpncpy, + hwcap2 & PPC_FEATURE2_ARCH_3_00, + __stpncpy_power9) +#endif IFUNC_IMPL_ADD (array, i, stpncpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __stpncpy_power8) diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S new file mode 100644 index 0000000000..ecbbb5c8e9 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for POWER9/PPC64. + Copyright (C) 2015-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define STPNCPY __stpncpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c index 17df886431..21702716a3 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c @@ -26,10 +26,17 @@ extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden; +# endif # undef stpncpy # undef __stpncpy libc_ifunc_redirected (__redirect___stpncpy, __stpncpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __stpncpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __stpncpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)