Message ID | 20200904165653.16202-1-rzinsly@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2,1/2] powerpc: Add optimized strncpy for POWER9 | expand |
Benchtest output: generic_strncpy __strncpy_power9 __strncpy_power8 __strncpy_power7 __strncpy_ppc Length 16, n 16, alignment 1/ 1: 6.44861 2.51617 2.54878 5.94753 9.41467 Length 16, n 16, alignment 1/ 1: 6.4448 2.51688 2.56978 5.86275 9.52956 Length 16, n 16, alignment 1/ 2: 6.51392 2.53026 2.55617 5.96487 9.51182 Length 16, n 16, alignment 2/ 1: 6.5421 2.5026 2.82458 5.95353 9.36524 Length 2, n 4, alignment 7/ 2: 8.02857 2.19272 4.35397 4.97347 8.60923 Length 4, n 2, alignment 2/ 7: 6.04262 1.66226 2.31865 3.27123 6.23803 Length 2, n 4, alignment 7/ 2: 8.15691 2.21924 4.48871 4.97328 8.3591 Length 4, n 2, alignment 2/ 7: 6.0428 1.66435 2.31671 3.2874 6.23902 Length 16, n 16, alignment 2/ 2: 6.75511 2.51667 2.82529 5.65252 9.32002 Length 16, n 16, alignment 2/ 2: 6.53469 2.51982 2.82678 5.93257 9.25613 Length 16, n 16, alignment 2/ 4: 6.3502 2.53333 2.82267 5.66948 9.35942 Length 16, n 16, alignment 4/ 2: 6.71533 2.51217 3.47278 5.95821 8.3249 Length 4, n 8, alignment 6/ 4: 7.85332 2.21708 5.68665 4.83111 9.07271 Length 8, n 4, alignment 4/ 6: 5.93863 1.67938 2.67249 3.07391 7.90751 Length 4, n 8, alignment 6/ 4: 8.24352 2.16644 5.22268 5.04674 9.10352 Length 8, n 4, alignment 4/ 6: 5.88514 1.67966 2.67286 3.29382 7.66757 Length 16, n 16, alignment 3/ 3: 6.55525 2.52511 3.06709 5.95625 9.23173 Length 16, n 16, alignment 3/ 3: 6.66344 2.50855 3.11771 5.96121 8.99767 Length 16, n 16, alignment 3/ 6: 6.82163 2.53355 3.0638 5.96451 9.09031 Length 16, n 16, alignment 6/ 3: 6.35636 2.51634 4.17868 5.95112 7.82576 Length 8, n 16, alignment 5/ 6: 7.46873 2.23953 4.33782 5.76124 10.2851 Length 16, n 8, alignment 6/ 5: 5.63643 1.88233 2.32899 4.72233 5.79268 Length 8, n 16, alignment 5/ 6: 7.47291 2.65201 3.9103 5.40334 10.3902 Length 16, n 8, alignment 6/ 5: 5.73738 1.8787 2.32749 4.69061 6.03053 Length 16, n 16, alignment 4/ 4: 6.63998 2.5166 3.5133 5.83764 8.17814 Length 16, n 16, alignment 4/ 4: 6.6866 2.51915 3.5831 5.96121 8.32436 Length 16, n 16, alignment 4/ 0: 6.58543 2.51529 3.38441 5.96909 8.03797 Length 16, n 16, alignment 0/ 4: 6.6541 1.87852 2.45328 5.96068 7.32961 Length 16, n 32, alignment 4/ 0: 9.37236 3.00744 5.92214 7.25884 11.1515 Length 32, n 16, alignment 0/ 4: 6.2795 1.87939 2.45688 5.96206 7.03327 Length 16, n 32, alignment 4/ 0: 9.24513 3.00344 5.97977 6.94778 11.0213 Length 32, n 16, alignment 0/ 4: 6.45422 1.87851 2.45698 5.96172 7.32939 Length 16, n 16, alignment 5/ 5: 6.53949 2.51619 3.88095 5.96091 9.05987 Length 16, n 16, alignment 5/ 5: 6.47371 2.51703 3.91695 5.96417 9.24674 Length 16, n 16, alignment 5/ 2: 6.5493 2.5163 3.78779 5.95898 9.44104 Length 16, n 16, alignment 2/ 5: 6.70967 2.52226 2.82034 5.96365 9.37646 Length 32, n 64, alignment 3/ 2: 14.0298 3.74521 6.80923 11.2825 12.8659 Length 64, n 32, alignment 2/ 3: 9.53123 2.75624 3.21242 8.51653 12.6887 Length 32, n 64, alignment 3/ 2: 14.179 3.83256 6.56898 11.3584 15.2479 Length 64, n 32, alignment 2/ 3: 9.53184 2.75305 3.21245 8.37087 14.1081 Length 16, n 16, alignment 6/ 6: 6.42159 2.51726 4.38574 5.9562 7.12266 Length 16, n 16, alignment 6/ 6: 6.67028 2.51692 4.2448 5.9544 7.81439 Length 16, n 16, alignment 6/ 4: 6.42402 2.51636 4.23817 5.96162 7.23351 Length 16, n 16, alignment 4/ 6: 6.60107 2.53036 3.54038 5.95837 8.32176 Length 64, n 128, alignment 2/ 4: 15.5573 4.80414 7.45917 11.5659 16.9298 Length 128, n 64, alignment 4/ 2: 11.6195 3.53279 4.80585 10.1583 11.6096 Length 64, n 128, alignment 2/ 4: 15.5233 4.7997 7.34679 11.6628 22.0123 Length 128, n 64, alignment 4/ 2: 11.6078 3.5492 4.77929 10.027 19.504 Length 16, n 16, alignment 7/ 7: 6.54515 2.5141 5.04928 5.95083 7.57587 Length 16, n 16, alignment 7/ 7: 7.00425 2.51299 5.06765 5.92888 8.25286 Length 16, n 16, alignment 7/ 6: 6.62954 2.51922 5.07189 6.02372 7.72968 Length 16, n 16, alignment 6/ 7: 6.34475 2.51841 4.36954 5.95968 7.78498 Length 128, n 256, alignment 1/ 6: 17.9386 7.60767 9.40348 16.5301 20.6134 Length 256, n 128, alignment 6/ 1: 13.373 4.84375 7.34616 12.3919 15.1296 Length 128, n 256, alignment 1/ 6: 17.9186 7.6077 9.37853 16.686 39.2821 Length 256, n 128, alignment 6/ 1: 13.3632 4.91799 8.06183 12.4174 34.1655 Length 8, n 16, alignment 0/ 0: 7.36981 2.22579 4.22739 4.9063 7.24636 Length 32, n 16, alignment 0/ 0: 6.43465 1.87932 2.45308 2.41526 7.1679 Length 8, n 16, alignment 7/ 2: 7.48861 2.21639 3.75708 5.35882 8.45777 Length 32, n 16, alignment 7/ 2: 7.03412 2.3535 5.04692 5.95484 7.25068 Length 16, n 32, alignment 0/ 0: 9.10177 3.06646 4.81682 4.41358 9.89656 Length 64, n 32, alignment 0/ 0: 8.57287 2.53847 2.94869 2.70506 8.2629 Length 16, n 32, alignment 6/ 4: 9.20906 3.04216 6.37553 9.46301 10.2489 Length 64, n 32, alignment 6/ 4: 9.73117 2.75023 4.49311 7.7856 9.59261 Length 32, n 64, alignment 0/ 0: 10.9253 3.80104 4.83111 4.97682 12.1086 Length 128, n 64, alignment 0/ 0: 9.26987 3.15895 3.49112 4.31372 10.1329 Length 32, n 64, alignment 5/ 6: 14.1856 3.78089 7.1768 9.63551 13.9944 Length 128, n 64, alignment 5/ 6: 11.5298 3.5249 5.07847 9.96481 12.8245 Length 64, n 128, alignment 0/ 0: 12.0142 4.73085 5.98759 7.1613 15.0462 Length 256, n 128, alignment 0/ 0: 7.96029 4.50244 6.44433 5.38248 11.6022 Length 64, n 128, alignment 4/ 0: 12.4223 4.80085 7.79294 11.0101 15.5277 Length 256, n 128, alignment 4/ 0: 12.2371 4.79242 6.83902 13.2758 16.0479 Length 128, n 256, alignment 0/ 0: 13.9165 7.28703 8.13319 8.79111 16.9101 Length 512, n 256, alignment 0/ 0: 10.5083 6.49881 9.05173 9.03139 19.6212 Length 128, n 256, alignment 3/ 2: 18.025 7.45493 9.86636 18.7234 20.5106 Length 512, n 256, alignment 3/ 2: 16.9588 7.07807 9.97969 23.4911 25.4407 Length 256, n 512, alignment 0/ 0: 17.6801 12.5811 15.3595 13.9989 28.5549 Length 1024, n 512, alignment 0/ 0: 16.379 10.7794 16.4748 16.7344 37.8286 Length 256, n 512, alignment 2/ 4: 23.2012 13.2761 14.3776 26.3752 31.6336 Length 1024, n 512, alignment 2/ 4: 25.4264 12.1716 17.2608 42.2122 47.425 Length 512, n 1024, alignment 0/ 0: 21.0239 23.0736 19.8285 21.0169 48.0091 Length 2048, n 1024, alignment 0/ 0: 28.424 19.323 36.917 35.4247 68.1661 Length 512, n 1024, alignment 1/ 6: 32.3159 24.2617 21.4919 46.5936 55.163 Length 2048, n 1024, alignment 1/ 6: 43.0359 21.6207 37.7643 77.5705 83.2998
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote: > Changes since v1: > - Fixed comments identation and added some spaces to improve > readbillity. > - Use "POWER 9 LE" instead of "PowerPC64/POWER9". > - Fixed copyright dates. > - Replaced cmpwi for cmpdi. > > ---8<--- > > Similar to the strcpy P9 optimization, this version uses VSX to improve > performance. > --- > sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 5 + > .../powerpc64/multiarch/strncpy-power9.S | 26 ++ > sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 + > 5 files changed, 320 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > new file mode 100644 > index 0000000000..34fcdee913 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -0,0 +1,281 @@ > +/* Optimized strncpy implementation for POWER9 LE. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +# ifndef STRNCPY > +# define FUNC_NAME strncpy > +# else > +# define FUNC_NAME STRNCPY > +# endif > + > +/* Implements the function > + > + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + The implementation can load bytes past a null terminator, but only > + up to the next 16-byte aligned address, so it never crosses a page. */ > + > +.machine power9 > +ENTRY_TOCLESS (FUNC_NAME, 4) > + CALL_MCOUNT 2 > + > + /* NULL string optimizations */ > + cmpdi r5, 0 > + beqlr > + > + lbz r0,0(r4) > + stb r0,0(r3) > + addi r11,r3,1 > + addi r5,r5,-1 > + vspltisb v18,0 /* Zeroes in v18 */ > + cmpdi r0,0 > + beq L(zero_padding_loop) > + > + /* Empty/1-byte string optimization */ > + cmpdi r5,0 > + beqlr > + > + addi r4,r4,1 > + neg r7,r4 > + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ > + > + /* Get source 16B aligned */ > + lvx v0,0,r4 > + lvsr v1,0,r4 > + vperm v0,v18,v0,v1 > + > + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vctzlsbb r7,v6 /* Number of trailing zeroes */ > + addi r8,r7,1 /* Add null terminator */ > + > + /* r8 = bytes including null > + r9 = bytes to get source 16B aligned > + if r8 > r9 > + no null, copy r9 bytes > + else > + there is a null, copy r8 bytes and return. */ > + cmpld r8,r9 > + bgt L(no_null) > + > + cmpld cr6,r8,r5 /* r8 <= n? */ > + ble cr6,L(null) > + > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ At first I was confused by this 32+vX syntax. Maybe we could consider adding defines for VSX registers to sysdeps/powerpc/sysdep.h in the future? This way we could refer to v0+32 as vs32, for example. But I don't think this needs to be part of this patchset. > + > + blr > + > +L(null): > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r11,r11,r8 > + sub r5,r5,r8 > + b L(zero_padding_loop) > + > +L(no_null): > + cmpld r9,r5 /* Check if length was reached. */ > + bge L(n_tail1) > + > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r4,r4,r9 > + add r11,r11,r9 > + sub r5,r5,r9 > + > +L(loop): > + cmpldi cr6,r5,64 /* Check if length was reached. */ > + ble cr6,L(final_loop) > + > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail1) > + > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail2) > + > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail3) > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail4) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + > + addi r4,r4,64 > + addi r11,r11,64 > + addi r5,r5,-64 > + > + b L(loop) > + > +L(final_loop): > + cmpldi cr5,r5,16 > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail1) > + bne cr6,L(count_tail1) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail2) > + bne cr6,L(count_tail2) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail3) > + bne cr6,L(count_tail3) > + addi r5,r5,-16 > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + beq cr6,L(n_tail4) > + > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpld r8,r5 /* r8 < n? */ > + blt L(tail4) > + > +L(n_tail4): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* Offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail1): > + beq cr6,L(n_tail1) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpld r8,r5 /* r8 < n? */ > + blt L(tail1) > + > +L(n_tail1): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail2): > + beq cr6,L(n_tail2) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpld r8,r5 /* r8 < n? */ > + blt L(tail2) > + > +L(n_tail2): > + stxv 32+v0,0(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail3): > + beq cr6,L(n_tail3) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpld r8,r5 /* r8 < n? */ > + blt L(tail3) > + > +L(n_tail3): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* Offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + blr > + > +L(prep_tail1): > +L(count_tail1): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail1): > + addi r9,r8,1 /* Add null terminator */ > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail2): > + addi r5,r5,-16 > +L(count_tail2): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail2): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail3): > + addi r5,r5,-32 > +L(count_tail3): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail3): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail4): > + addi r5,r5,-48 > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail4): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + > +/* This code pads the remainder of dest with NULL bytes. */ > +L(zero_padding_loop): > + cmpldi cr6,r5,16 /* Check if length was reached. */ > + ble cr6,L(zero_padding_end) > + > + stxv v18,0(r11) > + addi r11,r11,16 > + addi r5,r5,-16 > + > + b L(zero_padding_loop) > + > +L(zero_padding_end): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl v18,r11,r10 /* Partial store */ > + blr > + The logic looks good. I tried to find a way to reuse some code, as there are many similar blocks (e.g. tail* blocks). But their slight differences make it hard to reuse anything. > +END (FUNC_NAME) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index 19acb6c64a..cd2b47b403 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ > > ifneq (,$(filter %le,$(config-machine))) > sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ > - rawmemchr-power9 strlen-power9 > + rawmemchr-power9 strlen-power9 strncpy-power9 > endif > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index ea10b00417..aa63e1c23f 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > +#ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, strncpy, > + hwcap2 & PPC_FEATURE2_ARCH_3_00, > + __strncpy_power9) > +#endif > IFUNC_IMPL_ADD (array, i, strncpy, > hwcap2 & PPC_FEATURE2_ARCH_2_07, > __strncpy_power8) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S > new file mode 100644 > index 0000000000..ab7c570d54 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S > @@ -0,0 +1,26 @@ > +/* Optimized strncpy implementation for POWER9 LE. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) > +#define STRNCPY __strncpy_power9 > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> > +#endif > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > index 7bacf28aca..8ef0a99cb5 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > @@ -28,11 +28,18 @@ > extern __typeof (strncpy) __strncpy_ppc attribute_hidden; > extern __typeof (strncpy) __strncpy_power7 attribute_hidden; > extern __typeof (strncpy) __strncpy_power8 attribute_hidden; > +# ifdef __LITTLE_ENDIAN__ > +extern __typeof (strncpy) __strncpy_power9 attribute_hidden; > +# endif > # undef strncpy > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > ifunc symbol properly. */ > libc_ifunc_redirected (__redirect_strncpy, strncpy, > +# ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > + ? __strncpy_power9 : > +# endif > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > ? __strncpy_power8 > : (hwcap & PPC_FEATURE_HAS_VSX) > -- The only thing missing now seems to be the .machine power9 issue that was pointed out in v1. Otherwise, LGTM. Reviewed-by: Matheus Castanho <msc@linux.ibm.com> -- Matheus Castanho
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S new file mode 100644 index 0000000000..34fcdee913 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -0,0 +1,281 @@ +/* Optimized strncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +# ifndef STRNCPY +# define FUNC_NAME strncpy +# else +# define FUNC_NAME STRNCPY +# endif + +/* Implements the function + + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + The implementation can load bytes past a null terminator, but only + up to the next 16-byte aligned address, so it never crosses a page. */ + +.machine power9 +ENTRY_TOCLESS (FUNC_NAME, 4) + CALL_MCOUNT 2 + + /* NULL string optimizations */ + cmpdi r5, 0 + beqlr + + lbz r0,0(r4) + stb r0,0(r3) + addi r11,r3,1 + addi r5,r5,-1 + vspltisb v18,0 /* Zeroes in v18 */ + cmpdi r0,0 + beq L(zero_padding_loop) + + /* Empty/1-byte string optimization */ + cmpdi r5,0 + beqlr + + addi r4,r4,1 + neg r7,r4 + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ + + /* Get source 16B aligned */ + lvx v0,0,r4 + lvsr v1,0,r4 + vperm v0,v18,v0,v1 + + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ + vctzlsbb r7,v6 /* Number of trailing zeroes */ + addi r8,r7,1 /* Add null terminator */ + + /* r8 = bytes including null + r9 = bytes to get source 16B aligned + if r8 > r9 + no null, copy r9 bytes + else + there is a null, copy r8 bytes and return. */ + cmpld r8,r9 + bgt L(no_null) + + cmpld cr6,r8,r5 /* r8 <= n? */ + ble cr6,L(null) + + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + blr + +L(null): + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + add r11,r11,r8 + sub r5,r5,r8 + b L(zero_padding_loop) + +L(no_null): + cmpld r9,r5 /* Check if length was reached. */ + bge L(n_tail1) + + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + add r4,r4,r9 + add r11,r11,r9 + sub r5,r5,r9 + +L(loop): + cmpldi cr6,r5,64 /* Check if length was reached. */ + ble cr6,L(final_loop) + + lxv 32+v0,0(r4) + vcmpequb. v6,v0,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail1) + + lxv 32+v1,16(r4) + vcmpequb. v6,v1,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail2) + + lxv 32+v2,32(r4) + vcmpequb. v6,v2,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail3) + + lxv 32+v3,48(r4) + vcmpequb. v6,v3,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail4) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + + addi r4,r4,64 + addi r11,r11,64 + addi r5,r5,-64 + + b L(loop) + +L(final_loop): + cmpldi cr5,r5,16 + lxv 32+v0,0(r4) + vcmpequb. v6,v0,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail1) + bne cr6,L(count_tail1) + addi r5,r5,-16 + + cmpldi cr5,r5,16 + lxv 32+v1,16(r4) + vcmpequb. v6,v1,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail2) + bne cr6,L(count_tail2) + addi r5,r5,-16 + + cmpldi cr5,r5,16 + lxv 32+v2,32(r4) + vcmpequb. v6,v2,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail3) + bne cr6,L(count_tail3) + addi r5,r5,-16 + + lxv 32+v3,48(r4) + vcmpequb. v6,v3,v18 /* Any zero bytes? */ + beq cr6,L(n_tail4) + + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpld r8,r5 /* r8 < n? */ + blt L(tail4) + +L(n_tail4): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,48 /* Offset */ + stxvl 32+v3,r11,r10 /* Partial store */ + blr + +L(prep_n_tail1): + beq cr6,L(n_tail1) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpld r8,r5 /* r8 < n? */ + blt L(tail1) + +L(n_tail1): + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + blr + +L(prep_n_tail2): + beq cr6,L(n_tail2) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpld r8,r5 /* r8 < n? */ + blt L(tail2) + +L(n_tail2): + stxv 32+v0,0(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,16 /* offset */ + stxvl 32+v1,r11,r10 /* Partial store */ + blr + +L(prep_n_tail3): + beq cr6,L(n_tail3) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpld r8,r5 /* r8 < n? */ + blt L(tail3) + +L(n_tail3): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,32 /* Offset */ + stxvl 32+v2,r11,r10 /* Partial store */ + blr + +L(prep_tail1): +L(count_tail1): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail1): + addi r9,r8,1 /* Add null terminator */ + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail2): + addi r5,r5,-16 +L(count_tail2): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail2): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,16 /* offset */ + stxvl 32+v1,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail3): + addi r5,r5,-32 +L(count_tail3): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail3): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,32 /* offset */ + stxvl 32+v2,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail4): + addi r5,r5,-48 + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail4): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,48 /* offset */ + stxvl 32+v3,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + +/* This code pads the remainder of dest with NULL bytes. */ +L(zero_padding_loop): + cmpldi cr6,r5,16 /* Check if length was reached. */ + ble cr6,L(zero_padding_end) + + stxv v18,0(r11) + addi r11,r11,16 + addi r5,r5,-16 + + b L(zero_padding_loop) + +L(zero_padding_end): + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl v18,r11,r10 /* Partial store */ + blr + +END (FUNC_NAME) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 19acb6c64a..cd2b47b403 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index ea10b00417..aa63e1c23f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strncpy, + hwcap2 & PPC_FEATURE2_ARCH_3_00, + __strncpy_power9) +#endif IFUNC_IMPL_ADD (array, i, strncpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __strncpy_power8) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S new file mode 100644 index 0000000000..ab7c570d54 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S @@ -0,0 +1,26 @@ +/* Optimized strncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define STRNCPY __strncpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c index 7bacf28aca..8ef0a99cb5 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c @@ -28,11 +28,18 @@ extern __typeof (strncpy) __strncpy_ppc attribute_hidden; extern __typeof (strncpy) __strncpy_power7 attribute_hidden; extern __typeof (strncpy) __strncpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (strncpy) __strncpy_power9 attribute_hidden; +# endif # undef strncpy /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc_redirected (__redirect_strncpy, strncpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __strncpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __strncpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)