Message ID | 20200820182917.12602-1-rzinsly@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [1/2] powerpc: Optimized strncpy for POWER9 | expand |
Here is the make bench output: generic_strncpy __strncpy_power9 __strncpy_power8 __strncpy_power7 __strncpy_ppc Length 16, n 16, alignment 1/ 1: 7.11694 2.77348 2.80296 6.5724 10.4471 Length 16, n 16, alignment 1/ 1: 7.1557 2.75968 2.805 6.5748 10.5064 Length 16, n 16, alignment 1/ 2: 7.17956 2.79127 2.79964 6.57323 10.3281 Length 16, n 16, alignment 2/ 1: 7.15841 2.77364 3.10582 6.2332 10.331 Length 2, n 4, alignment 7/ 2: 8.90911 2.4623 5.38449 5.64873 9.36348 Length 4, n 2, alignment 2/ 7: 6.65395 1.84558 2.58298 3.10566 7.46376 Length 2, n 4, alignment 7/ 2: 8.70625 2.41166 5.38131 5.73421 9.64285 Length 4, n 2, alignment 2/ 7: 6.65458 1.84354 2.58382 3.64721 6.96163 Length 16, n 16, alignment 2/ 2: 7.01778 2.77373 3.10668 6.58047 10.6006 Length 16, n 16, alignment 2/ 2: 7.53778 2.75789 3.10591 6.2277 10.2613 Length 16, n 16, alignment 2/ 4: 7.13828 2.79132 3.10567 6.56847 10.619 Length 16, n 16, alignment 4/ 2: 7.38659 2.77668 3.70851 6.54537 9.17368 Length 4, n 8, alignment 6/ 4: 8.71748 2.45183 5.76669 4.65782 10.014 Length 8, n 4, alignment 4/ 6: 6.5504 1.83463 2.96574 2.66227 8.49964 Length 4, n 8, alignment 6/ 4: 8.96461 2.4499 5.78384 5.32287 9.79641 Length 8, n 4, alignment 4/ 6: 6.48083 1.83265 2.9783 3.38632 8.51888 Length 16, n 16, alignment 3/ 3: 7.7538 2.77353 3.29008 6.55912 9.94143 Length 16, n 16, alignment 3/ 3: 7.75279 2.76148 3.30616 6.5445 9.98866 Length 16, n 16, alignment 3/ 6: 7.21486 2.79444 3.33712 6.24747 10.113 Length 16, n 16, alignment 6/ 3: 6.99138 2.77778 4.50777 6.22522 8.53482 Length 8, n 16, alignment 5/ 6: 8.26994 2.77966 4.60681 6.10938 10.5975 Length 16, n 8, alignment 6/ 5: 6.28062 2.07193 2.57761 4.95636 6.48035 Length 8, n 16, alignment 5/ 6: 8.17113 2.43559 4.27753 5.95453 11.1796 Length 16, n 8, alignment 6/ 5: 6.21214 2.07239 2.57714 4.96762 6.76041 Length 16, n 16, alignment 4/ 4: 7.31373 2.77573 3.78349 6.19349 8.91432 Length 16, n 16, alignment 4/ 4: 7.32226 2.75658 3.70319 6.60792 9.17307 Length 16, n 16, alignment 4/ 0: 7.58812 2.76841 3.71554 6.54282 8.90051 Length 16, n 16, alignment 0/ 4: 6.92871 2.06944 2.66876 6.63947 8.08171 Length 16, n 32, alignment 4/ 0: 10.2972 3.3192 6.53695 7.77295 12.332 Length 32, n 16, alignment 0/ 4: 6.98056 2.06954 2.66445 6.54976 7.65286 Length 16, n 32, alignment 4/ 0: 10.5356 3.31343 6.53813 7.72029 12.2915 Length 32, n 16, alignment 0/ 4: 7.36068 2.06945 2.66424 6.21052 8.07614 Length 16, n 16, alignment 5/ 5: 7.2122 2.77732 4.17451 6.55383 10.4887 Length 16, n 16, alignment 5/ 5: 7.34438 2.77512 4.17191 6.56873 10.5664 Length 16, n 16, alignment 5/ 2: 7.15746 2.76198 4.14481 6.56235 10.7391 Length 16, n 16, alignment 2/ 5: 7.19372 2.79273 3.10693 6.56984 10.2697 Length 32, n 64, alignment 3/ 2: 15.3918 4.22964 7.0146 12.5809 13.8661 Length 64, n 32, alignment 2/ 3: 10.5331 3.02942 3.54253 9.19106 12.9356 Length 32, n 64, alignment 3/ 2: 15.369 4.17282 7.36163 12.5759 16.8501 Length 64, n 32, alignment 2/ 3: 10.5585 3.01971 3.52885 9.03369 15.6663 Length 16, n 16, alignment 6/ 6: 7.0405 2.77527 4.53842 6.54733 7.99437 Length 16, n 16, alignment 6/ 6: 7.02801 2.76059 4.52873 6.53536 8.45713 Length 16, n 16, alignment 6/ 4: 7.42011 2.77669 4.52223 6.57756 7.9899 Length 16, n 16, alignment 4/ 6: 7.37787 2.77507 3.77821 6.57058 9.17396 Length 64, n 128, alignment 2/ 4: 17.188 5.33493 8.00394 12.6196 19.1784 Length 128, n 64, alignment 4/ 2: 12.7962 3.91004 5.42994 11.294 12.5273 Length 64, n 128, alignment 2/ 4: 17.2298 5.2748 8.15392 12.6039 24.3802 Length 128, n 64, alignment 4/ 2: 12.7866 3.87534 5.3334 11.8516 21.6528 Length 16, n 16, alignment 7/ 7: 7.75015 2.76775 5.59024 6.57976 8.42318 Length 16, n 16, alignment 7/ 7: 7.81681 2.75691 5.56801 6.55397 10.0378 Length 16, n 16, alignment 7/ 6: 7.75225 2.77446 5.56813 6.57349 8.49645 Length 16, n 16, alignment 6/ 7: 7.23237 2.79186 4.51528 6.55304 8.63443 Length 128, n 256, alignment 1/ 6: 19.8414 8.37691 10.3445 18.4838 22.8314 Length 256, n 128, alignment 6/ 1: 14.7972 5.38498 8.83611 13.8521 16.6154 Length 128, n 256, alignment 1/ 6: 19.8497 8.37754 10.3469 18.2655 43.3568 Length 256, n 128, alignment 6/ 1: 14.7542 5.31075 8.75314 13.7759 37.6351 Length 8, n 16, alignment 0/ 0: 8.19872 2.45818 4.27602 4.6578 7.98513 Length 32, n 16, alignment 0/ 0: 6.92066 2.07115 2.66465 2.66381 7.75655 Length 8, n 16, alignment 7/ 2: 8.18253 2.42685 4.70317 6.01808 9.35743 Length 32, n 16, alignment 7/ 2: 7.79714 2.60074 5.58717 6.64181 6.98583 Length 16, n 32, alignment 0/ 0: 10.4715 3.40184 6.28388 4.86146 11.0819 Length 64, n 32, alignment 0/ 0: 10.4403 2.54135 3.07109 3.38791 9.35196 Length 16, n 32, alignment 6/ 4: 10.7077 3.34867 7.01321 10.4278 11.2951 Length 64, n 32, alignment 6/ 4: 10.9215 3.03041 5.04324 8.30023 11.2648 Length 32, n 64, alignment 0/ 0: 12.0062 4.09428 5.32372 5.48319 14.1455 Length 128, n 64, alignment 0/ 0: 10.1803 3.47282 3.83134 4.21557 10.6674 Length 32, n 64, alignment 5/ 6: 15.4165 4.16297 7.78876 10.8762 15.4308 Length 128, n 64, alignment 5/ 6: 12.7332 3.91667 5.8014 10.5869 14.0961 Length 64, n 128, alignment 0/ 0: 13.238 5.24242 6.90661 8.05566 15.9848 Length 256, n 128, alignment 0/ 0: 8.759 4.9483 6.98675 6.11489 12.6755 Length 64, n 128, alignment 4/ 0: 13.6593 5.27931 8.60925 12.5916 17.5016 Length 256, n 128, alignment 4/ 0: 13.4801 5.37114 7.47485 14.0585 17.4517 Length 128, n 256, alignment 0/ 0: 15.3147 8.02462 8.92006 9.67769 20.3757 Length 512, n 256, alignment 0/ 0: 11.5638 7.22535 9.80468 9.93597 21.3421 Length 128, n 256, alignment 3/ 2: 19.8948 8.15967 10.9435 20.6146 22.4146 Length 512, n 256, alignment 3/ 2: 18.681 7.77864 10.9269 25.9269 28.0105 Length 256, n 512, alignment 0/ 0: 19.4894 13.7363 14.8394 15.4064 31.6341 Length 1024, n 512, alignment 0/ 0: 18.0108 11.8737 18.1779 18.5072 41.5425 Length 256, n 512, alignment 2/ 4: 25.5662 14.5189 16.1872 29.5395 33.7587 Length 1024, n 512, alignment 2/ 4: 28.0079 13.2347 19.067 48.1998 52.3078 Length 512, n 1024, alignment 0/ 0: 23.1385 25.4237 21.2303 23.632 47.4502 Length 2048, n 1024, alignment 0/ 0: 31.201 21.308 40.6351 39.04 75.0329 Length 512, n 1024, alignment 1/ 6: 35.6234 27.0042 24.4711 51.3364 60.6277 Length 2048, n 1024, alignment 1/ 6: 47.442 24.0381 41.6616 85.4832 91.8897
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote: > Similar to the strcpy P9 optimization, this version uses VSX to improve > performance. > --- > sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 5 + > .../powerpc64/multiarch/strncpy-power9.S | 26 ++ > sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 + > 5 files changed, 315 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > new file mode 100644 > index 0000000000..cde68384d4 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -0,0 +1,276 @@ > +/* Optimized strncpy implementation for PowerPC64/POWER9. > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +# ifndef STRNCPY > +# define FUNC_NAME strncpy > +# else > +# define FUNC_NAME STRNCPY > +# endif > + > +/* Implements the function > + > + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + The implementation can load bytes past a null terminator, but only > + up to the next 16B boundary, so it never crosses a page. */ > + > +.machine power9 > +ENTRY_TOCLESS (FUNC_NAME, 4) > + CALL_MCOUNT 2 > + > + cmpwi r5, 0 > + beqlr Trivial nit, an newline after branches helps readability for me. > + /* NULL string optimisation */ > + lbz r0,0(r4) > + stb r0,0(r3) > + addi r11,r3,1 > + addi r5,r5,-1 > + vspltisb v18,0 /* Zeroes in v18 */ > + cmpwi r0,0 > + beq L(zero_padding_loop) > + > + cmpwi r5,0 > + beqlr OK. > + > +L(cont): I think this label can be removed or replaced with a comment. > + addi r4,r4,1 > + neg r7,r4 > + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ > + > + /* Get source 16B aligned */ > + lvx v0,0,r4 > + lvsr v1,0,r4 > + vperm v0,v18,v0,v1 > + > + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vctzlsbb r7,v6 /* Number of trailing zeroes */ > + addi r8,r7,1 /* Add null terminator */ Minor nit, can you align the comment with previous comments? > + > + /* r8 = bytes including null > + r9 = bytes to get source 16B aligned > + if r8 > r9 > + no null, copy r9 bytes > + else > + there is a null, copy r8 bytes and return. */ > + cmpd r8,r9 > + bgt L(no_null) > + > + cmpd r8,r5 /* r8 <= n? */ Minor, you could use another CR and run this in parallel with the previous check. > + ble L(null) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + blr OK. > + > +L(null): > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r11,r11,r8 > + sub r5,r5,r8 > + b L(zero_padding_loop) OK. > + > +L(no_null): > + cmpd r9,r5 /* Check if length was reached. */ > + bge L(n_tail1) An extra newline would help here. > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r4,r4,r9 > + add r11,r11,r9 > + sub r5,r5,r9 OK. > + > +L(loop): > + cmpldi cr6,r5,64 /* Check if length was reached. */ > + ble cr6,L(final_loop) > + > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail1) > + > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail2) > + > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail3) > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail4) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + > + addi r4,r4,64 > + addi r11,r11,64 > + addi r5,r5,-64 > + > + b L(loop) OK. > + > +L(final_loop): > + cmpldi cr5,r5,16 > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail1) > + bne cr6,L(count_tail1) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail2) > + bne cr6,L(count_tail2) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail3) > + bne cr6,L(count_tail3) > + addi r5,r5,-16 OK. > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + beq cr6,L(n_tail4) > + > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ > + blt L(tail4) OK. Newline here (and for the other similar cases below too please). > +L(n_tail4): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* Offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + blr OK. > + > +L(prep_n_tail1): > + beq cr6,L(n_tail1) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ > + blt L(tail1) > +L(n_tail1): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + blr OK. > + > +L(prep_n_tail2): > + beq cr6,L(n_tail2) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ > + blt L(tail2) > +L(n_tail2): > + stxv 32+v0,0(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + blr OK. > + > +L(prep_n_tail3): > + beq cr6,L(n_tail3) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ > + blt L(tail3) > +L(n_tail3): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* Offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + blr OK. > + > +L(prep_tail1): > +L(count_tail1): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail1): > + addi r9,r8,1 /* Add null terminator */ Please align this comment (and the 3 other similar cases). > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) OK. > + > +L(prep_tail2): > + addi r5,r5,-16 > +L(count_tail2): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail2): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail3): > + addi r5,r5,-32 > +L(count_tail3): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail3): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail4): > + addi r5,r5,-48 > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail4): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 OK. > + > +/* This code pads the remainder of dest with NULL bytes. */ > +L(zero_padding_loop): > + cmpldi cr6,r5,16 /* Check if length was reached. */ > + ble cr6,L(zero_padding_end) > + > + stxv v18,0(r11) > + addi r11,r11,16 > + addi r5,r5,-16 > + > + b L(zero_padding_loop) > + OK. > +L(zero_padding_end): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl v18,r11,r10 /* Partial store */ > + blr OK. > + > +L(n_tail):Is this label used? > + > +END (FUNC_NAME) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index 19acb6c64a..cd2b47b403 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile OK. > index ea10b00417..aa63e1c23f 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c OK. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S > new file mode 100644 > index 0000000000..b9b6092f7b > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S OK. > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > index 7bacf28aca..822ceb2003 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c > @@ -28,11 +28,18 @@ > extern __typeof (strncpy) __strncpy_ppc attribute_hidden; > extern __typeof (strncpy) __strncpy_power7 attribute_hidden; > extern __typeof (strncpy) __strncpy_power8 attribute_hidden; > +# ifdef __LITTLE_ENDIAN__ > +extern __typeof (strncpy) __strncpy_power9 attribute_hidden; > +# endif > # undef strncpy > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > ifunc symbol properly. */ > libc_ifunc_redirected (__redirect_strncpy, strncpy, > +# ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_00) > + ? __strncpy_power9 : Trivial nit, I think the above two lines need two extra spaces.
On Thu, Aug 20, 2020 at 03:29:16PM -0300, Raphael Moreira Zinsly via Libc-alpha wrote: > Similar to the strcpy P9 optimization, this version uses VSX to improve > performance. > --- > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > new file mode 100644 > index 0000000000..cde68384d4 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -0,0 +1,276 @@ > +/* Optimized strncpy implementation for PowerPC64/POWER9. sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S below, has "POWER9/PPC64". Can we make these consistent? Can we just say "POWER9"? Do we need to indicate little-endian only? > + Copyright (C) 2020 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +# ifndef STRNCPY > +# define FUNC_NAME strncpy > +# else > +# define FUNC_NAME STRNCPY > +# endif > + > +/* Implements the function > + > + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + The implementation can load bytes past a null terminator, but only > + up to the next 16B boundary, so it never crosses a page. */ nit, subjective: "up to the next 16-byte aligned address" > + > +.machine power9 > +ENTRY_TOCLESS (FUNC_NAME, 4) > + CALL_MCOUNT 2 > + > + cmpwi r5, 0 This should be "cmpdi". > + beqlr > + /* NULL string optimisation */ This comment would make more sense above the "cmpdi", above. > + lbz r0,0(r4) > + stb r0,0(r3) > + addi r11,r3,1 > + addi r5,r5,-1 > + vspltisb v18,0 /* Zeroes in v18 */ > + cmpwi r0,0 This should be "cmpdi". > + beq L(zero_padding_loop) > + Given the above "NULL string" comment, you could put an "empty string optimization" comment here. > + cmpwi r5,0 This should be "cmpdi". > + beqlr The "addi r11,r3,1" and "vspltisb v18,0" above aren't needed until a bit later, which penalizes the empty string case. I think you can move the empty string test up. Some experiments seemed to move the lbz and dependent stb apart. Something like this: /* NULL string optimisation */ cmpdi r5,0 beqlr lbz r0,0(r4) /* empty/1-byte string optimisation */ cmpdi r5,1 stb r0,0(r3) beqlr cmpdi r0,0 addi r11,r3,1 addi r5,r5,-1 vspltisb v18,0 /* Zeroes in v18 */ beq L(zero_padding_loop) (But, I didn't see significant performance difference in some light experimentation. It might be worth another look.) > + > +L(cont): This label isn't used. > + addi r4,r4,1 > + neg r7,r4 > + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ > + > + /* Get source 16B aligned */ > + lvx v0,0,r4 > + lvsr v1,0,r4 > + vperm v0,v18,v0,v1 > + > + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vctzlsbb r7,v6 /* Number of trailing zeroes */ > + addi r8,r7,1 /* Add null terminator */ > + > + /* r8 = bytes including null > + r9 = bytes to get source 16B aligned > + if r8 > r9 > + no null, copy r9 bytes > + else > + there is a null, copy r8 bytes and return. */ > + cmpd r8,r9 This should probably be "cmpld". > + bgt L(no_null) > + > + cmpd r8,r5 /* r8 <= n? */ This should probably be "cmpld". > + ble L(null) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ Do we still need this "32+v0" syntax? Is that due to a minimum supported level of binutils which isn't VSX-aware? > + > + blr > + > +L(null): > + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r11,r11,r8 > + sub r5,r5,r8 > + b L(zero_padding_loop) > + > +L(no_null): > + cmpd r9,r5 /* Check if length was reached. */ This should probably be "cmpld". > + bge L(n_tail1) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + > + add r4,r4,r9 > + add r11,r11,r9 > + sub r5,r5,r9 > + > +L(loop): > + cmpldi cr6,r5,64 /* Check if length was reached. */ > + ble cr6,L(final_loop) > + > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail1) > + > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail2) > + > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail3) > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + bne cr6,L(prep_tail4) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + > + addi r4,r4,64 > + addi r11,r11,64 > + addi r5,r5,-64 > + > + b L(loop) > + > +L(final_loop): > + cmpldi cr5,r5,16 > + lxv 32+v0,0(r4) > + vcmpequb. v6,v0,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail1) > + bne cr6,L(count_tail1) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v1,16(r4) > + vcmpequb. v6,v1,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail2) > + bne cr6,L(count_tail2) > + addi r5,r5,-16 > + > + cmpldi cr5,r5,16 > + lxv 32+v2,32(r4) > + vcmpequb. v6,v2,v18 /* Any zero bytes? */ > + ble cr5,L(prep_n_tail3) > + bne cr6,L(count_tail3) > + addi r5,r5,-16 > + > + lxv 32+v3,48(r4) > + vcmpequb. v6,v3,v18 /* Any zero bytes? */ > + beq cr6,L(n_tail4) > + > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ This should probably be "cmpld". > + blt L(tail4) > +L(n_tail4): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* Offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail1): > + beq cr6,L(n_tail1) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ This should probably be "cmpld". > + blt L(tail1) > +L(n_tail1): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail2): > + beq cr6,L(n_tail2) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ This should probably be "cmpld". > + blt L(tail2) > +L(n_tail2): > + stxv 32+v0,0(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + blr > + > +L(prep_n_tail3): > + beq cr6,L(n_tail3) /* Any zero bytes? */ > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > + cmpd r8,r5 /* r8 < n? */ This should probably be "cmpld". > + blt L(tail3) > +L(n_tail3): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* Offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + blr > + > +L(prep_tail1): > +L(count_tail1): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail1): > + addi r9,r8,1 /* Add null terminator */ > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + stxvl 32+v0,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail2): > + addi r5,r5,-16 > +L(count_tail2): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail2): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,16 /* offset */ > + stxvl 32+v1,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail3): > + addi r5,r5,-32 > +L(count_tail3): > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail3): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,32 /* offset */ > + stxvl 32+v2,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + b L(zero_padding_loop) > + > +L(prep_tail4): > + addi r5,r5,-48 > + vctzlsbb r8,v6 /* Number of trailing zeroes */ > +L(tail4): > + addi r9,r8,1 /* Add null terminator */ > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ > + addi r11,r11,48 /* offset */ > + stxvl 32+v3,r11,r10 /* Partial store */ > + add r11,r11,r9 > + sub r5,r5,r9 > + > +/* This code pads the remainder of dest with NULL bytes. */ > +L(zero_padding_loop): > + cmpldi cr6,r5,16 /* Check if length was reached. */ > + ble cr6,L(zero_padding_end) > + > + stxv v18,0(r11) > + addi r11,r11,16 > + addi r5,r5,-16 > + > + b L(zero_padding_loop) > + > +L(zero_padding_end): > + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ > + stxvl v18,r11,r10 /* Partial store */ > + blr > + > +L(n_tail): > + > +END (FUNC_NAME) PC
Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes: > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > new file mode 100644 > index 0000000000..cde68384d4 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S > @@ -0,0 +1,276 @@ > ... > +/* Implements the function > + > + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > + > + The implementation can load bytes past a null terminator, but only > + up to the next 16B boundary, so it never crosses a page. */ > + > +.machine power9 I don't think Binutils 2.26 supports .machine power9. Likewise for all P9 instructions. However, current glibc is expected to work with Binutils 2.26 (ppc64le), i.e. builds with Binutils 2.26 should not fail. So, we either need to change this code (e.g. similar to strcmp) or we need to bump the Binutils requirements. The last time Binutils requirements was bumped was in 2017, so I think it's safe to do this now. Let me prepare a patch proposing this.
On 9/2/20 8:20 AM, Tulio Magno Quites Machado Filho wrote: > Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes: > >> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S >> new file mode 100644 >> index 0000000000..cde68384d4 >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S >> @@ -0,0 +1,276 @@ >> ... >> +/* Implements the function >> + >> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) >> + >> + The implementation can load bytes past a null terminator, but only >> + up to the next 16B boundary, so it never crosses a page. */ >> + >> +.machine power9 > > I don't think Binutils 2.26 supports .machine power9. Likewise for all P9 > instructions. However, current glibc is expected to work with Binutils 2.26 > (ppc64le), i.e. builds with Binutils 2.26 should not fail. > > So, we either need to change this code (e.g. similar to strcmp) or we need > to bump the Binutils requirements. > The last time Binutils requirements was bumped was in 2017, so I think it's safe > to do this now. > > Let me prepare a patch proposing this. There are at least 5 uses of .machine power9 throughout glibc today. I agree with bumping at least the ppc64le requirements to match.
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S new file mode 100644 index 0000000000..cde68384d4 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -0,0 +1,276 @@ +/* Optimized strncpy implementation for PowerPC64/POWER9. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +# ifndef STRNCPY +# define FUNC_NAME strncpy +# else +# define FUNC_NAME STRNCPY +# endif + +/* Implements the function + + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + The implementation can load bytes past a null terminator, but only + up to the next 16B boundary, so it never crosses a page. */ + +.machine power9 +ENTRY_TOCLESS (FUNC_NAME, 4) + CALL_MCOUNT 2 + + cmpwi r5, 0 + beqlr + /* NULL string optimisation */ + lbz r0,0(r4) + stb r0,0(r3) + addi r11,r3,1 + addi r5,r5,-1 + vspltisb v18,0 /* Zeroes in v18 */ + cmpwi r0,0 + beq L(zero_padding_loop) + + cmpwi r5,0 + beqlr + +L(cont): + addi r4,r4,1 + neg r7,r4 + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ + + /* Get source 16B aligned */ + lvx v0,0,r4 + lvsr v1,0,r4 + vperm v0,v18,v0,v1 + + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ + vctzlsbb r7,v6 /* Number of trailing zeroes */ + addi r8,r7,1 /* Add null terminator */ + + /* r8 = bytes including null + r9 = bytes to get source 16B aligned + if r8 > r9 + no null, copy r9 bytes + else + there is a null, copy r8 bytes and return. */ + cmpd r8,r9 + bgt L(no_null) + + cmpd r8,r5 /* r8 <= n? */ + ble L(null) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + blr + +L(null): + sldi r10,r8,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + add r11,r11,r8 + sub r5,r5,r8 + b L(zero_padding_loop) + +L(no_null): + cmpd r9,r5 /* Check if length was reached. */ + bge L(n_tail1) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + + add r4,r4,r9 + add r11,r11,r9 + sub r5,r5,r9 + +L(loop): + cmpldi cr6,r5,64 /* Check if length was reached. */ + ble cr6,L(final_loop) + + lxv 32+v0,0(r4) + vcmpequb. v6,v0,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail1) + + lxv 32+v1,16(r4) + vcmpequb. v6,v1,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail2) + + lxv 32+v2,32(r4) + vcmpequb. v6,v2,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail3) + + lxv 32+v3,48(r4) + vcmpequb. v6,v3,v18 /* Any zero bytes? */ + bne cr6,L(prep_tail4) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + + addi r4,r4,64 + addi r11,r11,64 + addi r5,r5,-64 + + b L(loop) + +L(final_loop): + cmpldi cr5,r5,16 + lxv 32+v0,0(r4) + vcmpequb. v6,v0,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail1) + bne cr6,L(count_tail1) + addi r5,r5,-16 + + cmpldi cr5,r5,16 + lxv 32+v1,16(r4) + vcmpequb. v6,v1,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail2) + bne cr6,L(count_tail2) + addi r5,r5,-16 + + cmpldi cr5,r5,16 + lxv 32+v2,32(r4) + vcmpequb. v6,v2,v18 /* Any zero bytes? */ + ble cr5,L(prep_n_tail3) + bne cr6,L(count_tail3) + addi r5,r5,-16 + + lxv 32+v3,48(r4) + vcmpequb. v6,v3,v18 /* Any zero bytes? */ + beq cr6,L(n_tail4) + + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpd r8,r5 /* r8 < n? */ + blt L(tail4) +L(n_tail4): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,48 /* Offset */ + stxvl 32+v3,r11,r10 /* Partial store */ + blr + +L(prep_n_tail1): + beq cr6,L(n_tail1) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpd r8,r5 /* r8 < n? */ + blt L(tail1) +L(n_tail1): + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + blr + +L(prep_n_tail2): + beq cr6,L(n_tail2) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpd r8,r5 /* r8 < n? */ + blt L(tail2) +L(n_tail2): + stxv 32+v0,0(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,16 /* offset */ + stxvl 32+v1,r11,r10 /* Partial store */ + blr + +L(prep_n_tail3): + beq cr6,L(n_tail3) /* Any zero bytes? */ + vctzlsbb r8,v6 /* Number of trailing zeroes */ + cmpd r8,r5 /* r8 < n? */ + blt L(tail3) +L(n_tail3): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,32 /* Offset */ + stxvl 32+v2,r11,r10 /* Partial store */ + blr + +L(prep_tail1): +L(count_tail1): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail1): + addi r9,r8,1 /* Add null terminator */ + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + stxvl 32+v0,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail2): + addi r5,r5,-16 +L(count_tail2): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail2): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,16 /* offset */ + stxvl 32+v1,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail3): + addi r5,r5,-32 +L(count_tail3): + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail3): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,32 /* offset */ + stxvl 32+v2,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + b L(zero_padding_loop) + +L(prep_tail4): + addi r5,r5,-48 + vctzlsbb r8,v6 /* Number of trailing zeroes */ +L(tail4): + addi r9,r8,1 /* Add null terminator */ + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + sldi r10,r9,56 /* stxvl wants size in top 8 bits */ + addi r11,r11,48 /* offset */ + stxvl 32+v3,r11,r10 /* Partial store */ + add r11,r11,r9 + sub r5,r5,r9 + +/* This code pads the remainder of dest with NULL bytes. */ +L(zero_padding_loop): + cmpldi cr6,r5,16 /* Check if length was reached. */ + ble cr6,L(zero_padding_end) + + stxv v18,0(r11) + addi r11,r11,16 + addi r5,r5,-16 + + b L(zero_padding_loop) + +L(zero_padding_end): + sldi r10,r5,56 /* stxvl wants size in top 8 bits */ + stxvl v18,r11,r10 /* Partial store */ + blr + +L(n_tail): + +END (FUNC_NAME) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 19acb6c64a..cd2b47b403 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index ea10b00417..aa63e1c23f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strncpy, + hwcap2 & PPC_FEATURE2_ARCH_3_00, + __strncpy_power9) +#endif IFUNC_IMPL_ADD (array, i, strncpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __strncpy_power8) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S new file mode 100644 index 0000000000..b9b6092f7b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S @@ -0,0 +1,26 @@ +/* Optimized strncpy implementation for POWER9/PPC64. + Copyright (C) 2016-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define STRNCPY __strncpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c index 7bacf28aca..822ceb2003 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c @@ -28,11 +28,18 @@ extern __typeof (strncpy) __strncpy_ppc attribute_hidden; extern __typeof (strncpy) __strncpy_power7 attribute_hidden; extern __typeof (strncpy) __strncpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (strncpy) __strncpy_power9 attribute_hidden; +# endif # undef strncpy /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc_redirected (__redirect_strncpy, strncpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __strncpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __strncpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)