Message ID | 20200929152103.18564-2-rzinsly@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/2] powerpc: Add optimized strncpy for POWER9 | expand |
generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc Length 16, n 16, alignment 1/ 1: 7.04141 2.66905 2.71071 5.33257 9.45193 Length 16, n 16, alignment 1/ 1: 7.01728 2.54349 2.70763 5.35555 9.40601 Length 16, n 16, alignment 1/ 2: 6.76331 2.56894 2.70649 5.28715 9.19534 Length 16, n 16, alignment 2/ 1: 6.41285 2.52953 2.86392 5.25868 9.24343 Length 2, n 4, alignment 7/ 2: 7.76627 2.36037 4.34749 4.05757 8.45648 Length 4, n 2, alignment 2/ 7: 6.15257 1.734 2.66932 2.81884 6.61486 Length 2, n 4, alignment 7/ 2: 7.69004 2.34779 3.90224 4.08693 8.51617 Length 4, n 2, alignment 2/ 7: 6.14888 1.73738 2.66929 2.81777 6.39066 Length 16, n 16, alignment 2/ 2: 7.25765 2.5434 2.8759 4.7084 9.43171 Length 16, n 16, alignment 2/ 2: 6.41274 2.52681 2.87939 5.2894 9.2505 Length 16, n 16, alignment 2/ 4: 6.74797 2.6683 2.82869 5.27608 9.43391 Length 16, n 16, alignment 4/ 2: 7.6281 2.54368 3.52982 5.26862 8.7369 Length 4, n 8, alignment 6/ 4: 7.79233 2.33099 5.64785 4.21131 9.03 Length 8, n 4, alignment 4/ 6: 6.01824 1.73782 2.81779 2.81777 7.90004 Length 4, n 8, alignment 6/ 4: 7.94851 2.33098 4.90456 3.75698 8.89379 Length 8, n 4, alignment 4/ 6: 6.0183 1.73715 2.81777 2.41521 7.83867 Length 16, n 16, alignment 3/ 3: 6.93178 2.66854 3.22004 5.31673 9.09542 Length 16, n 16, alignment 3/ 3: 6.99998 2.67084 3.22862 5.48294 9.2366 Length 16, n 16, alignment 3/ 6: 7.14689 2.6615 3.21888 5.25964 9.1277 Length 16, n 16, alignment 6/ 3: 6.46654 2.65885 4.57873 5.25391 7.75507 Length 8, n 16, alignment 5/ 6: 7.37286 2.33316 3.92971 4.50331 10.1496 Length 16, n 8, alignment 6/ 5: 5.73663 1.87991 2.633 4.09291 5.91732 Length 8, n 16, alignment 5/ 6: 7.77512 2.33361 3.67636 4.50091 10.147 Length 16, n 8, alignment 6/ 5: 5.73662 1.88001 2.57119 4.10496 6.15016 Length 16, n 16, alignment 4/ 4: 7.55115 2.65827 3.5838 5.25628 8.81586 Length 16, n 16, alignment 4/ 4: 7.61232 2.66851 3.62508 5.32044 8.73914 Length 16, n 16, alignment 4/ 0: 7.54588 2.54345 3.48987 5.27812 8.77989 Length 16, n 16, alignment 0/ 4: 6.82387 1.88425 2.41569 5.27746 7.19847 Length 16, n 32, alignment 4/ 0: 10.1135 3.10868 6.01894 6.66693 11.7681 Length 32, n 16, alignment 0/ 4: 6.93527 1.8793 2.4162 5.29155 6.50752 Length 16, n 32, alignment 4/ 0: 10.1565 3.16134 5.78062 6.81425 11.2226 Length 32, n 16, alignment 0/ 4: 6.76758 1.87928 2.41649 5.30161 7.22291 Length 16, n 16, alignment 5/ 5: 7.22753 2.56593 4.22659 5.30415 9.86703 Length 16, n 16, alignment 5/ 5: 6.76256 2.54348 4.23108 5.43866 9.53557 Length 16, n 16, alignment 5/ 2: 7.23702 2.52833 4.23011 5.26711 9.52126 Length 16, n 16, alignment 2/ 5: 6.68084 2.66311 2.84314 5.2709 9.24495 Length 32, n 64, alignment 3/ 2: 12.4989 3.84198 6.40671 10.4545 14.317 Length 64, n 32, alignment 2/ 3: 10.1464 2.78457 3.17933 7.67569 12.4356 Length 32, n 64, alignment 3/ 2: 12.4991 3.83968 7.08471 10.451 15.8984 Length 64, n 32, alignment 2/ 3: 9.61285 2.78401 3.18834 7.66606 13.9602 Length 16, n 16, alignment 6/ 6: 7.24557 2.66839 4.55951 5.25563 7.7369 Length 16, n 16, alignment 6/ 6: 6.76327 2.65836 4.5127 5.63264 7.80333 Length 16, n 16, alignment 6/ 4: 7.15127 2.54397 4.57355 5.32957 7.51005 Length 16, n 16, alignment 4/ 6: 7.51733 2.5615 3.67299 5.31244 8.73893 Length 64, n 128, alignment 2/ 4: 14.0745 4.98021 7.33878 11.4384 17.1572 Length 128, n 64, alignment 4/ 2: 11.7179 3.59088 4.89414 10.2021 11.6637 Length 64, n 128, alignment 2/ 4: 14.0841 4.99105 7.28507 11.4365 21.7537 Length 128, n 64, alignment 4/ 2: 11.7142 3.59211 4.83864 9.87632 19.4664 Length 16, n 16, alignment 7/ 7: 7.12738 2.53533 5.62213 5.30017 7.90888 Length 16, n 16, alignment 7/ 7: 6.82635 2.53529 5.60694 5.27111 8.88482 Length 16, n 16, alignment 7/ 6: 6.9193 2.54376 5.48117 5.24785 8.04263 Length 16, n 16, alignment 6/ 7: 6.89261 2.55078 4.51003 5.32471 7.81768 Length 128, n 256, alignment 1/ 6: 16.2686 7.68983 9.35727 16.2843 19.8458 Length 256, n 128, alignment 6/ 1: 13.4356 4.94899 7.94404 15.0122 15.0231 Length 128, n 256, alignment 1/ 6: 16.2511 7.69025 9.35528 16.2859 37.8453 Length 256, n 128, alignment 6/ 1: 13.4332 4.94446 8.02757 12.2879 34.1949 Length 8, n 16, alignment 0/ 0: 7.26102 2.33285 3.75702 3.85762 7.72869 Length 32, n 16, alignment 0/ 0: 7.049 1.88689 2.42187 2.41537 6.58192 Length 8, n 16, alignment 7/ 2: 8.09344 2.31269 3.67403 4.31612 8.21018 Length 32, n 16, alignment 7/ 2: 6.822 2.45733 5.59593 5.33252 6.53496 Length 16, n 32, alignment 0/ 0: 9.99648 3.36432 4.70547 4.55746 10.148 Length 64, n 32, alignment 0/ 0: 7.89408 2.4309 2.58854 2.70519 8.89171 Length 16, n 32, alignment 6/ 4: 9.31969 3.15547 7.24937 9.47362 10.0091 Length 64, n 32, alignment 6/ 4: 9.91687 2.78234 4.64259 7.00062 10.5972 Length 32, n 64, alignment 0/ 0: 11.0651 3.81484 4.4379 4.91663 11.8363 Length 128, n 64, alignment 0/ 0: 9.25821 3.20129 3.55296 4.22664 9.63556 Length 32, n 64, alignment 5/ 6: 12.5097 3.83422 7.29892 9.09849 13.2517 Length 128, n 64, alignment 5/ 6: 11.6165 3.60246 5.35542 8.90704 13.3207 Length 64, n 128, alignment 0/ 0: 12.372 4.91681 5.41951 6.91629 15.0813 Length 256, n 128, alignment 0/ 0: 7.93075 4.5247 6.29502 5.58357 12.5963 Length 64, n 128, alignment 4/ 0: 12.569 5.00092 7.25225 10.4764 15.9366 Length 256, n 128, alignment 4/ 0: 12.2963 4.90654 7.57109 12.0953 16.7672 Length 128, n 256, alignment 0/ 0: 13.9015 7.34814 7.88738 9.15353 19.4141 Length 512, n 256, alignment 0/ 0: 10.6865 6.52749 9.15011 9.71701 20.9021 Length 128, n 256, alignment 3/ 2: 16.3681 7.53318 9.89911 18.5309 20.8335 Length 512, n 256, alignment 3/ 2: 17.0249 7.10063 10.1568 22.6063 25.1262 Length 256, n 512, alignment 0/ 0: 16.5169 12.3406 13.6056 14.5875 29.2826 Length 1024, n 512, alignment 0/ 0: 16.3619 10.8422 16.7061 17.1025 37.7908 Length 256, n 512, alignment 2/ 4: 21.162 12.9621 14.3306 26.0856 30.0397 Length 1024, n 512, alignment 2/ 4: 25.5543 11.9978 17.7424 42.4293 47.7581 Length 512, n 1024, alignment 0/ 0: 20.5504 17.3132 19.5751 21.3633 42.7215 Length 2048, n 1024, alignment 0/ 0: 28.5197 19.3708 37.1801 35.3122 67.9792 Length 512, n 1024, alignment 1/ 6: 29.9875 17.7823 22.3228 47.3516 51.3697 Length 2048, n 1024, alignment 1/ 6: 42.9443 21.6004 38.7767 78.1732 83.9784
On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
The benchmark numbers you provided [1] seems to show it is slight worse than
the generic_strncpy which uses the same strategy as string/strncpy.c
(which would use VSX instruction through memset/memcpy). Did you compare this
optimization against an implementation that just call power8/9 memset/memcpy
instead?
It should resulting a smaller implementation which reduces i-cache size and
the code is much more simpler and maintainable. The same applies for stpncpy.
I tried to dissuade Intel developers that such micro-optimization are not
really a real gain and instead we should optimize only a handful of string
operations (memcpy/memset/etc.) and use composable implementation instead
(as generic strncpy). It still resulted on 1a153e47fcc, but I think we
might do better for powerpc.
[1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
Hi Adhemerval, On 30/09/2020 10:42, Adhemerval Zanella wrote: > > > On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote: >> Add stpncpy support into the POWER9 strncpy. > > The benchmark numbers you provided [1] seems to show it is slight worse than > the generic_strncpy which uses the same strategy as string/strncpy.c > (which would use VSX instruction through memset/memcpy). My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well. Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it. > Did you compare this > optimization against an implementation that just call power8/9 memset/memcpy > instead? > Not sure if I understand, isn't that generic_strncpy and strncpy_ppc? > It should resulting a smaller implementation which reduces i-cache size and > the code is much more simpler and maintainable. The same applies for stpncpy. > > I tried to dissuade Intel developers that such micro-optimization are not > really a real gain and instead we should optimize only a handful of string > operations (memcpy/memset/etc.) and use composable implementation instead > (as generic strncpy). It still resulted on 1a153e47fcc, but I think we > might do better for powerpc. > > [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html > Best Regards,
On 30/09/2020 11:21, Raphael M Zinsly wrote: > Hi Adhemerval, > > On 30/09/2020 10:42, Adhemerval Zanella wrote: >> >> >> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote: >>> Add stpncpy support into the POWER9 strncpy. >> >> The benchmark numbers you provided [1] seems to show it is slight worse than >> the generic_strncpy which uses the same strategy as string/strncpy.c >> (which would use VSX instruction through memset/memcpy). > > My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well. > > Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it. > >> Did you compare this >> optimization against an implementation that just call power8/9 memset/memcpy >> instead? >> > > Not sure if I understand, isn't that generic_strncpy and strncpy_ppc? Right, I misread the benchmark. And I tested my own suggestion on the power9 from gcc farm and it seems that although it is slight faster than power7 variant it does not really beat power8 (as expected since it calls strnlen and then memcpy/memset and access the input twice). I do not really oppose it and it is up to the arch maintainer, but I still think these micro-optimizations tends to just add extra maintainability and icache pressure where the microbenchmark does not really catch. > > >> It should resulting a smaller implementation which reduces i-cache size and >> the code is much more simpler and maintainable. The same applies for stpncpy. >> >> I tried to dissuade Intel developers that such micro-optimization are not >> really a real gain and instead we should optimize only a handful of string >> operations (memcpy/memset/etc.) and use composable implementation instead >> (as generic strncpy). It still resulted on 1a153e47fcc, but I think we >> might do better for powerpc. >> >> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html >> > > Best Regards,
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes: > Add stpncpy support into the POWER9 strncpy. Same reminder for Reviewed-by. > +#define MEMSET __memset_power8 > +#ifdef SHARED > +#define MEMSET_is_local Wrong indentation here. Fixed. Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com> Pushed as 7beee7b39ade. Thanks!
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S new file mode 100644 index 0000000000..81d9673d8b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define USE_AS_STPNCPY +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S> + +weak_alias (__stpncpy, stpncpy) +libc_hidden_def (__stpncpy) +libc_hidden_builtin_def (stpncpy) diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S index 67cb648c65..b7d308c984 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -18,11 +18,19 @@ #include <sysdep.h> +#ifdef USE_AS_STPNCPY +# ifndef STPNCPY +# define FUNC_NAME __stpncpy +# else +# define FUNC_NAME STPNCPY +# endif +#else # ifndef STRNCPY # define FUNC_NAME strncpy # else # define FUNC_NAME STRNCPY # endif +#endif /* !USE_AS_STPNCPY */ #ifndef MEMSET /* For builds without IFUNC support, local calls should be made to internal @@ -41,6 +49,12 @@ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPNCPY is defined. + The implementation can load bytes past a null terminator, but only up to the next 16-byte aligned address, so it never crosses a page. */ @@ -66,7 +80,15 @@ ENTRY (FUNC_NAME, 4) /* Empty/1-byte string optimization */ cmpdi r5,0 +#ifdef USE_AS_STPNCPY + bgt L(cont) + /* Compute pointer to last byte copied into dest. */ + addi r3,r3,1 + blr +L(cont): +#else beqlr +#endif addi r4,r4,1 neg r7,r4 @@ -96,12 +118,20 @@ ENTRY (FUNC_NAME, 4) sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(null): sldi r10,r8,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r7 +#endif add r11,r11,r8 sub r5,r5,r8 b L(zero_padding) @@ -185,6 +215,10 @@ L(n_tail4): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* Offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail1): @@ -196,6 +230,10 @@ L(prep_n_tail1): L(n_tail1): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail2): @@ -209,6 +247,10 @@ L(n_tail2): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_n_tail3): @@ -223,6 +265,10 @@ L(n_tail3): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* Offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r5 +#endif blr L(prep_tail1): @@ -232,6 +278,10 @@ L(tail1): addi r9,r8,1 /* Add null terminator */ sldi r10,r9,56 /* stxvl wants size in top 8 bits */ stxvl 32+v0,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding) @@ -246,6 +296,10 @@ L(tail2): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,16 /* offset */ stxvl 32+v1,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding) @@ -261,6 +315,10 @@ L(tail3): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,32 /* offset */ stxvl 32+v2,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 b L(zero_padding) @@ -276,6 +334,10 @@ L(tail4): sldi r10,r9,56 /* stxvl wants size in top 8 bits */ addi r11,r11,48 /* offset */ stxvl 32+v3,r11,r10 /* Partial store */ +#ifdef USE_AS_STPNCPY + /* Compute pointer to last byte copied into dest. */ + add r3,r11,r8 +#endif add r11,r11,r9 sub r5,r5,r9 @@ -330,7 +392,8 @@ L(zero_padding_memset): ld r0,FRAMESIZE+16(r1) mr r3,r30 /* Restore the return value of strncpy, i.e.: - dest. */ + dest. For stpncpy, the return value is the + same as return value of memset. */ ld r30,FRAMESIZE-8(r1) /* Restore r30. */ /* Restore the stack frame. */ addi r1,r1,FRAMESIZE @@ -341,3 +404,6 @@ L(zero_padding_memset): blr END (FUNC_NAME) +#ifndef USE_AS_STPNCPY +libc_hidden_builtin_def (strncpy) +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index cd2b47b403..f46bf50732 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index fb55b07e53..d0f20cc97f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -318,6 +318,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, stpncpy, + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + && (hwcap & PPC_FEATURE_HAS_VSX), + __stpncpy_power9) +#endif IFUNC_IMPL_ADD (array, i, stpncpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __stpncpy_power8) diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S new file mode 100644 index 0000000000..55daa3455f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S @@ -0,0 +1,29 @@ +/* Optimized stpncpy implementation for POWER9 LE. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define STPNCPY __stpncpy_power9 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#define MEMSET __memset_power8 +#ifdef SHARED +#define MEMSET_is_local +#endif + +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c index 17df886431..3758f29ad1 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c @@ -26,10 +26,18 @@ extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; +# ifdef __LITTLE_ENDIAN__ +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden; +# endif # undef stpncpy # undef __stpncpy libc_ifunc_redirected (__redirect___stpncpy, __stpncpy, +# ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_00) && + (hwcap & PPC_FEATURE_HAS_VSX) + ? __stpncpy_power9 : +# endif (hwcap2 & PPC_FEATURE2_ARCH_2_07) ? __stpncpy_power8 : (hwcap & PPC_FEATURE_HAS_VSX)