Message ID | 20210512142717.225429-2-msc@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [1/2] benchtests: Use JSON for bench-rawmemchr output | expand |
Hi Matheus, adding the commentary bellow the patch LGTM, thanks! On 12/05/2021 11:27, Matheus Castanho via Libc-alpha wrote: > ... > @@ -50,19 +90,13 @@ > li r6,offset; \ > LXVP(v4+32,offset,addr); \ > LXVP(v6+32,offset+32,addr); \ > + RAWMEMCHR_SUBTRACT_VECTORS; \ > vminub v14,v4,v5; \ > vminub v15,v6,v7; \ > vminub v16,v14,v15; \ > - vcmpequb. v0,v16,v18; \ > + vcmpequb. v0,v16,VREG_ZERO; \ > bne cr6,L(label) > > -#define TAIL(vreg,increment) \ > - vctzlsbb r4,vreg; \ > - subf r3,r3,r5; \ > - addi r4,r4,increment; \ > - add r3,r3,r4; \ > - blr > - > /* Implements the function > > int [r3] strlen (const void *s [r3]) Missing the rawmemchar "definition" here. Regards,
LGTM. Tests pass on POWER10. Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com> Quoting Matheus Castanho via Libc-alpha (2021-05-12 11:27:17) > Reuse code for optimized strlen to implement a faster version of rawmemchr. > This takes advantage of the same benefits provided by the strlen implementation, > but needs some extra steps. The functionality and performance of __strlen_power10 > is unchanged after this commit. > > rawmemchr returns a pointer to the char found, while strlen returns only the > length, so we have to take that into account when preparing the return value. > > To quickly check 64B, the loop on __strlen_power10 merges the whole block into > 16B by using unsigned minimum vector operations (vminub) and checks if there are > any \0 on the resulting vector. The same code is used by rawmemchr if the char c > is 0. However, this approach does not work when c != 0. We first need to > subtract each byte by c, so that the value we are looking for is converted to a > 0, then taking the minimum and checking for nulls works again. > > The new code branches after it has compared ~256 bytes and chooses which of the > two strategies above will be used in the main loop, based on the char c. This > extra branch adds some overhead (~5%) for length ~256, but is quickly amortized > by the faster loop for larger sizes. > > Compared to __rawmemchr_power9, this version is ~20% faster for length < 256. > Because of the optimized main loop, the improvement becomes ~35% for c != 0 > and ~50% for c = 0 for strings longer than 256. > --- > .../powerpc/powerpc64/le/power10/rawmemchr.S | 22 +++ > sysdeps/powerpc/powerpc64/le/power10/strlen.S | 156 +++++++++++++++--- > sysdeps/powerpc/powerpc64/multiarch/Makefile | 4 +- > .../powerpc64/multiarch/ifunc-impl-list.c | 4 + > .../powerpc64/multiarch/rawmemchr-power10.S | 21 +++ > .../powerpc/powerpc64/multiarch/rawmemchr.c | 4 + > 6 files changed, 184 insertions(+), 27 deletions(-) > create mode 100644 sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S > > diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S > new file mode 100644 > index 0000000000..5351c2634f > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S > @@ -0,0 +1,22 @@ > +/* Optimized rawmemchr implementation for POWER10 LE. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#define USE_AS_RAWMEMCHR 1 > +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S> > diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S > index ca7e9eb3d8..18222ebeb8 100644 > --- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S > +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S > @@ -18,10 +18,50 @@ > > #include <sysdep.h> > > -#ifndef STRLEN > -# define STRLEN __strlen > -# define DEFINE_STRLEN_HIDDEN_DEF 1 > -#endif > +/* To reuse the code for rawmemchr, we have some extra steps compared to the > + strlen implementation: > + - Sum the initial value of r3 with the position at which the char was > + found, to guarantee we return a pointer and not the length. > + - In the main loop, subtract each byte by the char we are looking for, > + so we can keep using vminub to quickly check 64B at once. */ > +#ifdef USE_AS_RAWMEMCHR > +# ifndef RAWMEMCHR > +# define FUNCNAME __rawmemchr > +# else > +# define FUNCNAME RAWMEMCHR > +# endif > +# define MCOUNT_NARGS 2 > +# define VREG_ZERO v20 > +# define OFF_START_LOOP 256 > +# define RAWMEMCHR_SUBTRACT_VECTORS \ > + vsububm v4,v4,v18; \ > + vsububm v5,v5,v18; \ > + vsububm v6,v6,v18; \ > + vsububm v7,v7,v18; > +# define TAIL(vreg,increment) \ > + vctzlsbb r4,vreg; \ > + addi r4,r4,increment; \ > + add r3,r5,r4; \ > + blr > + > +#else /* strlen */ > + > +# ifndef STRLEN > +# define FUNCNAME __strlen > +# define DEFINE_STRLEN_HIDDEN_DEF 1 > +# else > +# define FUNCNAME STRLEN > +# endif > +# define MCOUNT_NARGS 1 > +# define VREG_ZERO v18 > +# define OFF_START_LOOP 192 > +# define TAIL(vreg,increment) \ > + vctzlsbb r4,vreg; \ > + subf r3,r3,r5; \ > + addi r4,r4,increment; \ > + add r3,r3,r4; \ > + blr > +#endif /* USE_AS_RAWMEMCHR */ > OK. Separate tail code, add subtraction for rawmemchr and loop values. > /* TODO: Replace macros by the actual instructions when minimum binutils becomes > >= 2.35. This is used to keep compatibility with older versions. */ > @@ -50,19 +90,13 @@ > li r6,offset; \ > LXVP(v4+32,offset,addr); \ > LXVP(v6+32,offset+32,addr); \ > + RAWMEMCHR_SUBTRACT_VECTORS; \ Ok. Add subtractions for rawmemchr. > vminub v14,v4,v5; \ > vminub v15,v6,v7; \ > vminub v16,v14,v15; \ > - vcmpequb. v0,v16,v18; \ > + vcmpequb. v0,v16,VREG_ZERO; \ Ok. Use specific register. > bne cr6,L(label) > > -#define TAIL(vreg,increment) \ > - vctzlsbb r4,vreg; \ > - subf r3,r3,r5; \ > - addi r4,r4,increment; \ > - add r3,r3,r4; \ > - blr > - Ok. Remove old tail code. > /* Implements the function > > int [r3] strlen (const void *s [r3]) > @@ -72,11 +106,21 @@ > > .machine power9 > > -ENTRY_TOCLESS (STRLEN, 4) > - CALL_MCOUNT 1 > +ENTRY_TOCLESS (FUNCNAME, 4) > + CALL_MCOUNT MCOUNT_NARGS > + > +#ifdef USE_AS_RAWMEMCHR > + xori r5,r4,0xff > > - vspltisb v18,0 > + mtvsrd v18+32,r4 /* matching char in v18 */ > + mtvsrd v19+32,r5 /* non matching char in v19 */ > + > + vspltb v18,v18,7 /* replicate */ > + vspltb v19,v19,7 /* replicate */ > +#else Ok. Load v18 and 19 with maching and non matching chars. > vspltisb v19,-1 > +#endif > + vspltisb VREG_ZERO,0 > > /* Next 16B-aligned address. Prepare address for L(aligned). */ > addi r5,r3,16 > @@ -90,16 +134,25 @@ ENTRY_TOCLESS (STRLEN, 4) > vcmpequb. v6,v0,v18 > beq cr6,L(aligned) > > +#ifdef USE_AS_RAWMEMCHR > + vctzlsbb r6,v6 > + add r3,r3,r6 > +#else > vctzlsbb r3,v6 > +#endif Ok. Add address to return the pointer on r3. > blr > > - /* Test next 176B, 16B at a time. The main loop is optimized for longer > - strings, so checking the first bytes in 16B chunks benefits a lot > - small strings. */ > + /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is > + optimized for longer strings, so checking the first bytes in 16B > + chunks benefits a lot small strings. */ Ok. > .p2align 5 > L(aligned): > +#ifdef USE_AS_RAWMEMCHR > + cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to > + choose how we will perform the main loop. */ > +#endif Ok. rawmemchr optimization if c is zero. > /* Prepare address for the loop. */ > - addi r4,r3,192 > + addi r4,r3,OFF_START_LOOP > clrrdi r4,r4,6 > > CHECK16(v0,0,r5,tail1) > @@ -113,15 +166,43 @@ L(aligned): > CHECK16(v8,128,r5,tail9) > CHECK16(v9,144,r5,tail10) > CHECK16(v10,160,r5,tail11) > +#ifdef USE_AS_RAWMEMCHR > + CHECK16(v0,176,r5,tail12) > + CHECK16(v1,192,r5,tail13) > + CHECK16(v2,208,r5,tail14) > + CHECK16(v3,224,r5,tail15) > +#endif Ok. More tails for the longer rawmemchr loop. > > addi r5,r4,128 > > +#ifdef USE_AS_RAWMEMCHR > + /* If c == 0, use the same loop as strlen, without the vsububm. */ > + beq cr5,L(loop) > + > + /* This is very similar to the block after L(loop), the difference is > + that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract > + each byte loaded by the char we are looking for, this way we can keep > + using vminub to merge the results and checking for nulls. */ > + .p2align 5 > +L(rawmemchr_loop): > + CHECK64(0,r4,pre_tail_64b) > + CHECK64(64,r4,pre_tail_64b) > + addi r4,r4,256 > + > + CHECK64(0,r5,tail_64b) > + CHECK64(64,r5,tail_64b) > + addi r5,r5,256 > + > + b L(rawmemchr_loop) > +#endif Ok. Optimization for c = 0. > /* Switch to a more aggressive approach checking 64B each time. Use 2 > pointers 128B apart and unroll the loop once to make the pointer > updates and usages separated enough to avoid stalls waiting for > address calculation. */ > .p2align 5 > L(loop): > +#undef RAWMEMCHR_SUBTRACT_VECTORS > +#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ > CHECK64(0,r4,pre_tail_64b) > CHECK64(64,r4,pre_tail_64b) > addi r4,r4,256 > @@ -140,10 +221,10 @@ L(tail_64b): > block and mark it in its corresponding VR. lxvp vx,0(ry) puts the > low 16B bytes into vx+1, and the high into vx, so the order here is > v5, v4, v7, v6. */ > - vcmpequb v1,v5,v18 > - vcmpequb v2,v4,v18 > - vcmpequb v3,v7,v18 > - vcmpequb v4,v6,v18 > + vcmpequb v1,v5,VREG_ZERO > + vcmpequb v2,v4,VREG_ZERO > + vcmpequb v3,v7,VREG_ZERO > + vcmpequb v4,v6,VREG_ZERO Ok. > > /* Take into account the other 64B blocks we had already checked. */ > add r5,r5,r6 > @@ -165,7 +246,9 @@ L(tail_64b): > or r10,r8,r7 > > cnttzd r0,r10 /* Count trailing zeros before the match. */ > +#ifndef USE_AS_RAWMEMCHR > subf r5,r3,r5 > +#endif Ok. > add r3,r5,r0 /* Compute final length. */ > blr > > @@ -213,9 +296,32 @@ L(tail10): > L(tail11): > TAIL(v10,160) > > -END (STRLEN) > +#ifdef USE_AS_RAWMEMCHR > + .p2align 5 > +L(tail12): > + TAIL(v0,176) > + > + .p2align 5 > +L(tail13): > + TAIL(v1,192) > + > + .p2align 5 > +L(tail14): > + TAIL(v2,208) > + > + .p2align 5 > +L(tail15): > + TAIL(v3,224) > +#endif > + > +END (FUNCNAME) Ok. > > -#ifdef DEFINE_STRLEN_HIDDEN_DEF > +#ifdef USE_AS_RAWMEMCHR > +weak_alias (__rawmemchr,rawmemchr) > +libc_hidden_builtin_def (__rawmemchr) > +#else > +# ifdef DEFINE_STRLEN_HIDDEN_DEF > weak_alias (__strlen, strlen) > libc_hidden_builtin_def (strlen) > +# endif > #endif > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index ea50b61674..e6e013db17 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ > > ifneq (,$(filter %le,$(config-machine))) > sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ > + rawmemchr-power9 rawmemchr-power10 \ > strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ > - rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ > - strlen-power10 > + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 > endif > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index b123c6a3d3..a92b67448e 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */ > IFUNC_IMPL (i, name, rawmemchr, > #ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, rawmemchr, > + (hwcap2 & PPC_FEATURE2_ARCH_3_1) > + && (hwcap & PPC_FEATURE_HAS_VSX), > + __rawmemchr_power10) > IFUNC_IMPL_ADD (array, i, rawmemchr, > hwcap2 & PPC_FEATURE2_ARCH_3_00, > __rawmemchr_power9) > diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S > new file mode 100644 > index 0000000000..bf1ed7e194 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S > @@ -0,0 +1,21 @@ > +/* Optimized rawmemchr implementation for PowerPC64/POWER10. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define RAWMEMCHR __rawmemchr_power10 > + > +#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c > index d906a4ea50..c0ffea2b93 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c > @@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden; > extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden; > # ifdef __LITTLE_ENDIAN__ > extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; > +extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden; > # endif > > # undef __rawmemchr > @@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; > ifunc symbol properly. */ > libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr, > # ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_1) > + && (hwcap & PPC_FEATURE_HAS_VSX) > + ? __rawmemchr_power10 : > (hwcap2 & PPC_FEATURE2_ARCH_3_00) > ? __rawmemchr_power9 : > # endif > -- > 2.31.1 >
Raphael M Zinsly <rzinsly@linux.ibm.com> writes: > Hi Matheus, adding the commentary bellow the patch LGTM, thanks! > > On 12/05/2021 11:27, Matheus Castanho via Libc-alpha wrote: >> ... >> @@ -50,19 +90,13 @@ >> li r6,offset; \ >> LXVP(v4+32,offset,addr); \ >> LXVP(v6+32,offset+32,addr); \ >> + RAWMEMCHR_SUBTRACT_VECTORS; \ >> vminub v14,v4,v5; \ >> vminub v15,v6,v7; \ >> vminub v16,v14,v15; \ >> - vcmpequb. v0,v16,v18; \ >> + vcmpequb. v0,v16,VREG_ZERO; \ >> bne cr6,L(label) >> -#define TAIL(vreg,increment) \ >> - vctzlsbb r4,vreg; \ >> - subf r3,r3,r5; \ >> - addi r4,r4,increment; \ >> - add r3,r3,r4; \ >> - blr >> - >> /* Implements the function >> int [r3] strlen (const void *s [r3]) > > Missing the rawmemchar "definition" here. > > > Regards, What about this? /* Implements the function int [r3] strlen (const void *s [r3]) but when USE_AS_RAWMEMCHR is set, implements the function void* [r3] rawmemchr (const void *s [r3], int c [r4]) The implementation can load bytes past a matching byte, but only up to the next 64B boundary, so it never crosses a page. */ -- Matheus Castanho
On 14/05/2021 10:30, Matheus Castanho wrote: > What about this? > > /* Implements the function > > int [r3] strlen (const void *s [r3]) > > but when USE_AS_RAWMEMCHR is set, implements the function > > void* [r3] rawmemchr (const void *s [r3], int c [r4]) > > The implementation can load bytes past a matching byte, but only > up to the next 64B boundary, so it never crosses a page. */ Seems good to me. Thanks,
(looks like my previous message didn't get through, trying again) Merged as 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd Thanks! -- Matheus Castanho
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S new file mode 100644 index 0000000000..5351c2634f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S @@ -0,0 +1,22 @@ +/* Optimized rawmemchr implementation for POWER10 LE. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define USE_AS_RAWMEMCHR 1 +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S> diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S index ca7e9eb3d8..18222ebeb8 100644 --- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S @@ -18,10 +18,50 @@ #include <sysdep.h> -#ifndef STRLEN -# define STRLEN __strlen -# define DEFINE_STRLEN_HIDDEN_DEF 1 -#endif +/* To reuse the code for rawmemchr, we have some extra steps compared to the + strlen implementation: + - Sum the initial value of r3 with the position at which the char was + found, to guarantee we return a pointer and not the length. + - In the main loop, subtract each byte by the char we are looking for, + so we can keep using vminub to quickly check 64B at once. */ +#ifdef USE_AS_RAWMEMCHR +# ifndef RAWMEMCHR +# define FUNCNAME __rawmemchr +# else +# define FUNCNAME RAWMEMCHR +# endif +# define MCOUNT_NARGS 2 +# define VREG_ZERO v20 +# define OFF_START_LOOP 256 +# define RAWMEMCHR_SUBTRACT_VECTORS \ + vsububm v4,v4,v18; \ + vsububm v5,v5,v18; \ + vsububm v6,v6,v18; \ + vsububm v7,v7,v18; +# define TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + addi r4,r4,increment; \ + add r3,r5,r4; \ + blr + +#else /* strlen */ + +# ifndef STRLEN +# define FUNCNAME __strlen +# define DEFINE_STRLEN_HIDDEN_DEF 1 +# else +# define FUNCNAME STRLEN +# endif +# define MCOUNT_NARGS 1 +# define VREG_ZERO v18 +# define OFF_START_LOOP 192 +# define TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + subf r3,r3,r5; \ + addi r4,r4,increment; \ + add r3,r3,r4; \ + blr +#endif /* USE_AS_RAWMEMCHR */ /* TODO: Replace macros by the actual instructions when minimum binutils becomes >= 2.35. This is used to keep compatibility with older versions. */ @@ -50,19 +90,13 @@ li r6,offset; \ LXVP(v4+32,offset,addr); \ LXVP(v6+32,offset+32,addr); \ + RAWMEMCHR_SUBTRACT_VECTORS; \ vminub v14,v4,v5; \ vminub v15,v6,v7; \ vminub v16,v14,v15; \ - vcmpequb. v0,v16,v18; \ + vcmpequb. v0,v16,VREG_ZERO; \ bne cr6,L(label) -#define TAIL(vreg,increment) \ - vctzlsbb r4,vreg; \ - subf r3,r3,r5; \ - addi r4,r4,increment; \ - add r3,r3,r4; \ - blr - /* Implements the function int [r3] strlen (const void *s [r3]) @@ -72,11 +106,21 @@ .machine power9 -ENTRY_TOCLESS (STRLEN, 4) - CALL_MCOUNT 1 +ENTRY_TOCLESS (FUNCNAME, 4) + CALL_MCOUNT MCOUNT_NARGS + +#ifdef USE_AS_RAWMEMCHR + xori r5,r4,0xff - vspltisb v18,0 + mtvsrd v18+32,r4 /* matching char in v18 */ + mtvsrd v19+32,r5 /* non matching char in v19 */ + + vspltb v18,v18,7 /* replicate */ + vspltb v19,v19,7 /* replicate */ +#else vspltisb v19,-1 +#endif + vspltisb VREG_ZERO,0 /* Next 16B-aligned address. Prepare address for L(aligned). */ addi r5,r3,16 @@ -90,16 +134,25 @@ ENTRY_TOCLESS (STRLEN, 4) vcmpequb. v6,v0,v18 beq cr6,L(aligned) +#ifdef USE_AS_RAWMEMCHR + vctzlsbb r6,v6 + add r3,r3,r6 +#else vctzlsbb r3,v6 +#endif blr - /* Test next 176B, 16B at a time. The main loop is optimized for longer - strings, so checking the first bytes in 16B chunks benefits a lot - small strings. */ + /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is + optimized for longer strings, so checking the first bytes in 16B + chunks benefits a lot small strings. */ .p2align 5 L(aligned): +#ifdef USE_AS_RAWMEMCHR + cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to + choose how we will perform the main loop. */ +#endif /* Prepare address for the loop. */ - addi r4,r3,192 + addi r4,r3,OFF_START_LOOP clrrdi r4,r4,6 CHECK16(v0,0,r5,tail1) @@ -113,15 +166,43 @@ L(aligned): CHECK16(v8,128,r5,tail9) CHECK16(v9,144,r5,tail10) CHECK16(v10,160,r5,tail11) +#ifdef USE_AS_RAWMEMCHR + CHECK16(v0,176,r5,tail12) + CHECK16(v1,192,r5,tail13) + CHECK16(v2,208,r5,tail14) + CHECK16(v3,224,r5,tail15) +#endif addi r5,r4,128 +#ifdef USE_AS_RAWMEMCHR + /* If c == 0, use the same loop as strlen, without the vsububm. */ + beq cr5,L(loop) + + /* This is very similar to the block after L(loop), the difference is + that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract + each byte loaded by the char we are looking for, this way we can keep + using vminub to merge the results and checking for nulls. */ + .p2align 5 +L(rawmemchr_loop): + CHECK64(0,r4,pre_tail_64b) + CHECK64(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64(0,r5,tail_64b) + CHECK64(64,r5,tail_64b) + addi r5,r5,256 + + b L(rawmemchr_loop) +#endif /* Switch to a more aggressive approach checking 64B each time. Use 2 pointers 128B apart and unroll the loop once to make the pointer updates and usages separated enough to avoid stalls waiting for address calculation. */ .p2align 5 L(loop): +#undef RAWMEMCHR_SUBTRACT_VECTORS +#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ CHECK64(0,r4,pre_tail_64b) CHECK64(64,r4,pre_tail_64b) addi r4,r4,256 @@ -140,10 +221,10 @@ L(tail_64b): block and mark it in its corresponding VR. lxvp vx,0(ry) puts the low 16B bytes into vx+1, and the high into vx, so the order here is v5, v4, v7, v6. */ - vcmpequb v1,v5,v18 - vcmpequb v2,v4,v18 - vcmpequb v3,v7,v18 - vcmpequb v4,v6,v18 + vcmpequb v1,v5,VREG_ZERO + vcmpequb v2,v4,VREG_ZERO + vcmpequb v3,v7,VREG_ZERO + vcmpequb v4,v6,VREG_ZERO /* Take into account the other 64B blocks we had already checked. */ add r5,r5,r6 @@ -165,7 +246,9 @@ L(tail_64b): or r10,r8,r7 cnttzd r0,r10 /* Count trailing zeros before the match. */ +#ifndef USE_AS_RAWMEMCHR subf r5,r3,r5 +#endif add r3,r5,r0 /* Compute final length. */ blr @@ -213,9 +296,32 @@ L(tail10): L(tail11): TAIL(v10,160) -END (STRLEN) +#ifdef USE_AS_RAWMEMCHR + .p2align 5 +L(tail12): + TAIL(v0,176) + + .p2align 5 +L(tail13): + TAIL(v1,192) + + .p2align 5 +L(tail14): + TAIL(v2,208) + + .p2align 5 +L(tail15): + TAIL(v3,224) +#endif + +END (FUNCNAME) -#ifdef DEFINE_STRLEN_HIDDEN_DEF +#ifdef USE_AS_RAWMEMCHR +weak_alias (__rawmemchr,rawmemchr) +libc_hidden_builtin_def (__rawmemchr) +#else +# ifdef DEFINE_STRLEN_HIDDEN_DEF weak_alias (__strlen, strlen) libc_hidden_builtin_def (strlen) +# endif #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index ea50b61674..e6e013db17 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ - strlen-power10 + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index b123c6a3d3..a92b67448e 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */ IFUNC_IMPL (i, name, rawmemchr, #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, rawmemchr, + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + && (hwcap & PPC_FEATURE_HAS_VSX), + __rawmemchr_power10) IFUNC_IMPL_ADD (array, i, rawmemchr, hwcap2 & PPC_FEATURE2_ARCH_3_00, __rawmemchr_power9) diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S new file mode 100644 index 0000000000..bf1ed7e194 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S @@ -0,0 +1,21 @@ +/* Optimized rawmemchr implementation for PowerPC64/POWER10. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define RAWMEMCHR __rawmemchr_power10 + +#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c index d906a4ea50..c0ffea2b93 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c @@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden; extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; +extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden; # endif # undef __rawmemchr @@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; ifunc symbol properly. */ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr, # ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + && (hwcap & PPC_FEATURE_HAS_VSX) + ? __rawmemchr_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00) ? __rawmemchr_power9 : # endif