Message ID | 1435319512-22245-28-git-send-email-stli@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On Fri, Jun 26, 2015 at 01:51:52PM +0200, Stefan Liebler wrote: > This patch provides optimized version of memrchr with the z13 vector > instructions. >> + > +.Lfound_g16: > + vlgvb %r1,%v17,7 /* Index of c or 16 if not found. */ > + lgr %r2,%r5 > + lghi %r4,15 /* Highest index of a full vr. */ > + clr %r1,%r4 > +.Lfound_lt16: > + la %r2,0(%r1,%r2) /* Store current pointer to found c. */ > + ber %r14 /* Return if found c is last loaded byte. */ > + > + /* Shift vector elements left and start search again. */ > + aghi %r1,1 /* Start search after current index. */ > + slr %r4,%r1 /* New highest index. */ > + sll %r1,3 /* Calculate byte count for vector shift > + left. */ > + vlvgg %v17,%r1,0 > + vslb %v16,%v16,%v17 /* Vector shift left by byte by number of bytes > + specified in bits 1-4 of byte 7 in v17. */ > + vfeeb %v17,%v16,%v18 /* Find c. */ > + la %r5,1(%r2) /* Save start-address of shifted v16. */ > + vlgvb %r1,%v17,7 /* Index of c or 16 if not found */ > + clr %r1,%r4 > + locgrle %r2,%r5 /* Use stored address as base if c found. */ > + jle .Lfound_lt16 /* Found c within loaded bytes. */ > + br %r14 /* No further c found, return last stored c. */ > + > +.Lfound_end: > + la %r2,0(%r4,%r2) /* Return pointer to found c */ > + br %r14 > + This sequence looks slow. Both here and in strrchr you should use something better than loop. One possibility is to use shuffle to do 16-byte byteswap but it needs load constant. A second way is to use same trick as in calculating clz with ctz. If instruction produces nonzero for match an 0 for nonmatch then to find highest nonzero byte you first need or all lower bytes with it for example by x |= x >> 8; x |= x >> 4; x |= x >> 2; x |= x >> 1; Then you look for index of first zero byte which is one more than highest nonzero byte. When roles of 1 and 0 are reversed and all nonzero bytes have bit in common you could use &. For strrchr idea is same. When handling trailing zero you could make similar trick with zero mask x |= x << 8; x |= x << 4; x |= x << 2; x |= x << 1; x = ~x; to construct vector that is 255 before and at first zero byte and 0 after that. Its important that first zero will get 255 to handle strrchr(x,0) without separate check if c is zero.
On 07/02/2015 01:11 AM, Ondřej Bílka wrote: > On Fri, Jun 26, 2015 at 01:51:52PM +0200, Stefan Liebler wrote: >> This patch provides optimized version of memrchr with the z13 vector >> instructions. >>> + >> +.Lfound_g16: >> + vlgvb %r1,%v17,7 /* Index of c or 16 if not found. */ >> + lgr %r2,%r5 >> + lghi %r4,15 /* Highest index of a full vr. */ >> + clr %r1,%r4 >> +.Lfound_lt16: >> + la %r2,0(%r1,%r2) /* Store current pointer to found c. */ >> + ber %r14 /* Return if found c is last loaded byte. */ >> + >> + /* Shift vector elements left and start search again. */ >> + aghi %r1,1 /* Start search after current index. */ >> + slr %r4,%r1 /* New highest index. */ >> + sll %r1,3 /* Calculate byte count for vector shift >> + left. */ >> + vlvgg %v17,%r1,0 >> + vslb %v16,%v16,%v17 /* Vector shift left by byte by number of bytes >> + specified in bits 1-4 of byte 7 in v17. */ >> + vfeeb %v17,%v16,%v18 /* Find c. */ >> + la %r5,1(%r2) /* Save start-address of shifted v16. */ >> + vlgvb %r1,%v17,7 /* Index of c or 16 if not found */ >> + clr %r1,%r4 >> + locgrle %r2,%r5 /* Use stored address as base if c found. */ >> + jle .Lfound_lt16 /* Found c within loaded bytes. */ >> + br %r14 /* No further c found, return last stored c. */ >> + >> +.Lfound_end: >> + la %r2,0(%r4,%r2) /* Return pointer to found c */ >> + br %r14 >> + > This sequence looks slow. Both here and in strrchr you should use > something better than loop. One possibility is to use shuffle to do > 16-byte byteswap but it needs load constant. > > A second way is to use same trick as in calculating clz with ctz. If > instruction produces nonzero for match an 0 for nonmatch then to find > highest nonzero byte you first need or all lower bytes with it for > example by > > x |= x >> 8; > x |= x >> 4; > x |= x >> 2; > x |= x >> 1; > > Then you look for index of first zero byte which is one more than > highest nonzero byte. When roles of 1 and 0 are reversed and all nonzero > bytes have bit in common you could use &. > > > > For strrchr idea is same. When handling trailing zero you could make > similar trick with zero mask > x |= x << 8; > x |= x << 4; > x |= x << 2; > x |= x << 1; > x = ~x; > to construct vector that is 255 before and at first zero byte and 0 after > that. Its important that first zero will get 255 to handle strrchr(x,0) > without separate check if c is zero. > > Currently I'm rewriting the optimized functions in order to load the first 16 bytes unaligned or up to page boundary. For strrchr I'm now searching for c in 16byte blocks and mark the last block with an found c - as you recommended. After reaching end of string, I check once for further occurrences of c in the marked block by shifting right the bytes in marked vector register to mask out the garbage after a possible zero, reversing the bytes in vector-register and searching for "the first" c again. The result is the last occurrence of c in the marked 16byte block and the whole string. And you are right, the mentioned worst case - string with only c's - is now much faster. I post the new version of patch-series after wcsrchr, memrchr are changed, too.
diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile index 929a545..0805b07 100644 --- a/sysdeps/s390/multiarch/Makefile +++ b/sysdeps/s390/multiarch/Makefile @@ -17,7 +17,8 @@ sysdep_routines += strlen strlen-vx strlen-c \ strcspn strcspn-vx strcspn-c \ memchr memchr-vx \ rawmemchr rawmemchr-vx rawmemchr-c \ - memccpy memccpy-vx memccpy-c + memccpy memccpy-vx memccpy-c \ + memrchr memrchr-vx memrchr-c endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c index 5ea258b..c235bdc 100644 --- a/sysdeps/s390/multiarch/ifunc-impl-list.c +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c @@ -137,6 +137,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_VX_IMPL (wmemcmp); + IFUNC_VX_IMPL (memrchr); + #endif /* HAVE_S390_VX_ASM_SUPPORT */ return i; diff --git a/sysdeps/s390/multiarch/memrchr-c.c b/sysdeps/s390/multiarch/memrchr-c.c new file mode 100644 index 0000000..ac481fd --- /dev/null +++ b/sysdeps/s390/multiarch/memrchr-c.c @@ -0,0 +1,25 @@ +/* Default memrchr implementation for S/390. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# define MEMRCHR __memrchr_c + +# include <string.h> +extern __typeof (__memrchr) __memrchr_c; +# include <string/memrchr.c> +#endif diff --git a/sysdeps/s390/multiarch/memrchr-vx.S b/sysdeps/s390/multiarch/memrchr-vx.S new file mode 100644 index 0000000..53c44ef --- /dev/null +++ b/sysdeps/s390/multiarch/memrchr-vx.S @@ -0,0 +1,116 @@ +/* Vector optimized 32/64 bit S/390 version of memrchr. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) + +# include "sysdep.h" +# include "asm-syntax.h" + + .text + +/* void *memrchr (const void *s, int c, size_t n) + Scans memory for character c backwards + and returns pointer to first c. + + Register usage: + -r1=tmp + -r2=s + -r3=c + -r4=n + -r5=s in loop + + -v16=part of s + -v17=index of found c + -v18=c replicated +*/ +ENTRY(__memrchr_vx) + .machine "z13" + .machinemode "zarch_nohighgprs" + +# if !defined __s390x__ + llgfr %r4,%r4 +# endif /* !defined __s390x__ */ + + vlvgb %v18,%r3,0 /* Generate vector which elements are all c. + If c > 255, c will be truncated. */ + vrepb %v18,%v18,0 + + clgijh %r4,16,.Lg16 /* Jump away if n > 16. */ + +.Ll16: + /* Process n <= 16 bytes; r2 points to begin of n-bytes. */ + clgije %r4,0,.Lnf_end + + aghi %r4,-1 /* vll needs highest index. */ + vll %v16,%r4,0(%r2) + vfeeb %v17,%v16,%v18 /* Find c. */ + vlgvb %r1,%v17,7 /* Index of c or 16 if not found. */ + clr %r1,%r4 + jle .Lfound_lt16 /* Found c within loaded bytes. */ +.Lnf_end: + lghi %r2,0 /* Return null. */ + br %r14 + +.Lfound_g16: + vlgvb %r1,%v17,7 /* Index of c or 16 if not found. */ + lgr %r2,%r5 + lghi %r4,15 /* Highest index of a full vr. */ + clr %r1,%r4 +.Lfound_lt16: + la %r2,0(%r1,%r2) /* Store current pointer to found c. */ + ber %r14 /* Return if found c is last loaded byte. */ + + /* Shift vector elements left and start search again. */ + aghi %r1,1 /* Start search after current index. */ + slr %r4,%r1 /* New highest index. */ + sll %r1,3 /* Calculate byte count for vector shift + left. */ + vlvgg %v17,%r1,0 + vslb %v16,%v16,%v17 /* Vector shift left by byte by number of bytes + specified in bits 1-4 of byte 7 in v17. */ + vfeeb %v17,%v16,%v18 /* Find c. */ + la %r5,1(%r2) /* Save start-address of shifted v16. */ + vlgvb %r1,%v17,7 /* Index of c or 16 if not found */ + clr %r1,%r4 + locgrle %r2,%r5 /* Use stored address as base if c found. */ + jle .Lfound_lt16 /* Found c within loaded bytes. */ + br %r14 /* No further c found, return last stored c. */ + +.Lfound_end: + la %r2,0(%r4,%r2) /* Return pointer to found c */ + br %r14 + +.Lg16: + /* Process 16byte blocks - n > 16. */ +.Lloop: + aghi %r4,-16 + la %r5,0(%r4,%r2) /* Get address n -16. */ + vl %v16,0(%r5) + vfeebs %v17,%v16,%v18 /* Find c. */ + jno .Lfound_g16 /* Jump away if c was found. */ + clgijle %r4,16,.Ll16 /* Process remaining. */ + + aghi %r4,-16 + la %r5,0(%r4,%r2) /* Get address n -16. */ + vl %v16,0(%r5) + vfeebs %v17,%v16,%v18 /* Find c. */ + jno .Lfound_g16 /* Jump away if c was found. */ + clgijh %r4,16,.Lloop /* Loop until n > 16. */ + j .Ll16 +END(__memrchr_vx) +#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */ diff --git a/sysdeps/s390/multiarch/memrchr.c b/sysdeps/s390/multiarch/memrchr.c new file mode 100644 index 0000000..8ac2f52 --- /dev/null +++ b/sysdeps/s390/multiarch/memrchr.c @@ -0,0 +1,28 @@ +/* Multiple versions of memrchr. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# include <string.h> +# include <ifunc-resolve.h> + +s390_vx_libc_ifunc (__memrchr) +weak_alias (__memrchr, memrchr) + +#else +# include <string/memrchr.c> +#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */