Message ID | 56F56EEB.5020500@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
LGTM with some comments below: On 25-03-2016 14:01, Paul E. Murphy wrote: > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S > @@ -0,0 +1,44 @@ > +/* Optimized strspn implementation for POWER8. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#if IS_IN (libc) AFAIK strspn is not build either for the loader or for another library, so I think there is no need for this preprocessor check. > + > +#undef EALIGN > +#define EALIGN(name, alignt, words) \ > + .section ".text"; \ > + ENTRY_2(__strspn_power8) \ > + .align ALIGNARG(alignt); \ > + EALIGN_W_##words; \ > + BODY_LABEL(__strspn_power8): \ > + cfi_startproc; \ > + LOCALENTRY(__strspn_power8) > + > +#undef END > +#define END(name) \ > + cfi_endproc; \ > + TRACEBACK(__strspn_power8) \ > + END_2(__strspn_power8) > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#endif > + > +#include <sysdeps/powerpc/powerpc64/power8/strspn.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S > new file mode 100644 > index 0000000..4e870a9 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S > @@ -0,0 +1,44 @@ > +/* Optimized strspn implementation for POWER8. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#if IS_IN (libc) Same as before. > + > +#undef EALIGN > +#define EALIGN(name, alignt, words) \ > + .section ".text"; \ > + ENTRY_2(__strspn_ppc) \ > + .align ALIGNARG(alignt); \ > + EALIGN_W_##words; \ > + BODY_LABEL(__strspn_ppc): \ > + cfi_startproc; \ > + LOCALENTRY(__strspn_ppc) > + > +#undef END > +#define END(name) \ > + cfi_endproc; \ > + TRACEBACK(__strspn_ppc) \ > + END_2(__strspn_ppc) > + > +#undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(name) > + > +#endif > + > +#include <sysdeps/powerpc/powerpc64/strspn.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c > new file mode 100644 > index 0000000..8769de3 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c > @@ -0,0 +1,37 @@ > +/* Multiple versions of strspn. PowerPC64 version. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) Same as before. > +# include <string.h> > +# include <shlib-compat.h> > +# include "init-arch.h" > + > +#undef strspn > +extern __typeof (strspn) __libc_strspn; > + > +extern __typeof (strspn) __strspn_ppc attribute_hidden; > +extern __typeof (strspn) __strspn_power8 attribute_hidden; > + > +libc_ifunc (__libc_strspn, > + (hwcap2 & PPC_FEATURE2_ARCH_2_07) > + ? __strspn_power8 > + : __strspn_ppc); > + > +weak_alias (__libc_strspn, strspn) > +libc_hidden_builtin_def (strspn) > +#endif > diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S > new file mode 100644 > index 0000000..dd1838e > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S > @@ -0,0 +1,179 @@ > +/* Optimized strspn implementation for Power8. > + > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* size_t [r3] strspn (const char *string [r3], > + const char *needleAccept [r4] */ > + > +/* This takes a novel approach by computing a 256 bit mask whereby > + each set bit implies the byte is "accepted". P8 vector hardware > + has extremely efficient hardware for selecting bits from a mask. > + > + One might ask "why not use bpermd for short strings"? It is > + so slow that its performance about matches the generic PPC64 > + variant without any fancy masking, with the added expense of > + making the mask. That was the first variant of this. */ > + > + > + > +#include "sysdep.h" > + > +/* Simple macro to use VSX instructions in overlapping VR's. */ > +#define XXVR(insn, vrt, vra, vrb) \ > + insn 32+vrt, 32+vra, 32+vrb > + > +/* ISA 2.07B instructions are not all defined for older binutils. > + Macros are defined below for these newer instructions in order > + to maintain compatibility. */ > + > +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ > +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) > +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) > + > +#define VBPERMQ(t,a,b) .long (0x1000054c \ > + | ((t)<<(32-11)) \ > + | ((a)<<(32-16)) \ > + | ((b)<<(32-21)) ) > + > + /* This can be updated to power8 once the minimum version of > + binutils supports power8 and the above instructions. */ > + .machine power7 > +EALIGN(strspn, 4, 0) > + CALL_MCOUNT 2 > + > + /* Generate useful constants for later on. */ > + vspltisb v1, 7 > + vspltisb v2, -1 > + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ > + vspltisb v10, 0 > + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ > + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ > + > + /* Prepare to compute 256b mask. */ > + addi r4, r4, -1 > + li r5, 0 > + li r6, 0 > + li r7, 0 > + li r8, 0 > + li r11, 1 > + sldi r11, r11, 63 > + > + /* Start interleaved Mask computation. > + This will eventually or 1's into ignored bits from vbpermq. */ > + lvsr v11, r0, r3 > + vspltb v11, v11, 0 /* Splat shift constant. */ > + > + /* Build a 256b mask in r5-r8. */ > + .align 4 > +L(next_needle): > + lbzu r9, 1(r4) > + > + cmpldi cr0, r9, 0 > + cmpldi cr1, r9, 128 > + > + /* This is a little tricky. srd only uses the first 7 bits, > + and if bit 7 is set, value is always 0. So, we can > + effectively shift 128b in this case. */ > + xori r12, r9, 0x40 /* Invert bit 6. */ > + srd r10, r11, r9 /* Mask for bits 0-63. */ > + srd r12, r11, r12 /* Mask for bits 64-127. */ > + > + beq cr0, L(start_cmp) > + > + /* Now, or the value into the correct GPR. */ > + bge cr1,L(needle_gt128) > + or r5, r5, r10 /* 0 - 63. */ > + or r6, r6, r12 /* 64 - 127. */ > + b L(next_needle) > + > + .align 4 > +L(needle_gt128): > + or r7, r7, r10 /* 128 - 191. */ > + or r8, r8, r12 /* 192 - 255. */ > + b L(next_needle) > + > + > + .align 4 > +L(start_cmp): > + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ > + mr r0, r3 /* Save r3 for final length computation. */ > + MTVRD (v5, r5) > + MTVRD (v6, r6) > + MTVRD (v7, r7) > + MTVRD (v8, r8) > + > + /* Continue interleaved mask generation. */ > +#ifdef __LITTLE_ENDIAN__ > + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ > + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ > +#else > + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ > + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ > +#endif > + lvx v0, r0, r3 /* Note, unaligned load ignores lower bits. */ > + > + /* Do the merging of the bitmask. */ > + XXVR(xxmrghd, v5, v5, v6) > + XXVR(xxmrghd, v6, v7, v8) > + > + /* Finish mask generation. */ > + vand v11, v11, v4 /* Throwaway bits not in the mask. */ > + > + /* Compare the first 1-16B, while masking unwanted bytes. */ > + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ > + vxor v9, v0, v1 /* Swap high bit. */ > + VBPERMQ (v8, v5, v0) > + VBPERMQ (v7, v6, v9) > + vor v7, v7, v8 > + vor v7, v7, v11 /* Ignore non-participating bytes. */ > + vcmpequh. v8, v7, v4 > + bnl cr6, L(done) > + > + addi r3, r3, 16 > + > + .align 4 > +L(vec): > + lvx v0, r0, r3 > + addi r3, r3, 16 > + vxor v9, v0, v1 /* Swap high bit. */ > + VBPERMQ (v8, v5, v0) > + VBPERMQ (v7, v6, v9) > + vor v7, v7, v8 > + vcmpequh. v8, v7, v4 > + blt cr6, L(vec) > + > + addi r3, r3, -16 > +L(done): > + subf r3, r0, r3 > + MFVRD (r10, v7) > + > +#ifdef __LITTLE_ENDIAN__ > + addi r0, r10, 1 /* Count the trailing 1's. */ > + andc r10, r10, r0 > + popcntd r10, r10 > +#else > + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ > + addi r3, r3, -48 /* Account for the extra leading zeros. */ > + cntlzd r10, r10 > +#endif > + > + add r3, r3, r10 > + blr > + > +END(strspn) > +libc_hidden_builtin_def (strspn) >
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3b0e3a0..7ed56bf 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -19,7 +19,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 \ memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ - strncpy-power8 strstr-power7 strstr-ppc64 + strncpy-power8 strstr-power7 strstr-ppc64 \ + strspn-power8 strspn-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 11a8215..3e1f099 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -322,6 +322,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strspn_power8) + IFUNC_IMPL_ADD (array, i, strspn, 1, + __strspn_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, IFUNC_IMPL_ADD (array, i, strstr, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S new file mode 100644 index 0000000..0beff3c --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S @@ -0,0 +1,44 @@ +/* Optimized strspn implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strspn_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strspn_power8): \ + cfi_startproc; \ + LOCALENTRY(__strspn_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strspn_power8) \ + END_2(__strspn_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#endif + +#include <sysdeps/powerpc/powerpc64/power8/strspn.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S new file mode 100644 index 0000000..4e870a9 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S @@ -0,0 +1,44 @@ +/* Optimized strspn implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strspn_ppc) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strspn_ppc): \ + cfi_startproc; \ + LOCALENTRY(__strspn_ppc) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strspn_ppc) \ + END_2(__strspn_ppc) + +#undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +#endif + +#include <sysdeps/powerpc/powerpc64/strspn.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c new file mode 100644 index 0000000..8769de3 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c @@ -0,0 +1,37 @@ +/* Multiple versions of strspn. PowerPC64 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +#undef strspn +extern __typeof (strspn) __libc_strspn; + +extern __typeof (strspn) __strspn_ppc attribute_hidden; +extern __typeof (strspn) __strspn_power8 attribute_hidden; + +libc_ifunc (__libc_strspn, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strspn_power8 + : __strspn_ppc); + +weak_alias (__libc_strspn, strspn) +libc_hidden_builtin_def (strspn) +#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S new file mode 100644 index 0000000..dd1838e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S @@ -0,0 +1,179 @@ +/* Optimized strspn implementation for Power8. + + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* size_t [r3] strspn (const char *string [r3], + const char *needleAccept [r4] */ + +/* This takes a novel approach by computing a 256 bit mask whereby + each set bit implies the byte is "accepted". P8 vector hardware + has extremely efficient hardware for selecting bits from a mask. + + One might ask "why not use bpermd for short strings"? It is + so slow that its performance about matches the generic PPC64 + variant without any fancy masking, with the added expense of + making the mask. That was the first variant of this. */ + + + +#include "sysdep.h" + +/* Simple macro to use VSX instructions in overlapping VR's. */ +#define XXVR(insn, vrt, vra, vrb) \ + insn 32+vrt, 32+vra, 32+vrb + +/* ISA 2.07B instructions are not all defined for older binutils. + Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) + +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + + /* This can be updated to power8 once the minimum version of + binutils supports power8 and the above instructions. */ + .machine power7 +EALIGN(strspn, 4, 0) + CALL_MCOUNT 2 + + /* Generate useful constants for later on. */ + vspltisb v1, 7 + vspltisb v2, -1 + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ + vspltisb v10, 0 + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ + + /* Prepare to compute 256b mask. */ + addi r4, r4, -1 + li r5, 0 + li r6, 0 + li r7, 0 + li r8, 0 + li r11, 1 + sldi r11, r11, 63 + + /* Start interleaved Mask computation. + This will eventually or 1's into ignored bits from vbpermq. */ + lvsr v11, r0, r3 + vspltb v11, v11, 0 /* Splat shift constant. */ + + /* Build a 256b mask in r5-r8. */ + .align 4 +L(next_needle): + lbzu r9, 1(r4) + + cmpldi cr0, r9, 0 + cmpldi cr1, r9, 128 + + /* This is a little tricky. srd only uses the first 7 bits, + and if bit 7 is set, value is always 0. So, we can + effectively shift 128b in this case. */ + xori r12, r9, 0x40 /* Invert bit 6. */ + srd r10, r11, r9 /* Mask for bits 0-63. */ + srd r12, r11, r12 /* Mask for bits 64-127. */ + + beq cr0, L(start_cmp) + + /* Now, or the value into the correct GPR. */ + bge cr1,L(needle_gt128) + or r5, r5, r10 /* 0 - 63. */ + or r6, r6, r12 /* 64 - 127. */ + b L(next_needle) + + .align 4 +L(needle_gt128): + or r7, r7, r10 /* 128 - 191. */ + or r8, r8, r12 /* 192 - 255. */ + b L(next_needle) + + + .align 4 +L(start_cmp): + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ + mr r0, r3 /* Save r3 for final length computation. */ + MTVRD (v5, r5) + MTVRD (v6, r6) + MTVRD (v7, r7) + MTVRD (v8, r8) + + /* Continue interleaved mask generation. */ +#ifdef __LITTLE_ENDIAN__ + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ +#else + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ +#endif + lvx v0, r0, r3 /* Note, unaligned load ignores lower bits. */ + + /* Do the merging of the bitmask. */ + XXVR(xxmrghd, v5, v5, v6) + XXVR(xxmrghd, v6, v7, v8) + + /* Finish mask generation. */ + vand v11, v11, v4 /* Throwaway bits not in the mask. */ + + /* Compare the first 1-16B, while masking unwanted bytes. */ + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vor v7, v7, v11 /* Ignore non-participating bytes. */ + vcmpequh. v8, v7, v4 + bnl cr6, L(done) + + addi r3, r3, 16 + + .align 4 +L(vec): + lvx v0, r0, r3 + addi r3, r3, 16 + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vcmpequh. v8, v7, v4 + blt cr6, L(vec) + + addi r3, r3, -16 +L(done): + subf r3, r0, r3 + MFVRD (r10, v7) + +#ifdef __LITTLE_ENDIAN__ + addi r0, r10, 1 /* Count the trailing 1's. */ + andc r10, r10, r0 + popcntd r10, r10 +#else + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ + addi r3, r3, -48 /* Account for the extra leading zeros. */ + cntlzd r10, r10 +#endif + + add r3, r3, r10 + blr + +END(strspn) +libc_hidden_builtin_def (strspn)