Message ID | 20220919195920.956393-9-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Improve generic string routines | expand |
On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > New algorithm have the following key differences: > > - Reads first word unaligned and use string-maskoff function to > remove unwanted data. This strategy follow arch-specific > optimization used on aarch64 and powerpc. > > - Use string-fz{b,i} functions. > > Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, > and powerpc-linux-gnu by removing the arch-specific assembly > implementation and disabling multi-arch (it covers both LE and BE > for 64 and 32 bits). > > Co-authored-by: Richard Henderson <rth@twiddle.net> > --- > string/strchrnul.c | 156 +++--------------- > .../power4/multiarch/strchrnul-ppc32.c | 4 - > sysdeps/s390/strchrnul-c.c | 2 - > 3 files changed, 24 insertions(+), 138 deletions(-) > > diff --git a/string/strchrnul.c b/string/strchrnul.c > index 0cc1fc6bb0..67defa3dab 100644 > --- a/string/strchrnul.c > +++ b/string/strchrnul.c > @@ -1,10 +1,5 @@ > /* Copyright (C) 1991-2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), > - with help from Dan Sahlin (dan@sics.se) and > - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); > - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), > - and implemented by Roland McGrath (roland@ai.mit.edu). > > The GNU C Library is free software; you can redistribute it and/or > modify it under the terms of the GNU Lesser General Public > @@ -21,146 +16,43 @@ > <https://www.gnu.org/licenses/>. */ > > #include <string.h> > -#include <memcopy.h> > #include <stdlib.h> > +#include <stdint.h> > +#include <string-fza.h> > +#include <string-fzb.h> > +#include <string-fzi.h> > +#include <string-maskoff.h> > > #undef __strchrnul > #undef strchrnul > > -#ifndef STRCHRNUL > -# define STRCHRNUL __strchrnul > +#ifdef STRCHRNUL > +# define __strchrnul STRCHRNUL > #endif > > /* Find the first occurrence of C in S or the final NUL byte. */ > char * > -STRCHRNUL (const char *s, int c_in) > +__strchrnul (const char *str, int c_in) > { > - const unsigned char *char_ptr; > - const unsigned long int *longword_ptr; > - unsigned long int longword, magic_bits, charmask; > - unsigned char c; > - > - c = (unsigned char) c_in; > - > - /* Handle the first few characters by reading one character at a time. > - Do this until CHAR_PTR is aligned on a longword boundary. */ > - for (char_ptr = (const unsigned char *) s; > - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; > - ++char_ptr) > - if (*char_ptr == c || *char_ptr == '\0') > - return (void *) char_ptr; > - > - /* All these elucidatory comments refer to 4-byte longwords, > - but the theory applies equally well to 8-byte longwords. */ > - > - longword_ptr = (unsigned long int *) char_ptr; > - > - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits > - the "holes." Note that there is a hole just to the left of > - each byte, with an extra at the end: > - > - bits: 01111110 11111110 11111110 11111111 > - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD > - > - The 1-bits make sure that carries propagate to the next 0-bit. > - The 0-bits provide holes for carries to fall into. */ > - magic_bits = -1; > - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; > - > - /* Set up a longword, each of whose bytes is C. */ > - charmask = c | (c << 8); > - charmask |= charmask << 16; > - if (sizeof (longword) > 4) > - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ > - charmask |= (charmask << 16) << 16; > - if (sizeof (longword) > 8) > - abort (); > - > - /* Instead of the traditional loop which tests each character, > - we will test a longword at a time. The tricky part is testing > - if *any of the four* bytes in the longword in question are zero. */ > - for (;;) > - { > - /* We tentatively exit the loop if adding MAGIC_BITS to > - LONGWORD fails to change any of the hole bits of LONGWORD. > - > - 1) Is this safe? Will it catch all the zero bytes? > - Suppose there is a byte with all zeros. Any carry bits > - propagating from its left will fall into the hole at its > - least significant bit and stop. Since there will be no > - carry from its most significant bit, the LSB of the > - byte to the left will be unchanged, and the zero will be > - detected. > + /* Set up a word, each of whose bytes is C. */ > + op_t repeated_c = repeat_bytes (c_in); > > - 2) Is this worthwhile? Will it ignore everything except > - zero bytes? Suppose every byte of LONGWORD has a bit set > - somewhere. There will be a carry into bit 8. If bit 8 > - is set, this will carry into bit 16. If bit 8 is clear, > - one of bits 9-15 must be set, so there will be a carry > - into bit 16. Similarly, there will be a carry into bit > - 24. If one of bits 24-30 is set, there will be a carry > - into bit 31, so all of the hole bits will be changed. > + /* Align the input address to op_t. */ > + uintptr_t s_int = (uintptr_t) str; > + const op_t *word_ptr = word_containing (str); > > - The one misfire occurs when bits 24-30 are clear and bit > - 31 is set; in this case, the hole at bit 31 is not > - changed. If we had access to the processor carry flag, > - we could close this loophole by putting the fourth hole > - at bit 32! > + /* Read the first aligned word, but force bytes before the string to > + match neither zero nor goal (we make sure the high bit of each byte > + is 1, and the low 7 bits are all the opposite of the goal byte). */ > + op_t bmask = create_mask (s_int); > + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); Think much clearer (and probably better codegen) is: find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > > - So it ignores everything except 128's, when they're aligned > - properly. > + while (! has_zero_eq (word, repeated_c)) > + word = *++word_ptr; > > - 3) But wait! Aren't we looking for C as well as zero? > - Good point. So what we do is XOR LONGWORD with a longword, > - each of whose bytes is C. This turns each byte that is C > - into a zero. */ > - > - longword = *longword_ptr++; > - > - /* Add MAGIC_BITS to LONGWORD. */ > - if ((((longword + magic_bits) > - > - /* Set those bits that were unchanged by the addition. */ > - ^ ~longword) > - > - /* Look at only the hole bits. If any of the hole bits > - are unchanged, most likely one of the bytes was a > - zero. */ > - & ~magic_bits) != 0 > - > - /* That caught zeroes. Now test for C. */ > - || ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask)) > - & ~magic_bits) != 0) > - { > - /* Which of the bytes was C or zero? > - If none of them were, it was a misfire; continue the search. */ > - > - const unsigned char *cp = (const unsigned char *) (longword_ptr - 1); > - > - if (*cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (sizeof (longword) > 4) > - { > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - if (*++cp == c || *cp == '\0') > - return (char *) cp; > - } > - } > - } > - > - /* This should never happen. */ > - return NULL; > + op_t found = index_first_zero_eq (word, repeated_c); > + return (char *) (word_ptr) + found; > } > - > +#ifndef STRCHRNUL > weak_alias (__strchrnul, strchrnul) > +#endif > diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c b/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c > index ed86b5e671..9c85e269f7 100644 > --- a/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c > +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c > @@ -19,10 +19,6 @@ > #include <string.h> > > #define STRCHRNUL __strchrnul_ppc > - > -#undef weak_alias > -#define weak_alias(a,b ) > - > extern __typeof (strchrnul) __strchrnul_ppc attribute_hidden; > > #include <string/strchrnul.c> > diff --git a/sysdeps/s390/strchrnul-c.c b/sysdeps/s390/strchrnul-c.c > index 4ffac54edd..2ebbcc62f7 100644 > --- a/sysdeps/s390/strchrnul-c.c > +++ b/sysdeps/s390/strchrnul-c.c > @@ -22,8 +22,6 @@ > # if HAVE_STRCHRNUL_IFUNC > # define STRCHRNUL STRCHRNUL_C > # define __strchrnul STRCHRNUL > -# undef weak_alias > -# define weak_alias(name, alias) > # endif > > # include <string/strchrnul.c> > -- > 2.34.1 >
On 05/01/23 20:17, Noah Goldstein wrote: > On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> New algorithm have the following key differences: >> >> - Reads first word unaligned and use string-maskoff function to >> remove unwanted data. This strategy follow arch-specific >> optimization used on aarch64 and powerpc. >> >> - Use string-fz{b,i} functions. >> >> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, >> and powerpc-linux-gnu by removing the arch-specific assembly >> implementation and disabling multi-arch (it covers both LE and BE >> for 64 and 32 bits). >> >> Co-authored-by: Richard Henderson <rth@twiddle.net> >> --- >> string/strchrnul.c | 156 +++--------------- >> .../power4/multiarch/strchrnul-ppc32.c | 4 - >> sysdeps/s390/strchrnul-c.c | 2 - >> 3 files changed, 24 insertions(+), 138 deletions(-) >> >> diff --git a/string/strchrnul.c b/string/strchrnul.c >> index 0cc1fc6bb0..67defa3dab 100644 >> --- a/string/strchrnul.c >> +++ b/string/strchrnul.c >> @@ -1,10 +1,5 @@ >> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. >> This file is part of the GNU C Library. >> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), >> - with help from Dan Sahlin (dan@sics.se) and >> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); >> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), >> - and implemented by Roland McGrath (roland@ai.mit.edu). >> >> The GNU C Library is free software; you can redistribute it and/or >> modify it under the terms of the GNU Lesser General Public >> @@ -21,146 +16,43 @@ >> <https://www.gnu.org/licenses/>. */ >> >> #include <string.h> >> -#include <memcopy.h> >> #include <stdlib.h> >> +#include <stdint.h> >> +#include <string-fza.h> >> +#include <string-fzb.h> >> +#include <string-fzi.h> >> +#include <string-maskoff.h> >> >> #undef __strchrnul >> #undef strchrnul >> >> -#ifndef STRCHRNUL >> -# define STRCHRNUL __strchrnul >> +#ifdef STRCHRNUL >> +# define __strchrnul STRCHRNUL >> #endif >> >> /* Find the first occurrence of C in S or the final NUL byte. */ >> char * >> -STRCHRNUL (const char *s, int c_in) >> +__strchrnul (const char *str, int c_in) >> { >> - const unsigned char *char_ptr; >> - const unsigned long int *longword_ptr; >> - unsigned long int longword, magic_bits, charmask; >> - unsigned char c; >> - >> - c = (unsigned char) c_in; >> - >> - /* Handle the first few characters by reading one character at a time. >> - Do this until CHAR_PTR is aligned on a longword boundary. */ >> - for (char_ptr = (const unsigned char *) s; >> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; >> - ++char_ptr) >> - if (*char_ptr == c || *char_ptr == '\0') >> - return (void *) char_ptr; >> - >> - /* All these elucidatory comments refer to 4-byte longwords, >> - but the theory applies equally well to 8-byte longwords. */ >> - >> - longword_ptr = (unsigned long int *) char_ptr; >> - >> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits >> - the "holes." Note that there is a hole just to the left of >> - each byte, with an extra at the end: >> - >> - bits: 01111110 11111110 11111110 11111111 >> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD >> - >> - The 1-bits make sure that carries propagate to the next 0-bit. >> - The 0-bits provide holes for carries to fall into. */ >> - magic_bits = -1; >> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; >> - >> - /* Set up a longword, each of whose bytes is C. */ >> - charmask = c | (c << 8); >> - charmask |= charmask << 16; >> - if (sizeof (longword) > 4) >> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ >> - charmask |= (charmask << 16) << 16; >> - if (sizeof (longword) > 8) >> - abort (); >> - >> - /* Instead of the traditional loop which tests each character, >> - we will test a longword at a time. The tricky part is testing >> - if *any of the four* bytes in the longword in question are zero. */ >> - for (;;) >> - { >> - /* We tentatively exit the loop if adding MAGIC_BITS to >> - LONGWORD fails to change any of the hole bits of LONGWORD. >> - >> - 1) Is this safe? Will it catch all the zero bytes? >> - Suppose there is a byte with all zeros. Any carry bits >> - propagating from its left will fall into the hole at its >> - least significant bit and stop. Since there will be no >> - carry from its most significant bit, the LSB of the >> - byte to the left will be unchanged, and the zero will be >> - detected. >> + /* Set up a word, each of whose bytes is C. */ >> + op_t repeated_c = repeat_bytes (c_in); >> >> - 2) Is this worthwhile? Will it ignore everything except >> - zero bytes? Suppose every byte of LONGWORD has a bit set >> - somewhere. There will be a carry into bit 8. If bit 8 >> - is set, this will carry into bit 16. If bit 8 is clear, >> - one of bits 9-15 must be set, so there will be a carry >> - into bit 16. Similarly, there will be a carry into bit >> - 24. If one of bits 24-30 is set, there will be a carry >> - into bit 31, so all of the hole bits will be changed. >> + /* Align the input address to op_t. */ >> + uintptr_t s_int = (uintptr_t) str; >> + const op_t *word_ptr = word_containing (str); >> >> - The one misfire occurs when bits 24-30 are clear and bit >> - 31 is set; in this case, the hole at bit 31 is not >> - changed. If we had access to the processor carry flag, >> - we could close this loophole by putting the fourth hole >> - at bit 32! >> + /* Read the first aligned word, but force bytes before the string to >> + match neither zero nor goal (we make sure the high bit of each byte >> + is 1, and the low 7 bits are all the opposite of the goal byte). */ >> + op_t bmask = create_mask (s_int); >> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); > > Think much clearer (and probably better codegen) is: > find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) It does not seem to work, at least not replacing the two lines with: op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); The loader itself can not loader anything (which means strchr is failing somewhere).
On 1/9/23 12:35, Adhemerval Zanella Netto wrote: > > > On 05/01/23 20:17, Noah Goldstein wrote: >> On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha >> <libc-alpha@sourceware.org> wrote: >>> >>> New algorithm have the following key differences: >>> >>> - Reads first word unaligned and use string-maskoff function to >>> remove unwanted data. This strategy follow arch-specific >>> optimization used on aarch64 and powerpc. >>> >>> - Use string-fz{b,i} functions. >>> >>> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, >>> and powerpc-linux-gnu by removing the arch-specific assembly >>> implementation and disabling multi-arch (it covers both LE and BE >>> for 64 and 32 bits). >>> >>> Co-authored-by: Richard Henderson <rth@twiddle.net> >>> --- >>> string/strchrnul.c | 156 +++--------------- >>> .../power4/multiarch/strchrnul-ppc32.c | 4 - >>> sysdeps/s390/strchrnul-c.c | 2 - >>> 3 files changed, 24 insertions(+), 138 deletions(-) >>> >>> diff --git a/string/strchrnul.c b/string/strchrnul.c >>> index 0cc1fc6bb0..67defa3dab 100644 >>> --- a/string/strchrnul.c >>> +++ b/string/strchrnul.c >>> @@ -1,10 +1,5 @@ >>> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. >>> This file is part of the GNU C Library. >>> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), >>> - with help from Dan Sahlin (dan@sics.se) and >>> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); >>> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), >>> - and implemented by Roland McGrath (roland@ai.mit.edu). >>> >>> The GNU C Library is free software; you can redistribute it and/or >>> modify it under the terms of the GNU Lesser General Public >>> @@ -21,146 +16,43 @@ >>> <https://www.gnu.org/licenses/>. */ >>> >>> #include <string.h> >>> -#include <memcopy.h> >>> #include <stdlib.h> >>> +#include <stdint.h> >>> +#include <string-fza.h> >>> +#include <string-fzb.h> >>> +#include <string-fzi.h> >>> +#include <string-maskoff.h> >>> >>> #undef __strchrnul >>> #undef strchrnul >>> >>> -#ifndef STRCHRNUL >>> -# define STRCHRNUL __strchrnul >>> +#ifdef STRCHRNUL >>> +# define __strchrnul STRCHRNUL >>> #endif >>> >>> /* Find the first occurrence of C in S or the final NUL byte. */ >>> char * >>> -STRCHRNUL (const char *s, int c_in) >>> +__strchrnul (const char *str, int c_in) >>> { >>> - const unsigned char *char_ptr; >>> - const unsigned long int *longword_ptr; >>> - unsigned long int longword, magic_bits, charmask; >>> - unsigned char c; >>> - >>> - c = (unsigned char) c_in; >>> - >>> - /* Handle the first few characters by reading one character at a time. >>> - Do this until CHAR_PTR is aligned on a longword boundary. */ >>> - for (char_ptr = (const unsigned char *) s; >>> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; >>> - ++char_ptr) >>> - if (*char_ptr == c || *char_ptr == '\0') >>> - return (void *) char_ptr; >>> - >>> - /* All these elucidatory comments refer to 4-byte longwords, >>> - but the theory applies equally well to 8-byte longwords. */ >>> - >>> - longword_ptr = (unsigned long int *) char_ptr; >>> - >>> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits >>> - the "holes." Note that there is a hole just to the left of >>> - each byte, with an extra at the end: >>> - >>> - bits: 01111110 11111110 11111110 11111111 >>> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD >>> - >>> - The 1-bits make sure that carries propagate to the next 0-bit. >>> - The 0-bits provide holes for carries to fall into. */ >>> - magic_bits = -1; >>> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; >>> - >>> - /* Set up a longword, each of whose bytes is C. */ >>> - charmask = c | (c << 8); >>> - charmask |= charmask << 16; >>> - if (sizeof (longword) > 4) >>> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ >>> - charmask |= (charmask << 16) << 16; >>> - if (sizeof (longword) > 8) >>> - abort (); >>> - >>> - /* Instead of the traditional loop which tests each character, >>> - we will test a longword at a time. The tricky part is testing >>> - if *any of the four* bytes in the longword in question are zero. */ >>> - for (;;) >>> - { >>> - /* We tentatively exit the loop if adding MAGIC_BITS to >>> - LONGWORD fails to change any of the hole bits of LONGWORD. >>> - >>> - 1) Is this safe? Will it catch all the zero bytes? >>> - Suppose there is a byte with all zeros. Any carry bits >>> - propagating from its left will fall into the hole at its >>> - least significant bit and stop. Since there will be no >>> - carry from its most significant bit, the LSB of the >>> - byte to the left will be unchanged, and the zero will be >>> - detected. >>> + /* Set up a word, each of whose bytes is C. */ >>> + op_t repeated_c = repeat_bytes (c_in); >>> >>> - 2) Is this worthwhile? Will it ignore everything except >>> - zero bytes? Suppose every byte of LONGWORD has a bit set >>> - somewhere. There will be a carry into bit 8. If bit 8 >>> - is set, this will carry into bit 16. If bit 8 is clear, >>> - one of bits 9-15 must be set, so there will be a carry >>> - into bit 16. Similarly, there will be a carry into bit >>> - 24. If one of bits 24-30 is set, there will be a carry >>> - into bit 31, so all of the hole bits will be changed. >>> + /* Align the input address to op_t. */ >>> + uintptr_t s_int = (uintptr_t) str; >>> + const op_t *word_ptr = word_containing (str); >>> >>> - The one misfire occurs when bits 24-30 are clear and bit >>> - 31 is set; in this case, the hole at bit 31 is not >>> - changed. If we had access to the processor carry flag, >>> - we could close this loophole by putting the fourth hole >>> - at bit 32! >>> + /* Read the first aligned word, but force bytes before the string to >>> + match neither zero nor goal (we make sure the high bit of each byte >>> + is 1, and the low 7 bits are all the opposite of the goal byte). */ >>> + op_t bmask = create_mask (s_int); >>> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); >> >> Think much clearer (and probably better codegen) is: >> find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > > It does not seem to work, at least not replacing the two lines with: > > op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); > > The loader itself can not loader anything (which means strchr is failing > somewhere). You'd need to update the extract arithmetic too. I'm sure it's still using ctz and the aligned pointer. Which might be why I used the slightly more complex arithmetic here, so that the tail of the function didn't need changing... r~
On Mon, Jan 9, 2023 at 12:35 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 05/01/23 20:17, Noah Goldstein wrote: > > On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > >> > >> New algorithm have the following key differences: > >> > >> - Reads first word unaligned and use string-maskoff function to > >> remove unwanted data. This strategy follow arch-specific > >> optimization used on aarch64 and powerpc. > >> > >> - Use string-fz{b,i} functions. > >> > >> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, > >> and powerpc-linux-gnu by removing the arch-specific assembly > >> implementation and disabling multi-arch (it covers both LE and BE > >> for 64 and 32 bits). > >> > >> Co-authored-by: Richard Henderson <rth@twiddle.net> > >> --- > >> string/strchrnul.c | 156 +++--------------- > >> .../power4/multiarch/strchrnul-ppc32.c | 4 - > >> sysdeps/s390/strchrnul-c.c | 2 - > >> 3 files changed, 24 insertions(+), 138 deletions(-) > >> > >> diff --git a/string/strchrnul.c b/string/strchrnul.c > >> index 0cc1fc6bb0..67defa3dab 100644 > >> --- a/string/strchrnul.c > >> +++ b/string/strchrnul.c > >> @@ -1,10 +1,5 @@ > >> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. > >> This file is part of the GNU C Library. > >> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), > >> - with help from Dan Sahlin (dan@sics.se) and > >> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); > >> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), > >> - and implemented by Roland McGrath (roland@ai.mit.edu). > >> > >> The GNU C Library is free software; you can redistribute it and/or > >> modify it under the terms of the GNU Lesser General Public > >> @@ -21,146 +16,43 @@ > >> <https://www.gnu.org/licenses/>. */ > >> > >> #include <string.h> > >> -#include <memcopy.h> > >> #include <stdlib.h> > >> +#include <stdint.h> > >> +#include <string-fza.h> > >> +#include <string-fzb.h> > >> +#include <string-fzi.h> > >> +#include <string-maskoff.h> > >> > >> #undef __strchrnul > >> #undef strchrnul > >> > >> -#ifndef STRCHRNUL > >> -# define STRCHRNUL __strchrnul > >> +#ifdef STRCHRNUL > >> +# define __strchrnul STRCHRNUL > >> #endif > >> > >> /* Find the first occurrence of C in S or the final NUL byte. */ > >> char * > >> -STRCHRNUL (const char *s, int c_in) > >> +__strchrnul (const char *str, int c_in) > >> { > >> - const unsigned char *char_ptr; > >> - const unsigned long int *longword_ptr; > >> - unsigned long int longword, magic_bits, charmask; > >> - unsigned char c; > >> - > >> - c = (unsigned char) c_in; > >> - > >> - /* Handle the first few characters by reading one character at a time. > >> - Do this until CHAR_PTR is aligned on a longword boundary. */ > >> - for (char_ptr = (const unsigned char *) s; > >> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; > >> - ++char_ptr) > >> - if (*char_ptr == c || *char_ptr == '\0') > >> - return (void *) char_ptr; > >> - > >> - /* All these elucidatory comments refer to 4-byte longwords, > >> - but the theory applies equally well to 8-byte longwords. */ > >> - > >> - longword_ptr = (unsigned long int *) char_ptr; > >> - > >> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits > >> - the "holes." Note that there is a hole just to the left of > >> - each byte, with an extra at the end: > >> - > >> - bits: 01111110 11111110 11111110 11111111 > >> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD > >> - > >> - The 1-bits make sure that carries propagate to the next 0-bit. > >> - The 0-bits provide holes for carries to fall into. */ > >> - magic_bits = -1; > >> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; > >> - > >> - /* Set up a longword, each of whose bytes is C. */ > >> - charmask = c | (c << 8); > >> - charmask |= charmask << 16; > >> - if (sizeof (longword) > 4) > >> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ > >> - charmask |= (charmask << 16) << 16; > >> - if (sizeof (longword) > 8) > >> - abort (); > >> - > >> - /* Instead of the traditional loop which tests each character, > >> - we will test a longword at a time. The tricky part is testing > >> - if *any of the four* bytes in the longword in question are zero. */ > >> - for (;;) > >> - { > >> - /* We tentatively exit the loop if adding MAGIC_BITS to > >> - LONGWORD fails to change any of the hole bits of LONGWORD. > >> - > >> - 1) Is this safe? Will it catch all the zero bytes? > >> - Suppose there is a byte with all zeros. Any carry bits > >> - propagating from its left will fall into the hole at its > >> - least significant bit and stop. Since there will be no > >> - carry from its most significant bit, the LSB of the > >> - byte to the left will be unchanged, and the zero will be > >> - detected. > >> + /* Set up a word, each of whose bytes is C. */ > >> + op_t repeated_c = repeat_bytes (c_in); > >> > >> - 2) Is this worthwhile? Will it ignore everything except > >> - zero bytes? Suppose every byte of LONGWORD has a bit set > >> - somewhere. There will be a carry into bit 8. If bit 8 > >> - is set, this will carry into bit 16. If bit 8 is clear, > >> - one of bits 9-15 must be set, so there will be a carry > >> - into bit 16. Similarly, there will be a carry into bit > >> - 24. If one of bits 24-30 is set, there will be a carry > >> - into bit 31, so all of the hole bits will be changed. > >> + /* Align the input address to op_t. */ > >> + uintptr_t s_int = (uintptr_t) str; > >> + const op_t *word_ptr = word_containing (str); > >> > >> - The one misfire occurs when bits 24-30 are clear and bit > >> - 31 is set; in this case, the hole at bit 31 is not > >> - changed. If we had access to the processor carry flag, > >> - we could close this loophole by putting the fourth hole > >> - at bit 32! > >> + /* Read the first aligned word, but force bytes before the string to > >> + match neither zero nor goal (we make sure the high bit of each byte > >> + is 1, and the low 7 bits are all the opposite of the goal byte). */ > >> + op_t bmask = create_mask (s_int); > >> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); > > > > Think much clearer (and probably better codegen) is: > > find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > > It does not seem to work, at least not replacing the two lines with: > > op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); > > The loader itself can not loader anything (which means strchr is failing > somewhere). The following works for me (only checking test-strchrnul, so maybe its missing a case), also as I have it here its only little endian. (Would need an ifdef here or API for shifting out out-of-bounds bits for cross platform). ``` op_t word = *word_ptr; op_t mask = find_zero_eq_low(word, repeated_c) >> (CHAR_BIT * (s_int % sizeof(uintptr_t))); if(mask) { return (char *) str + index_first_(mask); } while (! has_zero_eq (word, repeated_c)) word = *++word_ptr; op_t found = index_first_zero_eq (word, repeated_c); return (char *) (word_ptr) + found; ```
On Mon, Jan 9, 2023 at 12:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Jan 9, 2023 at 12:35 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: > > > > > > > > On 05/01/23 20:17, Noah Goldstein wrote: > > > On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > >> > > >> New algorithm have the following key differences: > > >> > > >> - Reads first word unaligned and use string-maskoff function to > > >> remove unwanted data. This strategy follow arch-specific > > >> optimization used on aarch64 and powerpc. > > >> > > >> - Use string-fz{b,i} functions. > > >> > > >> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, > > >> and powerpc-linux-gnu by removing the arch-specific assembly > > >> implementation and disabling multi-arch (it covers both LE and BE > > >> for 64 and 32 bits). > > >> > > >> Co-authored-by: Richard Henderson <rth@twiddle.net> > > >> --- > > >> string/strchrnul.c | 156 +++--------------- > > >> .../power4/multiarch/strchrnul-ppc32.c | 4 - > > >> sysdeps/s390/strchrnul-c.c | 2 - > > >> 3 files changed, 24 insertions(+), 138 deletions(-) > > >> > > >> diff --git a/string/strchrnul.c b/string/strchrnul.c > > >> index 0cc1fc6bb0..67defa3dab 100644 > > >> --- a/string/strchrnul.c > > >> +++ b/string/strchrnul.c > > >> @@ -1,10 +1,5 @@ > > >> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. > > >> This file is part of the GNU C Library. > > >> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), > > >> - with help from Dan Sahlin (dan@sics.se) and > > >> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); > > >> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), > > >> - and implemented by Roland McGrath (roland@ai.mit.edu). > > >> > > >> The GNU C Library is free software; you can redistribute it and/or > > >> modify it under the terms of the GNU Lesser General Public > > >> @@ -21,146 +16,43 @@ > > >> <https://www.gnu.org/licenses/>. */ > > >> > > >> #include <string.h> > > >> -#include <memcopy.h> > > >> #include <stdlib.h> > > >> +#include <stdint.h> > > >> +#include <string-fza.h> > > >> +#include <string-fzb.h> > > >> +#include <string-fzi.h> > > >> +#include <string-maskoff.h> > > >> > > >> #undef __strchrnul > > >> #undef strchrnul > > >> > > >> -#ifndef STRCHRNUL > > >> -# define STRCHRNUL __strchrnul > > >> +#ifdef STRCHRNUL > > >> +# define __strchrnul STRCHRNUL > > >> #endif > > >> > > >> /* Find the first occurrence of C in S or the final NUL byte. */ > > >> char * > > >> -STRCHRNUL (const char *s, int c_in) > > >> +__strchrnul (const char *str, int c_in) > > >> { > > >> - const unsigned char *char_ptr; > > >> - const unsigned long int *longword_ptr; > > >> - unsigned long int longword, magic_bits, charmask; > > >> - unsigned char c; > > >> - > > >> - c = (unsigned char) c_in; > > >> - > > >> - /* Handle the first few characters by reading one character at a time. > > >> - Do this until CHAR_PTR is aligned on a longword boundary. */ > > >> - for (char_ptr = (const unsigned char *) s; > > >> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; > > >> - ++char_ptr) > > >> - if (*char_ptr == c || *char_ptr == '\0') > > >> - return (void *) char_ptr; > > >> - > > >> - /* All these elucidatory comments refer to 4-byte longwords, > > >> - but the theory applies equally well to 8-byte longwords. */ > > >> - > > >> - longword_ptr = (unsigned long int *) char_ptr; > > >> - > > >> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits > > >> - the "holes." Note that there is a hole just to the left of > > >> - each byte, with an extra at the end: > > >> - > > >> - bits: 01111110 11111110 11111110 11111111 > > >> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD > > >> - > > >> - The 1-bits make sure that carries propagate to the next 0-bit. > > >> - The 0-bits provide holes for carries to fall into. */ > > >> - magic_bits = -1; > > >> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; > > >> - > > >> - /* Set up a longword, each of whose bytes is C. */ > > >> - charmask = c | (c << 8); > > >> - charmask |= charmask << 16; > > >> - if (sizeof (longword) > 4) > > >> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ > > >> - charmask |= (charmask << 16) << 16; > > >> - if (sizeof (longword) > 8) > > >> - abort (); > > >> - > > >> - /* Instead of the traditional loop which tests each character, > > >> - we will test a longword at a time. The tricky part is testing > > >> - if *any of the four* bytes in the longword in question are zero. */ > > >> - for (;;) > > >> - { > > >> - /* We tentatively exit the loop if adding MAGIC_BITS to > > >> - LONGWORD fails to change any of the hole bits of LONGWORD. > > >> - > > >> - 1) Is this safe? Will it catch all the zero bytes? > > >> - Suppose there is a byte with all zeros. Any carry bits > > >> - propagating from its left will fall into the hole at its > > >> - least significant bit and stop. Since there will be no > > >> - carry from its most significant bit, the LSB of the > > >> - byte to the left will be unchanged, and the zero will be > > >> - detected. > > >> + /* Set up a word, each of whose bytes is C. */ > > >> + op_t repeated_c = repeat_bytes (c_in); > > >> > > >> - 2) Is this worthwhile? Will it ignore everything except > > >> - zero bytes? Suppose every byte of LONGWORD has a bit set > > >> - somewhere. There will be a carry into bit 8. If bit 8 > > >> - is set, this will carry into bit 16. If bit 8 is clear, > > >> - one of bits 9-15 must be set, so there will be a carry > > >> - into bit 16. Similarly, there will be a carry into bit > > >> - 24. If one of bits 24-30 is set, there will be a carry > > >> - into bit 31, so all of the hole bits will be changed. > > >> + /* Align the input address to op_t. */ > > >> + uintptr_t s_int = (uintptr_t) str; > > >> + const op_t *word_ptr = word_containing (str); > > >> > > >> - The one misfire occurs when bits 24-30 are clear and bit > > >> - 31 is set; in this case, the hole at bit 31 is not > > >> - changed. If we had access to the processor carry flag, > > >> - we could close this loophole by putting the fourth hole > > >> - at bit 32! > > >> + /* Read the first aligned word, but force bytes before the string to > > >> + match neither zero nor goal (we make sure the high bit of each byte > > >> + is 1, and the low 7 bits are all the opposite of the goal byte). */ > > >> + op_t bmask = create_mask (s_int); > > >> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); > > > > > > Think much clearer (and probably better codegen) is: > > > find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > > > > It does not seem to work, at least not replacing the two lines with: > > > > op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); > > > > The loader itself can not loader anything (which means strchr is failing > > somewhere). > > The following works for me (only checking test-strchrnul, so maybe its missing > a case), also as I have it here its only little endian. (Would need an > ifdef here > or API for shifting out out-of-bounds bits for cross platform). > ``` > op_t word = *word_ptr; > op_t mask = find_zero_eq_low(word, repeated_c) >> (CHAR_BIT * (s_int > % sizeof(uintptr_t))); > if(mask) { > return (char *) str + index_first_(mask); > } > word = *++word_ptr; > while (! has_zero_eq (word, repeated_c)) > word = *++word_ptr; > > op_t found = index_first_zero_eq (word, repeated_c); > return (char *) (word_ptr) + found; > ```
On 1/9/23 12:35, Adhemerval Zanella Netto wrote: > > > On 05/01/23 20:17, Noah Goldstein wrote: >> On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha >> <libc-alpha@sourceware.org> wrote: >>> >>> New algorithm have the following key differences: >>> >>> - Reads first word unaligned and use string-maskoff function to >>> remove unwanted data. This strategy follow arch-specific >>> optimization used on aarch64 and powerpc. >>> >>> - Use string-fz{b,i} functions. >>> >>> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, >>> and powerpc-linux-gnu by removing the arch-specific assembly >>> implementation and disabling multi-arch (it covers both LE and BE >>> for 64 and 32 bits). >>> >>> Co-authored-by: Richard Henderson <rth@twiddle.net> >>> --- >>> string/strchrnul.c | 156 +++--------------- >>> .../power4/multiarch/strchrnul-ppc32.c | 4 - >>> sysdeps/s390/strchrnul-c.c | 2 - >>> 3 files changed, 24 insertions(+), 138 deletions(-) >>> >>> diff --git a/string/strchrnul.c b/string/strchrnul.c >>> index 0cc1fc6bb0..67defa3dab 100644 >>> --- a/string/strchrnul.c >>> +++ b/string/strchrnul.c >>> @@ -1,10 +1,5 @@ >>> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. >>> This file is part of the GNU C Library. >>> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), >>> - with help from Dan Sahlin (dan@sics.se) and >>> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); >>> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), >>> - and implemented by Roland McGrath (roland@ai.mit.edu). >>> >>> The GNU C Library is free software; you can redistribute it and/or >>> modify it under the terms of the GNU Lesser General Public >>> @@ -21,146 +16,43 @@ >>> <https://www.gnu.org/licenses/>. */ >>> >>> #include <string.h> >>> -#include <memcopy.h> >>> #include <stdlib.h> >>> +#include <stdint.h> >>> +#include <string-fza.h> >>> +#include <string-fzb.h> >>> +#include <string-fzi.h> >>> +#include <string-maskoff.h> >>> >>> #undef __strchrnul >>> #undef strchrnul >>> >>> -#ifndef STRCHRNUL >>> -# define STRCHRNUL __strchrnul >>> +#ifdef STRCHRNUL >>> +# define __strchrnul STRCHRNUL >>> #endif >>> >>> /* Find the first occurrence of C in S or the final NUL byte. */ >>> char * >>> -STRCHRNUL (const char *s, int c_in) >>> +__strchrnul (const char *str, int c_in) >>> { >>> - const unsigned char *char_ptr; >>> - const unsigned long int *longword_ptr; >>> - unsigned long int longword, magic_bits, charmask; >>> - unsigned char c; >>> - >>> - c = (unsigned char) c_in; >>> - >>> - /* Handle the first few characters by reading one character at a time. >>> - Do this until CHAR_PTR is aligned on a longword boundary. */ >>> - for (char_ptr = (const unsigned char *) s; >>> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; >>> - ++char_ptr) >>> - if (*char_ptr == c || *char_ptr == '\0') >>> - return (void *) char_ptr; >>> - >>> - /* All these elucidatory comments refer to 4-byte longwords, >>> - but the theory applies equally well to 8-byte longwords. */ >>> - >>> - longword_ptr = (unsigned long int *) char_ptr; >>> - >>> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits >>> - the "holes." Note that there is a hole just to the left of >>> - each byte, with an extra at the end: >>> - >>> - bits: 01111110 11111110 11111110 11111111 >>> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD >>> - >>> - The 1-bits make sure that carries propagate to the next 0-bit. >>> - The 0-bits provide holes for carries to fall into. */ >>> - magic_bits = -1; >>> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; >>> - >>> - /* Set up a longword, each of whose bytes is C. */ >>> - charmask = c | (c << 8); >>> - charmask |= charmask << 16; >>> - if (sizeof (longword) > 4) >>> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ >>> - charmask |= (charmask << 16) << 16; >>> - if (sizeof (longword) > 8) >>> - abort (); >>> - >>> - /* Instead of the traditional loop which tests each character, >>> - we will test a longword at a time. The tricky part is testing >>> - if *any of the four* bytes in the longword in question are zero. */ >>> - for (;;) >>> - { >>> - /* We tentatively exit the loop if adding MAGIC_BITS to >>> - LONGWORD fails to change any of the hole bits of LONGWORD. >>> - >>> - 1) Is this safe? Will it catch all the zero bytes? >>> - Suppose there is a byte with all zeros. Any carry bits >>> - propagating from its left will fall into the hole at its >>> - least significant bit and stop. Since there will be no >>> - carry from its most significant bit, the LSB of the >>> - byte to the left will be unchanged, and the zero will be >>> - detected. >>> + /* Set up a word, each of whose bytes is C. */ >>> + op_t repeated_c = repeat_bytes (c_in); >>> >>> - 2) Is this worthwhile? Will it ignore everything except >>> - zero bytes? Suppose every byte of LONGWORD has a bit set >>> - somewhere. There will be a carry into bit 8. If bit 8 >>> - is set, this will carry into bit 16. If bit 8 is clear, >>> - one of bits 9-15 must be set, so there will be a carry >>> - into bit 16. Similarly, there will be a carry into bit >>> - 24. If one of bits 24-30 is set, there will be a carry >>> - into bit 31, so all of the hole bits will be changed. >>> + /* Align the input address to op_t. */ >>> + uintptr_t s_int = (uintptr_t) str; >>> + const op_t *word_ptr = word_containing (str); >>> >>> - The one misfire occurs when bits 24-30 are clear and bit >>> - 31 is set; in this case, the hole at bit 31 is not >>> - changed. If we had access to the processor carry flag, >>> - we could close this loophole by putting the fourth hole >>> - at bit 32! >>> + /* Read the first aligned word, but force bytes before the string to >>> + match neither zero nor goal (we make sure the high bit of each byte >>> + is 1, and the low 7 bits are all the opposite of the goal byte). */ >>> + op_t bmask = create_mask (s_int); >>> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); >> >> Think much clearer (and probably better codegen) is: >> find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > > It does not seem to work, at least not replacing the two lines with: > > op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); Oh, two fine points: (1) big-endian would want shifting left, (2) alpha would want shifting by bits not bytes, because the cmpbge insn produces an 8-bit mask. so you'd need to hide this shift in the headers like create_mask(). r~
On 09/01/23 20:33, Richard Henderson wrote: > On 1/9/23 12:35, Adhemerval Zanella Netto wrote: >> >> >> On 05/01/23 20:17, Noah Goldstein wrote: >>> On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha >>> <libc-alpha@sourceware.org> wrote: >>>> >>>> New algorithm have the following key differences: >>>> >>>> - Reads first word unaligned and use string-maskoff function to >>>> remove unwanted data. This strategy follow arch-specific >>>> optimization used on aarch64 and powerpc. >>>> >>>> - Use string-fz{b,i} functions. >>>> >>>> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, >>>> and powerpc-linux-gnu by removing the arch-specific assembly >>>> implementation and disabling multi-arch (it covers both LE and BE >>>> for 64 and 32 bits). >>>> >>>> Co-authored-by: Richard Henderson <rth@twiddle.net> >>>> --- >>>> string/strchrnul.c | 156 +++--------------- >>>> .../power4/multiarch/strchrnul-ppc32.c | 4 - >>>> sysdeps/s390/strchrnul-c.c | 2 - >>>> 3 files changed, 24 insertions(+), 138 deletions(-) >>>> >>>> diff --git a/string/strchrnul.c b/string/strchrnul.c >>>> index 0cc1fc6bb0..67defa3dab 100644 >>>> --- a/string/strchrnul.c >>>> +++ b/string/strchrnul.c >>>> @@ -1,10 +1,5 @@ >>>> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. >>>> This file is part of the GNU C Library. >>>> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), >>>> - with help from Dan Sahlin (dan@sics.se) and >>>> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); >>>> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), >>>> - and implemented by Roland McGrath (roland@ai.mit.edu). >>>> >>>> The GNU C Library is free software; you can redistribute it and/or >>>> modify it under the terms of the GNU Lesser General Public >>>> @@ -21,146 +16,43 @@ >>>> <https://www.gnu.org/licenses/>. */ >>>> >>>> #include <string.h> >>>> -#include <memcopy.h> >>>> #include <stdlib.h> >>>> +#include <stdint.h> >>>> +#include <string-fza.h> >>>> +#include <string-fzb.h> >>>> +#include <string-fzi.h> >>>> +#include <string-maskoff.h> >>>> >>>> #undef __strchrnul >>>> #undef strchrnul >>>> >>>> -#ifndef STRCHRNUL >>>> -# define STRCHRNUL __strchrnul >>>> +#ifdef STRCHRNUL >>>> +# define __strchrnul STRCHRNUL >>>> #endif >>>> >>>> /* Find the first occurrence of C in S or the final NUL byte. */ >>>> char * >>>> -STRCHRNUL (const char *s, int c_in) >>>> +__strchrnul (const char *str, int c_in) >>>> { >>>> - const unsigned char *char_ptr; >>>> - const unsigned long int *longword_ptr; >>>> - unsigned long int longword, magic_bits, charmask; >>>> - unsigned char c; >>>> - >>>> - c = (unsigned char) c_in; >>>> - >>>> - /* Handle the first few characters by reading one character at a time. >>>> - Do this until CHAR_PTR is aligned on a longword boundary. */ >>>> - for (char_ptr = (const unsigned char *) s; >>>> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; >>>> - ++char_ptr) >>>> - if (*char_ptr == c || *char_ptr == '\0') >>>> - return (void *) char_ptr; >>>> - >>>> - /* All these elucidatory comments refer to 4-byte longwords, >>>> - but the theory applies equally well to 8-byte longwords. */ >>>> - >>>> - longword_ptr = (unsigned long int *) char_ptr; >>>> - >>>> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits >>>> - the "holes." Note that there is a hole just to the left of >>>> - each byte, with an extra at the end: >>>> - >>>> - bits: 01111110 11111110 11111110 11111111 >>>> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD >>>> - >>>> - The 1-bits make sure that carries propagate to the next 0-bit. >>>> - The 0-bits provide holes for carries to fall into. */ >>>> - magic_bits = -1; >>>> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; >>>> - >>>> - /* Set up a longword, each of whose bytes is C. */ >>>> - charmask = c | (c << 8); >>>> - charmask |= charmask << 16; >>>> - if (sizeof (longword) > 4) >>>> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ >>>> - charmask |= (charmask << 16) << 16; >>>> - if (sizeof (longword) > 8) >>>> - abort (); >>>> - >>>> - /* Instead of the traditional loop which tests each character, >>>> - we will test a longword at a time. The tricky part is testing >>>> - if *any of the four* bytes in the longword in question are zero. */ >>>> - for (;;) >>>> - { >>>> - /* We tentatively exit the loop if adding MAGIC_BITS to >>>> - LONGWORD fails to change any of the hole bits of LONGWORD. >>>> - >>>> - 1) Is this safe? Will it catch all the zero bytes? >>>> - Suppose there is a byte with all zeros. Any carry bits >>>> - propagating from its left will fall into the hole at its >>>> - least significant bit and stop. Since there will be no >>>> - carry from its most significant bit, the LSB of the >>>> - byte to the left will be unchanged, and the zero will be >>>> - detected. >>>> + /* Set up a word, each of whose bytes is C. */ >>>> + op_t repeated_c = repeat_bytes (c_in); >>>> >>>> - 2) Is this worthwhile? Will it ignore everything except >>>> - zero bytes? Suppose every byte of LONGWORD has a bit set >>>> - somewhere. There will be a carry into bit 8. If bit 8 >>>> - is set, this will carry into bit 16. If bit 8 is clear, >>>> - one of bits 9-15 must be set, so there will be a carry >>>> - into bit 16. Similarly, there will be a carry into bit >>>> - 24. If one of bits 24-30 is set, there will be a carry >>>> - into bit 31, so all of the hole bits will be changed. >>>> + /* Align the input address to op_t. */ >>>> + uintptr_t s_int = (uintptr_t) str; >>>> + const op_t *word_ptr = word_containing (str); >>>> >>>> - The one misfire occurs when bits 24-30 are clear and bit >>>> - 31 is set; in this case, the hole at bit 31 is not >>>> - changed. If we had access to the processor carry flag, >>>> - we could close this loophole by putting the fourth hole >>>> - at bit 32! >>>> + /* Read the first aligned word, but force bytes before the string to >>>> + match neither zero nor goal (we make sure the high bit of each byte >>>> + is 1, and the low 7 bits are all the opposite of the goal byte). */ >>>> + op_t bmask = create_mask (s_int); >>>> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); >>> >>> Think much clearer (and probably better codegen) is: >>> find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) >> >> It does not seem to work, at least not replacing the two lines with: >> >> op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); > > Oh, two fine points: > > (1) big-endian would want shifting left, > (2) alpha would want shifting by bits not bytes, > because the cmpbge insn produces an 8-bit mask. > > so you'd need to hide this shift in the headers like create_mask(). Alright, the following works: static __always_inline op_t check_mask (op_t word, uintptr_t s_int) { if (__BYTE_ORDER == __LITTLE_ENDIAN) return word >> (CHAR_BIT * (s_int % sizeof (s_int))); else return word << (CHAR_BIT * (s_int % sizeof (s_int))); } char * __strchrnul (const char *str, int c_in) { op_t repeated_c = repeat_bytes (c_in); uintptr_t s_int = (uintptr_t) str; const op_t *word_ptr = word_containing (str); op_t word = *word_ptr; op_t mask = check_mask (find_zero_eq_all (word, repeated_c), s_int); if (mask != 0) return (char *) str + index_first_(mask); do word = *++word_ptr; while (! has_zero_eq (word, repeated_c)); op_t found = index_first_zero_eq (word, repeated_c); return (char *) word_ptr + found; } I had to use find_zero_eq_all to avoid uninitialized bytes, that triggered some regression on tests that use strchr (for instance test-strpbrk). I will update the patch based on this version.
On 1/10/23 06:18, Adhemerval Zanella Netto via Libc-alpha wrote: > static __always_inline op_t > check_mask (op_t word, uintptr_t s_int) > { > if (__BYTE_ORDER == __LITTLE_ENDIAN) > return word >> (CHAR_BIT * (s_int % sizeof (s_int))); > else > return word << (CHAR_BIT * (s_int % sizeof (s_int))); > } sizeof(op_t), which is usually the same size, but doesn't have to be. r~
On Tue, Jan 10, 2023 at 8:24 AM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 1/10/23 06:18, Adhemerval Zanella Netto via Libc-alpha wrote: > > static __always_inline op_t > > check_mask (op_t word, uintptr_t s_int) > > { > > if (__BYTE_ORDER == __LITTLE_ENDIAN) > > return word >> (CHAR_BIT * (s_int % sizeof (s_int))); > > else > > return word << (CHAR_BIT * (s_int % sizeof (s_int))); > > } > > sizeof(op_t), which is usually the same size, but doesn't have to be. > Are we aligning by sizeof(op_t) or sizeof(void *)? The former then `word_containing` will also need to be changed. If the latter then sizeof(s_int) is correct. > > r~
On Tue, Jan 10, 2023 at 6:18 AM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 09/01/23 20:33, Richard Henderson wrote: > > On 1/9/23 12:35, Adhemerval Zanella Netto wrote: > >> > >> > >> On 05/01/23 20:17, Noah Goldstein wrote: > >>> On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha > >>> <libc-alpha@sourceware.org> wrote: > >>>> > >>>> New algorithm have the following key differences: > >>>> > >>>> - Reads first word unaligned and use string-maskoff function to > >>>> remove unwanted data. This strategy follow arch-specific > >>>> optimization used on aarch64 and powerpc. > >>>> > >>>> - Use string-fz{b,i} functions. > >>>> > >>>> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, > >>>> and powerpc-linux-gnu by removing the arch-specific assembly > >>>> implementation and disabling multi-arch (it covers both LE and BE > >>>> for 64 and 32 bits). > >>>> > >>>> Co-authored-by: Richard Henderson <rth@twiddle.net> > >>>> --- > >>>> string/strchrnul.c | 156 +++--------------- > >>>> .../power4/multiarch/strchrnul-ppc32.c | 4 - > >>>> sysdeps/s390/strchrnul-c.c | 2 - > >>>> 3 files changed, 24 insertions(+), 138 deletions(-) > >>>> > >>>> diff --git a/string/strchrnul.c b/string/strchrnul.c > >>>> index 0cc1fc6bb0..67defa3dab 100644 > >>>> --- a/string/strchrnul.c > >>>> +++ b/string/strchrnul.c > >>>> @@ -1,10 +1,5 @@ > >>>> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. > >>>> This file is part of the GNU C Library. > >>>> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), > >>>> - with help from Dan Sahlin (dan@sics.se) and > >>>> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); > >>>> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), > >>>> - and implemented by Roland McGrath (roland@ai.mit.edu). > >>>> > >>>> The GNU C Library is free software; you can redistribute it and/or > >>>> modify it under the terms of the GNU Lesser General Public > >>>> @@ -21,146 +16,43 @@ > >>>> <https://www.gnu.org/licenses/>. */ > >>>> > >>>> #include <string.h> > >>>> -#include <memcopy.h> > >>>> #include <stdlib.h> > >>>> +#include <stdint.h> > >>>> +#include <string-fza.h> > >>>> +#include <string-fzb.h> > >>>> +#include <string-fzi.h> > >>>> +#include <string-maskoff.h> > >>>> > >>>> #undef __strchrnul > >>>> #undef strchrnul > >>>> > >>>> -#ifndef STRCHRNUL > >>>> -# define STRCHRNUL __strchrnul > >>>> +#ifdef STRCHRNUL > >>>> +# define __strchrnul STRCHRNUL > >>>> #endif > >>>> > >>>> /* Find the first occurrence of C in S or the final NUL byte. */ > >>>> char * > >>>> -STRCHRNUL (const char *s, int c_in) > >>>> +__strchrnul (const char *str, int c_in) > >>>> { > >>>> - const unsigned char *char_ptr; > >>>> - const unsigned long int *longword_ptr; > >>>> - unsigned long int longword, magic_bits, charmask; > >>>> - unsigned char c; > >>>> - > >>>> - c = (unsigned char) c_in; > >>>> - > >>>> - /* Handle the first few characters by reading one character at a time. > >>>> - Do this until CHAR_PTR is aligned on a longword boundary. */ > >>>> - for (char_ptr = (const unsigned char *) s; > >>>> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; > >>>> - ++char_ptr) > >>>> - if (*char_ptr == c || *char_ptr == '\0') > >>>> - return (void *) char_ptr; > >>>> - > >>>> - /* All these elucidatory comments refer to 4-byte longwords, > >>>> - but the theory applies equally well to 8-byte longwords. */ > >>>> - > >>>> - longword_ptr = (unsigned long int *) char_ptr; > >>>> - > >>>> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits > >>>> - the "holes." Note that there is a hole just to the left of > >>>> - each byte, with an extra at the end: > >>>> - > >>>> - bits: 01111110 11111110 11111110 11111111 > >>>> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD > >>>> - > >>>> - The 1-bits make sure that carries propagate to the next 0-bit. > >>>> - The 0-bits provide holes for carries to fall into. */ > >>>> - magic_bits = -1; > >>>> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; > >>>> - > >>>> - /* Set up a longword, each of whose bytes is C. */ > >>>> - charmask = c | (c << 8); > >>>> - charmask |= charmask << 16; > >>>> - if (sizeof (longword) > 4) > >>>> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ > >>>> - charmask |= (charmask << 16) << 16; > >>>> - if (sizeof (longword) > 8) > >>>> - abort (); > >>>> - > >>>> - /* Instead of the traditional loop which tests each character, > >>>> - we will test a longword at a time. The tricky part is testing > >>>> - if *any of the four* bytes in the longword in question are zero. */ > >>>> - for (;;) > >>>> - { > >>>> - /* We tentatively exit the loop if adding MAGIC_BITS to > >>>> - LONGWORD fails to change any of the hole bits of LONGWORD. > >>>> - > >>>> - 1) Is this safe? Will it catch all the zero bytes? > >>>> - Suppose there is a byte with all zeros. Any carry bits > >>>> - propagating from its left will fall into the hole at its > >>>> - least significant bit and stop. Since there will be no > >>>> - carry from its most significant bit, the LSB of the > >>>> - byte to the left will be unchanged, and the zero will be > >>>> - detected. > >>>> + /* Set up a word, each of whose bytes is C. */ > >>>> + op_t repeated_c = repeat_bytes (c_in); > >>>> > >>>> - 2) Is this worthwhile? Will it ignore everything except > >>>> - zero bytes? Suppose every byte of LONGWORD has a bit set > >>>> - somewhere. There will be a carry into bit 8. If bit 8 > >>>> - is set, this will carry into bit 16. If bit 8 is clear, > >>>> - one of bits 9-15 must be set, so there will be a carry > >>>> - into bit 16. Similarly, there will be a carry into bit > >>>> - 24. If one of bits 24-30 is set, there will be a carry > >>>> - into bit 31, so all of the hole bits will be changed. > >>>> + /* Align the input address to op_t. */ > >>>> + uintptr_t s_int = (uintptr_t) str; > >>>> + const op_t *word_ptr = word_containing (str); > >>>> > >>>> - The one misfire occurs when bits 24-30 are clear and bit > >>>> - 31 is set; in this case, the hole at bit 31 is not > >>>> - changed. If we had access to the processor carry flag, > >>>> - we could close this loophole by putting the fourth hole > >>>> - at bit 32! > >>>> + /* Read the first aligned word, but force bytes before the string to > >>>> + match neither zero nor goal (we make sure the high bit of each byte > >>>> + is 1, and the low 7 bits are all the opposite of the goal byte). */ > >>>> + op_t bmask = create_mask (s_int); > >>>> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); > >>> > >>> Think much clearer (and probably better codegen) is: > >>> find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) > >> > >> It does not seem to work, at least not replacing the two lines with: > >> > >> op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); > > > > Oh, two fine points: > > > > (1) big-endian would want shifting left, > > (2) alpha would want shifting by bits not bytes, > > because the cmpbge insn produces an 8-bit mask. > > > > so you'd need to hide this shift in the headers like create_mask(). > > Alright, the following works: > > > static __always_inline op_t > check_mask (op_t word, uintptr_t s_int) > { > if (__BYTE_ORDER == __LITTLE_ENDIAN) > return word >> (CHAR_BIT * (s_int % sizeof (s_int))); > else > return word << (CHAR_BIT * (s_int % sizeof (s_int))); > } Imo put this in with "[PATCH v5 03/17] Add string-maskoff.h generic header" think may also be needed for memchr. > > char * > __strchrnul (const char *str, int c_in) > { > op_t repeated_c = repeat_bytes (c_in); > > uintptr_t s_int = (uintptr_t) str; > const op_t *word_ptr = word_containing (str); > > op_t word = *word_ptr; > > op_t mask = check_mask (find_zero_eq_all (word, repeated_c), s_int); > if (mask != 0) > return (char *) str + index_first_(mask); > > do > word = *++word_ptr; > while (! has_zero_eq (word, repeated_c)); > > op_t found = index_first_zero_eq (word, repeated_c); > return (char *) word_ptr + found; > } > > > I had to use find_zero_eq_all to avoid uninitialized bytes, that triggered > some regression on tests that use strchr (for instance test-strpbrk). > > I will update the patch based on this version.
On 10/01/23 14:17, Noah Goldstein wrote: > On Tue, Jan 10, 2023 at 6:18 AM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 09/01/23 20:33, Richard Henderson wrote: >>> On 1/9/23 12:35, Adhemerval Zanella Netto wrote: >>>> >>>> >>>> On 05/01/23 20:17, Noah Goldstein wrote: >>>>> On Mon, Sep 19, 2022 at 1:04 PM Adhemerval Zanella via Libc-alpha >>>>> <libc-alpha@sourceware.org> wrote: >>>>>> >>>>>> New algorithm have the following key differences: >>>>>> >>>>>> - Reads first word unaligned and use string-maskoff function to >>>>>> remove unwanted data. This strategy follow arch-specific >>>>>> optimization used on aarch64 and powerpc. >>>>>> >>>>>> - Use string-fz{b,i} functions. >>>>>> >>>>>> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64-linux-gnu, >>>>>> and powerpc-linux-gnu by removing the arch-specific assembly >>>>>> implementation and disabling multi-arch (it covers both LE and BE >>>>>> for 64 and 32 bits). >>>>>> >>>>>> Co-authored-by: Richard Henderson <rth@twiddle.net> >>>>>> --- >>>>>> string/strchrnul.c | 156 +++--------------- >>>>>> .../power4/multiarch/strchrnul-ppc32.c | 4 - >>>>>> sysdeps/s390/strchrnul-c.c | 2 - >>>>>> 3 files changed, 24 insertions(+), 138 deletions(-) >>>>>> >>>>>> diff --git a/string/strchrnul.c b/string/strchrnul.c >>>>>> index 0cc1fc6bb0..67defa3dab 100644 >>>>>> --- a/string/strchrnul.c >>>>>> +++ b/string/strchrnul.c >>>>>> @@ -1,10 +1,5 @@ >>>>>> /* Copyright (C) 1991-2022 Free Software Foundation, Inc. >>>>>> This file is part of the GNU C Library. >>>>>> - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), >>>>>> - with help from Dan Sahlin (dan@sics.se) and >>>>>> - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); >>>>>> - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), >>>>>> - and implemented by Roland McGrath (roland@ai.mit.edu). >>>>>> >>>>>> The GNU C Library is free software; you can redistribute it and/or >>>>>> modify it under the terms of the GNU Lesser General Public >>>>>> @@ -21,146 +16,43 @@ >>>>>> <https://www.gnu.org/licenses/>. */ >>>>>> >>>>>> #include <string.h> >>>>>> -#include <memcopy.h> >>>>>> #include <stdlib.h> >>>>>> +#include <stdint.h> >>>>>> +#include <string-fza.h> >>>>>> +#include <string-fzb.h> >>>>>> +#include <string-fzi.h> >>>>>> +#include <string-maskoff.h> >>>>>> >>>>>> #undef __strchrnul >>>>>> #undef strchrnul >>>>>> >>>>>> -#ifndef STRCHRNUL >>>>>> -# define STRCHRNUL __strchrnul >>>>>> +#ifdef STRCHRNUL >>>>>> +# define __strchrnul STRCHRNUL >>>>>> #endif >>>>>> >>>>>> /* Find the first occurrence of C in S or the final NUL byte. */ >>>>>> char * >>>>>> -STRCHRNUL (const char *s, int c_in) >>>>>> +__strchrnul (const char *str, int c_in) >>>>>> { >>>>>> - const unsigned char *char_ptr; >>>>>> - const unsigned long int *longword_ptr; >>>>>> - unsigned long int longword, magic_bits, charmask; >>>>>> - unsigned char c; >>>>>> - >>>>>> - c = (unsigned char) c_in; >>>>>> - >>>>>> - /* Handle the first few characters by reading one character at a time. >>>>>> - Do this until CHAR_PTR is aligned on a longword boundary. */ >>>>>> - for (char_ptr = (const unsigned char *) s; >>>>>> - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; >>>>>> - ++char_ptr) >>>>>> - if (*char_ptr == c || *char_ptr == '\0') >>>>>> - return (void *) char_ptr; >>>>>> - >>>>>> - /* All these elucidatory comments refer to 4-byte longwords, >>>>>> - but the theory applies equally well to 8-byte longwords. */ >>>>>> - >>>>>> - longword_ptr = (unsigned long int *) char_ptr; >>>>>> - >>>>>> - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits >>>>>> - the "holes." Note that there is a hole just to the left of >>>>>> - each byte, with an extra at the end: >>>>>> - >>>>>> - bits: 01111110 11111110 11111110 11111111 >>>>>> - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD >>>>>> - >>>>>> - The 1-bits make sure that carries propagate to the next 0-bit. >>>>>> - The 0-bits provide holes for carries to fall into. */ >>>>>> - magic_bits = -1; >>>>>> - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; >>>>>> - >>>>>> - /* Set up a longword, each of whose bytes is C. */ >>>>>> - charmask = c | (c << 8); >>>>>> - charmask |= charmask << 16; >>>>>> - if (sizeof (longword) > 4) >>>>>> - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ >>>>>> - charmask |= (charmask << 16) << 16; >>>>>> - if (sizeof (longword) > 8) >>>>>> - abort (); >>>>>> - >>>>>> - /* Instead of the traditional loop which tests each character, >>>>>> - we will test a longword at a time. The tricky part is testing >>>>>> - if *any of the four* bytes in the longword in question are zero. */ >>>>>> - for (;;) >>>>>> - { >>>>>> - /* We tentatively exit the loop if adding MAGIC_BITS to >>>>>> - LONGWORD fails to change any of the hole bits of LONGWORD. >>>>>> - >>>>>> - 1) Is this safe? Will it catch all the zero bytes? >>>>>> - Suppose there is a byte with all zeros. Any carry bits >>>>>> - propagating from its left will fall into the hole at its >>>>>> - least significant bit and stop. Since there will be no >>>>>> - carry from its most significant bit, the LSB of the >>>>>> - byte to the left will be unchanged, and the zero will be >>>>>> - detected. >>>>>> + /* Set up a word, each of whose bytes is C. */ >>>>>> + op_t repeated_c = repeat_bytes (c_in); >>>>>> >>>>>> - 2) Is this worthwhile? Will it ignore everything except >>>>>> - zero bytes? Suppose every byte of LONGWORD has a bit set >>>>>> - somewhere. There will be a carry into bit 8. If bit 8 >>>>>> - is set, this will carry into bit 16. If bit 8 is clear, >>>>>> - one of bits 9-15 must be set, so there will be a carry >>>>>> - into bit 16. Similarly, there will be a carry into bit >>>>>> - 24. If one of bits 24-30 is set, there will be a carry >>>>>> - into bit 31, so all of the hole bits will be changed. >>>>>> + /* Align the input address to op_t. */ >>>>>> + uintptr_t s_int = (uintptr_t) str; >>>>>> + const op_t *word_ptr = word_containing (str); >>>>>> >>>>>> - The one misfire occurs when bits 24-30 are clear and bit >>>>>> - 31 is set; in this case, the hole at bit 31 is not >>>>>> - changed. If we had access to the processor carry flag, >>>>>> - we could close this loophole by putting the fourth hole >>>>>> - at bit 32! >>>>>> + /* Read the first aligned word, but force bytes before the string to >>>>>> + match neither zero nor goal (we make sure the high bit of each byte >>>>>> + is 1, and the low 7 bits are all the opposite of the goal byte). */ >>>>>> + op_t bmask = create_mask (s_int); >>>>>> + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); >>>>> >>>>> Think much clearer (and probably better codegen) is: >>>>> find_zero_eq_low/all(word, repeated) >> (s_int * CHAR_BIT) >>>> >>>> It does not seem to work, at least not replacing the two lines with: >>>> >>>> op_t word = find_zero_eq_all/low (*word_ptr, repeated_c) >> (s_int * CHAR_BIT); >>> >>> Oh, two fine points: >>> >>> (1) big-endian would want shifting left, >>> (2) alpha would want shifting by bits not bytes, >>> because the cmpbge insn produces an 8-bit mask. >>> >>> so you'd need to hide this shift in the headers like create_mask(). >> >> Alright, the following works: >> >> >> static __always_inline op_t >> check_mask (op_t word, uintptr_t s_int) >> { >> if (__BYTE_ORDER == __LITTLE_ENDIAN) >> return word >> (CHAR_BIT * (s_int % sizeof (s_int))); >> else >> return word << (CHAR_BIT * (s_int % sizeof (s_int))); >> } > > Imo put this in with "[PATCH v5 03/17] Add string-maskoff.h generic header" > think may also be needed for memchr. Yeap, this is what I have done.
On 10/01/23 14:16, Noah Goldstein wrote: > On Tue, Jan 10, 2023 at 8:24 AM Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> On 1/10/23 06:18, Adhemerval Zanella Netto via Libc-alpha wrote: >>> static __always_inline op_t >>> check_mask (op_t word, uintptr_t s_int) >>> { >>> if (__BYTE_ORDER == __LITTLE_ENDIAN) >>> return word >> (CHAR_BIT * (s_int % sizeof (s_int))); >>> else >>> return word << (CHAR_BIT * (s_int % sizeof (s_int))); >>> } >> >> sizeof(op_t), which is usually the same size, but doesn't have to be. >> > Are we aligning by sizeof(op_t) or sizeof(void *)? > The former then `word_containing` will also need to be changed. > > If the latter then sizeof(s_int) is correct. The read/write operation are being done by op_t, so it makes sense to use op_t on the sizeof. It would really matter on ABIs with op_t different than uintptr_t (mips64n32 and x32).
diff --git a/string/strchrnul.c b/string/strchrnul.c index 0cc1fc6bb0..67defa3dab 100644 --- a/string/strchrnul.c +++ b/string/strchrnul.c @@ -1,10 +1,5 @@ /* Copyright (C) 1991-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. - Based on strlen implementation by Torbjorn Granlund (tege@sics.se), - with help from Dan Sahlin (dan@sics.se) and - bug fix and commentary by Jim Blandy (jimb@ai.mit.edu); - adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu), - and implemented by Roland McGrath (roland@ai.mit.edu). The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -21,146 +16,43 @@ <https://www.gnu.org/licenses/>. */ #include <string.h> -#include <memcopy.h> #include <stdlib.h> +#include <stdint.h> +#include <string-fza.h> +#include <string-fzb.h> +#include <string-fzi.h> +#include <string-maskoff.h> #undef __strchrnul #undef strchrnul -#ifndef STRCHRNUL -# define STRCHRNUL __strchrnul +#ifdef STRCHRNUL +# define __strchrnul STRCHRNUL #endif /* Find the first occurrence of C in S or the final NUL byte. */ char * -STRCHRNUL (const char *s, int c_in) +__strchrnul (const char *str, int c_in) { - const unsigned char *char_ptr; - const unsigned long int *longword_ptr; - unsigned long int longword, magic_bits, charmask; - unsigned char c; - - c = (unsigned char) c_in; - - /* Handle the first few characters by reading one character at a time. - Do this until CHAR_PTR is aligned on a longword boundary. */ - for (char_ptr = (const unsigned char *) s; - ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; - ++char_ptr) - if (*char_ptr == c || *char_ptr == '\0') - return (void *) char_ptr; - - /* All these elucidatory comments refer to 4-byte longwords, - but the theory applies equally well to 8-byte longwords. */ - - longword_ptr = (unsigned long int *) char_ptr; - - /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits - the "holes." Note that there is a hole just to the left of - each byte, with an extra at the end: - - bits: 01111110 11111110 11111110 11111111 - bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD - - The 1-bits make sure that carries propagate to the next 0-bit. - The 0-bits provide holes for carries to fall into. */ - magic_bits = -1; - magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1; - - /* Set up a longword, each of whose bytes is C. */ - charmask = c | (c << 8); - charmask |= charmask << 16; - if (sizeof (longword) > 4) - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ - charmask |= (charmask << 16) << 16; - if (sizeof (longword) > 8) - abort (); - - /* Instead of the traditional loop which tests each character, - we will test a longword at a time. The tricky part is testing - if *any of the four* bytes in the longword in question are zero. */ - for (;;) - { - /* We tentatively exit the loop if adding MAGIC_BITS to - LONGWORD fails to change any of the hole bits of LONGWORD. - - 1) Is this safe? Will it catch all the zero bytes? - Suppose there is a byte with all zeros. Any carry bits - propagating from its left will fall into the hole at its - least significant bit and stop. Since there will be no - carry from its most significant bit, the LSB of the - byte to the left will be unchanged, and the zero will be - detected. + /* Set up a word, each of whose bytes is C. */ + op_t repeated_c = repeat_bytes (c_in); - 2) Is this worthwhile? Will it ignore everything except - zero bytes? Suppose every byte of LONGWORD has a bit set - somewhere. There will be a carry into bit 8. If bit 8 - is set, this will carry into bit 16. If bit 8 is clear, - one of bits 9-15 must be set, so there will be a carry - into bit 16. Similarly, there will be a carry into bit - 24. If one of bits 24-30 is set, there will be a carry - into bit 31, so all of the hole bits will be changed. + /* Align the input address to op_t. */ + uintptr_t s_int = (uintptr_t) str; + const op_t *word_ptr = word_containing (str); - The one misfire occurs when bits 24-30 are clear and bit - 31 is set; in this case, the hole at bit 31 is not - changed. If we had access to the processor carry flag, - we could close this loophole by putting the fourth hole - at bit 32! + /* Read the first aligned word, but force bytes before the string to + match neither zero nor goal (we make sure the high bit of each byte + is 1, and the low 7 bits are all the opposite of the goal byte). */ + op_t bmask = create_mask (s_int); + op_t word = (*word_ptr | bmask) ^ (repeated_c & highbit_mask (bmask)); - So it ignores everything except 128's, when they're aligned - properly. + while (! has_zero_eq (word, repeated_c)) + word = *++word_ptr; - 3) But wait! Aren't we looking for C as well as zero? - Good point. So what we do is XOR LONGWORD with a longword, - each of whose bytes is C. This turns each byte that is C - into a zero. */ - - longword = *longword_ptr++; - - /* Add MAGIC_BITS to LONGWORD. */ - if ((((longword + magic_bits) - - /* Set those bits that were unchanged by the addition. */ - ^ ~longword) - - /* Look at only the hole bits. If any of the hole bits - are unchanged, most likely one of the bytes was a - zero. */ - & ~magic_bits) != 0 - - /* That caught zeroes. Now test for C. */ - || ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask)) - & ~magic_bits) != 0) - { - /* Which of the bytes was C or zero? - If none of them were, it was a misfire; continue the search. */ - - const unsigned char *cp = (const unsigned char *) (longword_ptr - 1); - - if (*cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (sizeof (longword) > 4) - { - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - if (*++cp == c || *cp == '\0') - return (char *) cp; - } - } - } - - /* This should never happen. */ - return NULL; + op_t found = index_first_zero_eq (word, repeated_c); + return (char *) (word_ptr) + found; } - +#ifndef STRCHRNUL weak_alias (__strchrnul, strchrnul) +#endif diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c b/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c index ed86b5e671..9c85e269f7 100644 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c @@ -19,10 +19,6 @@ #include <string.h> #define STRCHRNUL __strchrnul_ppc - -#undef weak_alias -#define weak_alias(a,b ) - extern __typeof (strchrnul) __strchrnul_ppc attribute_hidden; #include <string/strchrnul.c> diff --git a/sysdeps/s390/strchrnul-c.c b/sysdeps/s390/strchrnul-c.c index 4ffac54edd..2ebbcc62f7 100644 --- a/sysdeps/s390/strchrnul-c.c +++ b/sysdeps/s390/strchrnul-c.c @@ -22,8 +22,6 @@ # if HAVE_STRCHRNUL_IFUNC # define STRCHRNUL STRCHRNUL_C # define __strchrnul STRCHRNUL -# undef weak_alias -# define weak_alias(name, alias) # endif # include <string/strchrnul.c>