Message ID | 20220607041134.2369903-7-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v6,1/8] x86: Create header for VEC classes in x86 strings library | expand |
On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This is not meant as a performance optimization. The previous code was > far to liberal in aligning targets and wasted code size unnecissarily. > > The total code size saving is: 59 bytes > > There are no major changes in the benchmarks. > Geometric Mean of all benchmarks New / Old: 0.967 > > Full xcheck passes on x86_64. > --- > sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 1 + > sysdeps/x86_64/multiarch/memchr-avx2.S | 109 +++++++++++---------- > 2 files changed, 60 insertions(+), 50 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > index 87b076c7c4..c4d71938c5 100644 > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > @@ -2,6 +2,7 @@ > # define MEMCHR __memchr_avx2_rtm > #endif > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > index 75bd7262e0..28a01280ec 100644 > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > @@ -57,7 +57,7 @@ > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section SECTION(.text),"ax",@progbits > -ENTRY (MEMCHR) > +ENTRY_P2ALIGN (MEMCHR, 5) > # ifndef USE_AS_RAWMEMCHR > /* Check for zero length. */ > # ifdef __ILP32__ > @@ -87,12 +87,14 @@ ENTRY (MEMCHR) > # endif > testl %eax, %eax > jz L(aligned_more) > - tzcntl %eax, %eax > + bsfl %eax, %eax > addq %rdi, %rax > - VZEROUPPER_RETURN > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > + > > # ifndef USE_AS_RAWMEMCHR > - .p2align 5 > + .p2align 4 > L(first_vec_x0): > /* Check if first match was before length. */ > tzcntl %eax, %eax > @@ -100,58 +102,31 @@ L(first_vec_x0): > /* NB: Multiply length by 4 to get byte count. */ > sall $2, %edx > # endif > - xorl %ecx, %ecx > + COND_VZEROUPPER > + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch > + block. branch here as opposed to cmovcc is not that costly. Common > + usage of memchr is to check if the return was NULL (if string was > + known to contain CHAR user would use rawmemchr). This branch will be > + highly correlated with the user branch and can be used by most > + modern branch predictors to predict the user branch. */ > cmpl %eax, %edx > - leaq (%rdi, %rax), %rax > - cmovle %rcx, %rax > - VZEROUPPER_RETURN > - > -L(null): > - xorl %eax, %eax > - ret > -# endif > - .p2align 4 > -L(cross_page_boundary): > - /* Save pointer before aligning as its original value is > - necessary for computer return address if byte is found or > - adjusting length if it is not and this is memchr. */ > - movq %rdi, %rcx > - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr > - and rdi for rawmemchr. */ > - orq $(VEC_SIZE - 1), %ALGN_PTR_REG > - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Calculate length until end of page (length checked for a > - match). */ > - leaq 1(%ALGN_PTR_REG), %rsi > - subq %RRAW_PTR_REG, %rsi > -# ifdef USE_AS_WMEMCHR > - /* NB: Divide bytes by 4 to get wchar_t count. */ > - shrl $2, %esi > -# endif > -# endif > - /* Remove the leading bytes. */ > - sarxl %ERAW_PTR_REG, %eax, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Check the end of data. */ > - cmpq %rsi, %rdx > - jbe L(first_vec_x0) > + jle L(null) > + addq %rdi, %rax > + ret > # endif > - testl %eax, %eax > - jz L(cross_page_continue) > - tzcntl %eax, %eax > - addq %RRAW_PTR_REG, %rax > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > > - .p2align 4 > + .p2align 4,, 10 > L(first_vec_x1): > - tzcntl %eax, %eax > + bsfl %eax, %eax > incq %rdi > addq %rdi, %rax > VZEROUPPER_RETURN > - > +# ifndef USE_AS_RAWMEMCHR > + /* First in aligning bytes here. */ > +L(null): > + xorl %eax, %eax > + ret > +# endif > .p2align 4 > L(first_vec_x2): > tzcntl %eax, %eax > @@ -340,7 +315,7 @@ L(first_vec_x1_check): > incq %rdi > addq %rdi, %rax > VZEROUPPER_RETURN > - .p2align 4 > + .p2align 4,, 6 > L(set_zero_end): > xorl %eax, %eax > VZEROUPPER_RETURN > @@ -428,5 +403,39 @@ L(last_vec_x3): > VZEROUPPER_RETURN > # endif > > + .p2align 4 > +L(cross_page_boundary): > + /* Save pointer before aligning as its original value is necessary for > + computer return address if byte is found or adjusting length if it > + is not and this is memchr. */ > + movq %rdi, %rcx > + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for > + rawmemchr. */ > + andq $-VEC_SIZE, %ALGN_PTR_REG > + VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > +# ifndef USE_AS_RAWMEMCHR > + /* Calculate length until end of page (length checked for a match). */ > + leal VEC_SIZE(%ALGN_PTR_REG), %esi > + subl %ERAW_PTR_REG, %esi > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide bytes by 4 to get wchar_t count. */ > + shrl $2, %esi > +# endif > +# endif > + /* Remove the leading bytes. */ > + sarxl %ERAW_PTR_REG, %eax, %eax > +# ifndef USE_AS_RAWMEMCHR > + /* Check the end of data. */ > + cmpq %rsi, %rdx > + jbe L(first_vec_x0) > +# endif > + testl %eax, %eax > + jz L(cross_page_continue) > + bsfl %eax, %eax > + addq %RRAW_PTR_REG, %rax > + VZEROUPPER_RETURN > + > + > END (MEMCHR) > #endif > -- > 2.34.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > This is not meant as a performance optimization. The previous code was > > far to liberal in aligning targets and wasted code size unnecissarily. > > > > The total code size saving is: 59 bytes > > > > There are no major changes in the benchmarks. > > Geometric Mean of all benchmarks New / Old: 0.967 > > > > Full xcheck passes on x86_64. > > --- > > sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 1 + > > sysdeps/x86_64/multiarch/memchr-avx2.S | 109 +++++++++++---------- > > 2 files changed, 60 insertions(+), 50 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > index 87b076c7c4..c4d71938c5 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > @@ -2,6 +2,7 @@ > > # define MEMCHR __memchr_avx2_rtm > > #endif > > > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > index 75bd7262e0..28a01280ec 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > @@ -57,7 +57,7 @@ > > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > .section SECTION(.text),"ax",@progbits > > -ENTRY (MEMCHR) > > +ENTRY_P2ALIGN (MEMCHR, 5) > > # ifndef USE_AS_RAWMEMCHR > > /* Check for zero length. */ > > # ifdef __ILP32__ > > @@ -87,12 +87,14 @@ ENTRY (MEMCHR) > > # endif > > testl %eax, %eax > > jz L(aligned_more) > > - tzcntl %eax, %eax > > + bsfl %eax, %eax > > addq %rdi, %rax > > - VZEROUPPER_RETURN > > +L(return_vzeroupper): > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > + > > > > # ifndef USE_AS_RAWMEMCHR > > - .p2align 5 > > + .p2align 4 > > L(first_vec_x0): > > /* Check if first match was before length. */ > > tzcntl %eax, %eax > > @@ -100,58 +102,31 @@ L(first_vec_x0): > > /* NB: Multiply length by 4 to get byte count. */ > > sall $2, %edx > > # endif > > - xorl %ecx, %ecx > > + COND_VZEROUPPER > > + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch > > + block. branch here as opposed to cmovcc is not that costly. Common > > + usage of memchr is to check if the return was NULL (if string was > > + known to contain CHAR user would use rawmemchr). This branch will be > > + highly correlated with the user branch and can be used by most > > + modern branch predictors to predict the user branch. */ > > cmpl %eax, %edx > > - leaq (%rdi, %rax), %rax > > - cmovle %rcx, %rax > > - VZEROUPPER_RETURN > > - > > -L(null): > > - xorl %eax, %eax > > - ret > > -# endif > > - .p2align 4 > > -L(cross_page_boundary): > > - /* Save pointer before aligning as its original value is > > - necessary for computer return address if byte is found or > > - adjusting length if it is not and this is memchr. */ > > - movq %rdi, %rcx > > - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr > > - and rdi for rawmemchr. */ > > - orq $(VEC_SIZE - 1), %ALGN_PTR_REG > > - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > -# ifndef USE_AS_RAWMEMCHR > > - /* Calculate length until end of page (length checked for a > > - match). */ > > - leaq 1(%ALGN_PTR_REG), %rsi > > - subq %RRAW_PTR_REG, %rsi > > -# ifdef USE_AS_WMEMCHR > > - /* NB: Divide bytes by 4 to get wchar_t count. */ > > - shrl $2, %esi > > -# endif > > -# endif > > - /* Remove the leading bytes. */ > > - sarxl %ERAW_PTR_REG, %eax, %eax > > -# ifndef USE_AS_RAWMEMCHR > > - /* Check the end of data. */ > > - cmpq %rsi, %rdx > > - jbe L(first_vec_x0) > > + jle L(null) > > + addq %rdi, %rax > > + ret > > # endif > > - testl %eax, %eax > > - jz L(cross_page_continue) > > - tzcntl %eax, %eax > > - addq %RRAW_PTR_REG, %rax > > -L(return_vzeroupper): > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > - .p2align 4 > > + .p2align 4,, 10 > > L(first_vec_x1): > > - tzcntl %eax, %eax > > + bsfl %eax, %eax > > incq %rdi > > addq %rdi, %rax > > VZEROUPPER_RETURN > > - > > +# ifndef USE_AS_RAWMEMCHR > > + /* First in aligning bytes here. */ > > +L(null): > > + xorl %eax, %eax > > + ret > > +# endif > > .p2align 4 > > L(first_vec_x2): > > tzcntl %eax, %eax > > @@ -340,7 +315,7 @@ L(first_vec_x1_check): > > incq %rdi > > addq %rdi, %rax > > VZEROUPPER_RETURN > > - .p2align 4 > > + .p2align 4,, 6 > > L(set_zero_end): > > xorl %eax, %eax > > VZEROUPPER_RETURN > > @@ -428,5 +403,39 @@ L(last_vec_x3): > > VZEROUPPER_RETURN > > # endif > > > > + .p2align 4 > > +L(cross_page_boundary): > > + /* Save pointer before aligning as its original value is necessary for > > + computer return address if byte is found or adjusting length if it > > + is not and this is memchr. */ > > + movq %rdi, %rcx > > + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for > > + rawmemchr. */ > > + andq $-VEC_SIZE, %ALGN_PTR_REG > > + VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > +# ifndef USE_AS_RAWMEMCHR > > + /* Calculate length until end of page (length checked for a match). */ > > + leal VEC_SIZE(%ALGN_PTR_REG), %esi > > + subl %ERAW_PTR_REG, %esi > > +# ifdef USE_AS_WMEMCHR > > + /* NB: Divide bytes by 4 to get wchar_t count. */ > > + shrl $2, %esi > > +# endif > > +# endif > > + /* Remove the leading bytes. */ > > + sarxl %ERAW_PTR_REG, %eax, %eax > > +# ifndef USE_AS_RAWMEMCHR > > + /* Check the end of data. */ > > + cmpq %rsi, %rdx > > + jbe L(first_vec_x0) > > +# endif > > + testl %eax, %eax > > + jz L(cross_page_continue) > > + bsfl %eax, %eax > > + addq %RRAW_PTR_REG, %rax > > + VZEROUPPER_RETURN > > + > > + > > END (MEMCHR) > > #endif > > -- > > 2.34.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
On Wed, Jul 13, 2022 at 7:32 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > This is not meant as a performance optimization. The previous code was > > > far to liberal in aligning targets and wasted code size unnecissarily. > > > > > > The total code size saving is: 59 bytes > > > > > > There are no major changes in the benchmarks. > > > Geometric Mean of all benchmarks New / Old: 0.967 > > > > > > Full xcheck passes on x86_64. > > > --- > > > sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 1 + > > > sysdeps/x86_64/multiarch/memchr-avx2.S | 109 +++++++++++---------- > > > 2 files changed, 60 insertions(+), 50 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > > index 87b076c7c4..c4d71938c5 100644 > > > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S > > > @@ -2,6 +2,7 @@ > > > # define MEMCHR __memchr_avx2_rtm > > > #endif > > > > > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > > > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > > > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > > index 75bd7262e0..28a01280ec 100644 > > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > > @@ -57,7 +57,7 @@ > > > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > > .section SECTION(.text),"ax",@progbits > > > -ENTRY (MEMCHR) > > > +ENTRY_P2ALIGN (MEMCHR, 5) > > > # ifndef USE_AS_RAWMEMCHR > > > /* Check for zero length. */ > > > # ifdef __ILP32__ > > > @@ -87,12 +87,14 @@ ENTRY (MEMCHR) > > > # endif > > > testl %eax, %eax > > > jz L(aligned_more) > > > - tzcntl %eax, %eax > > > + bsfl %eax, %eax > > > addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > +L(return_vzeroupper): > > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > + > > > > > > # ifndef USE_AS_RAWMEMCHR > > > - .p2align 5 > > > + .p2align 4 > > > L(first_vec_x0): > > > /* Check if first match was before length. */ > > > tzcntl %eax, %eax > > > @@ -100,58 +102,31 @@ L(first_vec_x0): > > > /* NB: Multiply length by 4 to get byte count. */ > > > sall $2, %edx > > > # endif > > > - xorl %ecx, %ecx > > > + COND_VZEROUPPER > > > + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch > > > + block. branch here as opposed to cmovcc is not that costly. Common > > > + usage of memchr is to check if the return was NULL (if string was > > > + known to contain CHAR user would use rawmemchr). This branch will be > > > + highly correlated with the user branch and can be used by most > > > + modern branch predictors to predict the user branch. */ > > > cmpl %eax, %edx > > > - leaq (%rdi, %rax), %rax > > > - cmovle %rcx, %rax > > > - VZEROUPPER_RETURN > > > - > > > -L(null): > > > - xorl %eax, %eax > > > - ret > > > -# endif > > > - .p2align 4 > > > -L(cross_page_boundary): > > > - /* Save pointer before aligning as its original value is > > > - necessary for computer return address if byte is found or > > > - adjusting length if it is not and this is memchr. */ > > > - movq %rdi, %rcx > > > - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr > > > - and rdi for rawmemchr. */ > > > - orq $(VEC_SIZE - 1), %ALGN_PTR_REG > > > - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > -# ifndef USE_AS_RAWMEMCHR > > > - /* Calculate length until end of page (length checked for a > > > - match). */ > > > - leaq 1(%ALGN_PTR_REG), %rsi > > > - subq %RRAW_PTR_REG, %rsi > > > -# ifdef USE_AS_WMEMCHR > > > - /* NB: Divide bytes by 4 to get wchar_t count. */ > > > - shrl $2, %esi > > > -# endif > > > -# endif > > > - /* Remove the leading bytes. */ > > > - sarxl %ERAW_PTR_REG, %eax, %eax > > > -# ifndef USE_AS_RAWMEMCHR > > > - /* Check the end of data. */ > > > - cmpq %rsi, %rdx > > > - jbe L(first_vec_x0) > > > + jle L(null) > > > + addq %rdi, %rax > > > + ret > > > # endif > > > - testl %eax, %eax > > > - jz L(cross_page_continue) > > > - tzcntl %eax, %eax > > > - addq %RRAW_PTR_REG, %rax > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > - .p2align 4 > > > + .p2align 4,, 10 > > > L(first_vec_x1): > > > - tzcntl %eax, %eax > > > + bsfl %eax, %eax > > > incq %rdi > > > addq %rdi, %rax > > > VZEROUPPER_RETURN > > > - > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* First in aligning bytes here. */ > > > +L(null): > > > + xorl %eax, %eax > > > + ret > > > +# endif > > > .p2align 4 > > > L(first_vec_x2): > > > tzcntl %eax, %eax > > > @@ -340,7 +315,7 @@ L(first_vec_x1_check): > > > incq %rdi > > > addq %rdi, %rax > > > VZEROUPPER_RETURN > > > - .p2align 4 > > > + .p2align 4,, 6 > > > L(set_zero_end): > > > xorl %eax, %eax > > > VZEROUPPER_RETURN > > > @@ -428,5 +403,39 @@ L(last_vec_x3): > > > VZEROUPPER_RETURN > > > # endif > > > > > > + .p2align 4 > > > +L(cross_page_boundary): > > > + /* Save pointer before aligning as its original value is necessary for > > > + computer return address if byte is found or adjusting length if it > > > + is not and this is memchr. */ > > > + movq %rdi, %rcx > > > + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for > > > + rawmemchr. */ > > > + andq $-VEC_SIZE, %ALGN_PTR_REG > > > + VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* Calculate length until end of page (length checked for a match). */ > > > + leal VEC_SIZE(%ALGN_PTR_REG), %esi > > > + subl %ERAW_PTR_REG, %esi > > > +# ifdef USE_AS_WMEMCHR > > > + /* NB: Divide bytes by 4 to get wchar_t count. */ > > > + shrl $2, %esi > > > +# endif > > > +# endif > > > + /* Remove the leading bytes. */ > > > + sarxl %ERAW_PTR_REG, %eax, %eax > > > +# ifndef USE_AS_RAWMEMCHR > > > + /* Check the end of data. */ > > > + cmpq %rsi, %rdx > > > + jbe L(first_vec_x0) > > > +# endif > > > + testl %eax, %eax > > > + jz L(cross_page_continue) > > > + bsfl %eax, %eax > > > + addq %RRAW_PTR_REG, %rax > > > + VZEROUPPER_RETURN > > > + > > > + > > > END (MEMCHR) > > > #endif > > > -- > > > 2.34.1 > > > > > > > LGTM. > > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > > > Thanks. > > > > -- > > H.J. > > I would like to backport this patch to release branches. > Any comments or objections? Probably best to squash with: https://sourceware.org/git/?p=glibc.git;a=commit;h=2c9af8421d2b4a7fcce163e7bc81a118d22fd346 > > --Sunil
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S index 87b076c7c4..c4d71938c5 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMCHR __memchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 75bd7262e0..28a01280ec 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -57,7 +57,7 @@ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 5) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ # ifdef __ILP32__ @@ -87,12 +87,14 @@ ENTRY (MEMCHR) # endif testl %eax, %eax jz L(aligned_more) - tzcntl %eax, %eax + bsfl %eax, %eax addq %rdi, %rax - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + # ifndef USE_AS_RAWMEMCHR - .p2align 5 + .p2align 4 L(first_vec_x0): /* Check if first match was before length. */ tzcntl %eax, %eax @@ -100,58 +102,31 @@ L(first_vec_x0): /* NB: Multiply length by 4 to get byte count. */ sall $2, %edx # endif - xorl %ecx, %ecx + COND_VZEROUPPER + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch + block. branch here as opposed to cmovcc is not that costly. Common + usage of memchr is to check if the return was NULL (if string was + known to contain CHAR user would use rawmemchr). This branch will be + highly correlated with the user branch and can be used by most + modern branch predictors to predict the user branch. */ cmpl %eax, %edx - leaq (%rdi, %rax), %rax - cmovle %rcx, %rax - VZEROUPPER_RETURN - -L(null): - xorl %eax, %eax - ret -# endif - .p2align 4 -L(cross_page_boundary): - /* Save pointer before aligning as its original value is - necessary for computer return address if byte is found or - adjusting length if it is not and this is memchr. */ - movq %rdi, %rcx - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr - and rdi for rawmemchr. */ - orq $(VEC_SIZE - 1), %ALGN_PTR_REG - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax -# ifndef USE_AS_RAWMEMCHR - /* Calculate length until end of page (length checked for a - match). */ - leaq 1(%ALGN_PTR_REG), %rsi - subq %RRAW_PTR_REG, %rsi -# ifdef USE_AS_WMEMCHR - /* NB: Divide bytes by 4 to get wchar_t count. */ - shrl $2, %esi -# endif -# endif - /* Remove the leading bytes. */ - sarxl %ERAW_PTR_REG, %eax, %eax -# ifndef USE_AS_RAWMEMCHR - /* Check the end of data. */ - cmpq %rsi, %rdx - jbe L(first_vec_x0) + jle L(null) + addq %rdi, %rax + ret # endif - testl %eax, %eax - jz L(cross_page_continue) - tzcntl %eax, %eax - addq %RRAW_PTR_REG, %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 + .p2align 4,, 10 L(first_vec_x1): - tzcntl %eax, %eax + bsfl %eax, %eax incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - +# ifndef USE_AS_RAWMEMCHR + /* First in aligning bytes here. */ +L(null): + xorl %eax, %eax + ret +# endif .p2align 4 L(first_vec_x2): tzcntl %eax, %eax @@ -340,7 +315,7 @@ L(first_vec_x1_check): incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 6 L(set_zero_end): xorl %eax, %eax VZEROUPPER_RETURN @@ -428,5 +403,39 @@ L(last_vec_x3): VZEROUPPER_RETURN # endif + .p2align 4 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is necessary for + computer return address if byte is found or adjusting length if it + is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for + rawmemchr. */ + andq $-VEC_SIZE, %ALGN_PTR_REG + VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a match). */ + leal VEC_SIZE(%ALGN_PTR_REG), %esi + subl %ERAW_PTR_REG, %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %esi +# endif +# endif + /* Remove the leading bytes. */ + sarxl %ERAW_PTR_REG, %eax, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + bsfl %eax, %eax + addq %RRAW_PTR_REG, %rax + VZEROUPPER_RETURN + + END (MEMCHR) #endif