Message ID | 20210202093956.2613423-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/2] x86: Refactor and improve performance of strchr-avx2.S | expand |
On Tue, Feb 2, 2021 at 1:40 AM <goldstein.w.n@gmail.com> wrote: > > From: noah <goldstein.w.n@gmail.com> > > No bug. Just seemed the performance could be improved a bit. Observed > and expected behavior are unchanged. Optimized body of main > loop. Updated page cross logic and optimized accordingly. Made a few > minor instruction selection modifications. No regressions in test > suite. Both test-strchrnul and test-strchr passed. > > Signed-off-by: noah <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++------------- > sysdeps/x86_64/multiarch/strchr.c | 1 + > 2 files changed, 118 insertions(+), 118 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > index d416558d04..806ca66a9b 100644 > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > @@ -27,10 +27,12 @@ > # ifdef USE_AS_WCSCHR > # define VPBROADCAST vpbroadcastd > # define VPCMPEQ vpcmpeqd > +# define VPMINU vpminud > # define CHAR_REG esi > # else > # define VPBROADCAST vpbroadcastb > # define VPCMPEQ vpcmpeqb > +# define VPMINU vpminub > # define CHAR_REG sil > # endif > > @@ -39,20 +41,26 @@ > # endif > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > > .section .text.avx,"ax",@progbits > ENTRY (STRCHR) > movl %edi, %ecx > - /* Broadcast CHAR to YMM0. */ > +# ifndef USE_AS_STRCHRNUL > + xorl %edx, %edx > +# endif > + > + /* Broadcast CHAR to YMM0. */ > vmovd %esi, %xmm0 > vpxor %xmm9, %xmm9, %xmm9 > VPBROADCAST %xmm0, %ymm0 > - /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > - > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the > + > + /* Check if we cross page boundary with one vector load. */ > + andl $(PAGE_SIZE - 1), %ecx > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > + ja L(cross_page_boundary) > + > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the > null byte. */ > vmovdqu (%rdi), %ymm8 > VPCMPEQ %ymm8, %ymm0, %ymm1 > @@ -60,50 +68,27 @@ ENTRY (STRCHR) > vpor %ymm1, %ymm2, %ymm1 > vpmovmskb %ymm1, %eax > testl %eax, %eax > - jnz L(first_vec_x0) > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - > - jmp L(more_4x_vec) > - > - .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - vmovdqu (%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > - vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bytes. */ > - sarl %cl, %eax > - testl %eax, %eax > - jz L(aligned_more) > - /* Found CHAR or the null byte. */ > + jz L(more_vecs) > tzcntl %eax, %eax > - addq %rcx, %rax > -# ifdef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > addq %rdi, %rax > -# else > - xorl %edx, %edx > - leaq (%rdi, %rax), %rax > - cmp (%rax), %CHAR_REG > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > cmovne %rdx, %rax > # endif > VZEROUPPER > ret > > .p2align 4 > +L(more_vecs): > + /* Align data for aligned loads in the loop. */ > + andq $-VEC_SIZE, %rdi > L(aligned_more): > - addq $VEC_SIZE, %rdi > > -L(more_4x_vec): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - vmovdqa (%rdi), %ymm8 > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > + since data is only aligned to VEC_SIZE. */ > + vmovdqa VEC_SIZE(%rdi), %ymm8 > + addq $VEC_SIZE, %rdi > VPCMPEQ %ymm8, %ymm0, %ymm1 > VPCMPEQ %ymm8, %ymm9, %ymm2 > vpor %ymm1, %ymm2, %ymm1 > @@ -125,7 +110,7 @@ L(more_4x_vec): > vpor %ymm1, %ymm2, %ymm1 > vpmovmskb %ymm1, %eax > testl %eax, %eax > - jnz L(first_vec_x2) > + jnz L(first_vec_x2) > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > VPCMPEQ %ymm8, %ymm0, %ymm1 > @@ -133,122 +118,136 @@ L(more_4x_vec): > vpor %ymm1, %ymm2, %ymm1 > vpmovmskb %ymm1, %eax > testl %eax, %eax > - jnz L(first_vec_x3) > - > - addq $(VEC_SIZE * 4), %rdi > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > - > - .p2align 4 > -L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - vmovdqa (%rdi), %ymm5 > - vmovdqa VEC_SIZE(%rdi), %ymm6 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > - > - VPCMPEQ %ymm5, %ymm0, %ymm1 > - VPCMPEQ %ymm6, %ymm0, %ymm2 > - VPCMPEQ %ymm7, %ymm0, %ymm3 > - VPCMPEQ %ymm8, %ymm0, %ymm4 > - > - VPCMPEQ %ymm5, %ymm9, %ymm5 > - VPCMPEQ %ymm6, %ymm9, %ymm6 > - VPCMPEQ %ymm7, %ymm9, %ymm7 > - VPCMPEQ %ymm8, %ymm9, %ymm8 > - > - vpor %ymm1, %ymm5, %ymm1 > - vpor %ymm2, %ymm6, %ymm2 > - vpor %ymm3, %ymm7, %ymm3 > - vpor %ymm4, %ymm8, %ymm4 > - > - vpor %ymm1, %ymm2, %ymm5 > - vpor %ymm3, %ymm4, %ymm6 > - > - vpor %ymm5, %ymm6, %ymm5 > - > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jnz L(4x_vec_end) > - > - addq $(VEC_SIZE * 4), %rdi > + jz L(prep_loop_4x) > > - jmp L(loop_4x_vec) > + tzcntl %eax, %eax > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > + cmovne %rdx, %rax > +# endif > + VZEROUPPER > + ret > > .p2align 4 > L(first_vec_x0): > - /* Found CHAR or the null byte. */ > tzcntl %eax, %eax > -# ifdef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > addq %rdi, %rax > -# else > - xorl %edx, %edx > - leaq (%rdi, %rax), %rax > - cmp (%rax), %CHAR_REG > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > cmovne %rdx, %rax > # endif > VZEROUPPER > ret > - > + > .p2align 4 > L(first_vec_x1): > tzcntl %eax, %eax > -# ifdef USE_AS_STRCHRNUL > - addq $VEC_SIZE, %rax > - addq %rdi, %rax > -# else > - xorl %edx, %edx > leaq VEC_SIZE(%rdi, %rax), %rax > - cmp (%rax), %CHAR_REG > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > cmovne %rdx, %rax > # endif > VZEROUPPER > - ret > - > + ret > + > .p2align 4 > L(first_vec_x2): > tzcntl %eax, %eax > -# ifdef USE_AS_STRCHRNUL > - addq $(VEC_SIZE * 2), %rax > - addq %rdi, %rax > -# else > - xorl %edx, %edx > + /* Found CHAR or the null byte. */ > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > - cmp (%rax), %CHAR_REG > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > cmovne %rdx, %rax > # endif > VZEROUPPER > ret > + > +L(prep_loop_4x): > + /* Align data to 4 * VEC_SIZE. */ > + andq $-(VEC_SIZE * 4), %rdi > > .p2align 4 > -L(4x_vec_end): > +L(loop_4x_vec): > + /* Compare 4 * VEC at a time forward. */ > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > + > + /* Leaves only CHARS matching esi as 0. */ > + vpxor %ymm5, %ymm0, %ymm1 > + vpxor %ymm6, %ymm0, %ymm2 > + vpxor %ymm7, %ymm0, %ymm3 > + vpxor %ymm8, %ymm0, %ymm4 > + > + VPMINU %ymm1, %ymm5, %ymm1 > + VPMINU %ymm2, %ymm6, %ymm2 > + VPMINU %ymm3, %ymm7, %ymm3 > + VPMINU %ymm4, %ymm8, %ymm4 > + > + VPMINU %ymm1, %ymm2, %ymm5 > + VPMINU %ymm3, %ymm4, %ymm6 > + > + VPMINU %ymm5, %ymm6, %ymm5 > + > + VPCMPEQ %ymm5, %ymm9, %ymm5 > + vpmovmskb %ymm5, %eax > + > + addq $(VEC_SIZE * 4), %rdi > + testl %eax, %eax > + jz L(loop_4x_vec) > + > + VPCMPEQ %ymm1, %ymm9, %ymm1 > vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x0) > + > + VPCMPEQ %ymm2, %ymm9, %ymm2 > vpmovmskb %ymm2, %eax > testl %eax, %eax > jnz L(first_vec_x1) > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > + > + VPCMPEQ %ymm3, %ymm9, %ymm3 > + VPCMPEQ %ymm4, %ymm9, %ymm4 > + vpmovmskb %ymm3, %ecx > vpmovmskb %ymm4, %eax > + salq $32, %rax > + orq %rcx, %rax Remove the extra tab. > + tzcntq %rax, %rax > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > + cmovne %rdx, %rax > +# endif > + VZEROUPPER > + ret > + > + /* Cold case for crossing page with first load. */ > + .p2align 4 > +L(cross_page_boundary): > + andq $-VEC_SIZE, %rdi > + andl $(VEC_SIZE - 1), %ecx > + > + vmovdqa (%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > + vpor %ymm1, %ymm2, %ymm1 > + vpmovmskb %ymm1, %eax > + /* Remove the leading bits. */ > + sarxl %ecx, %eax, %eax > testl %eax, %eax > -L(first_vec_x3): > + jz L(aligned_more) > tzcntl %eax, %eax > -# ifdef USE_AS_STRCHRNUL > - addq $(VEC_SIZE * 3), %rax > + addq %rcx, %rdi > addq %rdi, %rax > -# else > - xorl %edx, %edx > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > - cmp (%rax), %CHAR_REG > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG Remove the extra tab. > cmovne %rdx, %rax > # endif > VZEROUPPER > ret > > END (STRCHR) > -#endif > +# endif > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c > index 583a152794..4dfbe3b58b 100644 > --- a/sysdeps/x86_64/multiarch/strchr.c > +++ b/sysdeps/x86_64/multiarch/strchr.c > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void) > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) > && CPU_FEATURE_USABLE_P (cpu_features, AVX2) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > return OPTIMIZE (avx2); > > -- > 2.29.2 >
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index d416558d04..806ca66a9b 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -27,10 +27,12 @@ # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd +# define VPMINU vpminud # define CHAR_REG esi # else # define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb +# define VPMINU vpminub # define CHAR_REG sil # endif @@ -39,20 +41,26 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 .section .text.avx,"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx - /* Broadcast CHAR to YMM0. */ +# ifndef USE_AS_STRCHRNUL + xorl %edx, %edx +# endif + + /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpxor %xmm9, %xmm9, %xmm9 VPBROADCAST %xmm0, %ymm0 - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) - - /* Check the first VEC_SIZE bytes. Search for both CHAR and the + + /* Check if we cross page boundary with one vector load. */ + andl $(PAGE_SIZE - 1), %ecx + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the null byte. */ vmovdqu (%rdi), %ymm8 VPCMPEQ %ymm8, %ymm0, %ymm1 @@ -60,50 +68,27 @@ ENTRY (STRCHR) vpor %ymm1, %ymm2, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax - jnz L(first_vec_x0) - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - - jmp L(more_4x_vec) - - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - vmovdqu (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) - /* Found CHAR or the null byte. */ + jz L(more_vecs) tzcntl %eax, %eax - addq %rcx, %rax -# ifdef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ addq %rdi, %rax -# else - xorl %edx, %edx - leaq (%rdi, %rax), %rax - cmp (%rax), %CHAR_REG +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif VZEROUPPER ret .p2align 4 +L(more_vecs): + /* Align data for aligned loads in the loop. */ + andq $-VEC_SIZE, %rdi L(aligned_more): - addq $VEC_SIZE, %rdi -L(more_4x_vec): - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vmovdqa (%rdi), %ymm8 + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vmovdqa VEC_SIZE(%rdi), %ymm8 + addq $VEC_SIZE, %rdi VPCMPEQ %ymm8, %ymm0, %ymm1 VPCMPEQ %ymm8, %ymm9, %ymm2 vpor %ymm1, %ymm2, %ymm1 @@ -125,7 +110,7 @@ L(more_4x_vec): vpor %ymm1, %ymm2, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax - jnz L(first_vec_x2) + jnz L(first_vec_x2) vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 VPCMPEQ %ymm8, %ymm0, %ymm1 @@ -133,122 +118,136 @@ L(more_4x_vec): vpor %ymm1, %ymm2, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax - jnz L(first_vec_x3) - - addq $(VEC_SIZE * 4), %rdi - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi - - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm5 - vmovdqa VEC_SIZE(%rdi), %ymm6 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 - - VPCMPEQ %ymm5, %ymm0, %ymm1 - VPCMPEQ %ymm6, %ymm0, %ymm2 - VPCMPEQ %ymm7, %ymm0, %ymm3 - VPCMPEQ %ymm8, %ymm0, %ymm4 - - VPCMPEQ %ymm5, %ymm9, %ymm5 - VPCMPEQ %ymm6, %ymm9, %ymm6 - VPCMPEQ %ymm7, %ymm9, %ymm7 - VPCMPEQ %ymm8, %ymm9, %ymm8 - - vpor %ymm1, %ymm5, %ymm1 - vpor %ymm2, %ymm6, %ymm2 - vpor %ymm3, %ymm7, %ymm3 - vpor %ymm4, %ymm8, %ymm4 - - vpor %ymm1, %ymm2, %ymm5 - vpor %ymm3, %ymm4, %ymm6 - - vpor %ymm5, %ymm6, %ymm5 - - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi + jz L(prep_loop_4x) - jmp L(loop_4x_vec) + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret .p2align 4 L(first_vec_x0): - /* Found CHAR or the null byte. */ tzcntl %eax, %eax -# ifdef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ addq %rdi, %rax -# else - xorl %edx, %edx - leaq (%rdi, %rax), %rax - cmp (%rax), %CHAR_REG +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif VZEROUPPER ret - + .p2align 4 L(first_vec_x1): tzcntl %eax, %eax -# ifdef USE_AS_STRCHRNUL - addq $VEC_SIZE, %rax - addq %rdi, %rax -# else - xorl %edx, %edx leaq VEC_SIZE(%rdi, %rax), %rax - cmp (%rax), %CHAR_REG +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif VZEROUPPER - ret - + ret + .p2align 4 L(first_vec_x2): tzcntl %eax, %eax -# ifdef USE_AS_STRCHRNUL - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax -# else - xorl %edx, %edx + /* Found CHAR or the null byte. */ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax - cmp (%rax), %CHAR_REG +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif VZEROUPPER ret + +L(prep_loop_4x): + /* Align data to 4 * VEC_SIZE. */ + andq $-(VEC_SIZE * 4), %rdi .p2align 4 -L(4x_vec_end): +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 + + /* Leaves only CHARS matching esi as 0. */ + vpxor %ymm5, %ymm0, %ymm1 + vpxor %ymm6, %ymm0, %ymm2 + vpxor %ymm7, %ymm0, %ymm3 + vpxor %ymm8, %ymm0, %ymm4 + + VPMINU %ymm1, %ymm5, %ymm1 + VPMINU %ymm2, %ymm6, %ymm2 + VPMINU %ymm3, %ymm7, %ymm3 + VPMINU %ymm4, %ymm8, %ymm4 + + VPMINU %ymm1, %ymm2, %ymm5 + VPMINU %ymm3, %ymm4, %ymm6 + + VPMINU %ymm5, %ymm6, %ymm5 + + VPCMPEQ %ymm5, %ymm9, %ymm5 + vpmovmskb %ymm5, %eax + + addq $(VEC_SIZE * 4), %rdi + testl %eax, %eax + jz L(loop_4x_vec) + + VPCMPEQ %ymm1, %ymm9, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x0) + + VPCMPEQ %ymm2, %ymm9, %ymm2 vpmovmskb %ymm2, %eax testl %eax, %eax jnz L(first_vec_x1) - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(first_vec_x2) + + VPCMPEQ %ymm3, %ymm9, %ymm3 + VPCMPEQ %ymm4, %ymm9, %ymm4 + vpmovmskb %ymm3, %ecx vpmovmskb %ymm4, %eax + salq $32, %rax + orq %rcx, %rax + tzcntq %rax, %rax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + /* Cold case for crossing page with first load. */ + .p2align 4 +L(cross_page_boundary): + andq $-VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + + vmovdqa (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bits. */ + sarxl %ecx, %eax, %eax testl %eax, %eax -L(first_vec_x3): + jz L(aligned_more) tzcntl %eax, %eax -# ifdef USE_AS_STRCHRNUL - addq $(VEC_SIZE * 3), %rax + addq %rcx, %rdi addq %rdi, %rax -# else - xorl %edx, %edx - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax - cmp (%rax), %CHAR_REG +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif VZEROUPPER ret END (STRCHR) -#endif +# endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index 583a152794..4dfbe3b58b 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void) if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) return OPTIMIZE (avx2);