Message ID | 20220323215734.3927131-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,01/23] benchtests: Use json-lib in bench-strchr.c | expand |
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Small code cleanup for size: -81 bytes. > > Add comment justifying using a branch to do NULL/non-null return. > > All string/memory tests pass and no regressions in benchtests. > > geometric_mean(N=20) of all benchmarks New / Original: .985 > --- > Geomtric Mean N=20 runs; All functions page aligned > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time > 2048, 0, 32, 0, 23, 127, 0.878 > 2048, 1, 32, 0, 23, 127, 0.88 > 2048, 0, 64, 0, 23, 127, 0.997 > 2048, 2, 64, 0, 23, 127, 1.001 > 2048, 0, 128, 0, 23, 127, 0.973 > 2048, 3, 128, 0, 23, 127, 0.971 > 2048, 0, 256, 0, 23, 127, 0.976 > 2048, 4, 256, 0, 23, 127, 0.973 > 2048, 0, 512, 0, 23, 127, 1.001 > 2048, 5, 512, 0, 23, 127, 1.004 > 2048, 0, 1024, 0, 23, 127, 1.005 > 2048, 6, 1024, 0, 23, 127, 1.007 > 2048, 0, 2048, 0, 23, 127, 1.035 > 2048, 7, 2048, 0, 23, 127, 1.03 > 4096, 0, 32, 0, 23, 127, 0.889 > 4096, 1, 32, 0, 23, 127, 0.891 > 4096, 0, 64, 0, 23, 127, 1.012 > 4096, 2, 64, 0, 23, 127, 1.017 > 4096, 0, 128, 0, 23, 127, 0.975 > 4096, 3, 128, 0, 23, 127, 0.974 > 4096, 0, 256, 0, 23, 127, 0.974 > 4096, 4, 256, 0, 23, 127, 0.972 > 4096, 0, 512, 0, 23, 127, 1.002 > 4096, 5, 512, 0, 23, 127, 1.016 > 4096, 0, 1024, 0, 23, 127, 1.009 > 4096, 6, 1024, 0, 23, 127, 1.008 > 4096, 0, 2048, 0, 23, 127, 1.003 > 4096, 7, 2048, 0, 23, 127, 1.004 > 256, 1, 64, 0, 23, 127, 0.993 > 256, 2, 64, 0, 23, 127, 0.999 > 256, 3, 64, 0, 23, 127, 0.992 > 256, 4, 64, 0, 23, 127, 0.99 > 256, 5, 64, 0, 23, 127, 0.99 > 256, 6, 64, 0, 23, 127, 0.994 > 256, 7, 64, 0, 23, 127, 0.991 > 512, 0, 256, 0, 23, 127, 0.971 > 512, 16, 256, 0, 23, 127, 0.971 > 512, 32, 256, 0, 23, 127, 1.005 > 512, 48, 256, 0, 23, 127, 0.998 > 512, 64, 256, 0, 23, 127, 1.001 > 512, 80, 256, 0, 23, 127, 1.002 > 512, 96, 256, 0, 23, 127, 1.005 > 512, 112, 256, 0, 23, 127, 1.012 > 1, 0, 0, 0, 23, 127, 1.024 > 2, 0, 1, 0, 23, 127, 0.991 > 3, 0, 2, 0, 23, 127, 0.997 > 4, 0, 3, 0, 23, 127, 0.984 > 5, 0, 4, 0, 23, 127, 0.993 > 6, 0, 5, 0, 23, 127, 0.985 > 7, 0, 6, 0, 23, 127, 0.979 > 8, 0, 7, 0, 23, 127, 0.975 > 9, 0, 8, 0, 23, 127, 0.965 > 10, 0, 9, 0, 23, 127, 0.957 > 11, 0, 10, 0, 23, 127, 0.979 > 12, 0, 11, 0, 23, 127, 0.987 > 13, 0, 12, 0, 23, 127, 1.023 > 14, 0, 13, 0, 23, 127, 0.997 > 15, 0, 14, 0, 23, 127, 0.983 > 16, 0, 15, 0, 23, 127, 0.987 > 17, 0, 16, 0, 23, 127, 0.993 > 18, 0, 17, 0, 23, 127, 0.985 > 19, 0, 18, 0, 23, 127, 0.999 > 20, 0, 19, 0, 23, 127, 0.998 > 21, 0, 20, 0, 23, 127, 0.983 > 22, 0, 21, 0, 23, 127, 0.983 > 23, 0, 22, 0, 23, 127, 1.002 > 24, 0, 23, 0, 23, 127, 1.0 > 25, 0, 24, 0, 23, 127, 1.002 > 26, 0, 25, 0, 23, 127, 0.984 > 27, 0, 26, 0, 23, 127, 0.994 > 28, 0, 27, 0, 23, 127, 0.995 > 29, 0, 28, 0, 23, 127, 1.017 > 30, 0, 29, 0, 23, 127, 1.009 > 31, 0, 30, 0, 23, 127, 1.001 > 32, 0, 31, 0, 23, 127, 1.021 > 2048, 0, 32, 0, 0, 127, 0.899 > 2048, 1, 32, 0, 0, 127, 0.93 > 2048, 0, 64, 0, 0, 127, 1.009 > 2048, 2, 64, 0, 0, 127, 1.023 > 2048, 0, 128, 0, 0, 127, 0.973 > 2048, 3, 128, 0, 0, 127, 0.975 > 2048, 0, 256, 0, 0, 127, 0.974 > 2048, 4, 256, 0, 0, 127, 0.97 > 2048, 0, 512, 0, 0, 127, 0.999 > 2048, 5, 512, 0, 0, 127, 1.004 > 2048, 0, 1024, 0, 0, 127, 1.008 > 2048, 6, 1024, 0, 0, 127, 1.008 > 2048, 0, 2048, 0, 0, 127, 0.996 > 2048, 7, 2048, 0, 0, 127, 1.002 > 4096, 0, 32, 0, 0, 127, 0.872 > 4096, 1, 32, 0, 0, 127, 0.881 > 4096, 0, 64, 0, 0, 127, 1.006 > 4096, 2, 64, 0, 0, 127, 1.005 > 4096, 0, 128, 0, 0, 127, 0.973 > 4096, 3, 128, 0, 0, 127, 0.974 > 4096, 0, 256, 0, 0, 127, 0.969 > 4096, 4, 256, 0, 0, 127, 0.971 > 4096, 0, 512, 0, 0, 127, 1.0 > 4096, 5, 512, 0, 0, 127, 1.005 > 4096, 0, 1024, 0, 0, 127, 1.007 > 4096, 6, 1024, 0, 0, 127, 1.009 > 4096, 0, 2048, 0, 0, 127, 1.005 > 4096, 7, 2048, 0, 0, 127, 1.007 > 256, 1, 64, 0, 0, 127, 0.994 > 256, 2, 64, 0, 0, 127, 1.008 > 256, 3, 64, 0, 0, 127, 1.019 > 256, 4, 64, 0, 0, 127, 0.991 > 256, 5, 64, 0, 0, 127, 0.992 > 256, 6, 64, 0, 0, 127, 0.991 > 256, 7, 64, 0, 0, 127, 0.988 > 512, 0, 256, 0, 0, 127, 0.971 > 512, 16, 256, 0, 0, 127, 0.967 > 512, 32, 256, 0, 0, 127, 1.005 > 512, 48, 256, 0, 0, 127, 1.001 > 512, 64, 256, 0, 0, 127, 1.009 > 512, 80, 256, 0, 0, 127, 1.008 > 512, 96, 256, 0, 0, 127, 1.009 > 512, 112, 256, 0, 0, 127, 1.016 > 1, 0, 0, 0, 0, 127, 1.038 > 2, 0, 1, 0, 0, 127, 1.009 > 3, 0, 2, 0, 0, 127, 0.992 > 4, 0, 3, 0, 0, 127, 1.004 > 5, 0, 4, 0, 0, 127, 0.966 > 6, 0, 5, 0, 0, 127, 0.968 > 7, 0, 6, 0, 0, 127, 1.004 > 8, 0, 7, 0, 0, 127, 0.99 > 9, 0, 8, 0, 0, 127, 0.958 > 10, 0, 9, 0, 0, 127, 0.96 > 11, 0, 10, 0, 0, 127, 0.948 > 12, 0, 11, 0, 0, 127, 0.984 > 13, 0, 12, 0, 0, 127, 0.967 > 14, 0, 13, 0, 0, 127, 0.993 > 15, 0, 14, 0, 0, 127, 0.991 > 16, 0, 15, 0, 0, 127, 1.0 > 17, 0, 16, 0, 0, 127, 0.982 > 18, 0, 17, 0, 0, 127, 0.977 > 19, 0, 18, 0, 0, 127, 0.987 > 20, 0, 19, 0, 0, 127, 0.978 > 21, 0, 20, 0, 0, 127, 1.0 > 22, 0, 21, 0, 0, 127, 0.99 > 23, 0, 22, 0, 0, 127, 0.988 > 24, 0, 23, 0, 0, 127, 0.997 > 25, 0, 24, 0, 0, 127, 1.003 > 26, 0, 25, 0, 0, 127, 1.004 > 27, 0, 26, 0, 0, 127, 0.982 > 28, 0, 27, 0, 0, 127, 0.972 > 29, 0, 28, 0, 0, 127, 0.978 > 30, 0, 29, 0, 0, 127, 0.992 > 31, 0, 30, 0, 0, 127, 0.986 > 32, 0, 31, 0, 0, 127, 1.0 > > 16, 0, 15, 1, 1, 0, 0.997 > 16, 0, 15, 1, 0, 0, 1.001 > 16, 0, 15, 1, 1, 0.1, 0.984 > 16, 0, 15, 1, 0, 0.1, 0.999 > 16, 0, 15, 1, 1, 0.25, 0.929 > 16, 0, 15, 1, 0, 0.25, 1.001 > 16, 0, 15, 1, 1, 0.33, 0.892 > 16, 0, 15, 1, 0, 0.33, 0.996 > 16, 0, 15, 1, 1, 0.5, 0.897 > 16, 0, 15, 1, 0, 0.5, 1.009 > 16, 0, 15, 1, 1, 0.66, 0.882 > 16, 0, 15, 1, 0, 0.66, 0.967 > 16, 0, 15, 1, 1, 0.75, 0.919 > 16, 0, 15, 1, 0, 0.75, 1.027 > 16, 0, 15, 1, 1, 0.9, 0.949 > 16, 0, 15, 1, 0, 0.9, 1.021 > 16, 0, 15, 1, 1, 1, 0.998 > 16, 0, 15, 1, 0, 1, 0.999 > > sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++----------- > 1 file changed, 80 insertions(+), 66 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > index f62cd9d144..ec739fb8f9 100644 > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > @@ -30,6 +30,7 @@ > # ifdef USE_AS_WCSCHR > # define VPBROADCAST vpbroadcastd > # define VPCMP vpcmpd > +# define VPTESTN vptestnmd > # define VPMINU vpminud > # define CHAR_REG esi > # define SHIFT_REG ecx > @@ -37,6 +38,7 @@ > # else > # define VPBROADCAST vpbroadcastb > # define VPCMP vpcmpb > +# define VPTESTN vptestnmb > # define VPMINU vpminub > # define CHAR_REG sil > # define SHIFT_REG edx > @@ -61,13 +63,11 @@ > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section .text.evex,"ax",@progbits > -ENTRY (STRCHR) > +ENTRY_P2ALIGN (STRCHR, 5) > /* Broadcast CHAR to YMM0. */ > VPBROADCAST %esi, %YMM0 > movl %edi, %eax > andl $(PAGE_SIZE - 1), %eax > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - > /* Check if we cross page boundary with one vector load. > Otherwise it is safe to use an unaligned load. */ > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > @@ -81,49 +81,35 @@ ENTRY (STRCHR) > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > + VPTESTN %YMM2, %YMM2, %k0 > kmovd %k0, %eax > testl %eax, %eax > jz L(aligned_more) > tzcntl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > + /* NB: Use a branch instead of cmovcc here. The expectation is > + that with strchr the user will branch based on input being > + null. Since this branch will be 100% predictive of the user > + branch a branch miss here should save what otherwise would > + be branch miss in the user code. Otherwise using a branch 1) > + saves code size and 2) is faster in highly predictable > + environments. */ > + jne L(zero) > +# endif > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of bytes. > */ > leaq (%rdi, %rax, CHAR_SIZE), %rax > # else > addq %rdi, %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - /* Found CHAR or the null byte. */ > - cmp (%rax), %CHAR_REG > - jne L(zero) > # endif > ret > > - /* .p2align 5 helps keep performance more consistent if ENTRY() > - alignment % 32 was either 16 or 0. As well this makes the > - alignment % 32 of the loop_4x_vec fixed which makes tuning it > - easier. */ > - .p2align 5 > -L(first_vec_x3): > - tzcntl %eax, %eax > -# ifndef USE_AS_STRCHRNUL > - /* Found CHAR or the null byte. */ > - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > - jne L(zero) > -# endif > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > - bytes. */ > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > - ret > > -# ifndef USE_AS_STRCHRNUL > -L(zero): > - xorl %eax, %eax > - ret > -# endif > > - .p2align 4 > + .p2align 4,, 10 > L(first_vec_x4): > # ifndef USE_AS_STRCHRNUL > /* Check to see if first match was CHAR (k0) or null (k1). */ > @@ -144,9 +130,18 @@ L(first_vec_x4): > leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > ret > > +# ifndef USE_AS_STRCHRNUL > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > + > .p2align 4 > L(first_vec_x1): > - tzcntl %eax, %eax > + /* Use bsf here to save 1-byte keeping keeping the block in 1x > + fetch block. eax guranteed non-zero. */ > + bsfl %eax, %eax > # ifndef USE_AS_STRCHRNUL > /* Found CHAR or the null byte. */ > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > @@ -158,7 +153,7 @@ L(first_vec_x1): > leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4 > + .p2align 4,, 10 > L(first_vec_x2): > # ifndef USE_AS_STRCHRNUL > /* Check to see if first match was CHAR (k0) or null (k1). */ > @@ -179,6 +174,21 @@ L(first_vec_x2): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > + .p2align 4,, 10 > +L(first_vec_x3): > + /* Use bsf here to save 1-byte keeping keeping the block in 1x > + fetch block. eax guranteed non-zero. */ > + bsfl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero) > +# endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > .p2align 4 > L(aligned_more): > /* Align data to VEC_SIZE. */ > @@ -195,7 +205,7 @@ L(cross_page_continue): > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > + VPTESTN %YMM2, %YMM2, %k0 > kmovd %k0, %eax > testl %eax, %eax > jnz L(first_vec_x1) > @@ -206,7 +216,7 @@ L(cross_page_continue): > /* Each bit in K0 represents a CHAR in YMM1. */ > VPCMP $0, %YMM1, %YMM0, %k0 > /* Each bit in K1 represents a CHAR in YMM1. */ > - VPCMP $0, %YMM1, %YMMZERO, %k1 > + VPTESTN %YMM1, %YMM1, %k1 > kortestd %k0, %k1 > jnz L(first_vec_x2) > > @@ -215,7 +225,7 @@ L(cross_page_continue): > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > + VPTESTN %YMM2, %YMM2, %k0 > kmovd %k0, %eax > testl %eax, %eax > jnz L(first_vec_x3) > @@ -224,7 +234,7 @@ L(cross_page_continue): > /* Each bit in K0 represents a CHAR in YMM1. */ > VPCMP $0, %YMM1, %YMM0, %k0 > /* Each bit in K1 represents a CHAR in YMM1. */ > - VPCMP $0, %YMM1, %YMMZERO, %k1 > + VPTESTN %YMM1, %YMM1, %k1 > kortestd %k0, %k1 > jnz L(first_vec_x4) > > @@ -265,33 +275,33 @@ L(loop_4x_vec): > VPMINU %YMM3, %YMM4, %YMM4 > VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > > - VPCMP $0, %YMMZERO, %YMM4, %k1 > + VPTESTN %YMM4, %YMM4, %k1 > kmovd %k1, %ecx > subq $-(VEC_SIZE * 4), %rdi > testl %ecx, %ecx > jz L(loop_4x_vec) > > - VPCMP $0, %YMMZERO, %YMM1, %k0 > + VPTESTN %YMM1, %YMM1, %k0 > kmovd %k0, %eax > testl %eax, %eax > jnz L(last_vec_x1) > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > + VPTESTN %YMM2, %YMM2, %k0 > kmovd %k0, %eax > testl %eax, %eax > jnz L(last_vec_x2) > > - VPCMP $0, %YMMZERO, %YMM3, %k0 > + VPTESTN %YMM3, %YMM3, %k0 > kmovd %k0, %eax > /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > # ifdef USE_AS_WCSCHR > sall $8, %ecx > orl %ecx, %eax > - tzcntl %eax, %eax > + bsfl %eax, %eax > # else > salq $32, %rcx > orq %rcx, %rax > - tzcntq %rax, %rax > + bsfq %rax, %rax > # endif > # ifndef USE_AS_STRCHRNUL > /* Check if match was CHAR or null. */ > @@ -303,28 +313,28 @@ L(loop_4x_vec): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > -# ifndef USE_AS_STRCHRNUL > -L(zero_end): > - xorl %eax, %eax > - ret > + .p2align 4,, 8 > +L(last_vec_x1): > + bsfl %eax, %eax > +# ifdef USE_AS_WCSCHR > + /* NB: Multiply wchar_t count by 4 to get the number of bytes. > + */ > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > # endif > > - .p2align 4 > -L(last_vec_x1): > - tzcntl %eax, %eax > # ifndef USE_AS_STRCHRNUL > /* Check if match was null. */ > - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > + cmp (%rax), %CHAR_REG > jne L(zero_end) > # endif > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > - bytes. */ > - leaq (%rdi, %rax, CHAR_SIZE), %rax > + > ret > > - .p2align 4 > + .p2align 4,, 8 > L(last_vec_x2): > - tzcntl %eax, %eax > + bsfl %eax, %eax > # ifndef USE_AS_STRCHRNUL > /* Check if match was null. */ > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > @@ -336,7 +346,7 @@ L(last_vec_x2): > ret > > /* Cold case for crossing page with first load. */ > - .p2align 4 > + .p2align 4,, 8 > L(cross_page_boundary): > movq %rdi, %rdx > /* Align rdi. */ > @@ -346,9 +356,9 @@ L(cross_page_boundary): > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > + VPTESTN %YMM2, %YMM2, %k0 > kmovd %k0, %eax > - /* Remove the leading bits. */ > + /* Remove the leading bits. */ > # ifdef USE_AS_WCSCHR > movl %edx, %SHIFT_REG > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > @@ -360,20 +370,24 @@ L(cross_page_boundary): > /* If eax is zero continue. */ > testl %eax, %eax > jz L(cross_page_continue) > - tzcntl %eax, %eax > -# ifndef USE_AS_STRCHRNUL > - /* Check to see if match was CHAR or null. */ > - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > - jne L(zero_end) > -# endif > + bsfl %eax, %eax > + > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of > bytes. */ > leaq (%rdx, %rax, CHAR_SIZE), %rax > # else > addq %rdx, %rax > +# endif > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if match was CHAR or null. */ > + cmp (%rax), %CHAR_REG > + je L(cross_page_ret) > +L(zero_end): > + xorl %eax, %eax > +L(cross_page_ret): > # endif > ret > > END (STRCHR) > -# endif > +#endif > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Thu, Mar 24, 2022 at 11:55 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Small code cleanup for size: -81 bytes. > > > > Add comment justifying using a branch to do NULL/non-null return. > > > > All string/memory tests pass and no regressions in benchtests. > > > > geometric_mean(N=20) of all benchmarks New / Original: .985 > > --- > > Geomtric Mean N=20 runs; All functions page aligned > > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time > > 2048, 0, 32, 0, 23, 127, 0.878 > > 2048, 1, 32, 0, 23, 127, 0.88 > > 2048, 0, 64, 0, 23, 127, 0.997 > > 2048, 2, 64, 0, 23, 127, 1.001 > > 2048, 0, 128, 0, 23, 127, 0.973 > > 2048, 3, 128, 0, 23, 127, 0.971 > > 2048, 0, 256, 0, 23, 127, 0.976 > > 2048, 4, 256, 0, 23, 127, 0.973 > > 2048, 0, 512, 0, 23, 127, 1.001 > > 2048, 5, 512, 0, 23, 127, 1.004 > > 2048, 0, 1024, 0, 23, 127, 1.005 > > 2048, 6, 1024, 0, 23, 127, 1.007 > > 2048, 0, 2048, 0, 23, 127, 1.035 > > 2048, 7, 2048, 0, 23, 127, 1.03 > > 4096, 0, 32, 0, 23, 127, 0.889 > > 4096, 1, 32, 0, 23, 127, 0.891 > > 4096, 0, 64, 0, 23, 127, 1.012 > > 4096, 2, 64, 0, 23, 127, 1.017 > > 4096, 0, 128, 0, 23, 127, 0.975 > > 4096, 3, 128, 0, 23, 127, 0.974 > > 4096, 0, 256, 0, 23, 127, 0.974 > > 4096, 4, 256, 0, 23, 127, 0.972 > > 4096, 0, 512, 0, 23, 127, 1.002 > > 4096, 5, 512, 0, 23, 127, 1.016 > > 4096, 0, 1024, 0, 23, 127, 1.009 > > 4096, 6, 1024, 0, 23, 127, 1.008 > > 4096, 0, 2048, 0, 23, 127, 1.003 > > 4096, 7, 2048, 0, 23, 127, 1.004 > > 256, 1, 64, 0, 23, 127, 0.993 > > 256, 2, 64, 0, 23, 127, 0.999 > > 256, 3, 64, 0, 23, 127, 0.992 > > 256, 4, 64, 0, 23, 127, 0.99 > > 256, 5, 64, 0, 23, 127, 0.99 > > 256, 6, 64, 0, 23, 127, 0.994 > > 256, 7, 64, 0, 23, 127, 0.991 > > 512, 0, 256, 0, 23, 127, 0.971 > > 512, 16, 256, 0, 23, 127, 0.971 > > 512, 32, 256, 0, 23, 127, 1.005 > > 512, 48, 256, 0, 23, 127, 0.998 > > 512, 64, 256, 0, 23, 127, 1.001 > > 512, 80, 256, 0, 23, 127, 1.002 > > 512, 96, 256, 0, 23, 127, 1.005 > > 512, 112, 256, 0, 23, 127, 1.012 > > 1, 0, 0, 0, 23, 127, 1.024 > > 2, 0, 1, 0, 23, 127, 0.991 > > 3, 0, 2, 0, 23, 127, 0.997 > > 4, 0, 3, 0, 23, 127, 0.984 > > 5, 0, 4, 0, 23, 127, 0.993 > > 6, 0, 5, 0, 23, 127, 0.985 > > 7, 0, 6, 0, 23, 127, 0.979 > > 8, 0, 7, 0, 23, 127, 0.975 > > 9, 0, 8, 0, 23, 127, 0.965 > > 10, 0, 9, 0, 23, 127, 0.957 > > 11, 0, 10, 0, 23, 127, 0.979 > > 12, 0, 11, 0, 23, 127, 0.987 > > 13, 0, 12, 0, 23, 127, 1.023 > > 14, 0, 13, 0, 23, 127, 0.997 > > 15, 0, 14, 0, 23, 127, 0.983 > > 16, 0, 15, 0, 23, 127, 0.987 > > 17, 0, 16, 0, 23, 127, 0.993 > > 18, 0, 17, 0, 23, 127, 0.985 > > 19, 0, 18, 0, 23, 127, 0.999 > > 20, 0, 19, 0, 23, 127, 0.998 > > 21, 0, 20, 0, 23, 127, 0.983 > > 22, 0, 21, 0, 23, 127, 0.983 > > 23, 0, 22, 0, 23, 127, 1.002 > > 24, 0, 23, 0, 23, 127, 1.0 > > 25, 0, 24, 0, 23, 127, 1.002 > > 26, 0, 25, 0, 23, 127, 0.984 > > 27, 0, 26, 0, 23, 127, 0.994 > > 28, 0, 27, 0, 23, 127, 0.995 > > 29, 0, 28, 0, 23, 127, 1.017 > > 30, 0, 29, 0, 23, 127, 1.009 > > 31, 0, 30, 0, 23, 127, 1.001 > > 32, 0, 31, 0, 23, 127, 1.021 > > 2048, 0, 32, 0, 0, 127, 0.899 > > 2048, 1, 32, 0, 0, 127, 0.93 > > 2048, 0, 64, 0, 0, 127, 1.009 > > 2048, 2, 64, 0, 0, 127, 1.023 > > 2048, 0, 128, 0, 0, 127, 0.973 > > 2048, 3, 128, 0, 0, 127, 0.975 > > 2048, 0, 256, 0, 0, 127, 0.974 > > 2048, 4, 256, 0, 0, 127, 0.97 > > 2048, 0, 512, 0, 0, 127, 0.999 > > 2048, 5, 512, 0, 0, 127, 1.004 > > 2048, 0, 1024, 0, 0, 127, 1.008 > > 2048, 6, 1024, 0, 0, 127, 1.008 > > 2048, 0, 2048, 0, 0, 127, 0.996 > > 2048, 7, 2048, 0, 0, 127, 1.002 > > 4096, 0, 32, 0, 0, 127, 0.872 > > 4096, 1, 32, 0, 0, 127, 0.881 > > 4096, 0, 64, 0, 0, 127, 1.006 > > 4096, 2, 64, 0, 0, 127, 1.005 > > 4096, 0, 128, 0, 0, 127, 0.973 > > 4096, 3, 128, 0, 0, 127, 0.974 > > 4096, 0, 256, 0, 0, 127, 0.969 > > 4096, 4, 256, 0, 0, 127, 0.971 > > 4096, 0, 512, 0, 0, 127, 1.0 > > 4096, 5, 512, 0, 0, 127, 1.005 > > 4096, 0, 1024, 0, 0, 127, 1.007 > > 4096, 6, 1024, 0, 0, 127, 1.009 > > 4096, 0, 2048, 0, 0, 127, 1.005 > > 4096, 7, 2048, 0, 0, 127, 1.007 > > 256, 1, 64, 0, 0, 127, 0.994 > > 256, 2, 64, 0, 0, 127, 1.008 > > 256, 3, 64, 0, 0, 127, 1.019 > > 256, 4, 64, 0, 0, 127, 0.991 > > 256, 5, 64, 0, 0, 127, 0.992 > > 256, 6, 64, 0, 0, 127, 0.991 > > 256, 7, 64, 0, 0, 127, 0.988 > > 512, 0, 256, 0, 0, 127, 0.971 > > 512, 16, 256, 0, 0, 127, 0.967 > > 512, 32, 256, 0, 0, 127, 1.005 > > 512, 48, 256, 0, 0, 127, 1.001 > > 512, 64, 256, 0, 0, 127, 1.009 > > 512, 80, 256, 0, 0, 127, 1.008 > > 512, 96, 256, 0, 0, 127, 1.009 > > 512, 112, 256, 0, 0, 127, 1.016 > > 1, 0, 0, 0, 0, 127, 1.038 > > 2, 0, 1, 0, 0, 127, 1.009 > > 3, 0, 2, 0, 0, 127, 0.992 > > 4, 0, 3, 0, 0, 127, 1.004 > > 5, 0, 4, 0, 0, 127, 0.966 > > 6, 0, 5, 0, 0, 127, 0.968 > > 7, 0, 6, 0, 0, 127, 1.004 > > 8, 0, 7, 0, 0, 127, 0.99 > > 9, 0, 8, 0, 0, 127, 0.958 > > 10, 0, 9, 0, 0, 127, 0.96 > > 11, 0, 10, 0, 0, 127, 0.948 > > 12, 0, 11, 0, 0, 127, 0.984 > > 13, 0, 12, 0, 0, 127, 0.967 > > 14, 0, 13, 0, 0, 127, 0.993 > > 15, 0, 14, 0, 0, 127, 0.991 > > 16, 0, 15, 0, 0, 127, 1.0 > > 17, 0, 16, 0, 0, 127, 0.982 > > 18, 0, 17, 0, 0, 127, 0.977 > > 19, 0, 18, 0, 0, 127, 0.987 > > 20, 0, 19, 0, 0, 127, 0.978 > > 21, 0, 20, 0, 0, 127, 1.0 > > 22, 0, 21, 0, 0, 127, 0.99 > > 23, 0, 22, 0, 0, 127, 0.988 > > 24, 0, 23, 0, 0, 127, 0.997 > > 25, 0, 24, 0, 0, 127, 1.003 > > 26, 0, 25, 0, 0, 127, 1.004 > > 27, 0, 26, 0, 0, 127, 0.982 > > 28, 0, 27, 0, 0, 127, 0.972 > > 29, 0, 28, 0, 0, 127, 0.978 > > 30, 0, 29, 0, 0, 127, 0.992 > > 31, 0, 30, 0, 0, 127, 0.986 > > 32, 0, 31, 0, 0, 127, 1.0 > > > > 16, 0, 15, 1, 1, 0, 0.997 > > 16, 0, 15, 1, 0, 0, 1.001 > > 16, 0, 15, 1, 1, 0.1, 0.984 > > 16, 0, 15, 1, 0, 0.1, 0.999 > > 16, 0, 15, 1, 1, 0.25, 0.929 > > 16, 0, 15, 1, 0, 0.25, 1.001 > > 16, 0, 15, 1, 1, 0.33, 0.892 > > 16, 0, 15, 1, 0, 0.33, 0.996 > > 16, 0, 15, 1, 1, 0.5, 0.897 > > 16, 0, 15, 1, 0, 0.5, 1.009 > > 16, 0, 15, 1, 1, 0.66, 0.882 > > 16, 0, 15, 1, 0, 0.66, 0.967 > > 16, 0, 15, 1, 1, 0.75, 0.919 > > 16, 0, 15, 1, 0, 0.75, 1.027 > > 16, 0, 15, 1, 1, 0.9, 0.949 > > 16, 0, 15, 1, 0, 0.9, 1.021 > > 16, 0, 15, 1, 1, 1, 0.998 > > 16, 0, 15, 1, 0, 1, 0.999 > > > > sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++----------- > > 1 file changed, 80 insertions(+), 66 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > > index f62cd9d144..ec739fb8f9 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > > @@ -30,6 +30,7 @@ > > # ifdef USE_AS_WCSCHR > > # define VPBROADCAST vpbroadcastd > > # define VPCMP vpcmpd > > +# define VPTESTN vptestnmd > > # define VPMINU vpminud > > # define CHAR_REG esi > > # define SHIFT_REG ecx > > @@ -37,6 +38,7 @@ > > # else > > # define VPBROADCAST vpbroadcastb > > # define VPCMP vpcmpb > > +# define VPTESTN vptestnmb > > # define VPMINU vpminub > > # define CHAR_REG sil > > # define SHIFT_REG edx > > @@ -61,13 +63,11 @@ > > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > .section .text.evex,"ax",@progbits > > -ENTRY (STRCHR) > > +ENTRY_P2ALIGN (STRCHR, 5) > > /* Broadcast CHAR to YMM0. */ > > VPBROADCAST %esi, %YMM0 > > movl %edi, %eax > > andl $(PAGE_SIZE - 1), %eax > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > - > > /* Check if we cross page boundary with one vector load. > > Otherwise it is safe to use an unaligned load. */ > > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > @@ -81,49 +81,35 @@ ENTRY (STRCHR) > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > + VPTESTN %YMM2, %YMM2, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > jz L(aligned_more) > > tzcntl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + /* NB: Use a branch instead of cmovcc here. The expectation is > > + that with strchr the user will branch based on input being > > + null. Since this branch will be 100% predictive of the user > > + branch a branch miss here should save what otherwise would > > + be branch miss in the user code. Otherwise using a branch 1) > > + saves code size and 2) is faster in highly predictable > > + environments. */ > > + jne L(zero) > > +# endif > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of bytes. > > */ > > leaq (%rdi, %rax, CHAR_SIZE), %rax > > # else > > addq %rdi, %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - /* Found CHAR or the null byte. */ > > - cmp (%rax), %CHAR_REG > > - jne L(zero) > > # endif > > ret > > > > - /* .p2align 5 helps keep performance more consistent if ENTRY() > > - alignment % 32 was either 16 or 0. As well this makes the > > - alignment % 32 of the loop_4x_vec fixed which makes tuning it > > - easier. */ > > - .p2align 5 > > -L(first_vec_x3): > > - tzcntl %eax, %eax > > -# ifndef USE_AS_STRCHRNUL > > - /* Found CHAR or the null byte. */ > > - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > - jne L(zero) > > -# endif > > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > > - bytes. */ > > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > - ret > > > > -# ifndef USE_AS_STRCHRNUL > > -L(zero): > > - xorl %eax, %eax > > - ret > > -# endif > > > > - .p2align 4 > > + .p2align 4,, 10 > > L(first_vec_x4): > > # ifndef USE_AS_STRCHRNUL > > /* Check to see if first match was CHAR (k0) or null (k1). */ > > @@ -144,9 +130,18 @@ L(first_vec_x4): > > leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > +# ifndef USE_AS_STRCHRNUL > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > + > > .p2align 4 > > L(first_vec_x1): > > - tzcntl %eax, %eax > > + /* Use bsf here to save 1-byte keeping keeping the block in 1x > > + fetch block. eax guranteed non-zero. */ > > + bsfl %eax, %eax > > # ifndef USE_AS_STRCHRNUL > > /* Found CHAR or the null byte. */ > > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > @@ -158,7 +153,7 @@ L(first_vec_x1): > > leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > - .p2align 4 > > + .p2align 4,, 10 > > L(first_vec_x2): > > # ifndef USE_AS_STRCHRNUL > > /* Check to see if first match was CHAR (k0) or null (k1). */ > > @@ -179,6 +174,21 @@ L(first_vec_x2): > > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > + .p2align 4,, 10 > > +L(first_vec_x3): > > + /* Use bsf here to save 1-byte keeping keeping the block in 1x > > + fetch block. eax guranteed non-zero. */ > > + bsfl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero) > > +# endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > .p2align 4 > > L(aligned_more): > > /* Align data to VEC_SIZE. */ > > @@ -195,7 +205,7 @@ L(cross_page_continue): > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > + VPTESTN %YMM2, %YMM2, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > jnz L(first_vec_x1) > > @@ -206,7 +216,7 @@ L(cross_page_continue): > > /* Each bit in K0 represents a CHAR in YMM1. */ > > VPCMP $0, %YMM1, %YMM0, %k0 > > /* Each bit in K1 represents a CHAR in YMM1. */ > > - VPCMP $0, %YMM1, %YMMZERO, %k1 > > + VPTESTN %YMM1, %YMM1, %k1 > > kortestd %k0, %k1 > > jnz L(first_vec_x2) > > > > @@ -215,7 +225,7 @@ L(cross_page_continue): > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > + VPTESTN %YMM2, %YMM2, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > jnz L(first_vec_x3) > > @@ -224,7 +234,7 @@ L(cross_page_continue): > > /* Each bit in K0 represents a CHAR in YMM1. */ > > VPCMP $0, %YMM1, %YMM0, %k0 > > /* Each bit in K1 represents a CHAR in YMM1. */ > > - VPCMP $0, %YMM1, %YMMZERO, %k1 > > + VPTESTN %YMM1, %YMM1, %k1 > > kortestd %k0, %k1 > > jnz L(first_vec_x4) > > > > @@ -265,33 +275,33 @@ L(loop_4x_vec): > > VPMINU %YMM3, %YMM4, %YMM4 > > VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > > > > - VPCMP $0, %YMMZERO, %YMM4, %k1 > > + VPTESTN %YMM4, %YMM4, %k1 > > kmovd %k1, %ecx > > subq $-(VEC_SIZE * 4), %rdi > > testl %ecx, %ecx > > jz L(loop_4x_vec) > > > > - VPCMP $0, %YMMZERO, %YMM1, %k0 > > + VPTESTN %YMM1, %YMM1, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > jnz L(last_vec_x1) > > > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > + VPTESTN %YMM2, %YMM2, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > jnz L(last_vec_x2) > > > > - VPCMP $0, %YMMZERO, %YMM3, %k0 > > + VPTESTN %YMM3, %YMM3, %k0 > > kmovd %k0, %eax > > /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > > # ifdef USE_AS_WCSCHR > > sall $8, %ecx > > orl %ecx, %eax > > - tzcntl %eax, %eax > > + bsfl %eax, %eax > > # else > > salq $32, %rcx > > orq %rcx, %rax > > - tzcntq %rax, %rax > > + bsfq %rax, %rax > > # endif > > # ifndef USE_AS_STRCHRNUL > > /* Check if match was CHAR or null. */ > > @@ -303,28 +313,28 @@ L(loop_4x_vec): > > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > -# ifndef USE_AS_STRCHRNUL > > -L(zero_end): > > - xorl %eax, %eax > > - ret > > + .p2align 4,, 8 > > +L(last_vec_x1): > > + bsfl %eax, %eax > > +# ifdef USE_AS_WCSCHR > > + /* NB: Multiply wchar_t count by 4 to get the number of bytes. > > + */ > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + addq %rdi, %rax > > # endif > > > > - .p2align 4 > > -L(last_vec_x1): > > - tzcntl %eax, %eax > > # ifndef USE_AS_STRCHRNUL > > /* Check if match was null. */ > > - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + cmp (%rax), %CHAR_REG > > jne L(zero_end) > > # endif > > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > > - bytes. */ > > - leaq (%rdi, %rax, CHAR_SIZE), %rax > > + > > ret > > > > - .p2align 4 > > + .p2align 4,, 8 > > L(last_vec_x2): > > - tzcntl %eax, %eax > > + bsfl %eax, %eax > > # ifndef USE_AS_STRCHRNUL > > /* Check if match was null. */ > > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > @@ -336,7 +346,7 @@ L(last_vec_x2): > > ret > > > > /* Cold case for crossing page with first load. */ > > - .p2align 4 > > + .p2align 4,, 8 > > L(cross_page_boundary): > > movq %rdi, %rdx > > /* Align rdi. */ > > @@ -346,9 +356,9 @@ L(cross_page_boundary): > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > + VPTESTN %YMM2, %YMM2, %k0 > > kmovd %k0, %eax > > - /* Remove the leading bits. */ > > + /* Remove the leading bits. */ > > # ifdef USE_AS_WCSCHR > > movl %edx, %SHIFT_REG > > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > > @@ -360,20 +370,24 @@ L(cross_page_boundary): > > /* If eax is zero continue. */ > > testl %eax, %eax > > jz L(cross_page_continue) > > - tzcntl %eax, %eax > > -# ifndef USE_AS_STRCHRNUL > > - /* Check to see if match was CHAR or null. */ > > - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > > - jne L(zero_end) > > -# endif > > + bsfl %eax, %eax > > + > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of > > bytes. */ > > leaq (%rdx, %rax, CHAR_SIZE), %rax > > # else > > addq %rdx, %rax > > +# endif > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if match was CHAR or null. */ > > + cmp (%rax), %CHAR_REG > > + je L(cross_page_ret) > > +L(zero_end): > > + xorl %eax, %eax > > +L(cross_page_ret): > > # endif > > ret > > > > END (STRCHR) > > -# endif > > +#endif > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index f62cd9d144..ec739fb8f9 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -30,6 +30,7 @@ # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd # define VPCMP vpcmpd +# define VPTESTN vptestnmd # define VPMINU vpminud # define CHAR_REG esi # define SHIFT_REG ecx @@ -37,6 +38,7 @@ # else # define VPBROADCAST vpbroadcastb # define VPCMP vpcmpb +# define VPTESTN vptestnmb # define VPMINU vpminub # define CHAR_REG sil # define SHIFT_REG edx @@ -61,13 +63,11 @@ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits -ENTRY (STRCHR) +ENTRY_P2ALIGN (STRCHR, 5) /* Broadcast CHAR to YMM0. */ VPBROADCAST %esi, %YMM0 movl %edi, %eax andl $(PAGE_SIZE - 1), %eax - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Check if we cross page boundary with one vector load. Otherwise it is safe to use an unaligned load. */ cmpl $(PAGE_SIZE - VEC_SIZE), %eax @@ -81,49 +81,35 @@ ENTRY (STRCHR) vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jz L(aligned_more) tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + /* NB: Use a branch instead of cmovcc here. The expectation is + that with strchr the user will branch based on input being + null. Since this branch will be 100% predictive of the user + branch a branch miss here should save what otherwise would + be branch miss in the user code. Otherwise using a branch 1) + saves code size and 2) is faster in highly predictable + environments. */ + jne L(zero) +# endif # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (%rax), %CHAR_REG - jne L(zero) # endif ret - /* .p2align 5 helps keep performance more consistent if ENTRY() - alignment % 32 was either 16 or 0. As well this makes the - alignment % 32 of the loop_4x_vec fixed which makes tuning it - easier. */ - .p2align 5 -L(first_vec_x3): - tzcntl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero) -# endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax - ret -# ifndef USE_AS_STRCHRNUL -L(zero): - xorl %eax, %eax - ret -# endif - .p2align 4 + .p2align 4,, 10 L(first_vec_x4): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ @@ -144,9 +130,18 @@ L(first_vec_x4): leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ret +# ifndef USE_AS_STRCHRNUL +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 4 L(first_vec_x1): - tzcntl %eax, %eax + /* Use bsf here to save 1-byte keeping keeping the block in 1x + fetch block. eax guranteed non-zero. */ + bsfl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -158,7 +153,7 @@ L(first_vec_x1): leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4 + .p2align 4,, 10 L(first_vec_x2): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ @@ -179,6 +174,21 @@ L(first_vec_x2): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret + .p2align 4,, 10 +L(first_vec_x3): + /* Use bsf here to save 1-byte keeping keeping the block in 1x + fetch block. eax guranteed non-zero. */ + bsfl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) +# endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + .p2align 4 L(aligned_more): /* Align data to VEC_SIZE. */ @@ -195,7 +205,7 @@ L(cross_page_continue): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x1) @@ -206,7 +216,7 @@ L(cross_page_continue): /* Each bit in K0 represents a CHAR in YMM1. */ VPCMP $0, %YMM1, %YMM0, %k0 /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMMZERO, %k1 + VPTESTN %YMM1, %YMM1, %k1 kortestd %k0, %k1 jnz L(first_vec_x2) @@ -215,7 +225,7 @@ L(cross_page_continue): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x3) @@ -224,7 +234,7 @@ L(cross_page_continue): /* Each bit in K0 represents a CHAR in YMM1. */ VPCMP $0, %YMM1, %YMM0, %k0 /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMMZERO, %k1 + VPTESTN %YMM1, %YMM1, %k1 kortestd %k0, %k1 jnz L(first_vec_x4) @@ -265,33 +275,33 @@ L(loop_4x_vec): VPMINU %YMM3, %YMM4, %YMM4 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} - VPCMP $0, %YMMZERO, %YMM4, %k1 + VPTESTN %YMM4, %YMM4, %k1 kmovd %k1, %ecx subq $-(VEC_SIZE * 4), %rdi testl %ecx, %ecx jz L(loop_4x_vec) - VPCMP $0, %YMMZERO, %YMM1, %k0 + VPTESTN %YMM1, %YMM1, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(last_vec_x1) - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(last_vec_x2) - VPCMP $0, %YMMZERO, %YMM3, %k0 + VPTESTN %YMM3, %YMM3, %k0 kmovd %k0, %eax /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ # ifdef USE_AS_WCSCHR sall $8, %ecx orl %ecx, %eax - tzcntl %eax, %eax + bsfl %eax, %eax # else salq $32, %rcx orq %rcx, %rax - tzcntq %rax, %rax + bsfq %rax, %rax # endif # ifndef USE_AS_STRCHRNUL /* Check if match was CHAR or null. */ @@ -303,28 +313,28 @@ L(loop_4x_vec): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret -# ifndef USE_AS_STRCHRNUL -L(zero_end): - xorl %eax, %eax - ret + .p2align 4,, 8 +L(last_vec_x1): + bsfl %eax, %eax +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif - .p2align 4 -L(last_vec_x1): - tzcntl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (%rax), %CHAR_REG jne L(zero_end) # endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (%rdi, %rax, CHAR_SIZE), %rax + ret - .p2align 4 + .p2align 4,, 8 L(last_vec_x2): - tzcntl %eax, %eax + bsfl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Check if match was null. */ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -336,7 +346,7 @@ L(last_vec_x2): ret /* Cold case for crossing page with first load. */ - .p2align 4 + .p2align 4,, 8 L(cross_page_boundary): movq %rdi, %rdx /* Align rdi. */ @@ -346,9 +356,9 @@ L(cross_page_boundary): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax - /* Remove the leading bits. */ + /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR movl %edx, %SHIFT_REG /* NB: Divide shift count by 4 since each bit in K1 represent 4 @@ -360,20 +370,24 @@ L(cross_page_boundary): /* If eax is zero continue. */ testl %eax, %eax jz L(cross_page_continue) - tzcntl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Check to see if match was CHAR or null. */ - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero_end) -# endif + bsfl %eax, %eax + # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ leaq (%rdx, %rax, CHAR_SIZE), %rax # else addq %rdx, %rax +# endif +# ifndef USE_AS_STRCHRNUL + /* Check to see if match was CHAR or null. */ + cmp (%rax), %CHAR_REG + je L(cross_page_ret) +L(zero_end): + xorl %eax, %eax +L(cross_page_ret): # endif ret END (STRCHR) -# endif +#endif