Message ID | 20220607041134.2369903-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v6,1/8] x86: Create header for VEC classes in x86 strings library | expand |
On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The new code: > 1. prioritizes smaller user-arg lengths more. > 2. optimizes target placement more carefully > 3. reuses logic more > 4. fixes up various inefficiencies in the logic. The biggest > case here is the `lzcnt` logic for checking returns which > saves either a branch or multiple instructions. > > The total code size saving is: 306 bytes > Geometric Mean of all benchmarks New / Old: 0.760 > > Regressions: > There are some regressions. Particularly where the length (user arg > length) is large but the position of the match char is near the > beginning of the string (in first VEC). This case has roughly a > 10-20% regression. > > This is because the new logic gives the hot path for immediate matches > to shorter lengths (the more common input). This case has roughly > a 15-45% speedup. > > Full xcheck passes on x86_64. > --- > sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + > sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 ++++++++++---------- > 2 files changed, 257 insertions(+), 278 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > index cea2d2a72d..5e9beeeef2 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > @@ -2,6 +2,7 @@ > # define MEMRCHR __memrchr_avx2_rtm > #endif > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S > index ba2ce7cb03..bea4528068 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S > @@ -21,340 +21,318 @@ > # include <sysdep.h> > > # ifndef MEMRCHR > -# define MEMRCHR __memrchr_avx2 > +# define MEMRCHR __memrchr_avx2 > # endif > > # ifndef VZEROUPPER > -# define VZEROUPPER vzeroupper > +# define VZEROUPPER vzeroupper > # endif > > # ifndef SECTION > # define SECTION(p) p##.avx > # endif > > -# define VEC_SIZE 32 > +# define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > + .section SECTION(.text), "ax", @progbits > +ENTRY(MEMRCHR) > +# ifdef __ILP32__ > + /* Clear upper bits. */ > + and %RDX_LP, %RDX_LP > +# else > + test %RDX_LP, %RDX_LP > +# endif > + jz L(zero_0) > > - .section SECTION(.text),"ax",@progbits > -ENTRY (MEMRCHR) > - /* Broadcast CHAR to YMM0. */ > vmovd %esi, %xmm0 > - vpbroadcastb %xmm0, %ymm0 > - > - sub $VEC_SIZE, %RDX_LP > - jbe L(last_vec_or_less) > - > - add %RDX_LP, %RDI_LP > - > - /* Check the last VEC_SIZE bytes. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x0) > + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a > + correct page cross check and 2) it correctly sets up end ptr to be > + subtract by lzcnt aligned. */ > + leaq -1(%rdx, %rdi), %rax > > - subq $(VEC_SIZE * 4), %rdi > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - jz L(aligned_more) > + vpbroadcastb %xmm0, %ymm0 > > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - addq $VEC_SIZE, %rdx > - andq $-VEC_SIZE, %rdi > - subq %rcx, %rdx > + /* Check if we can load 1x VEC without cross a page. */ > + testl $(PAGE_SIZE - VEC_SIZE), %eax > + jz L(page_cross) > + > + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + cmpq $VEC_SIZE, %rdx > + ja L(more_1x_vec) > + > +L(ret_vec_x0_test): > + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which > + will gurantee edx (len) is less than it. */ > + lzcntl %ecx, %ecx > + > + /* Hoist vzeroupper (not great for RTM) to save code size. This allows > + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > > - .p2align 4 > -L(aligned_more): > - subq $(VEC_SIZE * 4), %rdx > - jbe L(last_4x_vec_or_less) > - > - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - vpcmpeqb (%rdi), %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jnz L(last_vec_x0) > - > - /* Align data to 4 * VEC_SIZE for loop with fewer branches. > - There are some overlaps with above if data isn't aligned > - to 4 * VEC_SIZE. */ > - movl %edi, %ecx > - andl $(VEC_SIZE * 4 - 1), %ecx > - jz L(loop_4x_vec) > - > - addq $(VEC_SIZE * 4), %rdi > - addq $(VEC_SIZE * 4), %rdx > - andq $-(VEC_SIZE * 4), %rdi > - subq %rcx, %rdx > + /* Fits in aligning bytes of first cache line. */ > +L(zero_0): > + xorl %eax, %eax > + ret > > - .p2align 4 > -L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - subq $(VEC_SIZE * 4), %rdi > - subq $(VEC_SIZE * 4), %rdx > - jbe L(last_4x_vec_or_less) > - > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - > - vpcmpeqb %ymm1, %ymm0, %ymm1 > - vpcmpeqb %ymm2, %ymm0, %ymm2 > - vpcmpeqb %ymm3, %ymm0, %ymm3 > - vpcmpeqb %ymm4, %ymm0, %ymm4 > - > - vpor %ymm1, %ymm2, %ymm5 > - vpor %ymm3, %ymm4, %ymm6 > - vpor %ymm5, %ymm6, %ymm5 > - > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jz L(loop_4x_vec) > - > - /* There is a match. */ > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - vpmovmskb %ymm1, %eax > - bsrl %eax, %eax > - addq %rdi, %rax > + .p2align 4,, 9 > +L(ret_vec_x0): > + lzcntl %ecx, %ecx > + subq %rcx, %rax > L(return_vzeroupper): > ZERO_UPPER_VEC_REGISTERS_RETURN > > - .p2align 4 > -L(last_4x_vec_or_less): > - addl $(VEC_SIZE * 4), %edx > - cmpl $(VEC_SIZE * 2), %edx > - jbe L(last_2x_vec) > - > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x1_check) > - cmpl $(VEC_SIZE * 3), %edx > - jbe L(zero) > - > - vpcmpeqb (%rdi), %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jz L(zero) > - bsrl %eax, %eax > - subq $(VEC_SIZE * 4), %rdx > - addq %rax, %rdx > - jl L(zero) > - addq %rdi, %rax > - VZEROUPPER_RETURN > - > - .p2align 4 > + .p2align 4,, 10 > +L(more_1x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > + > + /* Align rax (string pointer). */ > + andq $-VEC_SIZE, %rax > + > + /* Recompute remaining length after aligning. */ > + movq %rax, %rdx > + /* Need this comparison next no matter what. */ > + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 > + subq %rdi, %rdx > + decq %rax > + vpmovmskb %ymm1, %ecx > + /* Fall through for short (hotter than length). */ > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > L(last_2x_vec): > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3_check) > cmpl $VEC_SIZE, %edx > - jbe L(zero) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jz L(zero) > - bsrl %eax, %eax > - subq $(VEC_SIZE * 2), %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $(VEC_SIZE * 2), %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(last_vec_x0): > - bsrl %eax, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > + jbe L(ret_vec_x0_test) > + > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > + > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + /* 64-bit lzcnt. This will naturally add 32 to position. */ > + lzcntq %rcx, %rcx > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > > - .p2align 4 > -L(last_vec_x1): > - bsrl %eax, %eax > - addl $VEC_SIZE, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > - .p2align 4 > -L(last_vec_x2): > - bsrl %eax, %eax > - addl $(VEC_SIZE * 2), %eax > - addq %rdi, %rax > + /* Inexpensive place to put this regarding code size / target alignments > + / ICache NLP. Necessary for 2-byte encoding of jump to page cross > + case which in turn in necessary for hot path (len <= VEC_SIZE) to fit is necessary? > + in first cache line. */ > +L(page_cross): > + movq %rax, %rsi > + andq $-VEC_SIZE, %rsi > + vpcmpeqb (%rsi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + /* Shift out negative alignment (because we are starting from endptr and > + working backwards). */ > + movl %eax, %r8d > + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ > + notl %r8d > + shlxl %r8d, %ecx, %ecx > + cmpq %rdi, %rsi > + ja L(more_1x_vec) > + lzcntl %ecx, %ecx > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > + .p2align 4,, 11 > +L(ret_vec_x1): > + /* This will naturally add 32 to position. */ > + lzcntq %rcx, %rcx > + subq %rcx, %rax > VZEROUPPER_RETURN > + .p2align 4,, 10 > +L(more_2x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > > - .p2align 4 > -L(last_vec_x3): > - bsrl %eax, %eax > - addl $(VEC_SIZE * 3), %eax > - addq %rdi, %rax > - ret > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1) > > - .p2align 4 > -L(last_vec_x1_check): > - bsrl %eax, %eax > - subq $(VEC_SIZE * 3), %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $VEC_SIZE, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > - .p2align 4 > -L(last_vec_x3_check): > - bsrl %eax, %eax > - subq $VEC_SIZE, %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $(VEC_SIZE * 3), %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > + /* Needed no matter what. */ > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - VZEROUPPER_RETURN > + subq $(VEC_SIZE * 4), %rdx > + ja L(more_4x_vec) > + > + cmpl $(VEC_SIZE * -1), %edx > + jle L(ret_vec_x2_test) > + > +L(last_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + > + /* Needed no matter what. */ > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 3), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + ja L(zero_2) > + ret > > - .p2align 4 > -L(null): > + /* First in aligning bytes. */ > +L(zero_2): > xorl %eax, %eax > ret > > - .p2align 4 > -L(last_vec_or_less_aligned): > - movl %edx, %ecx > + .p2align 4,, 4 > +L(ret_vec_x2_test): > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 2), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + ja L(zero_2) > + ret > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > - movl $1, %edx > - /* Support rdx << 32. */ > - salq %cl, %rdx > - subq $1, %rdx > + .p2align 4,, 11 > +L(ret_vec_x2): > + /* ecx must be non-zero. */ > + bsrl %ecx, %ecx > + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - vpmovmskb %ymm1, %eax > + .p2align 4,, 14 > +L(ret_vec_x3): > + /* ecx must be non-zero. */ > + bsrl %ecx, %ecx > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - /* Remove the trailing bytes. */ > - andl %edx, %eax > - testl %eax, %eax > - jz L(zero) > > - bsrl %eax, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > .p2align 4 > -L(last_vec_or_less): > - addl $VEC_SIZE, %edx > +L(more_4x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > > - /* Check for zero length. */ > - testl %edx, %edx > - jz L(null) > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - jz L(last_vec_or_less_aligned) > + testl %ecx, %ecx > + jnz L(ret_vec_x3) > > - movl %ecx, %esi > - movl %ecx, %r8d > - addl %edx, %esi > - andq $-VEC_SIZE, %rdi > + /* Check if near end before re-aligning (otherwise might do an > + unnecissary loop iteration). */ > + addq $-(VEC_SIZE * 4), %rax > + cmpq $(VEC_SIZE * 4), %rdx > + jbe L(last_4x_vec) > > - subl $VEC_SIZE, %esi > - ja L(last_vec_2x_aligned) > + /* Align rax to (VEC_SIZE - 1). */ > + orq $(VEC_SIZE * 4 - 1), %rax > + movq %rdi, %rdx > + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because > + lengths that overflow can be valid and break the comparison. */ > + orq $(VEC_SIZE * 4 - 1), %rdx > > - /* Check the last VEC. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - > - /* Remove the leading and trailing bytes. */ > - sarl %cl, %eax > - movl %edx, %ecx > + .p2align 4 > +L(loop_4x_vec): > + /* Need this comparison next no matter what. */ > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 > > - movl $1, %edx > - sall %cl, %edx > - subl $1, %edx > + vpor %ymm1, %ymm2, %ymm2 > + vpor %ymm3, %ymm4, %ymm4 > + vpor %ymm2, %ymm4, %ymm4 > + vpmovmskb %ymm4, %esi > > - andl %edx, %eax > - testl %eax, %eax > - jz L(zero) > + testl %esi, %esi > + jnz L(loop_end) > > - bsrl %eax, %eax > - addq %rdi, %rax > - addq %r8, %rax > - VZEROUPPER_RETURN > + addq $(VEC_SIZE * -4), %rax > + cmpq %rdx, %rax > + jne L(loop_4x_vec) > > - .p2align 4 > -L(last_vec_2x_aligned): > - movl %esi, %ecx > + subl %edi, %edx > + incl %edx > > - /* Check the last VEC. */ > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 > +L(last_4x_vec): > + /* Used no matter what. */ > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - movl $1, %edx > - sall %cl, %edx > - subl $1, %edx > + cmpl $(VEC_SIZE * 2), %edx > + jbe L(last_2x_vec) > > - vpmovmskb %ymm1, %eax > + testl %ecx, %ecx > + jnz L(ret_vec_x0_end) > > - /* Remove the trailing bytes. */ > - andl %edx, %eax > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1_end) > > - testl %eax, %eax > - jnz L(last_vec_x1) > + /* Used no matter what. */ > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - /* Check the second last VEC. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > + cmpl $(VEC_SIZE * 3), %edx > + ja L(last_vec) > + > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 2), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + jbe L(ret0) > + xorl %eax, %eax > +L(ret0): > + ret > > - movl %r8d, %ecx > > - vpmovmskb %ymm1, %eax > + .p2align 4 > +L(loop_end): > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x0_end) > + > + vpmovmskb %ymm2, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1_end) > + > + vpmovmskb %ymm3, %ecx > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > + then it won't affect the result in esi (VEC4). If ecx is non-zero > + then CHAR in VEC3 and bsrq will use that position. */ > + salq $32, %rcx > + orq %rsi, %rcx > + bsrq %rcx, %rcx > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - /* Remove the leading bytes. Must use unsigned right shift for > - bsrl below. */ > - shrl %cl, %eax > - testl %eax, %eax > - jz L(zero) > + .p2align 4,, 4 > +L(ret_vec_x1_end): > + /* 64-bit version will automatically add 32 (VEC_SIZE). */ > + lzcntq %rcx, %rcx > + subq %rcx, %rax > + VZEROUPPER_RETURN > > - bsrl %eax, %eax > - addq %rdi, %rax > - addq %r8, %rax > + .p2align 4,, 4 > +L(ret_vec_x0_end): > + lzcntl %ecx, %ecx > + subq %rcx, %rax > VZEROUPPER_RETURN > -END (MEMRCHR) > + > + /* 2 bytes until next cache line. */ > +END(MEMRCHR) > #endif > -- > 2.34.1 > OK with the updated comments. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Tue, Jun 7, 2022 at 11:18 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > The new code: > > 1. prioritizes smaller user-arg lengths more. > > 2. optimizes target placement more carefully > > 3. reuses logic more > > 4. fixes up various inefficiencies in the logic. The biggest > > case here is the `lzcnt` logic for checking returns which > > saves either a branch or multiple instructions. > > > > The total code size saving is: 306 bytes > > Geometric Mean of all benchmarks New / Old: 0.760 > > > > Regressions: > > There are some regressions. Particularly where the length (user arg > > length) is large but the position of the match char is near the > > beginning of the string (in first VEC). This case has roughly a > > 10-20% regression. > > > > This is because the new logic gives the hot path for immediate matches > > to shorter lengths (the more common input). This case has roughly > > a 15-45% speedup. > > > > Full xcheck passes on x86_64. > > --- > > sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + > > sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 ++++++++++---------- > > 2 files changed, 257 insertions(+), 278 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > index cea2d2a72d..5e9beeeef2 100644 > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > @@ -2,6 +2,7 @@ > > # define MEMRCHR __memrchr_avx2_rtm > > #endif > > > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S > > index ba2ce7cb03..bea4528068 100644 > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S > > @@ -21,340 +21,318 @@ > > # include <sysdep.h> > > > > # ifndef MEMRCHR > > -# define MEMRCHR __memrchr_avx2 > > +# define MEMRCHR __memrchr_avx2 > > # endif > > > > # ifndef VZEROUPPER > > -# define VZEROUPPER vzeroupper > > +# define VZEROUPPER vzeroupper > > # endif > > > > # ifndef SECTION > > # define SECTION(p) p##.avx > > # endif > > > > -# define VEC_SIZE 32 > > +# define VEC_SIZE 32 > > +# define PAGE_SIZE 4096 > > + .section SECTION(.text), "ax", @progbits > > +ENTRY(MEMRCHR) > > +# ifdef __ILP32__ > > + /* Clear upper bits. */ > > + and %RDX_LP, %RDX_LP > > +# else > > + test %RDX_LP, %RDX_LP > > +# endif > > + jz L(zero_0) > > > > - .section SECTION(.text),"ax",@progbits > > -ENTRY (MEMRCHR) > > - /* Broadcast CHAR to YMM0. */ > > vmovd %esi, %xmm0 > > - vpbroadcastb %xmm0, %ymm0 > > - > > - sub $VEC_SIZE, %RDX_LP > > - jbe L(last_vec_or_less) > > - > > - add %RDX_LP, %RDI_LP > > - > > - /* Check the last VEC_SIZE bytes. */ > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x0) > > + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a > > + correct page cross check and 2) it correctly sets up end ptr to be > > + subtract by lzcnt aligned. */ > > + leaq -1(%rdx, %rdi), %rax > > > > - subq $(VEC_SIZE * 4), %rdi > > - movl %edi, %ecx > > - andl $(VEC_SIZE - 1), %ecx > > - jz L(aligned_more) > > + vpbroadcastb %xmm0, %ymm0 > > > > - /* Align data for aligned loads in the loop. */ > > - addq $VEC_SIZE, %rdi > > - addq $VEC_SIZE, %rdx > > - andq $-VEC_SIZE, %rdi > > - subq %rcx, %rdx > > + /* Check if we can load 1x VEC without cross a page. */ > > + testl $(PAGE_SIZE - VEC_SIZE), %eax > > + jz L(page_cross) > > + > > + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + cmpq $VEC_SIZE, %rdx > > + ja L(more_1x_vec) > > + > > +L(ret_vec_x0_test): > > + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which > > + will gurantee edx (len) is less than it. */ > > + lzcntl %ecx, %ecx > > + > > + /* Hoist vzeroupper (not great for RTM) to save code size. This allows > > + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ > > + COND_VZEROUPPER > > + cmpl %ecx, %edx > > + jle L(zero_0) > > + subq %rcx, %rax > > + ret > > > > - .p2align 4 > > -L(aligned_more): > > - subq $(VEC_SIZE * 4), %rdx > > - jbe L(last_4x_vec_or_less) > > - > > - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x3) > > - > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > > - vpmovmskb %ymm2, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x2) > > - > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > > - vpmovmskb %ymm3, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x1) > > - > > - vpcmpeqb (%rdi), %ymm0, %ymm4 > > - vpmovmskb %ymm4, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x0) > > - > > - /* Align data to 4 * VEC_SIZE for loop with fewer branches. > > - There are some overlaps with above if data isn't aligned > > - to 4 * VEC_SIZE. */ > > - movl %edi, %ecx > > - andl $(VEC_SIZE * 4 - 1), %ecx > > - jz L(loop_4x_vec) > > - > > - addq $(VEC_SIZE * 4), %rdi > > - addq $(VEC_SIZE * 4), %rdx > > - andq $-(VEC_SIZE * 4), %rdi > > - subq %rcx, %rdx > > + /* Fits in aligning bytes of first cache line. */ > > +L(zero_0): > > + xorl %eax, %eax > > + ret > > > > - .p2align 4 > > -L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > - subq $(VEC_SIZE * 4), %rdi > > - subq $(VEC_SIZE * 4), %rdx > > - jbe L(last_4x_vec_or_less) > > - > > - vmovdqa (%rdi), %ymm1 > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > - > > - vpcmpeqb %ymm1, %ymm0, %ymm1 > > - vpcmpeqb %ymm2, %ymm0, %ymm2 > > - vpcmpeqb %ymm3, %ymm0, %ymm3 > > - vpcmpeqb %ymm4, %ymm0, %ymm4 > > - > > - vpor %ymm1, %ymm2, %ymm5 > > - vpor %ymm3, %ymm4, %ymm6 > > - vpor %ymm5, %ymm6, %ymm5 > > - > > - vpmovmskb %ymm5, %eax > > - testl %eax, %eax > > - jz L(loop_4x_vec) > > - > > - /* There is a match. */ > > - vpmovmskb %ymm4, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x3) > > - > > - vpmovmskb %ymm3, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x2) > > - > > - vpmovmskb %ymm2, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x1) > > - > > - vpmovmskb %ymm1, %eax > > - bsrl %eax, %eax > > - addq %rdi, %rax > > + .p2align 4,, 9 > > +L(ret_vec_x0): > > + lzcntl %ecx, %ecx > > + subq %rcx, %rax > > L(return_vzeroupper): > > ZERO_UPPER_VEC_REGISTERS_RETURN > > > > - .p2align 4 > > -L(last_4x_vec_or_less): > > - addl $(VEC_SIZE * 4), %edx > > - cmpl $(VEC_SIZE * 2), %edx > > - jbe L(last_2x_vec) > > - > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x3) > > - > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > > - vpmovmskb %ymm2, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x2) > > - > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > > - vpmovmskb %ymm3, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x1_check) > > - cmpl $(VEC_SIZE * 3), %edx > > - jbe L(zero) > > - > > - vpcmpeqb (%rdi), %ymm0, %ymm4 > > - vpmovmskb %ymm4, %eax > > - testl %eax, %eax > > - jz L(zero) > > - bsrl %eax, %eax > > - subq $(VEC_SIZE * 4), %rdx > > - addq %rax, %rdx > > - jl L(zero) > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > - > > - .p2align 4 > > + .p2align 4,, 10 > > +L(more_1x_vec): > > + testl %ecx, %ecx > > + jnz L(ret_vec_x0) > > + > > + /* Align rax (string pointer). */ > > + andq $-VEC_SIZE, %rax > > + > > + /* Recompute remaining length after aligning. */ > > + movq %rax, %rdx > > + /* Need this comparison next no matter what. */ > > + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 > > + subq %rdi, %rdx > > + decq %rax > > + vpmovmskb %ymm1, %ecx > > + /* Fall through for short (hotter than length). */ > > + cmpq $(VEC_SIZE * 2), %rdx > > + ja L(more_2x_vec) > > L(last_2x_vec): > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(last_vec_x3_check) > > cmpl $VEC_SIZE, %edx > > - jbe L(zero) > > - > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jz L(zero) > > - bsrl %eax, %eax > > - subq $(VEC_SIZE * 2), %rdx > > - addq %rax, %rdx > > - jl L(zero) > > - addl $(VEC_SIZE * 2), %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > - > > - .p2align 4 > > -L(last_vec_x0): > > - bsrl %eax, %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > + jbe L(ret_vec_x0_test) > > + > > + testl %ecx, %ecx > > + jnz L(ret_vec_x0) > > + > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + /* 64-bit lzcnt. This will naturally add 32 to position. */ > > + lzcntq %rcx, %rcx > > + COND_VZEROUPPER > > + cmpl %ecx, %edx > > + jle L(zero_0) > > + subq %rcx, %rax > > + ret > > > > - .p2align 4 > > -L(last_vec_x1): > > - bsrl %eax, %eax > > - addl $VEC_SIZE, %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(last_vec_x2): > > - bsrl %eax, %eax > > - addl $(VEC_SIZE * 2), %eax > > - addq %rdi, %rax > > + /* Inexpensive place to put this regarding code size / target alignments > > + / ICache NLP. Necessary for 2-byte encoding of jump to page cross > > + case which in turn in necessary for hot path (len <= VEC_SIZE) to fit > is necessary? > > + in first cache line. */ > > +L(page_cross): > > + movq %rax, %rsi > > + andq $-VEC_SIZE, %rsi > > + vpcmpeqb (%rsi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + /* Shift out negative alignment (because we are starting from endptr and > > + working backwards). */ > > + movl %eax, %r8d > > + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ > > + notl %r8d > > + shlxl %r8d, %ecx, %ecx > > + cmpq %rdi, %rsi > > + ja L(more_1x_vec) > > + lzcntl %ecx, %ecx > > + COND_VZEROUPPER > > + cmpl %ecx, %edx > > + jle L(zero_0) > > + subq %rcx, %rax > > + ret > > + .p2align 4,, 11 > > +L(ret_vec_x1): > > + /* This will naturally add 32 to position. */ > > + lzcntq %rcx, %rcx > > + subq %rcx, %rax > > VZEROUPPER_RETURN > > + .p2align 4,, 10 > > +L(more_2x_vec): > > + testl %ecx, %ecx > > + jnz L(ret_vec_x0) > > > > - .p2align 4 > > -L(last_vec_x3): > > - bsrl %eax, %eax > > - addl $(VEC_SIZE * 3), %eax > > - addq %rdi, %rax > > - ret > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + testl %ecx, %ecx > > + jnz L(ret_vec_x1) > > > > - .p2align 4 > > -L(last_vec_x1_check): > > - bsrl %eax, %eax > > - subq $(VEC_SIZE * 3), %rdx > > - addq %rax, %rdx > > - jl L(zero) > > - addl $VEC_SIZE, %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(last_vec_x3_check): > > - bsrl %eax, %eax > > - subq $VEC_SIZE, %rdx > > - addq %rax, %rdx > > - jl L(zero) > > - addl $(VEC_SIZE * 3), %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > + /* Needed no matter what. */ > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > > > - .p2align 4 > > -L(zero): > > - xorl %eax, %eax > > - VZEROUPPER_RETURN > > + subq $(VEC_SIZE * 4), %rdx > > + ja L(more_4x_vec) > > + > > + cmpl $(VEC_SIZE * -1), %edx > > + jle L(ret_vec_x2_test) > > + > > +L(last_vec): > > + testl %ecx, %ecx > > + jnz L(ret_vec_x2) > > + > > + /* Needed no matter what. */ > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + lzcntl %ecx, %ecx > > + subq $(VEC_SIZE * 3), %rax > > + COND_VZEROUPPER > > + subq %rcx, %rax > > + cmpq %rax, %rdi > > + ja L(zero_2) > > + ret > > > > - .p2align 4 > > -L(null): > > + /* First in aligning bytes. */ > > +L(zero_2): > > xorl %eax, %eax > > ret > > > > - .p2align 4 > > -L(last_vec_or_less_aligned): > > - movl %edx, %ecx > > + .p2align 4,, 4 > > +L(ret_vec_x2_test): > > + lzcntl %ecx, %ecx > > + subq $(VEC_SIZE * 2), %rax > > + COND_VZEROUPPER > > + subq %rcx, %rax > > + cmpq %rax, %rdi > > + ja L(zero_2) > > + ret > > > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > > > - movl $1, %edx > > - /* Support rdx << 32. */ > > - salq %cl, %rdx > > - subq $1, %rdx > > + .p2align 4,, 11 > > +L(ret_vec_x2): > > + /* ecx must be non-zero. */ > > + bsrl %ecx, %ecx > > + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax > > + VZEROUPPER_RETURN > > > > - vpmovmskb %ymm1, %eax > > + .p2align 4,, 14 > > +L(ret_vec_x3): > > + /* ecx must be non-zero. */ > > + bsrl %ecx, %ecx > > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > > + VZEROUPPER_RETURN > > > > - /* Remove the trailing bytes. */ > > - andl %edx, %eax > > - testl %eax, %eax > > - jz L(zero) > > > > - bsrl %eax, %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > > > .p2align 4 > > -L(last_vec_or_less): > > - addl $VEC_SIZE, %edx > > +L(more_4x_vec): > > + testl %ecx, %ecx > > + jnz L(ret_vec_x2) > > > > - /* Check for zero length. */ > > - testl %edx, %edx > > - jz L(null) > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > > > - movl %edi, %ecx > > - andl $(VEC_SIZE - 1), %ecx > > - jz L(last_vec_or_less_aligned) > > + testl %ecx, %ecx > > + jnz L(ret_vec_x3) > > > > - movl %ecx, %esi > > - movl %ecx, %r8d > > - addl %edx, %esi > > - andq $-VEC_SIZE, %rdi > > + /* Check if near end before re-aligning (otherwise might do an > > + unnecissary loop iteration). */ > > + addq $-(VEC_SIZE * 4), %rax > > + cmpq $(VEC_SIZE * 4), %rdx > > + jbe L(last_4x_vec) > > > > - subl $VEC_SIZE, %esi > > - ja L(last_vec_2x_aligned) > > + /* Align rax to (VEC_SIZE - 1). */ > > + orq $(VEC_SIZE * 4 - 1), %rax > > + movq %rdi, %rdx > > + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because > > + lengths that overflow can be valid and break the comparison. */ > > + orq $(VEC_SIZE * 4 - 1), %rdx > > > > - /* Check the last VEC. */ > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - > > - /* Remove the leading and trailing bytes. */ > > - sarl %cl, %eax > > - movl %edx, %ecx > > + .p2align 4 > > +L(loop_4x_vec): > > + /* Need this comparison next no matter what. */ > > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 > > > > - movl $1, %edx > > - sall %cl, %edx > > - subl $1, %edx > > + vpor %ymm1, %ymm2, %ymm2 > > + vpor %ymm3, %ymm4, %ymm4 > > + vpor %ymm2, %ymm4, %ymm4 > > + vpmovmskb %ymm4, %esi > > > > - andl %edx, %eax > > - testl %eax, %eax > > - jz L(zero) > > + testl %esi, %esi > > + jnz L(loop_end) > > > > - bsrl %eax, %eax > > - addq %rdi, %rax > > - addq %r8, %rax > > - VZEROUPPER_RETURN > > + addq $(VEC_SIZE * -4), %rax > > + cmpq %rdx, %rax > > + jne L(loop_4x_vec) > > > > - .p2align 4 > > -L(last_vec_2x_aligned): > > - movl %esi, %ecx > > + subl %edi, %edx > > + incl %edx > > > > - /* Check the last VEC. */ > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 > > +L(last_4x_vec): > > + /* Used no matter what. */ > > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > > > - movl $1, %edx > > - sall %cl, %edx > > - subl $1, %edx > > + cmpl $(VEC_SIZE * 2), %edx > > + jbe L(last_2x_vec) > > > > - vpmovmskb %ymm1, %eax > > + testl %ecx, %ecx > > + jnz L(ret_vec_x0_end) > > > > - /* Remove the trailing bytes. */ > > - andl %edx, %eax > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > + testl %ecx, %ecx > > + jnz L(ret_vec_x1_end) > > > > - testl %eax, %eax > > - jnz L(last_vec_x1) > > + /* Used no matter what. */ > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %ecx > > > > - /* Check the second last VEC. */ > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > + cmpl $(VEC_SIZE * 3), %edx > > + ja L(last_vec) > > + > > + lzcntl %ecx, %ecx > > + subq $(VEC_SIZE * 2), %rax > > + COND_VZEROUPPER > > + subq %rcx, %rax > > + cmpq %rax, %rdi > > + jbe L(ret0) > > + xorl %eax, %eax > > +L(ret0): > > + ret > > > > - movl %r8d, %ecx > > > > - vpmovmskb %ymm1, %eax > > + .p2align 4 > > +L(loop_end): > > + vpmovmskb %ymm1, %ecx > > + testl %ecx, %ecx > > + jnz L(ret_vec_x0_end) > > + > > + vpmovmskb %ymm2, %ecx > > + testl %ecx, %ecx > > + jnz L(ret_vec_x1_end) > > + > > + vpmovmskb %ymm3, %ecx > > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > > + then it won't affect the result in esi (VEC4). If ecx is non-zero > > + then CHAR in VEC3 and bsrq will use that position. */ > > + salq $32, %rcx > > + orq %rsi, %rcx > > + bsrq %rcx, %rcx > > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > > + VZEROUPPER_RETURN > > > > - /* Remove the leading bytes. Must use unsigned right shift for > > - bsrl below. */ > > - shrl %cl, %eax > > - testl %eax, %eax > > - jz L(zero) > > + .p2align 4,, 4 > > +L(ret_vec_x1_end): > > + /* 64-bit version will automatically add 32 (VEC_SIZE). */ > > + lzcntq %rcx, %rcx > > + subq %rcx, %rax > > + VZEROUPPER_RETURN > > > > - bsrl %eax, %eax > > - addq %rdi, %rax > > - addq %r8, %rax > > + .p2align 4,, 4 > > +L(ret_vec_x0_end): > > + lzcntl %ecx, %ecx > > + subq %rcx, %rax > > VZEROUPPER_RETURN > > -END (MEMRCHR) > > + > > + /* 2 bytes until next cache line. */ > > +END(MEMRCHR) > > #endif > > -- > > 2.34.1 > > > > OK with the updated comments. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
On Wed, Jul 13, 2022 at 7:26 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Tue, Jun 7, 2022 at 11:18 AM H.J. Lu via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > The new code: > > > 1. prioritizes smaller user-arg lengths more. > > > 2. optimizes target placement more carefully > > > 3. reuses logic more > > > 4. fixes up various inefficiencies in the logic. The biggest > > > case here is the `lzcnt` logic for checking returns which > > > saves either a branch or multiple instructions. > > > > > > The total code size saving is: 306 bytes > > > Geometric Mean of all benchmarks New / Old: 0.760 > > > > > > Regressions: > > > There are some regressions. Particularly where the length (user arg > > > length) is large but the position of the match char is near the > > > beginning of the string (in first VEC). This case has roughly a > > > 10-20% regression. > > > > > > This is because the new logic gives the hot path for immediate matches > > > to shorter lengths (the more common input). This case has roughly > > > a 15-45% speedup. > > > > > > Full xcheck passes on x86_64. > > > --- > > > sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + > > > sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 ++++++++++---------- > > > 2 files changed, 257 insertions(+), 278 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > > index cea2d2a72d..5e9beeeef2 100644 > > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > > > @@ -2,6 +2,7 @@ > > > # define MEMRCHR __memrchr_avx2_rtm > > > #endif > > > > > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > > > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > > > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > > > > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S > > > index ba2ce7cb03..bea4528068 100644 > > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S > > > @@ -21,340 +21,318 @@ > > > # include <sysdep.h> > > > > > > # ifndef MEMRCHR > > > -# define MEMRCHR __memrchr_avx2 > > > +# define MEMRCHR __memrchr_avx2 > > > # endif > > > > > > # ifndef VZEROUPPER > > > -# define VZEROUPPER vzeroupper > > > +# define VZEROUPPER vzeroupper > > > # endif > > > > > > # ifndef SECTION > > > # define SECTION(p) p##.avx > > > # endif > > > > > > -# define VEC_SIZE 32 > > > +# define VEC_SIZE 32 > > > +# define PAGE_SIZE 4096 > > > + .section SECTION(.text), "ax", @progbits > > > +ENTRY(MEMRCHR) > > > +# ifdef __ILP32__ > > > + /* Clear upper bits. */ > > > + and %RDX_LP, %RDX_LP > > > +# else > > > + test %RDX_LP, %RDX_LP > > > +# endif > > > + jz L(zero_0) > > > > > > - .section SECTION(.text),"ax",@progbits > > > -ENTRY (MEMRCHR) > > > - /* Broadcast CHAR to YMM0. */ > > > vmovd %esi, %xmm0 > > > - vpbroadcastb %xmm0, %ymm0 > > > - > > > - sub $VEC_SIZE, %RDX_LP > > > - jbe L(last_vec_or_less) > > > - > > > - add %RDX_LP, %RDI_LP > > > - > > > - /* Check the last VEC_SIZE bytes. */ > > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x0) > > > + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a > > > + correct page cross check and 2) it correctly sets up end ptr to be > > > + subtract by lzcnt aligned. */ > > > + leaq -1(%rdx, %rdi), %rax > > > > > > - subq $(VEC_SIZE * 4), %rdi > > > - movl %edi, %ecx > > > - andl $(VEC_SIZE - 1), %ecx > > > - jz L(aligned_more) > > > + vpbroadcastb %xmm0, %ymm0 > > > > > > - /* Align data for aligned loads in the loop. */ > > > - addq $VEC_SIZE, %rdi > > > - addq $VEC_SIZE, %rdx > > > - andq $-VEC_SIZE, %rdi > > > - subq %rcx, %rdx > > > + /* Check if we can load 1x VEC without cross a page. */ > > > + testl $(PAGE_SIZE - VEC_SIZE), %eax > > > + jz L(page_cross) > > > + > > > + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + cmpq $VEC_SIZE, %rdx > > > + ja L(more_1x_vec) > > > + > > > +L(ret_vec_x0_test): > > > + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which > > > + will gurantee edx (len) is less than it. */ > > > + lzcntl %ecx, %ecx > > > + > > > + /* Hoist vzeroupper (not great for RTM) to save code size. This allows > > > + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ > > > + COND_VZEROUPPER > > > + cmpl %ecx, %edx > > > + jle L(zero_0) > > > + subq %rcx, %rax > > > + ret > > > > > > - .p2align 4 > > > -L(aligned_more): > > > - subq $(VEC_SIZE * 4), %rdx > > > - jbe L(last_4x_vec_or_less) > > > - > > > - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > - since data is only aligned to VEC_SIZE. */ > > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x3) > > > - > > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x2) > > > - > > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > > > - vpmovmskb %ymm3, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x1) > > > - > > > - vpcmpeqb (%rdi), %ymm0, %ymm4 > > > - vpmovmskb %ymm4, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x0) > > > - > > > - /* Align data to 4 * VEC_SIZE for loop with fewer branches. > > > - There are some overlaps with above if data isn't aligned > > > - to 4 * VEC_SIZE. */ > > > - movl %edi, %ecx > > > - andl $(VEC_SIZE * 4 - 1), %ecx > > > - jz L(loop_4x_vec) > > > - > > > - addq $(VEC_SIZE * 4), %rdi > > > - addq $(VEC_SIZE * 4), %rdx > > > - andq $-(VEC_SIZE * 4), %rdi > > > - subq %rcx, %rdx > > > + /* Fits in aligning bytes of first cache line. */ > > > +L(zero_0): > > > + xorl %eax, %eax > > > + ret > > > > > > - .p2align 4 > > > -L(loop_4x_vec): > > > - /* Compare 4 * VEC at a time forward. */ > > > - subq $(VEC_SIZE * 4), %rdi > > > - subq $(VEC_SIZE * 4), %rdx > > > - jbe L(last_4x_vec_or_less) > > > - > > > - vmovdqa (%rdi), %ymm1 > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > - > > > - vpcmpeqb %ymm1, %ymm0, %ymm1 > > > - vpcmpeqb %ymm2, %ymm0, %ymm2 > > > - vpcmpeqb %ymm3, %ymm0, %ymm3 > > > - vpcmpeqb %ymm4, %ymm0, %ymm4 > > > - > > > - vpor %ymm1, %ymm2, %ymm5 > > > - vpor %ymm3, %ymm4, %ymm6 > > > - vpor %ymm5, %ymm6, %ymm5 > > > - > > > - vpmovmskb %ymm5, %eax > > > - testl %eax, %eax > > > - jz L(loop_4x_vec) > > > - > > > - /* There is a match. */ > > > - vpmovmskb %ymm4, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x3) > > > - > > > - vpmovmskb %ymm3, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x2) > > > - > > > - vpmovmskb %ymm2, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x1) > > > - > > > - vpmovmskb %ymm1, %eax > > > - bsrl %eax, %eax > > > - addq %rdi, %rax > > > + .p2align 4,, 9 > > > +L(ret_vec_x0): > > > + lzcntl %ecx, %ecx > > > + subq %rcx, %rax > > > L(return_vzeroupper): > > > ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > - .p2align 4 > > > -L(last_4x_vec_or_less): > > > - addl $(VEC_SIZE * 4), %edx > > > - cmpl $(VEC_SIZE * 2), %edx > > > - jbe L(last_2x_vec) > > > - > > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x3) > > > - > > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x2) > > > - > > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > > > - vpmovmskb %ymm3, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x1_check) > > > - cmpl $(VEC_SIZE * 3), %edx > > > - jbe L(zero) > > > - > > > - vpcmpeqb (%rdi), %ymm0, %ymm4 > > > - vpmovmskb %ymm4, %eax > > > - testl %eax, %eax > > > - jz L(zero) > > > - bsrl %eax, %eax > > > - subq $(VEC_SIZE * 4), %rdx > > > - addq %rax, %rdx > > > - jl L(zero) > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > - > > > - .p2align 4 > > > + .p2align 4,, 10 > > > +L(more_1x_vec): > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x0) > > > + > > > + /* Align rax (string pointer). */ > > > + andq $-VEC_SIZE, %rax > > > + > > > + /* Recompute remaining length after aligning. */ > > > + movq %rax, %rdx > > > + /* Need this comparison next no matter what. */ > > > + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 > > > + subq %rdi, %rdx > > > + decq %rax > > > + vpmovmskb %ymm1, %ecx > > > + /* Fall through for short (hotter than length). */ > > > + cmpq $(VEC_SIZE * 2), %rdx > > > + ja L(more_2x_vec) > > > L(last_2x_vec): > > > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(last_vec_x3_check) > > > cmpl $VEC_SIZE, %edx > > > - jbe L(zero) > > > - > > > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jz L(zero) > > > - bsrl %eax, %eax > > > - subq $(VEC_SIZE * 2), %rdx > > > - addq %rax, %rdx > > > - jl L(zero) > > > - addl $(VEC_SIZE * 2), %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > - > > > - .p2align 4 > > > -L(last_vec_x0): > > > - bsrl %eax, %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > + jbe L(ret_vec_x0_test) > > > + > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x0) > > > + > > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + /* 64-bit lzcnt. This will naturally add 32 to position. */ > > > + lzcntq %rcx, %rcx > > > + COND_VZEROUPPER > > > + cmpl %ecx, %edx > > > + jle L(zero_0) > > > + subq %rcx, %rax > > > + ret > > > > > > - .p2align 4 > > > -L(last_vec_x1): > > > - bsrl %eax, %eax > > > - addl $VEC_SIZE, %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(last_vec_x2): > > > - bsrl %eax, %eax > > > - addl $(VEC_SIZE * 2), %eax > > > - addq %rdi, %rax > > > + /* Inexpensive place to put this regarding code size / target alignments > > > + / ICache NLP. Necessary for 2-byte encoding of jump to page cross > > > + case which in turn in necessary for hot path (len <= VEC_SIZE) to fit > > is necessary? > > > + in first cache line. */ > > > +L(page_cross): > > > + movq %rax, %rsi > > > + andq $-VEC_SIZE, %rsi > > > + vpcmpeqb (%rsi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + /* Shift out negative alignment (because we are starting from endptr and > > > + working backwards). */ > > > + movl %eax, %r8d > > > + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ > > > + notl %r8d > > > + shlxl %r8d, %ecx, %ecx > > > + cmpq %rdi, %rsi > > > + ja L(more_1x_vec) > > > + lzcntl %ecx, %ecx > > > + COND_VZEROUPPER > > > + cmpl %ecx, %edx > > > + jle L(zero_0) > > > + subq %rcx, %rax > > > + ret > > > + .p2align 4,, 11 > > > +L(ret_vec_x1): > > > + /* This will naturally add 32 to position. */ > > > + lzcntq %rcx, %rcx > > > + subq %rcx, %rax > > > VZEROUPPER_RETURN > > > + .p2align 4,, 10 > > > +L(more_2x_vec): > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x0) > > > > > > - .p2align 4 > > > -L(last_vec_x3): > > > - bsrl %eax, %eax > > > - addl $(VEC_SIZE * 3), %eax > > > - addq %rdi, %rax > > > - ret > > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x1) > > > > > > - .p2align 4 > > > -L(last_vec_x1_check): > > > - bsrl %eax, %eax > > > - subq $(VEC_SIZE * 3), %rdx > > > - addq %rax, %rdx > > > - jl L(zero) > > > - addl $VEC_SIZE, %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(last_vec_x3_check): > > > - bsrl %eax, %eax > > > - subq $VEC_SIZE, %rdx > > > - addq %rax, %rdx > > > - jl L(zero) > > > - addl $(VEC_SIZE * 3), %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > + /* Needed no matter what. */ > > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > > > > - .p2align 4 > > > -L(zero): > > > - xorl %eax, %eax > > > - VZEROUPPER_RETURN > > > + subq $(VEC_SIZE * 4), %rdx > > > + ja L(more_4x_vec) > > > + > > > + cmpl $(VEC_SIZE * -1), %edx > > > + jle L(ret_vec_x2_test) > > > + > > > +L(last_vec): > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x2) > > > + > > > + /* Needed no matter what. */ > > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + lzcntl %ecx, %ecx > > > + subq $(VEC_SIZE * 3), %rax > > > + COND_VZEROUPPER > > > + subq %rcx, %rax > > > + cmpq %rax, %rdi > > > + ja L(zero_2) > > > + ret > > > > > > - .p2align 4 > > > -L(null): > > > + /* First in aligning bytes. */ > > > +L(zero_2): > > > xorl %eax, %eax > > > ret > > > > > > - .p2align 4 > > > -L(last_vec_or_less_aligned): > > > - movl %edx, %ecx > > > + .p2align 4,, 4 > > > +L(ret_vec_x2_test): > > > + lzcntl %ecx, %ecx > > > + subq $(VEC_SIZE * 2), %rax > > > + COND_VZEROUPPER > > > + subq %rcx, %rax > > > + cmpq %rax, %rdi > > > + ja L(zero_2) > > > + ret > > > > > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > > > > > - movl $1, %edx > > > - /* Support rdx << 32. */ > > > - salq %cl, %rdx > > > - subq $1, %rdx > > > + .p2align 4,, 11 > > > +L(ret_vec_x2): > > > + /* ecx must be non-zero. */ > > > + bsrl %ecx, %ecx > > > + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax > > > + VZEROUPPER_RETURN > > > > > > - vpmovmskb %ymm1, %eax > > > + .p2align 4,, 14 > > > +L(ret_vec_x3): > > > + /* ecx must be non-zero. */ > > > + bsrl %ecx, %ecx > > > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > > > + VZEROUPPER_RETURN > > > > > > - /* Remove the trailing bytes. */ > > > - andl %edx, %eax > > > - testl %eax, %eax > > > - jz L(zero) > > > > > > - bsrl %eax, %eax > > > - addq %rdi, %rax > > > - VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(last_vec_or_less): > > > - addl $VEC_SIZE, %edx > > > +L(more_4x_vec): > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x2) > > > > > > - /* Check for zero length. */ > > > - testl %edx, %edx > > > - jz L(null) > > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > > > > - movl %edi, %ecx > > > - andl $(VEC_SIZE - 1), %ecx > > > - jz L(last_vec_or_less_aligned) > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x3) > > > > > > - movl %ecx, %esi > > > - movl %ecx, %r8d > > > - addl %edx, %esi > > > - andq $-VEC_SIZE, %rdi > > > + /* Check if near end before re-aligning (otherwise might do an > > > + unnecissary loop iteration). */ > > > + addq $-(VEC_SIZE * 4), %rax > > > + cmpq $(VEC_SIZE * 4), %rdx > > > + jbe L(last_4x_vec) > > > > > > - subl $VEC_SIZE, %esi > > > - ja L(last_vec_2x_aligned) > > > + /* Align rax to (VEC_SIZE - 1). */ > > > + orq $(VEC_SIZE * 4 - 1), %rax > > > + movq %rdi, %rdx > > > + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because > > > + lengths that overflow can be valid and break the comparison. */ > > > + orq $(VEC_SIZE * 4 - 1), %rdx > > > > > > - /* Check the last VEC. */ > > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - > > > - /* Remove the leading and trailing bytes. */ > > > - sarl %cl, %eax > > > - movl %edx, %ecx > > > + .p2align 4 > > > +L(loop_4x_vec): > > > + /* Need this comparison next no matter what. */ > > > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 > > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 > > > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 > > > > > > - movl $1, %edx > > > - sall %cl, %edx > > > - subl $1, %edx > > > + vpor %ymm1, %ymm2, %ymm2 > > > + vpor %ymm3, %ymm4, %ymm4 > > > + vpor %ymm2, %ymm4, %ymm4 > > > + vpmovmskb %ymm4, %esi > > > > > > - andl %edx, %eax > > > - testl %eax, %eax > > > - jz L(zero) > > > + testl %esi, %esi > > > + jnz L(loop_end) > > > > > > - bsrl %eax, %eax > > > - addq %rdi, %rax > > > - addq %r8, %rax > > > - VZEROUPPER_RETURN > > > + addq $(VEC_SIZE * -4), %rax > > > + cmpq %rdx, %rax > > > + jne L(loop_4x_vec) > > > > > > - .p2align 4 > > > -L(last_vec_2x_aligned): > > > - movl %esi, %ecx > > > + subl %edi, %edx > > > + incl %edx > > > > > > - /* Check the last VEC. */ > > > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 > > > +L(last_4x_vec): > > > + /* Used no matter what. */ > > > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > > > > - movl $1, %edx > > > - sall %cl, %edx > > > - subl $1, %edx > > > + cmpl $(VEC_SIZE * 2), %edx > > > + jbe L(last_2x_vec) > > > > > > - vpmovmskb %ymm1, %eax > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x0_end) > > > > > > - /* Remove the trailing bytes. */ > > > - andl %edx, %eax > > > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x1_end) > > > > > > - testl %eax, %eax > > > - jnz L(last_vec_x1) > > > + /* Used no matter what. */ > > > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %ecx > > > > > > - /* Check the second last VEC. */ > > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > > + cmpl $(VEC_SIZE * 3), %edx > > > + ja L(last_vec) > > > + > > > + lzcntl %ecx, %ecx > > > + subq $(VEC_SIZE * 2), %rax > > > + COND_VZEROUPPER > > > + subq %rcx, %rax > > > + cmpq %rax, %rdi > > > + jbe L(ret0) > > > + xorl %eax, %eax > > > +L(ret0): > > > + ret > > > > > > - movl %r8d, %ecx > > > > > > - vpmovmskb %ymm1, %eax > > > + .p2align 4 > > > +L(loop_end): > > > + vpmovmskb %ymm1, %ecx > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x0_end) > > > + > > > + vpmovmskb %ymm2, %ecx > > > + testl %ecx, %ecx > > > + jnz L(ret_vec_x1_end) > > > + > > > + vpmovmskb %ymm3, %ecx > > > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > > > + then it won't affect the result in esi (VEC4). If ecx is non-zero > > > + then CHAR in VEC3 and bsrq will use that position. */ > > > + salq $32, %rcx > > > + orq %rsi, %rcx > > > + bsrq %rcx, %rcx > > > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > > > + VZEROUPPER_RETURN > > > > > > - /* Remove the leading bytes. Must use unsigned right shift for > > > - bsrl below. */ > > > - shrl %cl, %eax > > > - testl %eax, %eax > > > - jz L(zero) > > > + .p2align 4,, 4 > > > +L(ret_vec_x1_end): > > > + /* 64-bit version will automatically add 32 (VEC_SIZE). */ > > > + lzcntq %rcx, %rcx > > > + subq %rcx, %rax > > > + VZEROUPPER_RETURN > > > > > > - bsrl %eax, %eax > > > - addq %rdi, %rax > > > - addq %r8, %rax > > > + .p2align 4,, 4 > > > +L(ret_vec_x0_end): > > > + lzcntl %ecx, %ecx > > > + subq %rcx, %rax > > > VZEROUPPER_RETURN > > > -END (MEMRCHR) > > > + > > > + /* 2 bytes until next cache line. */ > > > +END(MEMRCHR) > > > #endif > > > -- > > > 2.34.1 > > > > > > > OK with the updated comments. > > > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > > > Thanks. > > > > -- > > H.J. > > I would like to backport this patch to release branches. > Any comments or objections? Probably should also do: https://sourceware.org/git/?p=glibc.git;a=commit;h=227afaa67213efcdce6a870ef5086200f1076438 > > --Sunil
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S index cea2d2a72d..5e9beeeef2 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMRCHR __memrchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index ba2ce7cb03..bea4528068 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -21,340 +21,318 @@ # include <sysdep.h> # ifndef MEMRCHR -# define MEMRCHR __memrchr_avx2 +# define MEMRCHR __memrchr_avx2 # endif # ifndef VZEROUPPER -# define VZEROUPPER vzeroupper +# define VZEROUPPER vzeroupper # endif # ifndef SECTION # define SECTION(p) p##.avx # endif -# define VEC_SIZE 32 +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section SECTION(.text), "ax", @progbits +ENTRY(MEMRCHR) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) - .section SECTION(.text),"ax",@progbits -ENTRY (MEMRCHR) - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 - vpbroadcastb %xmm0, %ymm0 - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP - - /* Check the last VEC_SIZE bytes. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x0) + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdx, %rdi), %rax - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) + vpbroadcastb %xmm0, %ymm0 - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) + +L(ret_vec_x0_test): + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will gurantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + + /* Hoist vzeroupper (not great for RTM) to save code size. This allows + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - - vpcmpeqb %ymm1, %ymm0, %ymm1 - vpcmpeqb %ymm2, %ymm0, %ymm2 - vpcmpeqb %ymm3, %ymm0, %ymm3 - vpcmpeqb %ymm4, %ymm0, %ymm4 - - vpor %ymm1, %ymm2, %ymm5 - vpor %ymm3, %ymm4, %ymm6 - vpor %ymm5, %ymm6, %ymm5 - - vpmovmskb %ymm5, %eax - testl %eax, %eax - jz L(loop_4x_vec) - - /* There is a match. */ - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpmovmskb %ymm1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) - - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Align rax (string pointer). */ + andq $-VEC_SIZE, %rax + + /* Recompute remaining length after aligning. */ + movq %rax, %rdx + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 + subq %rdi, %rdx + decq %rax + vpmovmskb %ymm1, %ecx + /* Fall through for short (hotter than length). */ + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which in turn in necessary for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpeqb (%rsi), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %r8d + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %r8d + shlxl %r8d, %ecx, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret + .p2align 4,, 11 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + subq %rcx, %rax VZEROUPPER_RETURN + .p2align 4,, 10 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - VZEROUPPER_RETURN + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax - VZEROUPPER_RETURN + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) + +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - .p2align 4 -L(null): + /* First in aligning bytes. */ +L(zero_2): xorl %eax, %eax ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx + .p2align 4,, 4 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - vpcmpeqb (%rdi), %ymm0, %ymm1 - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx + .p2align 4,, 11 +L(ret_vec_x2): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - vpmovmskb %ymm1, %eax + .p2align 4,, 14 +L(ret_vec_x3): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Check for zero length. */ - testl %edx, %edx - jz L(null) + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) + testl %ecx, %ecx + jnz L(ret_vec_x3) - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + /* Check if near end before re-aligning (otherwise might do an + unnecissary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + /* Align rax to (VEC_SIZE - 1). */ + orq $(VEC_SIZE * 4 - 1), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + orq $(VEC_SIZE * 4 - 1), %rdx - /* Check the last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + .p2align 4 +L(loop_4x_vec): + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 - movl $1, %edx - sall %cl, %edx - subl $1, %edx + vpor %ymm1, %ymm2, %ymm2 + vpor %ymm3, %ymm4, %ymm4 + vpor %ymm2, %ymm4, %ymm4 + vpmovmskb %ymm4, %esi - andl %edx, %eax - testl %eax, %eax - jz L(zero) + testl %esi, %esi + jnz L(loop_end) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - VZEROUPPER_RETURN + addq $(VEC_SIZE * -4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) - .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx + subl %edi, %edx + incl %edx - /* Check the last VEC. */ - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 +L(last_4x_vec): + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - vpmovmskb %ymm1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_end) - /* Remove the trailing bytes. */ - andl %edx, %eax + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) - testl %eax, %eax - jnz L(last_vec_x1) + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - /* Check the second last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) + + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret0) + xorl %eax, %eax +L(ret0): + ret - movl %r8d, %ecx - vpmovmskb %ymm1, %eax + .p2align 4 +L(loop_end): + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + vpmovmskb %ymm3, %ecx + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + .p2align 4,, 4 +L(ret_vec_x1_end): + /* 64-bit version will automatically add 32 (VEC_SIZE). */ + lzcntq %rcx, %rcx + subq %rcx, %rax + VZEROUPPER_RETURN - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 4 +L(ret_vec_x0_end): + lzcntl %ecx, %ecx + subq %rcx, %rax VZEROUPPER_RETURN -END (MEMRCHR) + + /* 2 bytes until next cache line. */ +END(MEMRCHR) #endif