Message ID | 20220607041134.2369903-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v6,1/8] x86: Create header for VEC classes in x86 strings library | expand |
On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The new code: > 1. prioritizes smaller lengths more. > 2. optimizes target placement more carefully. > 3. reuses logic more. > 4. fixes up various inefficiencies in the logic. > > The total code size saving is: 394 bytes > Geometric Mean of all benchmarks New / Old: 0.874 > > Regressions: > 1. The page cross case is now colder, especially re-entry from the > page cross case if a match is not found in the first VEC > (roughly 50%). My general opinion with this patch is this is > acceptable given the "coldness" of this case (less than 4%) and > generally performance improvement in the other far more common > cases. > > 2. There are some regressions 5-15% for medium/large user-arg > lengths that have a match in the first VEC. This is because the > logic was rewritten to optimize finds in the first VEC if the > user-arg length is shorter (where we see roughly 20-50% > performance improvements). It is not always the case this is a > regression. My intuition is some frontend quirk is partially > explaining the data although I haven't been able to find the > root cause. > > Full xcheck passes on x86_64. > --- > sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++-------------------- > 1 file changed, 292 insertions(+), 321 deletions(-) > > diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S > index d1a9f47911..b0dffd2ae2 100644 > --- a/sysdeps/x86_64/memrchr.S > +++ b/sysdeps/x86_64/memrchr.S > @@ -18,362 +18,333 @@ > <https://www.gnu.org/licenses/>. */ > > #include <sysdep.h> > +#define VEC_SIZE 16 > +#define PAGE_SIZE 4096 > > .text > -ENTRY (__memrchr) > - movd %esi, %xmm1 > - > - sub $16, %RDX_LP > - jbe L(length_less16) > - > - punpcklbw %xmm1, %xmm1 > - punpcklbw %xmm1, %xmm1 > - > - add %RDX_LP, %RDI_LP > - pshufd $0, %xmm1, %xmm1 > - > - movdqu (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - > -/* Check if there is a match. */ > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches0) > - > - sub $64, %rdi > - mov %edi, %ecx > - and $15, %ecx > - jz L(loop_prolog) > - > - add $16, %rdi > - add $16, %rdx > - and $-16, %rdi > - sub %rcx, %rdx > - > - .p2align 4 > -L(loop_prolog): > - sub $64, %rdx > - jbe L(exit_loop) > - > - movdqa 48(%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches48) > - > - movdqa 32(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 16(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa (%rdi), %xmm4 > - pcmpeqb %xmm1, %xmm4 > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches0) > - > - sub $64, %rdi > - sub $64, %rdx > - jbe L(exit_loop) > - > - movdqa 48(%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches48) > - > - movdqa 32(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 16(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa (%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches0) > - > - mov %edi, %ecx > - and $63, %ecx > - jz L(align64_loop) > - > - add $64, %rdi > - add $64, %rdx > - and $-64, %rdi > - sub %rcx, %rdx > - > - .p2align 4 > -L(align64_loop): > - sub $64, %rdi > - sub $64, %rdx > - jbe L(exit_loop) > - > - movdqa (%rdi), %xmm0 > - movdqa 16(%rdi), %xmm2 > - movdqa 32(%rdi), %xmm3 > - movdqa 48(%rdi), %xmm4 > - > - pcmpeqb %xmm1, %xmm0 > - pcmpeqb %xmm1, %xmm2 > - pcmpeqb %xmm1, %xmm3 > - pcmpeqb %xmm1, %xmm4 > - > - pmaxub %xmm3, %xmm0 > - pmaxub %xmm4, %xmm2 > - pmaxub %xmm0, %xmm2 > - pmovmskb %xmm2, %eax > - > - test %eax, %eax > - jz L(align64_loop) > - > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches48) > - > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 16(%rdi), %xmm2 > - > - pcmpeqb %xmm1, %xmm2 > - pcmpeqb (%rdi), %xmm1 > - > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - pmovmskb %xmm1, %eax > - bsr %eax, %eax > - > - add %rdi, %rax > +ENTRY_P2ALIGN(__memrchr, 6) > +#ifdef __ILP32__ > + /* Clear upper bits. */ > + mov %RDX_LP, %RDX_LP > +#endif > + movd %esi, %xmm0 > + > + /* Get end pointer. */ > + leaq (%rdx, %rdi), %rcx > + > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > + pshufd $0, %xmm0, %xmm0 > + > + /* Check if we can load 1x VEC without cross a page. */ > + testl $(PAGE_SIZE - VEC_SIZE), %ecx > + jz L(page_cross) > + > + /* NB: This load happens regardless of whether rdx (len) is zero. Since > + it doesn't cross a page and the standard gurantees any pointer have > + at least one-valid byte this load must be safe. For the entire > + history of the x86 memrchr implementation this has been possible so > + no code "should" be relying on a zero-length check before this load. > + The zero-length check is moved to the page cross case because it is > + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE > + into 2-cache lines. */ > + movups -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subq $VEC_SIZE, %rdx > + ja L(more_1x_vec) > +L(ret_vec_x0_test): > + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is > + zero. */ > + bsrl %eax, %eax > + jz L(ret_0) > + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here > + if out of bounds. */ > + addl %edx, %eax > + jl L(zero_0) > + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base > + ptr. */ > + addq %rdi, %rax > +L(ret_0): > ret > > - .p2align 4 > -L(exit_loop): > - add $64, %edx > - cmp $32, %edx > - jbe L(exit_loop_32) > - > - movdqa 48(%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches48) > - > - movdqa 32(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 16(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches16_1) > - cmp $48, %edx > - jbe L(return_null) > - > - pcmpeqb (%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches0_1) > - xor %eax, %eax > + .p2align 4,, 5 > +L(ret_vec_x0): > + bsrl %eax, %eax > + leaq -(VEC_SIZE)(%rcx, %rax), %rax > ret > > - .p2align 4 > -L(exit_loop_32): > - movdqa 48(%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches48_1) > - cmp $16, %edx > - jbe L(return_null) > - > - pcmpeqb 32(%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches32_1) > - xor %eax, %eax > + .p2align 4,, 2 > +L(zero_0): > + xorl %eax, %eax > ret > > - .p2align 4 > -L(matches0): > - bsr %eax, %eax > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16): > - bsr %eax, %eax > - lea 16(%rax, %rdi), %rax > - ret > > - .p2align 4 > -L(matches32): > - bsr %eax, %eax > - lea 32(%rax, %rdi), %rax > + .p2align 4,, 8 > +L(more_1x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + /* Align rcx (pointer to string). */ > + decq %rcx > + andq $-VEC_SIZE, %rcx > + > + movq %rcx, %rdx > + /* NB: We could consistenyl save 1-byte in this pattern with `movaps > + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is > + it adds more frontend uops (even if the moves can be eliminated) and > + some percentage of the time actual backend uops. */ > + movaps -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + subq %rdi, %rdx > + pmovmskb %xmm1, %eax > + > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > +L(last_2x_vec): > + subl $VEC_SIZE, %edx > + jbe L(ret_vec_x0_test) > + > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subl $VEC_SIZE, %edx > + bsrl %eax, %eax > + jz L(ret_1) > + addl %edx, %eax > + jl L(zero_0) > + addq %rdi, %rax > +L(ret_1): > ret > > - .p2align 4 > -L(matches48): > - bsr %eax, %eax > - lea 48(%rax, %rdi), %rax > + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) > + causes the hot pause (length <= VEC_SIZE) to span multiple cache > + lines. Naturally aligned % 16 to 8-bytes. */ > +L(page_cross): > + /* Zero length check. */ > + testq %rdx, %rdx > + jz L(zero_0) > + > + leaq -1(%rcx), %r8 > + andq $-(VEC_SIZE), %r8 > + > + movaps (%r8), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %esi > + /* Shift out negative alignment (because we are starting from endptr and > + working backwards). */ > + negl %ecx > + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count > + explicitly. */ > + andl $(VEC_SIZE - 1), %ecx > + shl %cl, %esi > + movzwl %si, %eax > + leaq (%rdi, %rdx), %rcx > + cmpq %rdi, %r8 > + ja L(more_1x_vec) > + subl $VEC_SIZE, %edx > + bsrl %eax, %eax > + jz L(ret_2) > + addl %edx, %eax > + jl L(zero_1) > + addq %rdi, %rax > +L(ret_2): > ret > > - .p2align 4 > -L(matches0_1): > - bsr %eax, %eax > - sub $64, %rdx > - add %rax, %rdx > - jl L(return_null) > - add %rdi, %rax > + /* Fits in aliging bytes. */ > +L(zero_1): > + xorl %eax, %eax > ret > > - .p2align 4 > -L(matches16_1): > - bsr %eax, %eax > - sub $48, %rdx > - add %rax, %rdx > - jl L(return_null) > - lea 16(%rdi, %rax), %rax > + .p2align 4,, 5 > +L(ret_vec_x1): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax > ret > > - .p2align 4 > -L(matches32_1): > - bsr %eax, %eax > - sub $32, %rdx > - add %rax, %rdx > - jl L(return_null) > - lea 32(%rdi, %rax), %rax > - ret > + .p2align 4,, 8 > +L(more_2x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x0) > > - .p2align 4 > -L(matches48_1): > - bsr %eax, %eax > - sub $16, %rdx > - add %rax, %rdx > - jl L(return_null) > - lea 48(%rdi, %rax), %rax > - ret > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + testl %eax, %eax > + jnz L(ret_vec_x1) > > - .p2align 4 > -L(return_null): > - xor %eax, %eax > - ret > > - .p2align 4 > -L(length_less16_offset0): > - test %edx, %edx > - jz L(return_null) > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > > - mov %dl, %cl > - pcmpeqb (%rdi), %xmm1 > + subq $(VEC_SIZE * 4), %rdx > + ja L(more_4x_vec) > > - mov $1, %edx > - sal %cl, %edx > - sub $1, %edx > + addl $(VEC_SIZE), %edx > + jle L(ret_vec_x2_test) > > - pmovmskb %xmm1, %eax > +L(last_vec): > + testl %eax, %eax > + jnz L(ret_vec_x2) > > - and %edx, %eax > - test %eax, %eax > - jz L(return_null) > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > > - bsr %eax, %eax > - add %rdi, %rax > + subl $(VEC_SIZE), %edx > + bsrl %eax, %eax > + jz L(ret_3) > + addl %edx, %eax > + jl L(zero_2) > + addq %rdi, %rax > +L(ret_3): > ret > > - .p2align 4 > -L(length_less16): > - punpcklbw %xmm1, %xmm1 > - punpcklbw %xmm1, %xmm1 > - > - add $16, %edx > - > - pshufd $0, %xmm1, %xmm1 > - > - mov %edi, %ecx > - and $15, %ecx > - jz L(length_less16_offset0) > - > - mov %cl, %dh > - mov %ecx, %esi > - add %dl, %dh > - and $-16, %rdi > - > - sub $16, %dh > - ja L(length_less16_part2) > - > - pcmpeqb (%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - > - sar %cl, %eax > - mov %dl, %cl > - > - mov $1, %edx > - sal %cl, %edx > - sub $1, %edx > - > - and %edx, %eax > - test %eax, %eax > - jz L(return_null) > - > - bsr %eax, %eax > - add %rdi, %rax > - add %rsi, %rax > + .p2align 4,, 6 > +L(ret_vec_x2_test): > + bsrl %eax, %eax > + jz L(zero_2) > + addl %edx, %eax > + jl L(zero_2) > + addq %rdi, %rax > ret > > - .p2align 4 > -L(length_less16_part2): > - movdqa 16(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - > - mov %dh, %cl > - mov $1, %edx > - sal %cl, %edx > - sub $1, %edx > - > - and %edx, %eax > +L(zero_2): > + xorl %eax, %eax > + ret > > - test %eax, %eax > - jnz L(length_less16_part2_return) > > - pcmpeqb (%rdi), %xmm1 > - pmovmskb %xmm1, %eax > + .p2align 4,, 5 > +L(ret_vec_x2): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax > + ret > > - mov %esi, %ecx > - sar %cl, %eax > - test %eax, %eax > - jz L(return_null) > + .p2align 4,, 5 > +L(ret_vec_x3): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > + ret > > - bsr %eax, %eax > - add %rdi, %rax > - add %rsi, %rax > + .p2align 4,, 8 > +L(more_4x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x2) > + > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + testl %eax, %eax > + jnz L(ret_vec_x3) > + > + addq $-(VEC_SIZE * 4), %rcx > + cmpq $(VEC_SIZE * 4), %rdx > + jbe L(last_4x_vec) > + > + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end > + keeping the code from spilling to the next cache line. */ > + addq $(VEC_SIZE * 4 - 1), %rcx > + andq $-(VEC_SIZE * 4), %rcx > + leaq (VEC_SIZE * 4)(%rdi), %rdx > + andq $-(VEC_SIZE * 4), %rdx > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + movaps (VEC_SIZE * -1)(%rcx), %xmm1 > + movaps (VEC_SIZE * -2)(%rcx), %xmm2 > + movaps (VEC_SIZE * -3)(%rcx), %xmm3 > + movaps (VEC_SIZE * -4)(%rcx), %xmm4 > + pcmpeqb %xmm0, %xmm1 > + pcmpeqb %xmm0, %xmm2 > + pcmpeqb %xmm0, %xmm3 > + pcmpeqb %xmm0, %xmm4 > + > + por %xmm1, %xmm2 > + por %xmm3, %xmm4 > + por %xmm2, %xmm4 > + > + pmovmskb %xmm4, %esi > + testl %esi, %esi > + jnz L(loop_end) > + > + addq $-(VEC_SIZE * 4), %rcx > + cmpq %rdx, %rcx > + jne L(loop_4x_vec) > + > + subl %edi, %edx > + > + /* Ends up being 1-byte nop. */ > + .p2align 4,, 2 > +L(last_4x_vec): > + movaps -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + cmpl $(VEC_SIZE * 2), %edx > + jbe L(last_2x_vec) > + > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + testl %eax, %eax > + jnz L(ret_vec_end) > + > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subl $(VEC_SIZE * 3), %edx > + ja L(last_vec) > + bsrl %eax, %eax > + jz L(ret_4) > + addl %edx, %eax > + jl L(zero_3) > + addq %rdi, %rax > +L(ret_4): > ret > > - .p2align 4 > -L(length_less16_part2_return): > - bsr %eax, %eax > - lea 16(%rax, %rdi), %rax > + /* Ends up being 1-byte nop. */ > + .p2align 4,, 3 > +L(loop_end): > + pmovmskb %xmm1, %eax > + sall $16, %eax > + jnz L(ret_vec_end) > + > + pmovmskb %xmm2, %eax > + testl %eax, %eax > + jnz L(ret_vec_end) > + > + pmovmskb %xmm3, %eax > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > + then it won't affect the result in esi (VEC4). If ecx is non-zero > + then CHAR in VEC3 and bsrq will use that position. */ > + sall $16, %eax > + orl %esi, %eax > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > ret > > -END (__memrchr) > +L(ret_vec_end): > + bsrl %eax, %eax > + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax > + ret > + /* Use in L(last_4x_vec). In the same cache line. This is just a spare > + aligning bytes. */ > +L(zero_3): > + xorl %eax, %eax > + ret > + /* 2-bytes from next cache line. */ > +END(__memrchr) > weak_alias (__memrchr, memrchr) > -- > 2.34.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Tue, Jun 7, 2022 at 11:07 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > The new code: > > 1. prioritizes smaller lengths more. > > 2. optimizes target placement more carefully. > > 3. reuses logic more. > > 4. fixes up various inefficiencies in the logic. > > > > The total code size saving is: 394 bytes > > Geometric Mean of all benchmarks New / Old: 0.874 > > > > Regressions: > > 1. The page cross case is now colder, especially re-entry from the > > page cross case if a match is not found in the first VEC > > (roughly 50%). My general opinion with this patch is this is > > acceptable given the "coldness" of this case (less than 4%) and > > generally performance improvement in the other far more common > > cases. > > > > 2. There are some regressions 5-15% for medium/large user-arg > > lengths that have a match in the first VEC. This is because the > > logic was rewritten to optimize finds in the first VEC if the > > user-arg length is shorter (where we see roughly 20-50% > > performance improvements). It is not always the case this is a > > regression. My intuition is some frontend quirk is partially > > explaining the data although I haven't been able to find the > > root cause. > > > > Full xcheck passes on x86_64. > > --- > > sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++-------------------- > > 1 file changed, 292 insertions(+), 321 deletions(-) > > > > diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S > > index d1a9f47911..b0dffd2ae2 100644 > > --- a/sysdeps/x86_64/memrchr.S > > +++ b/sysdeps/x86_64/memrchr.S > > @@ -18,362 +18,333 @@ > > <https://www.gnu.org/licenses/>. */ > > > > #include <sysdep.h> > > +#define VEC_SIZE 16 > > +#define PAGE_SIZE 4096 > > > > .text > > -ENTRY (__memrchr) > > - movd %esi, %xmm1 > > - > > - sub $16, %RDX_LP > > - jbe L(length_less16) > > - > > - punpcklbw %xmm1, %xmm1 > > - punpcklbw %xmm1, %xmm1 > > - > > - add %RDX_LP, %RDI_LP > > - pshufd $0, %xmm1, %xmm1 > > - > > - movdqu (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - sub $64, %rdi > > - mov %edi, %ecx > > - and $15, %ecx > > - jz L(loop_prolog) > > - > > - add $16, %rdi > > - add $16, %rdx > > - and $-16, %rdi > > - sub %rcx, %rdx > > - > > - .p2align 4 > > -L(loop_prolog): > > - sub $64, %rdx > > - jbe L(exit_loop) > > - > > - movdqa 48(%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches48) > > - > > - movdqa 32(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 16(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa (%rdi), %xmm4 > > - pcmpeqb %xmm1, %xmm4 > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - sub $64, %rdi > > - sub $64, %rdx > > - jbe L(exit_loop) > > - > > - movdqa 48(%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches48) > > - > > - movdqa 32(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 16(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa (%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - mov %edi, %ecx > > - and $63, %ecx > > - jz L(align64_loop) > > - > > - add $64, %rdi > > - add $64, %rdx > > - and $-64, %rdi > > - sub %rcx, %rdx > > - > > - .p2align 4 > > -L(align64_loop): > > - sub $64, %rdi > > - sub $64, %rdx > > - jbe L(exit_loop) > > - > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - pcmpeqb %xmm1, %xmm0 > > - pcmpeqb %xmm1, %xmm2 > > - pcmpeqb %xmm1, %xmm3 > > - pcmpeqb %xmm1, %xmm4 > > - > > - pmaxub %xmm3, %xmm0 > > - pmaxub %xmm4, %xmm2 > > - pmaxub %xmm0, %xmm2 > > - pmovmskb %xmm2, %eax > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches48) > > - > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 16(%rdi), %xmm2 > > - > > - pcmpeqb %xmm1, %xmm2 > > - pcmpeqb (%rdi), %xmm1 > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - pmovmskb %xmm1, %eax > > - bsr %eax, %eax > > - > > - add %rdi, %rax > > +ENTRY_P2ALIGN(__memrchr, 6) > > +#ifdef __ILP32__ > > + /* Clear upper bits. */ > > + mov %RDX_LP, %RDX_LP > > +#endif > > + movd %esi, %xmm0 > > + > > + /* Get end pointer. */ > > + leaq (%rdx, %rdi), %rcx > > + > > + punpcklbw %xmm0, %xmm0 > > + punpcklwd %xmm0, %xmm0 > > + pshufd $0, %xmm0, %xmm0 > > + > > + /* Check if we can load 1x VEC without cross a page. */ > > + testl $(PAGE_SIZE - VEC_SIZE), %ecx > > + jz L(page_cross) > > + > > + /* NB: This load happens regardless of whether rdx (len) is zero. Since > > + it doesn't cross a page and the standard gurantees any pointer have > > + at least one-valid byte this load must be safe. For the entire > > + history of the x86 memrchr implementation this has been possible so > > + no code "should" be relying on a zero-length check before this load. > > + The zero-length check is moved to the page cross case because it is > > + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE > > + into 2-cache lines. */ > > + movups -(VEC_SIZE)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + subq $VEC_SIZE, %rdx > > + ja L(more_1x_vec) > > +L(ret_vec_x0_test): > > + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is > > + zero. */ > > + bsrl %eax, %eax > > + jz L(ret_0) > > + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here > > + if out of bounds. */ > > + addl %edx, %eax > > + jl L(zero_0) > > + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base > > + ptr. */ > > + addq %rdi, %rax > > +L(ret_0): > > ret > > > > - .p2align 4 > > -L(exit_loop): > > - add $64, %edx > > - cmp $32, %edx > > - jbe L(exit_loop_32) > > - > > - movdqa 48(%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches48) > > - > > - movdqa 32(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 16(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches16_1) > > - cmp $48, %edx > > - jbe L(return_null) > > - > > - pcmpeqb (%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches0_1) > > - xor %eax, %eax > > + .p2align 4,, 5 > > +L(ret_vec_x0): > > + bsrl %eax, %eax > > + leaq -(VEC_SIZE)(%rcx, %rax), %rax > > ret > > > > - .p2align 4 > > -L(exit_loop_32): > > - movdqa 48(%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches48_1) > > - cmp $16, %edx > > - jbe L(return_null) > > - > > - pcmpeqb 32(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches32_1) > > - xor %eax, %eax > > + .p2align 4,, 2 > > +L(zero_0): > > + xorl %eax, %eax > > ret > > > > - .p2align 4 > > -L(matches0): > > - bsr %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsr %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > > > - .p2align 4 > > -L(matches32): > > - bsr %eax, %eax > > - lea 32(%rax, %rdi), %rax > > + .p2align 4,, 8 > > +L(more_1x_vec): > > + testl %eax, %eax > > + jnz L(ret_vec_x0) > > + > > + /* Align rcx (pointer to string). */ > > + decq %rcx > > + andq $-VEC_SIZE, %rcx > > + > > + movq %rcx, %rdx > > + /* NB: We could consistenyl save 1-byte in this pattern with `movaps > > + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is > > + it adds more frontend uops (even if the moves can be eliminated) and > > + some percentage of the time actual backend uops. */ > > + movaps -(VEC_SIZE)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + subq %rdi, %rdx > > + pmovmskb %xmm1, %eax > > + > > + cmpq $(VEC_SIZE * 2), %rdx > > + ja L(more_2x_vec) > > +L(last_2x_vec): > > + subl $VEC_SIZE, %edx > > + jbe L(ret_vec_x0_test) > > + > > + testl %eax, %eax > > + jnz L(ret_vec_x0) > > + > > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + subl $VEC_SIZE, %edx > > + bsrl %eax, %eax > > + jz L(ret_1) > > + addl %edx, %eax > > + jl L(zero_0) > > + addq %rdi, %rax > > +L(ret_1): > > ret > > > > - .p2align 4 > > -L(matches48): > > - bsr %eax, %eax > > - lea 48(%rax, %rdi), %rax > > + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) > > + causes the hot pause (length <= VEC_SIZE) to span multiple cache > > + lines. Naturally aligned % 16 to 8-bytes. */ > > +L(page_cross): > > + /* Zero length check. */ > > + testq %rdx, %rdx > > + jz L(zero_0) > > + > > + leaq -1(%rcx), %r8 > > + andq $-(VEC_SIZE), %r8 > > + > > + movaps (%r8), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %esi > > + /* Shift out negative alignment (because we are starting from endptr and > > + working backwards). */ > > + negl %ecx > > + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count > > + explicitly. */ > > + andl $(VEC_SIZE - 1), %ecx > > + shl %cl, %esi > > + movzwl %si, %eax > > + leaq (%rdi, %rdx), %rcx > > + cmpq %rdi, %r8 > > + ja L(more_1x_vec) > > + subl $VEC_SIZE, %edx > > + bsrl %eax, %eax > > + jz L(ret_2) > > + addl %edx, %eax > > + jl L(zero_1) > > + addq %rdi, %rax > > +L(ret_2): > > ret > > > > - .p2align 4 > > -L(matches0_1): > > - bsr %eax, %eax > > - sub $64, %rdx > > - add %rax, %rdx > > - jl L(return_null) > > - add %rdi, %rax > > + /* Fits in aliging bytes. */ > > +L(zero_1): > > + xorl %eax, %eax > > ret > > > > - .p2align 4 > > -L(matches16_1): > > - bsr %eax, %eax > > - sub $48, %rdx > > - add %rax, %rdx > > - jl L(return_null) > > - lea 16(%rdi, %rax), %rax > > + .p2align 4,, 5 > > +L(ret_vec_x1): > > + bsrl %eax, %eax > > + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax > > ret > > > > - .p2align 4 > > -L(matches32_1): > > - bsr %eax, %eax > > - sub $32, %rdx > > - add %rax, %rdx > > - jl L(return_null) > > - lea 32(%rdi, %rax), %rax > > - ret > > + .p2align 4,, 8 > > +L(more_2x_vec): > > + testl %eax, %eax > > + jnz L(ret_vec_x0) > > > > - .p2align 4 > > -L(matches48_1): > > - bsr %eax, %eax > > - sub $16, %rdx > > - add %rax, %rdx > > - jl L(return_null) > > - lea 48(%rdi, %rax), %rax > > - ret > > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + testl %eax, %eax > > + jnz L(ret_vec_x1) > > > > - .p2align 4 > > -L(return_null): > > - xor %eax, %eax > > - ret > > > > - .p2align 4 > > -L(length_less16_offset0): > > - test %edx, %edx > > - jz L(return_null) > > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > > > - mov %dl, %cl > > - pcmpeqb (%rdi), %xmm1 > > + subq $(VEC_SIZE * 4), %rdx > > + ja L(more_4x_vec) > > > > - mov $1, %edx > > - sal %cl, %edx > > - sub $1, %edx > > + addl $(VEC_SIZE), %edx > > + jle L(ret_vec_x2_test) > > > > - pmovmskb %xmm1, %eax > > +L(last_vec): > > + testl %eax, %eax > > + jnz L(ret_vec_x2) > > > > - and %edx, %eax > > - test %eax, %eax > > - jz L(return_null) > > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > > > - bsr %eax, %eax > > - add %rdi, %rax > > + subl $(VEC_SIZE), %edx > > + bsrl %eax, %eax > > + jz L(ret_3) > > + addl %edx, %eax > > + jl L(zero_2) > > + addq %rdi, %rax > > +L(ret_3): > > ret > > > > - .p2align 4 > > -L(length_less16): > > - punpcklbw %xmm1, %xmm1 > > - punpcklbw %xmm1, %xmm1 > > - > > - add $16, %edx > > - > > - pshufd $0, %xmm1, %xmm1 > > - > > - mov %edi, %ecx > > - and $15, %ecx > > - jz L(length_less16_offset0) > > - > > - mov %cl, %dh > > - mov %ecx, %esi > > - add %dl, %dh > > - and $-16, %rdi > > - > > - sub $16, %dh > > - ja L(length_less16_part2) > > - > > - pcmpeqb (%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - > > - sar %cl, %eax > > - mov %dl, %cl > > - > > - mov $1, %edx > > - sal %cl, %edx > > - sub $1, %edx > > - > > - and %edx, %eax > > - test %eax, %eax > > - jz L(return_null) > > - > > - bsr %eax, %eax > > - add %rdi, %rax > > - add %rsi, %rax > > + .p2align 4,, 6 > > +L(ret_vec_x2_test): > > + bsrl %eax, %eax > > + jz L(zero_2) > > + addl %edx, %eax > > + jl L(zero_2) > > + addq %rdi, %rax > > ret > > > > - .p2align 4 > > -L(length_less16_part2): > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - > > - mov %dh, %cl > > - mov $1, %edx > > - sal %cl, %edx > > - sub $1, %edx > > - > > - and %edx, %eax > > +L(zero_2): > > + xorl %eax, %eax > > + ret > > > > - test %eax, %eax > > - jnz L(length_less16_part2_return) > > > > - pcmpeqb (%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > + .p2align 4,, 5 > > +L(ret_vec_x2): > > + bsrl %eax, %eax > > + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax > > + ret > > > > - mov %esi, %ecx > > - sar %cl, %eax > > - test %eax, %eax > > - jz L(return_null) > > + .p2align 4,, 5 > > +L(ret_vec_x3): > > + bsrl %eax, %eax > > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > > + ret > > > > - bsr %eax, %eax > > - add %rdi, %rax > > - add %rsi, %rax > > + .p2align 4,, 8 > > +L(more_4x_vec): > > + testl %eax, %eax > > + jnz L(ret_vec_x2) > > + > > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + testl %eax, %eax > > + jnz L(ret_vec_x3) > > + > > + addq $-(VEC_SIZE * 4), %rcx > > + cmpq $(VEC_SIZE * 4), %rdx > > + jbe L(last_4x_vec) > > + > > + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end > > + keeping the code from spilling to the next cache line. */ > > + addq $(VEC_SIZE * 4 - 1), %rcx > > + andq $-(VEC_SIZE * 4), %rcx > > + leaq (VEC_SIZE * 4)(%rdi), %rdx > > + andq $-(VEC_SIZE * 4), %rdx > > + > > + .p2align 4,, 11 > > +L(loop_4x_vec): > > + movaps (VEC_SIZE * -1)(%rcx), %xmm1 > > + movaps (VEC_SIZE * -2)(%rcx), %xmm2 > > + movaps (VEC_SIZE * -3)(%rcx), %xmm3 > > + movaps (VEC_SIZE * -4)(%rcx), %xmm4 > > + pcmpeqb %xmm0, %xmm1 > > + pcmpeqb %xmm0, %xmm2 > > + pcmpeqb %xmm0, %xmm3 > > + pcmpeqb %xmm0, %xmm4 > > + > > + por %xmm1, %xmm2 > > + por %xmm3, %xmm4 > > + por %xmm2, %xmm4 > > + > > + pmovmskb %xmm4, %esi > > + testl %esi, %esi > > + jnz L(loop_end) > > + > > + addq $-(VEC_SIZE * 4), %rcx > > + cmpq %rdx, %rcx > > + jne L(loop_4x_vec) > > + > > + subl %edi, %edx > > + > > + /* Ends up being 1-byte nop. */ > > + .p2align 4,, 2 > > +L(last_4x_vec): > > + movaps -(VEC_SIZE)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + cmpl $(VEC_SIZE * 2), %edx > > + jbe L(last_2x_vec) > > + > > + testl %eax, %eax > > + jnz L(ret_vec_x0) > > + > > + > > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + testl %eax, %eax > > + jnz L(ret_vec_end) > > + > > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > > + pcmpeqb %xmm0, %xmm1 > > + pmovmskb %xmm1, %eax > > + > > + subl $(VEC_SIZE * 3), %edx > > + ja L(last_vec) > > + bsrl %eax, %eax > > + jz L(ret_4) > > + addl %edx, %eax > > + jl L(zero_3) > > + addq %rdi, %rax > > +L(ret_4): > > ret > > > > - .p2align 4 > > -L(length_less16_part2_return): > > - bsr %eax, %eax > > - lea 16(%rax, %rdi), %rax > > + /* Ends up being 1-byte nop. */ > > + .p2align 4,, 3 > > +L(loop_end): > > + pmovmskb %xmm1, %eax > > + sall $16, %eax > > + jnz L(ret_vec_end) > > + > > + pmovmskb %xmm2, %eax > > + testl %eax, %eax > > + jnz L(ret_vec_end) > > + > > + pmovmskb %xmm3, %eax > > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > > + then it won't affect the result in esi (VEC4). If ecx is non-zero > > + then CHAR in VEC3 and bsrq will use that position. */ > > + sall $16, %eax > > + orl %esi, %eax > > + bsrl %eax, %eax > > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > > ret > > > > -END (__memrchr) > > +L(ret_vec_end): > > + bsrl %eax, %eax > > + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax > > + ret > > + /* Use in L(last_4x_vec). In the same cache line. This is just a spare > > + aligning bytes. */ > > +L(zero_3): > > + xorl %eax, %eax > > + ret > > + /* 2-bytes from next cache line. */ > > +END(__memrchr) > > weak_alias (__memrchr, memrchr) > > -- > > 2.34.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index d1a9f47911..b0dffd2ae2 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -18,362 +18,333 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> +#define VEC_SIZE 16 +#define PAGE_SIZE 4096 .text -ENTRY (__memrchr) - movd %esi, %xmm1 - - sub $16, %RDX_LP - jbe L(length_less16) - - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add %RDX_LP, %RDI_LP - pshufd $0, %xmm1, %xmm1 - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - mov %edi, %ecx - and $15, %ecx - jz L(loop_prolog) - - add $16, %rdi - add $16, %rdx - and $-16, %rdi - sub %rcx, %rdx - - .p2align 4 -L(loop_prolog): - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches0) - - mov %edi, %ecx - and $63, %ecx - jz L(align64_loop) - - add $64, %rdi - add $64, %rdx - and $-64, %rdi - sub %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm3, %xmm0 - pmaxub %xmm4, %xmm2 - pmaxub %xmm0, %xmm2 - pmovmskb %xmm2, %eax - - test %eax, %eax - jz L(align64_loop) - - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches48) - - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm2 - - pcmpeqb %xmm1, %xmm2 - pcmpeqb (%rdi), %xmm1 - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - pmovmskb %xmm1, %eax - bsr %eax, %eax - - add %rdi, %rax +ENTRY_P2ALIGN(__memrchr, 6) +#ifdef __ILP32__ + /* Clear upper bits. */ + mov %RDX_LP, %RDX_LP +#endif + movd %esi, %xmm0 + + /* Get end pointer. */ + leaq (%rdx, %rdi), %rcx + + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %ecx + jz L(page_cross) + + /* NB: This load happens regardless of whether rdx (len) is zero. Since + it doesn't cross a page and the standard gurantees any pointer have + at least one-valid byte this load must be safe. For the entire + history of the x86 memrchr implementation this has been possible so + no code "should" be relying on a zero-length check before this load. + The zero-length check is moved to the page cross case because it is + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE + into 2-cache lines. */ + movups -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is + zero. */ + bsrl %eax, %eax + jz L(ret_0) + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here + if out of bounds. */ + addl %edx, %eax + jl L(zero_0) + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base + ptr. */ + addq %rdi, %rax +L(ret_0): ret - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches0_1) - xor %eax, %eax + .p2align 4,, 5 +L(ret_vec_x0): + bsrl %eax, %eax + leaq -(VEC_SIZE)(%rcx, %rax), %rax ret - .p2align 4 -L(exit_loop_32): - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 32(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches32_1) - xor %eax, %eax + .p2align 4,, 2 +L(zero_0): + xorl %eax, %eax ret - .p2align 4 -L(matches0): - bsr %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax - ret - .p2align 4 -L(matches32): - bsr %eax, %eax - lea 32(%rax, %rdi), %rax + .p2align 4,, 8 +L(more_1x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) + + /* Align rcx (pointer to string). */ + decq %rcx + andq $-VEC_SIZE, %rcx + + movq %rcx, %rdx + /* NB: We could consistenyl save 1-byte in this pattern with `movaps + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is + it adds more frontend uops (even if the moves can be eliminated) and + some percentage of the time actual backend uops. */ + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + subq %rdi, %rdx + pmovmskb %xmm1, %eax + + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +L(last_2x_vec): + subl $VEC_SIZE, %edx + jbe L(ret_vec_x0_test) + + testl %eax, %eax + jnz L(ret_vec_x0) + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_1) + addl %edx, %eax + jl L(zero_0) + addq %rdi, %rax +L(ret_1): ret - .p2align 4 -L(matches48): - bsr %eax, %eax - lea 48(%rax, %rdi), %rax + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) + causes the hot pause (length <= VEC_SIZE) to span multiple cache + lines. Naturally aligned % 16 to 8-bytes. */ +L(page_cross): + /* Zero length check. */ + testq %rdx, %rdx + jz L(zero_0) + + leaq -1(%rcx), %r8 + andq $-(VEC_SIZE), %r8 + + movaps (%r8), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + negl %ecx + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count + explicitly. */ + andl $(VEC_SIZE - 1), %ecx + shl %cl, %esi + movzwl %si, %eax + leaq (%rdi, %rdx), %rcx + cmpq %rdi, %r8 + ja L(more_1x_vec) + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_2) + addl %edx, %eax + jl L(zero_1) + addq %rdi, %rax +L(ret_2): ret - .p2align 4 -L(matches0_1): - bsr %eax, %eax - sub $64, %rdx - add %rax, %rdx - jl L(return_null) - add %rdi, %rax + /* Fits in aliging bytes. */ +L(zero_1): + xorl %eax, %eax ret - .p2align 4 -L(matches16_1): - bsr %eax, %eax - sub $48, %rdx - add %rax, %rdx - jl L(return_null) - lea 16(%rdi, %rax), %rax + .p2align 4,, 5 +L(ret_vec_x1): + bsrl %eax, %eax + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret - .p2align 4 -L(matches32_1): - bsr %eax, %eax - sub $32, %rdx - add %rax, %rdx - jl L(return_null) - lea 32(%rdi, %rax), %rax - ret + .p2align 4,, 8 +L(more_2x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) - .p2align 4 -L(matches48_1): - bsr %eax, %eax - sub $16, %rdx - add %rax, %rdx - jl L(return_null) - lea 48(%rdi, %rax), %rax - ret + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz L(ret_vec_x1) - .p2align 4 -L(return_null): - xor %eax, %eax - ret - .p2align 4 -L(length_less16_offset0): - test %edx, %edx - jz L(return_null) + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - mov %dl, %cl - pcmpeqb (%rdi), %xmm1 + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) - mov $1, %edx - sal %cl, %edx - sub $1, %edx + addl $(VEC_SIZE), %edx + jle L(ret_vec_x2_test) - pmovmskb %xmm1, %eax +L(last_vec): + testl %eax, %eax + jnz L(ret_vec_x2) - and %edx, %eax - test %eax, %eax - jz L(return_null) + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - bsr %eax, %eax - add %rdi, %rax + subl $(VEC_SIZE), %edx + bsrl %eax, %eax + jz L(ret_3) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax +L(ret_3): ret - .p2align 4 -L(length_less16): - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add $16, %edx - - pshufd $0, %xmm1, %xmm1 - - mov %edi, %ecx - and $15, %ecx - jz L(length_less16_offset0) - - mov %cl, %dh - mov %ecx, %esi - add %dl, %dh - and $-16, %rdi - - sub $16, %dh - ja L(length_less16_part2) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - - sar %cl, %eax - mov %dl, %cl - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax - test %eax, %eax - jz L(return_null) - - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 6 +L(ret_vec_x2_test): + bsrl %eax, %eax + jz L(zero_2) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax ret - .p2align 4 -L(length_less16_part2): - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - - mov %dh, %cl - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax +L(zero_2): + xorl %eax, %eax + ret - test %eax, %eax - jnz L(length_less16_part2_return) - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax + .p2align 4,, 5 +L(ret_vec_x2): + bsrl %eax, %eax + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret - mov %esi, %ecx - sar %cl, %eax - test %eax, %eax - jz L(return_null) + .p2align 4,, 5 +L(ret_vec_x3): + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 8 +L(more_4x_vec): + testl %eax, %eax + jnz L(ret_vec_x2) + + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_x3) + + addq $-(VEC_SIZE * 4), %rcx + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) + + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end + keeping the code from spilling to the next cache line. */ + addq $(VEC_SIZE * 4 - 1), %rcx + andq $-(VEC_SIZE * 4), %rcx + leaq (VEC_SIZE * 4)(%rdi), %rdx + andq $-(VEC_SIZE * 4), %rdx + + .p2align 4,, 11 +L(loop_4x_vec): + movaps (VEC_SIZE * -1)(%rcx), %xmm1 + movaps (VEC_SIZE * -2)(%rcx), %xmm2 + movaps (VEC_SIZE * -3)(%rcx), %xmm3 + movaps (VEC_SIZE * -4)(%rcx), %xmm4 + pcmpeqb %xmm0, %xmm1 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm0, %xmm4 + + por %xmm1, %xmm2 + por %xmm3, %xmm4 + por %xmm2, %xmm4 + + pmovmskb %xmm4, %esi + testl %esi, %esi + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rcx + cmpq %rdx, %rcx + jne L(loop_4x_vec) + + subl %edi, %edx + + /* Ends up being 1-byte nop. */ + .p2align 4,, 2 +L(last_4x_vec): + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + testl %eax, %eax + jnz L(ret_vec_x0) + + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_end) + + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $(VEC_SIZE * 3), %edx + ja L(last_vec) + bsrl %eax, %eax + jz L(ret_4) + addl %edx, %eax + jl L(zero_3) + addq %rdi, %rax +L(ret_4): ret - .p2align 4 -L(length_less16_part2_return): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax + /* Ends up being 1-byte nop. */ + .p2align 4,, 3 +L(loop_end): + pmovmskb %xmm1, %eax + sall $16, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm2, %eax + testl %eax, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm3, %eax + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + sall $16, %eax + orl %esi, %eax + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ret -END (__memrchr) +L(ret_vec_end): + bsrl %eax, %eax + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax + ret + /* Use in L(last_4x_vec). In the same cache line. This is just a spare + aligning bytes. */ +L(zero_3): + xorl %eax, %eax + ret + /* 2-bytes from next cache line. */ +END(__memrchr) weak_alias (__memrchr, memrchr)