Message ID | 20170601154519.GB14526@lucon.org |
---|---|
State | New |
Headers | show |
On 06/01/2017 05:45 PM, H.J. Lu wrote: > +L(between_4_7): > + vmovd (%rdi), %xmm1 > + vmovd (%rsi), %xmm2 > + VPCMPEQ %xmm1, %xmm2, %xmm2 > + vpmovmskb %xmm2, %eax > + subl $0xffff, %eax > + jnz L(first_vec) Is this really faster than two 32-bit bswaps followed by a sub? > + leaq -4(%rdi, %rdx), %rdi > + leaq -4(%rsi, %rdx), %rsi > + vmovd (%rdi), %xmm1 > + vmovd (%rsi), %xmm2 > + VPCMPEQ %xmm1, %xmm2, %xmm2 > + vpmovmskb %xmm2, %eax > + subl $0xffff, %eax > + jnz L(first_vec) > + ret What is ensuring alignment, so that the vmovd instructions cannot fault? > + .p2align 4 > +L(between_2_3): > + /* Load 2 bytes into registers. */ > + movzwl (%rdi), %eax > + movzwl (%rsi), %ecx > + /* Compare the lowest byte. */ > + cmpb %cl, %al > + jne L(1byte_reg) > + /* Load the difference of 2 bytes into EAX. */ > + subl %ecx, %eax > + /* Return if 2 bytes differ. */ > + jnz L(exit) > + cmpb $2, %dl > + /* Return if these are the last 2 bytes. */ > + je L(exit) > + movzbl 2(%rdi), %eax > + movzbl 2(%rsi), %ecx > + subl %ecx, %eax > + ret Again, bswap should be faster, and if we assume that the ordering of the inputs is more difficult to predict than the length, it would be better to construct the full 24-bit value before comparing it. Thanks, Florian
On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote: > On 06/01/2017 05:45 PM, H.J. Lu wrote: >> +L(between_4_7): >> + vmovd (%rdi), %xmm1 >> + vmovd (%rsi), %xmm2 >> + VPCMPEQ %xmm1, %xmm2, %xmm2 >> + vpmovmskb %xmm2, %eax >> + subl $0xffff, %eax >> + jnz L(first_vec) > > Is this really faster than two 32-bit bswaps followed by a sub? Can you elaborate how to use bswap here? >> + leaq -4(%rdi, %rdx), %rdi >> + leaq -4(%rsi, %rdx), %rsi >> + vmovd (%rdi), %xmm1 >> + vmovd (%rsi), %xmm2 >> + VPCMPEQ %xmm1, %xmm2, %xmm2 >> + vpmovmskb %xmm2, %eax >> + subl $0xffff, %eax >> + jnz L(first_vec) >> + ret > > What is ensuring alignment, so that the vmovd instructions cannot fault? What do you mean? This sequence compares the last 4 bytes with vmovd, which loads 4 bytes and zeroes out the high 12 bytes, and VPCMPEQ. If they aren't the same, go to L(first_vec). >> + .p2align 4 >> +L(between_2_3): >> + /* Load 2 bytes into registers. */ >> + movzwl (%rdi), %eax >> + movzwl (%rsi), %ecx >> + /* Compare the lowest byte. */ >> + cmpb %cl, %al >> + jne L(1byte_reg) >> + /* Load the difference of 2 bytes into EAX. */ >> + subl %ecx, %eax >> + /* Return if 2 bytes differ. */ >> + jnz L(exit) >> + cmpb $2, %dl >> + /* Return if these are the last 2 bytes. */ >> + je L(exit) >> + movzbl 2(%rdi), %eax >> + movzbl 2(%rsi), %ecx >> + subl %ecx, %eax >> + ret > > Again, bswap should be faster, and if we assume that the ordering of the > inputs is more difficult to predict than the length, it would be better > to construct the full 24-bit value before comparing it. > Can you elaborate it here? Thanks.
On 06/01/2017 07:19 PM, H.J. Lu wrote: > On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote: >> On 06/01/2017 05:45 PM, H.J. Lu wrote: >>> +L(between_4_7): >>> + vmovd (%rdi), %xmm1 >>> + vmovd (%rsi), %xmm2 >>> + VPCMPEQ %xmm1, %xmm2, %xmm2 >>> + vpmovmskb %xmm2, %eax >>> + subl $0xffff, %eax >>> + jnz L(first_vec) >> >> Is this really faster than two 32-bit bswaps followed by a sub? > > Can you elaborate how to use bswap here? Something like this: /* Load 4 to 7 bytes into an 8-byte word. ABCDEFG turns into GFEDDCBA. ABCDEF turns into FEDCDCBA. ABCDE turns into EDCBDCBA. ABCD turns into DCBADCBA. bswapq below reverses the order of bytes. The duplicated bytes do not affect the comparison result. */ movl -4(%rdi, %rdx), R1 shrq $32, R1 movl -4(%rsi, %rdx), R2 shrq $32, R2 movl ($rdi), R3 orq R3, R1 /* Variant below starts after this point. */ cmpq R1, R2 jne L(diffin8bytes) xor %eax, %eax ret L(diffin8bytes): bswapq R1 bswapq R2 cmpq R1, R2 sbbl %eax, %eax /* Set to -1 if R1 < R2, otherwise 0. */ orl $1, %eax /* Turn 0 into 1, but preserve -1. */ ret (Not sure about the right ordering for R1 and R2 here.) There's a way to avoid the conditional jump completely, but whether that's worthwhile depends on the cost of the bswapq and the cmove: bswapq R1 bswapq R2 xorl R3, R3 cmpq R1, R2 sbbl %eax, %eax orl $1, %eax cmpq R1, R2 cmove R3, %eax ret See this patch and the related discussion: <https://sourceware.org/ml/libc-alpha/2014-02/msg00139.html> >> What is ensuring alignment, so that the vmovd instructions cannot fault? > > What do you mean? This sequence compares the last 4 bytes with > vmovd, which loads 4 bytes and zeroes out the high 12 bytes, and > VPCMPEQ. If they aren't the same, go to L(first_vec). Ah, I see now. The loads overlap. Maybe add a comment to that effect? Thanks, Florian
On Thu, Jun 1, 2017 at 11:39 AM, Florian Weimer <fweimer@redhat.com> wrote: > On 06/01/2017 07:19 PM, H.J. Lu wrote: >> On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote: >>> On 06/01/2017 05:45 PM, H.J. Lu wrote: >>>> +L(between_4_7): >>>> + vmovd (%rdi), %xmm1 >>>> + vmovd (%rsi), %xmm2 >>>> + VPCMPEQ %xmm1, %xmm2, %xmm2 >>>> + vpmovmskb %xmm2, %eax >>>> + subl $0xffff, %eax >>>> + jnz L(first_vec) >>> >>> Is this really faster than two 32-bit bswaps followed by a sub? >> >> Can you elaborate how to use bswap here? > > Something like this: > > /* Load 4 to 7 bytes into an 8-byte word. > ABCDEFG turns into GFEDDCBA. > ABCDEF turns into FEDCDCBA. > ABCDE turns into EDCBDCBA. > ABCD turns into DCBADCBA. > bswapq below reverses the order of bytes. > The duplicated bytes do not affect the comparison result. */ > movl -4(%rdi, %rdx), R1 > shrq $32, R1 > movl -4(%rsi, %rdx), R2 > shrq $32, R2 > movl ($rdi), R3 > orq R3, R1 > /* Variant below starts after this point. */ > cmpq R1, R2 > jne L(diffin8bytes) > xor %eax, %eax > ret > > L(diffin8bytes): > bswapq R1 > bswapq R2 > cmpq R1, R2 > sbbl %eax, %eax /* Set to -1 if R1 < R2, otherwise 0. */ > orl $1, %eax /* Turn 0 into 1, but preserve -1. */ > ret I don't think it works with memcmp since return value depends on the first bytes which differs. Say ABCDE turns into EDCBDCBA If all bytes differs, we should only compare A, not EDCBDCBA. > (Not sure about the right ordering for R1 and R2 here.) > > There's a way to avoid the conditional jump completely, but whether > that's worthwhile depends on the cost of the bswapq and the cmove: > > bswapq R1 > bswapq R2 > xorl R3, R3 > cmpq R1, R2 > sbbl %eax, %eax > orl $1, %eax > cmpq R1, R2 > cmove R3, %eax > ret > > See this patch and the related discussion: > > <https://sourceware.org/ml/libc-alpha/2014-02/msg00139.html> > >>> What is ensuring alignment, so that the vmovd instructions cannot fault? >> >> What do you mean? This sequence compares the last 4 bytes with >> vmovd, which loads 4 bytes and zeroes out the high 12 bytes, and >> VPCMPEQ. If they aren't the same, go to L(first_vec). > > Ah, I see now. The loads overlap. Maybe add a comment to that effect? I will add /* Use overlapping loads to avoid branches. */
On 06/01/2017 10:57 PM, H.J. Lu wrote: > I don't think it works with memcmp since return value depends on > the first bytes which differs. Say > > ABCDE turns into EDCBDCBA > > If all bytes differs, we should only compare A, not EDCBDCBA. That's what the bswapq is for, it reverses the order of bytes. Florian
On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote: > On 06/01/2017 10:57 PM, H.J. Lu wrote: >> I don't think it works with memcmp since return value depends on >> the first bytes which differs. Say >> >> ABCDE turns into EDCBDCBA >> >> If all bytes differs, we should only compare A, not EDCBDCBA. > > That's what the bswapq is for, it reverses the order of bytes. > bswapq doesn't help since cmpq compares 8 bytes but only the last byte matters. Comparing the highest byte give you the wrong result, like 0x36775382d1367753 0x7b8d14025b7b8d14
On 06/01/2017 11:17 PM, H.J. Lu wrote: > On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote: >> On 06/01/2017 10:57 PM, H.J. Lu wrote: >>> I don't think it works with memcmp since return value depends on >>> the first bytes which differs. Say >>> >>> ABCDE turns into EDCBDCBA >>> >>> If all bytes differs, we should only compare A, not EDCBDCBA. >> >> That's what the bswapq is for, it reverses the order of bytes. >> > > bswapq doesn't help since cmpq compares 8 bytes but only > the last byte matters. Comparing the highest byte give you the > wrong result, like > > 0x36775382d1367753 > 0x7b8d14025b7b8d14 I don't understand. On big-endian, to compare two 8-byte arrays as if by memcmp, you can certainly do a uint64_t load, compute the difference as a 65-bit value, and return the integer sign of that. The code I posted does that (modulo bugs, but you can get a working patch from the old message I referenced). bswapq is needed to get an equivalent to that big-endian load. Thanks, Florian
On Thu, Jun 1, 2017 at 2:20 PM, Florian Weimer <fweimer@redhat.com> wrote: > On 06/01/2017 11:17 PM, H.J. Lu wrote: >> On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote: >>> On 06/01/2017 10:57 PM, H.J. Lu wrote: >>>> I don't think it works with memcmp since return value depends on >>>> the first bytes which differs. Say >>>> >>>> ABCDE turns into EDCBDCBA >>>> >>>> If all bytes differs, we should only compare A, not EDCBDCBA. >>> >>> That's what the bswapq is for, it reverses the order of bytes. >>> >> >> bswapq doesn't help since cmpq compares 8 bytes but only >> the last byte matters. Comparing the highest byte give you the >> wrong result, like >> >> 0x36775382d1367753 >> 0x7b8d14025b7b8d14 > > I don't understand. On big-endian, to compare two 8-byte arrays as if > by memcmp, you can certainly do a uint64_t load, compute the difference > as a 65-bit value, and return the integer sign of that. > > The code I posted does that (modulo bugs, but you can get a working > patch from the old message I referenced). bswapq is needed to get an > equivalent to that big-endian load. > I put memcmp-avx2.S on hjl/avx2/master branch and changed it to L(between_4_7): movl (%rdi), %r8d movl (%rsi), %ecx shlq $32, %r8 shlq $32, %rcx movl -4(%rdi, %rdx), %edi movl -4(%rsi, %rdx), %esi orq %rdi, %r8 orq %rsi, %rcx bswap %r8 bswap %rcx cmpq %rcx, %r8 je L(zero) sbbl %eax, %eax orl $1, %eax ret and got Iteration 70485 - wrong result in function __memcmp_avx2 (18, 26, 5, 0) -1 != 1, p1 0x7ffff7ff0e00 p2 0x7ffff7fece00 Where did I do wrong?
On 06/01/2017 11:29 PM, H.J. Lu wrote: > L(between_4_7): > movl (%rdi), %r8d > movl (%rsi), %ecx > shlq $32, %r8 > shlq $32, %rcx > movl -4(%rdi, %rdx), %edi > movl -4(%rsi, %rdx), %esi > orq %rdi, %r8 > orq %rsi, %rcx > bswap %r8 > bswap %rcx > cmpq %rcx, %r8 > je L(zero) > sbbl %eax, %eax > orl $1, %eax > ret > > and got > > Iteration 70485 - wrong result in function __memcmp_avx2 (18, 26, 5, > 0) -1 != 1, p1 0x7ffff7ff0e00 p2 0x7ffff7fece00 > > Where did I do wrong? I think you created some PDP-endian thing there. The 4 bytes at (%rdi) need to remain in the lower part of %r8, up until the bswap. In other words, you need to shift the 4 bytes loaded from -4(%rdi, %rdx). Thanks, Florian
On 06/01/2017 02:20 PM, Florian Weimer wrote: > bswapq is needed to get an > equivalent to that big-endian load. Don't forget about movbe, which will also be available on these avx2 machines. r~
On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote: > Optimize x86-64 memcmp/wmemcmp with AVX2. It uses vector compare as > much as possible. It is as fast as SSE4 memcmp for size <= 16 bytes > and up to 2X faster for size > 16 bytes on Haswell and Skylake. Select > AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and > AVX unaligned load is fast. > > Key features: > > 1. Use overlapping compare to avoid branch. > 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8 > bytes for wmemcmp. > 3. If size is 8 * VEC_SIZE or less, unroll the loop. > 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area. > 5. Use 2 vector compares when size is 2 * VEC_SIZE or less. > 6. Use 4 vector compares when size is 4 * VEC_SIZE or less. > 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. > > Any comments? > I have some comments, its similar to one of my previous patches > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > + This is unnecessary branch, its likely that there is difference in first 16 bytes regardless of size. Move test about sizes... > +L(last_2x_vec): > + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ > + vmovdqu (%rsi), %ymm2 > + VPCMPEQ (%rdi), %ymm2, %ymm2 > + vpmovmskb %ymm2, %eax > + subl $VEC_MASK, %eax > + jnz L(first_vec) here. > +L(first_vec): > + /* A byte or int32 is different within 16 or 32 bytes. */ > + bsfl %eax, %ecx > +# ifdef USE_AS_WMEMCMP > + xorl %eax, %eax > + movl (%rdi, %rcx), %edx > + cmpl (%rsi, %rcx), %edx > +L(wmemcmp_return): > + setl %al > + negl %eax > + orl $1, %eax > +# else > + movzbl (%rdi, %rcx), %eax > + movzbl (%rsi, %rcx), %edx > + sub %edx, %eax > +# endif > + VZEROUPPER > + ret > + Loading bytes depending on result of bsf is slow, alternative is to find that from vector tests. I could avoid it using tests like this but I didn't measure performance/test it yet. vmovdqu (%rdi), %ymm3 VPCMPGTQ %ymm2, %ymm3, %ymm4 VPCMPGTQ %ymm3, %ymm2, %ymm5 vpmovmskb %ymm4, %eax vpmovmskb %ymm5, %edx neg %eax neg %edx lzcnt %eax, %eax lzcnt %edx, %edx sub %edx, %eax ret > + .p2align 4 > +L(less_vec): > +# ifdef USE_AS_WMEMCMP > + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ > + cmpb $4, %dl > + je L(4) > + jb L(zero) > +# else > + cmpb $1, %dl > + je L(1) > + jb L(zero) > + cmpb $4, %dl > + jb L(between_2_3) > + cmpb $8, %dl > + jb L(between_4_7) > +# endif > + cmpb $16, %dl > + jae L(between_16_31) I am net entirely sure about this as it depends on if one calls memcmp with fixed sizes in loop or not. If size is unpredictable first test if loads cross page boudary for special case. if not do 32-byte comparison and if first different byte is bigger than size return 0.
On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote: > On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote: >> Optimize x86-64 memcmp/wmemcmp with AVX2. It uses vector compare as >> much as possible. It is as fast as SSE4 memcmp for size <= 16 bytes >> and up to 2X faster for size > 16 bytes on Haswell and Skylake. Select >> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and >> AVX unaligned load is fast. >> >> Key features: >> >> 1. Use overlapping compare to avoid branch. >> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8 >> bytes for wmemcmp. >> 3. If size is 8 * VEC_SIZE or less, unroll the loop. >> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area. >> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less. >> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less. >> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. >> >> Any comments? >> > I have some comments, its similar to one of my previous patches > >> + cmpq $(VEC_SIZE * 2), %rdx >> + ja L(more_2x_vec) >> + > This is unnecessary branch, its likely that there is difference in first > 16 bytes regardless of size. Move test about sizes... >> +L(last_2x_vec): >> + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ >> + vmovdqu (%rsi), %ymm2 >> + VPCMPEQ (%rdi), %ymm2, %ymm2 >> + vpmovmskb %ymm2, %eax >> + subl $VEC_MASK, %eax >> + jnz L(first_vec) > here. > If we do that, the size check will be redundant from /* Less than 4 * VEC. */ cmpq $VEC_SIZE, %rdx jbe L(last_vec) cmpq $(VEC_SIZE * 2), %rdx jbe L(last_2x_vec) L(last_4x_vec): Of cause, we can duplicate these blocks to avoid size. > >> +L(first_vec): >> + /* A byte or int32 is different within 16 or 32 bytes. */ >> + bsfl %eax, %ecx >> +# ifdef USE_AS_WMEMCMP >> + xorl %eax, %eax >> + movl (%rdi, %rcx), %edx >> + cmpl (%rsi, %rcx), %edx >> +L(wmemcmp_return): >> + setl %al >> + negl %eax >> + orl $1, %eax >> +# else >> + movzbl (%rdi, %rcx), %eax >> + movzbl (%rsi, %rcx), %edx >> + sub %edx, %eax >> +# endif >> + VZEROUPPER >> + ret >> + > > Loading bytes depending on result of bsf is slow, alternative is to find > that from vector tests. I could avoid it using tests like this but I > didn't measure performance/test it yet. > > vmovdqu (%rdi), %ymm3 > > VPCMPGTQ %ymm2, %ymm3, %ymm4 > VPCMPGTQ %ymm3, %ymm2, %ymm5 > vpmovmskb %ymm4, %eax > vpmovmskb %ymm5, %edx > neg %eax > neg %edx > lzcnt %eax, %eax > lzcnt %edx, %edx > sub %edx, %eax > ret Andrew, can you give it a try? > > >> + .p2align 4 >> +L(less_vec): >> +# ifdef USE_AS_WMEMCMP >> + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ >> + cmpb $4, %dl >> + je L(4) >> + jb L(zero) >> +# else >> + cmpb $1, %dl >> + je L(1) >> + jb L(zero) >> + cmpb $4, %dl >> + jb L(between_2_3) >> + cmpb $8, %dl >> + jb L(between_4_7) >> +# endif >> + cmpb $16, %dl >> + jae L(between_16_31) > > I am net entirely sure about this as it depends on if one calls memcmp > with fixed sizes in loop or not. If size is unpredictable first test if > loads cross page boudary for special case. if not do 32-byte comparison > and if first different byte is bigger than size return 0. There are 2 loads from 2 different sources. We need to do 2 address checks before using 32-byte vector comparison, I don't know if it will be faster.
2017-06-16 4:15 GMT+02:00 H.J. Lu <hjl.tools@gmail.com>: > On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote: >> On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote: >>> Optimize x86-64 memcmp/wmemcmp with AVX2. It uses vector compare as >>> much as possible. It is as fast as SSE4 memcmp for size <= 16 bytes >>> and up to 2X faster for size > 16 bytes on Haswell and Skylake. Select >>> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and >>> AVX unaligned load is fast. >>> >>> Key features: >>> >>> 1. Use overlapping compare to avoid branch. >>> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8 >>> bytes for wmemcmp. >>> 3. If size is 8 * VEC_SIZE or less, unroll the loop. >>> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area. >>> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less. >>> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less. >>> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. >>> >>> Any comments? >>> >> I have some comments, its similar to one of my previous patches >> >>> + cmpq $(VEC_SIZE * 2), %rdx >>> + ja L(more_2x_vec) >>> + >> This is unnecessary branch, its likely that there is difference in first >> 16 bytes regardless of size. Move test about sizes... >>> +L(last_2x_vec): >>> + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ >>> + vmovdqu (%rsi), %ymm2 >>> + VPCMPEQ (%rdi), %ymm2, %ymm2 >>> + vpmovmskb %ymm2, %eax >>> + subl $VEC_MASK, %eax >>> + jnz L(first_vec) >> here. >> > > If we do that, the size check will be redundant from > > /* Less than 4 * VEC. */ > cmpq $VEC_SIZE, %rdx > jbe L(last_vec) > cmpq $(VEC_SIZE * 2), %rdx > jbe L(last_2x_vec) > > L(last_4x_vec): > > Of cause, we can duplicate these blocks to avoid size. > >> >>> +L(first_vec): >>> + /* A byte or int32 is different within 16 or 32 bytes. */ >>> + bsfl %eax, %ecx >>> +# ifdef USE_AS_WMEMCMP >>> + xorl %eax, %eax >>> + movl (%rdi, %rcx), %edx >>> + cmpl (%rsi, %rcx), %edx >>> +L(wmemcmp_return): >>> + setl %al >>> + negl %eax >>> + orl $1, %eax >>> +# else >>> + movzbl (%rdi, %rcx), %eax >>> + movzbl (%rsi, %rcx), %edx >>> + sub %edx, %eax >>> +# endif >>> + VZEROUPPER >>> + ret >>> + >> >> Loading bytes depending on result of bsf is slow, alternative is to find >> that from vector tests. I could avoid it using tests like this but I >> didn't measure performance/test it yet. >> >> vmovdqu (%rdi), %ymm3 >> >> VPCMPGTQ %ymm2, %ymm3, %ymm4 >> VPCMPGTQ %ymm3, %ymm2, %ymm5 >> vpmovmskb %ymm4, %eax >> vpmovmskb %ymm5, %edx >> neg %eax >> neg %edx >> lzcnt %eax, %eax >> lzcnt %edx, %edx >> sub %edx, %eax >> ret > > Andrew, can you give it a try? Hi Ondrej, could you send patch with you proposal? I have tried with the following change and got many test-memcmp wrong results: < leaq -VEC_SIZE(%rdi, %rdx), %rdi < leaq -VEC_SIZE(%rsi, %rdx), %rsi < vmovdqu (%rsi), %ymm2 < VPCMPEQ (%rdi), %ymm2, %ymm2 --- > leaq -VEC_SIZE(%rdi, %rdx), %r8 > leaq -VEC_SIZE(%rsi, %rdx), %r9 > vmovdqu (%r9), %ymm2 > VPCMPEQ (%r8), %ymm2, %ymm2 91,104c91,103 < tzcntl %eax, %ecx < # ifdef USE_AS_WMEMCMP < xorl %eax, %eax < movl (%rdi, %rcx), %edx < cmpl (%rsi, %rcx), %edx < L(wmemcmp_return): < setl %al < negl %eax < orl $1, %eax < # else < movzbl (%rdi, %rcx), %eax < movzbl (%rsi, %rcx), %edx < sub %edx, %eax < # endif --- > vmovdqu (%rsi), %ymm2 > vmovdqu (%rdi), %ymm3 > > VPCMPGTQ %ymm2, %ymm3, %ymm4 > VPCMPGTQ %ymm3, %ymm2, %ymm5 > vpmovmskb %ymm4, %eax > vpmovmskb %ymm5, %edx > neg %eax > neg %edx > lzcnt %eax, %eax > lzcnt %edx, %edx > sub %edx, %eax > Thanks. -- WBR, Andrew
On Sat, Jun 17, 2017 at 3:44 AM, Andrew Senkevich <andrew.n.senkevich@gmail.com> wrote: > 2017-06-16 4:15 GMT+02:00 H.J. Lu <hjl.tools@gmail.com>: >> On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote: >>> On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote: >>>> Optimize x86-64 memcmp/wmemcmp with AVX2. It uses vector compare as >>>> much as possible. It is as fast as SSE4 memcmp for size <= 16 bytes >>>> and up to 2X faster for size > 16 bytes on Haswell and Skylake. Select >>>> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and >>>> AVX unaligned load is fast. >>>> >>>> Key features: >>>> >>>> 1. Use overlapping compare to avoid branch. >>>> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8 >>>> bytes for wmemcmp. >>>> 3. If size is 8 * VEC_SIZE or less, unroll the loop. >>>> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area. >>>> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less. >>>> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less. >>>> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. >>>> >>>> Any comments? >>>> >>> I have some comments, its similar to one of my previous patches >>> >>>> + cmpq $(VEC_SIZE * 2), %rdx >>>> + ja L(more_2x_vec) >>>> + >>> This is unnecessary branch, its likely that there is difference in first >>> 16 bytes regardless of size. Move test about sizes... >>>> +L(last_2x_vec): >>>> + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ >>>> + vmovdqu (%rsi), %ymm2 >>>> + VPCMPEQ (%rdi), %ymm2, %ymm2 >>>> + vpmovmskb %ymm2, %eax >>>> + subl $VEC_MASK, %eax >>>> + jnz L(first_vec) >>> here. >>> >> >> If we do that, the size check will be redundant from >> >> /* Less than 4 * VEC. */ >> cmpq $VEC_SIZE, %rdx >> jbe L(last_vec) >> cmpq $(VEC_SIZE * 2), %rdx >> jbe L(last_2x_vec) >> >> L(last_4x_vec): >> >> Of cause, we can duplicate these blocks to avoid size. >> >>> >>>> +L(first_vec): >>>> + /* A byte or int32 is different within 16 or 32 bytes. */ >>>> + bsfl %eax, %ecx >>>> +# ifdef USE_AS_WMEMCMP >>>> + xorl %eax, %eax >>>> + movl (%rdi, %rcx), %edx >>>> + cmpl (%rsi, %rcx), %edx >>>> +L(wmemcmp_return): >>>> + setl %al >>>> + negl %eax >>>> + orl $1, %eax >>>> +# else >>>> + movzbl (%rdi, %rcx), %eax >>>> + movzbl (%rsi, %rcx), %edx >>>> + sub %edx, %eax >>>> +# endif >>>> + VZEROUPPER >>>> + ret >>>> + >>> >>> Loading bytes depending on result of bsf is slow, alternative is to find >>> that from vector tests. I could avoid it using tests like this but I >>> didn't measure performance/test it yet. >>> >>> vmovdqu (%rdi), %ymm3 >>> >>> VPCMPGTQ %ymm2, %ymm3, %ymm4 >>> VPCMPGTQ %ymm3, %ymm2, %ymm5 >>> vpmovmskb %ymm4, %eax >>> vpmovmskb %ymm5, %edx >>> neg %eax >>> neg %edx >>> lzcnt %eax, %eax >>> lzcnt %edx, %edx >>> sub %edx, %eax >>> ret >> >> Andrew, can you give it a try? > > Hi Ondrej, could you send patch with you proposal? > I have tried with the following change and got many test-memcmp wrong results: We can't use VPCMPGT for memcmp since it performs signed comparison, but memcmp requires unsigned comparison. H.J.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 3736f54..a62def3 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -6,6 +6,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ + memcmp-avx2 \ memcmp-sse4 memcpy-ssse3 \ memmove-ssse3 \ memcpy-ssse3-back \ @@ -30,5 +31,7 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq ($(subdir),wcsmbs) -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wmemcmp-avx2 \ + wcscpy-ssse3 wcscpy-c endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a91d2f9..35f1960 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -40,6 +40,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __memcmp_avx2) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), @@ -294,6 +297,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemcmp_avx2) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S new file mode 100644 index 0000000..8e3872a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S @@ -0,0 +1,430 @@ +/* memcmp/wmemcmp optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. Use overlapping compare to avoid branch. + 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 3. If size is 8 * VEC_SIZE or less, unroll the loop. + 4. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 5. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 6. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_avx2 +# endif + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 +# define VEC_MASK ((1 << VEC_SIZE) - 1) + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.avx,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + bsfl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx), %edx + cmpl (%rsi, %rcx), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + vmovd (%rdi), %xmm1 + vmovd (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + leaq -4(%rdi, %rdx), %rdi + leaq -4(%rsi, %rdx), %rsi + vmovd (%rdi), %xmm1 + vmovd (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_2_3): + /* Load 2 bytes into registers. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + /* Compare the lowest byte. */ + cmpb %cl, %al + jne L(1byte_reg) + /* Load the difference of 2 bytes into EAX. */ + subl %ecx, %eax + /* Return if 2 bytes differ. */ + jnz L(exit) + cmpb $2, %dl + /* Return if these are the last 2 bytes. */ + je L(exit) + movzbl 2(%rdi), %eax + movzbl 2(%rsi), %ecx + subl %ecx, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(1byte_reg): + movzbl %al, %eax + movzbl %cl, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_2x_vec): + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + VZEROUPPER + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + subl $VEC_MASK, %eax + bsfl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + bsfl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx), %edx + cmpl VEC_SIZE(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + bsfl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index 6129820..08acacb 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -27,7 +27,16 @@ ENTRY(memcmp) .type memcmp, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX - HAS_CPU_FEATURE (SSSE3) + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __memcmp_avx2(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) jnz 2f leaq __memcmp_sse2(%rip), %rax ret diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S new file mode 100644 index 0000000..aa2190b --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2 +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S index 5dc54d7..46ee8f5 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -27,7 +27,16 @@ ENTRY(wmemcmp) .type wmemcmp, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX - HAS_CPU_FEATURE (SSSE3) + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __wmemcmp_avx2(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) jnz 2f leaq __wmemcmp_sse2(%rip), %rax ret