Message ID | CAMe9rOr9r5s8mTThYDomrJsBEk_zQLpMQYeQ6rJ41S3VXS-Bmg@mail.gmail.com |
---|---|
State | New |
Headers | show |
On 03/29/2016 03:23 PM, H.J. Lu wrote: > The goal of this patch is to replace SSE2 and AVX2 memset.S > with faster and smaller alternatives, also support 64-byte vector > register size. bench-memset data on various Intel and AMD > processors is at > > https://sourceware.org/bugzilla/show_bug.cgi?id=19881 > > Any comments, feedbacks? Caveats about Penryn being slower apply here, and I expect your answer is the same: the selection of the ifunc will not change, and so Penryn will not use the newer versions. This looks good to me. Again, same question about thresholding below. > -- H.J. > > > 0001-Add-x86-64-memset-with-unaligned-store-and-rep-stosb.patch > > > From d0d3495951be16568656971dd2c825da68c2660c Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Fri, 25 Mar 2016 08:20:17 -0700 > Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb > > Implement x86-64 memset with unaligned store and rep movsb. Support > 16-byte, 32-byte and 64-byte vector register sizes. A single file > provides 2 implementations of memset, one with rep stosb and the other > without rep stosb. They share the same codes when size is between 2 > times of vector register size and REP_STOSB_THRESHOLD which is 1KB for > 16-byte vector register size and scaled up by larger vector register > size. > > Key features: > > 1. Use overlapping store to avoid branch. > 2. For size <= 4 times of vector register size, fully unroll the loop. > 3. For size > 4 times of vector register size, store 4 times of vector > register size at a time. > > [BZ #19881] > * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add > memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and > memset-avx512-unaligned-erms. > * sysdeps/x86_64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned, > __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned, > __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned, > __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned, > __memset_sse2_unaligned_erms, __memset_erms, > __memset_avx2_unaligned, __memset_avx2_unaligned_erms, > __memset_avx512_unaligned_erms and __memset_avx512_unaligned. > * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New > file. > * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S: > Likewise. > * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: > Likewise. > * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: > Likewise. > > Memset > --- > sysdeps/x86_64/multiarch/Makefile | 5 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 +++ > .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++ > .../multiarch/memset-avx512-unaligned-erms.S | 17 ++ > .../x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 ++ > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 246 +++++++++++++++++++++ > 6 files changed, 330 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index ef4dbc0..8878efb 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ > memset-avx512-no-vzeroupper \ > memmove-sse2-unaligned-erms \ > memmove-avx-unaligned-erms \ > - memmove-avx512-unaligned-erms > + memmove-avx512-unaligned-erms \ > + memset-sse2-unaligned-erms \ > + memset-avx2-unaligned-erms \ > + memset-avx512-unaligned-erms OK. > CFLAGS-varshift.c += -msse4 > CFLAGS-strcspn-c.c += -msse4 > CFLAGS-strpbrk-c.c += -msse4 > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 9204da4..1e880f6 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL (i, name, __memset_chk, > IFUNC_IMPL_ADD (array, i, __memset_chk, 1, > __memset_chk_sse2) > + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, > + __memset_chk_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, > + __memset_chk_sse2_unaligned_erms) > IFUNC_IMPL_ADD (array, i, __memset_chk, > HAS_ARCH_FEATURE (AVX2_Usable), > __memset_chk_avx2) > + IFUNC_IMPL_ADD (array, i, __memset_chk, > + HAS_ARCH_FEATURE (AVX2_Usable), > + __memset_chk_avx2_unaligned) > + IFUNC_IMPL_ADD (array, i, __memset_chk, > + HAS_ARCH_FEATURE (AVX2_Usable), > + __memset_chk_avx2_unaligned_erms) > #ifdef HAVE_AVX512_ASM_SUPPORT > IFUNC_IMPL_ADD (array, i, __memset_chk, > HAS_ARCH_FEATURE (AVX512F_Usable), > + __memset_chk_avx512_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, __memset_chk, > + HAS_ARCH_FEATURE (AVX512F_Usable), > + __memset_chk_avx512_unaligned) > + IFUNC_IMPL_ADD (array, i, __memset_chk, > + HAS_ARCH_FEATURE (AVX512F_Usable), > __memset_chk_avx512_no_vzeroupper) > #endif > ) > @@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/memset.S. */ > IFUNC_IMPL (i, name, memset, > IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) > + IFUNC_IMPL_ADD (array, i, memset, 1, > + __memset_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, memset, 1, > + __memset_sse2_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) > IFUNC_IMPL_ADD (array, i, memset, > HAS_ARCH_FEATURE (AVX2_Usable), > __memset_avx2) > + IFUNC_IMPL_ADD (array, i, memset, > + HAS_ARCH_FEATURE (AVX2_Usable), > + __memset_avx2_unaligned) > + IFUNC_IMPL_ADD (array, i, memset, > + HAS_ARCH_FEATURE (AVX2_Usable), > + __memset_avx2_unaligned_erms) > #ifdef HAVE_AVX512_ASM_SUPPORT > IFUNC_IMPL_ADD (array, i, memset, > HAS_ARCH_FEATURE (AVX512F_Usable), > + __memset_avx512_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, memset, > + HAS_ARCH_FEATURE (AVX512F_Usable), > + __memset_avx512_unaligned) > + IFUNC_IMPL_ADD (array, i, memset, > + HAS_ARCH_FEATURE (AVX512F_Usable), > __memset_avx512_no_vzeroupper) > #endif > ) > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > new file mode 100644 > index 0000000..e0dc565 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -0,0 +1,14 @@ > +#define VEC_SIZE 32 > +#define VEC(i) ymm##i > +#define VMOVU vmovdqu > +#define VMOVA vmovdqa > + > +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > + vmovd d, %xmm0; \ > + movq r, %rax; \ > + vpbroadcastb %xmm0, %ymm0 > + > +#define SECTION(p) p##.avx > +#define MEMSET_SYMBOL(p,s) p##_avx2_##s OK. > + > +#include "memset-vec-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > new file mode 100644 > index 0000000..72f4095 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -0,0 +1,17 @@ > +#ifdef HAVE_AVX512_ASM_SUPPORT > +# define VEC_SIZE 64 > +# define VEC(i) zmm##i > +# define VMOVU vmovdqu64 > +# define VMOVA vmovdqa64 > + > +# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > + vmovd d, %xmm0; \ > + movq r, %rax; \ > + vpbroadcastb %xmm0, %xmm0; \ > + vpbroadcastq %xmm0, %zmm0 > + > +# define SECTION(p) p##.avx512 > +# define MEMSET_SYMBOL(p,s) p##_avx512_##s OK. > + > +# include "memset-vec-unaligned-erms.S" > +#endif > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > new file mode 100644 > index 0000000..437a858 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > @@ -0,0 +1,16 @@ > +#define VEC_SIZE 16 > +#define VEC(i) xmm##i > +#define VMOVU movdqu > +#define VMOVA movdqa > + > +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > + movd d, %xmm0; \ > + movq r, %rax; \ > + punpcklbw %xmm0, %xmm0; \ > + punpcklwd %xmm0, %xmm0; \ > + pshufd $0, %xmm0, %xmm0 > + > +#define SECTION(p) p > +#define MEMSET_SYMBOL(p,s) p##_sse2_##s OK. > + > +#include "memset-vec-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > new file mode 100644 > index 0000000..dd04789 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -0,0 +1,246 @@ > +/* memset/bzero with unaligned store and rep stosb > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* memset is implemented as: > + 1. Use overlapping store to avoid branch. > + 2. Force 32-bit displacement for branches to avoid long nop between > + instructions. > + 3. If size is less than VEC, use integer register stores. > + 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. > + 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. > + 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with > + 4 VEC stores and store 4 * VEC at a time until done. > + */ Use GNU formatting please. e.g. /* foo */ not /* foo */ > +#include <sysdep.h> > + > +#ifndef VZEROUPPER > +# if VEC_SIZE > 16 > +# define VZEROUPPER vzeroupper > +# else > +# define VZEROUPPER > +# endif > +#endif > + > +#ifndef VZEROUPPER_SHORT_RETURN > +# if VEC_SIZE > 16 > +# define VZEROUPPER_SHORT_RETURN vzeroupper > +# else > +# define VZEROUPPER_SHORT_RETURN rep > +# endif > +#endif > + > +#ifndef MOVQ > +# if VEC_SIZE > 16 > +# define MOVQ vmovq > +# else > +# define MOVQ movq > +# endif > +#endif > + > +/* Threshold to use Enhanced REP STOSB. */ > +#ifndef REP_STOSB_THRESHOLD > +# define REP_STOSB_THRESHOLD (1024 * (VEC_SIZE / 16)) Same question as your other patch. How are we selecting this threshold? > +#endif > + > +#ifndef SECTION > +# error SECTION is not defined! > +#endif > + > +#if !defined USE_MULTIARCH && IS_IN (libc) > + .section SECTION(.text),"ax",@progbits > +ENTRY (__bzero) > + movq %rdi, %rax /* Set return value. */ > + movq %rsi, %rdx /* Set n. */ > + pxor %xmm0, %xmm0 > + jmp L(entry_from_bzero) > +END (__bzero) > +weak_alias (__bzero, bzero) > +#endif > + > +#if defined SHARED && IS_IN (libc) > +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) > + cmpq %rdx, %rcx > + jb HIDDEN_JUMPTARGET (__chk_fail) > +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) > +#endif > + > +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > +L(memset_entry): > + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > +L(entry_from_bzero): > + cmpq $VEC_SIZE, %rdx > + jb L(less_vec) > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VEC(0), (%rdi) > + VZEROUPPER > + ret > +END (MEMSET_SYMBOL (__memset, unaligned)) > + > +#if VEC_SIZE == 16 > +/* Only used to measure performance of REP STOSB. */ > +ENTRY (__memset_erms) > +#else > +/* Provide a symbol to debugger. */ > +ENTRY (MEMSET_SYMBOL (__memset, erms)) > +#endif > +L(stosb): > + movq %rdx, %rcx > + movzbl %sil, %eax > + movq %rdi, %rdx > + rep stosb > + movq %rdx, %rax > + ret > +#if VEC_SIZE == 16 > +END (__memset_erms) > +#else > +END (MEMSET_SYMBOL (__memset, erms)) > +#endif > + > +#if defined SHARED && IS_IN (libc) > +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) > + cmpq %rdx, %rcx > + jb HIDDEN_JUMPTARGET (__chk_fail) > +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) > +#endif > + > +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) > + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > + cmpq $VEC_SIZE, %rdx > + jb L(less_vec) > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(stosb_more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VEC(0), (%rdi) > + VZEROUPPER > + ret > + > + .p2align 4 > +L(stosb_more_2x_vec): > + cmpq $REP_STOSB_THRESHOLD, %rdx > + /* Force 32-bit displacement to avoid long nop between > + instructions. */ > + ja.d32 L(stosb) > + .p2align 4 > +L(more_2x_vec): > + cmpq $(VEC_SIZE * 4), %rdx > + ja L(loop_start) > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), VEC_SIZE(%rdi) > + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) > +L(return): > + VZEROUPPER > + ret > + > + .p2align 4 > +L(loop_start): > + leaq (VEC_SIZE * 4)(%rdi), %rcx > + VMOVU %VEC(0), (%rdi) > + andq $-(VEC_SIZE * 4), %rcx > + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VEC(0), VEC_SIZE(%rdi) > + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) > + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) > + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) > + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) > + addq %rdi, %rdx > + andq $-(VEC_SIZE * 4), %rdx > + cmpq %rdx, %rcx > +# if VEC_SIZE == 32 || VEC_SIZE == 64 > + /* Force 32-bit displacement to avoid long nop between > + instructions. */ > + je.d32 L(return) > +# else > + je L(return) > +# endif > + .p2align 4 > +L(loop): > + VMOVA %VEC(0), (%rcx) > + VMOVA %VEC(0), VEC_SIZE(%rcx) > + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) > + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) > + addq $(VEC_SIZE * 4), %rcx > + cmpq %rcx, %rdx > + jne L(loop) > + VZEROUPPER_SHORT_RETURN > + ret > +L(less_vec): > + /* Less than 1 VEC. */ > +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > +# error Unsupported VEC_SIZE! > +# endif > +# if VEC_SIZE > 32 > + cmpb $32, %dl > + jae L(between_32_63) > +# endif > +# if VEC_SIZE > 16 > + cmpb $16, %dl > + jae L(between_16_31) > +# endif > + MOVQ %xmm0, %rcx > + cmpb $8, %dl > + jae L(between_8_15) > + cmpb $4, %dl > + jae L(between_4_7) > + cmpb $1, %dl > + ja L(between_2_3) > + jb 1f > + movb %cl, (%rdi) > +1: > + VZEROUPPER > + ret > +# if VEC_SIZE > 32 > + /* From 32 to 63. No branch when size == 32. */ > +L(between_32_63): > + vmovdqu %ymm0, -32(%rdi,%rdx) > + vmovdqu %ymm0, (%rdi) > + VZEROUPPER > + ret > +# endif > +# if VEC_SIZE > 16 > + /* From 16 to 31. No branch when size == 16. */ > +L(between_16_31): > + vmovdqu %xmm0, -16(%rdi,%rdx) > + vmovdqu %xmm0, (%rdi) > + VZEROUPPER > + ret > +# endif > + /* From 8 to 15. No branch when size == 8. */ > +L(between_8_15): > + movq %rcx, -8(%rdi,%rdx) > + movq %rcx, (%rdi) > + VZEROUPPER > + ret > +L(between_4_7): > + /* From 4 to 7. No branch when size == 4. */ > + movl %ecx, -4(%rdi,%rdx) > + movl %ecx, (%rdi) > + VZEROUPPER > + ret > +L(between_2_3): > + /* From 2 to 3. No branch when size == 2. */ > + movw %cx, -2(%rdi,%rdx) > + movw %cx, (%rdi) > + VZEROUPPER > + ret > +END (MEMSET_SYMBOL (__memset, unaligned_erms)) > -- 2.5.5 OK.
From d0d3495951be16568656971dd2c825da68c2660c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Fri, 25 Mar 2016 08:20:17 -0700 Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb Implement x86-64 memset with unaligned store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. A single file provides 2 implementations of memset, one with rep stosb and the other without rep stosb. They share the same codes when size is between 2 times of vector register size and REP_STOSB_THRESHOLD which is 1KB for 16-byte vector register size and scaled up by larger vector register size. Key features: 1. Use overlapping store to avoid branch. 2. For size <= 4 times of vector register size, fully unroll the loop. 3. For size > 4 times of vector register size, store 4 times of vector register size at a time. [BZ #19881] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and memset-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned, __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned, __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned, __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned, __memset_sse2_unaligned_erms, __memset_erms, __memset_avx2_unaligned, __memset_avx2_unaligned_erms, __memset_avx512_unaligned_erms and __memset_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. Memset --- sysdeps/x86_64/multiarch/Makefile | 5 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 +++ .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++ .../multiarch/memset-avx512-unaligned-erms.S | 17 ++ .../x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 ++ .../x86_64/multiarch/memset-vec-unaligned-erms.S | 246 +++++++++++++++++++++ 6 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ef4dbc0..8878efb 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ memset-avx512-no-vzeroupper \ memmove-sse2-unaligned-erms \ memmove-avx-unaligned-erms \ - memmove-avx512-unaligned-erms + memmove-avx512-unaligned-erms \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 9204da4..1e880f6 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), __memset_chk_avx2) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned_erms) #ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), __memset_chk_avx512_no_vzeroupper) #endif ) @@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memset.S. */ IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), __memset_avx2) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned_erms) #ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), __memset_avx512_no_vzeroupper) #endif ) diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S new file mode 100644 index 0000000..e0dc565 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -0,0 +1,14 @@ +#define VEC_SIZE 32 +#define VEC(i) ymm##i +#define VMOVU vmovdqu +#define VMOVA vmovdqa + +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %ymm0 + +#define SECTION(p) p##.avx +#define MEMSET_SYMBOL(p,s) p##_avx2_##s + +#include "memset-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S new file mode 100644 index 0000000..72f4095 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -0,0 +1,17 @@ +#ifdef HAVE_AVX512_ASM_SUPPORT +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define SECTION(p) p##.avx512 +# define MEMSET_SYMBOL(p,s) p##_avx512_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S new file mode 100644 index 0000000..437a858 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -0,0 +1,16 @@ +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define VMOVU movdqu +#define VMOVA movdqa + +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + punpcklbw %xmm0, %xmm0; \ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + +#define SECTION(p) p +#define MEMSET_SYMBOL(p,s) p##_sse2_##s + +#include "memset-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S new file mode 100644 index 0000000..dd04789 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -0,0 +1,246 @@ +/* memset/bzero with unaligned store and rep stosb + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memset is implemented as: + 1. Use overlapping store to avoid branch. + 2. Force 32-bit displacement for branches to avoid long nop between + instructions. + 3. If size is less than VEC, use integer register stores. + 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. + */ +#include <sysdep.h> + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef VZEROUPPER_SHORT_RETURN +# if VEC_SIZE > 16 +# define VZEROUPPER_SHORT_RETURN vzeroupper +# else +# define VZEROUPPER_SHORT_RETURN rep +# endif +#endif + +#ifndef MOVQ +# if VEC_SIZE > 16 +# define MOVQ vmovq +# else +# define MOVQ movq +# endif +#endif + +/* Threshold to use Enhanced REP STOSB. */ +#ifndef REP_STOSB_THRESHOLD +# define REP_STOSB_THRESHOLD (1024 * (VEC_SIZE / 16)) +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + +#if !defined USE_MULTIARCH && IS_IN (libc) + .section SECTION(.text),"ax",@progbits +ENTRY (__bzero) + movq %rdi, %rax /* Set return value. */ + movq %rsi, %rdx /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) +END (__bzero) +weak_alias (__bzero, bzero) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +L(memset_entry): + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned)) + +#if VEC_SIZE == 16 +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) +#else +/* Provide a symbol to debugger. */ +ENTRY (MEMSET_SYMBOL (__memset, erms)) +#endif +L(stosb): + movq %rdx, %rcx + movzbl %sil, %eax + movq %rdi, %rdx + rep stosb + movq %rdx, %rax + ret +#if VEC_SIZE == 16 +END (__memset_erms) +#else +END (MEMSET_SYMBOL (__memset, erms)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret + + .p2align 4 +L(stosb_more_2x_vec): + cmpq $REP_STOSB_THRESHOLD, %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + ja.d32 L(stosb) + .p2align 4 +L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_start) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +L(return): + VZEROUPPER + ret + + .p2align 4 +L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx + VMOVU %VEC(0), (%rdi) + andq $-(VEC_SIZE * 4), %rcx + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + cmpq %rdx, %rcx +# if VEC_SIZE == 32 || VEC_SIZE == 64 + /* Force 32-bit displacement to avoid long nop between + instructions. */ + je.d32 L(return) +# else + je L(return) +# endif + .p2align 4 +L(loop): + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(0), VEC_SIZE(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + MOVQ %xmm0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movb %cl, (%rdi) +1: + VZEROUPPER + ret +# if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ +L(between_32_63): + vmovdqu %ymm0, -32(%rdi,%rdx) + vmovdqu %ymm0, (%rdi) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu %xmm0, -16(%rdi,%rdx) + vmovdqu %xmm0, (%rdi) + VZEROUPPER + ret +# endif + /* From 8 to 15. No branch when size == 8. */ +L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) + VZEROUPPER + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) + VZEROUPPER + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned_erms)) -- 2.5.5