Message ID | 20160518205420.GA28859@intel.com |
---|---|
State | New |
Headers | show |
On Wed, May 18, 2016 at 1:54 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > X86-64 memset-vec-unaligned-erms.S aligns many jump targets, which > increases code sizes, but not necessarily improve performance. As > memset benchtest data of align vs no align on various Intel and AMD > processors > > https://sourceware.org/bugzilla/attachment.cgi?id=9277 > > shows that aligning jump targets isn't necessary. > > Any comments, feedbacks? > > > H.J. > --- > [BZ #20115] > * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset): > Remove alignments on jump targets. > --- > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 37 +++------------------- > 1 file changed, 5 insertions(+), 32 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 578a5ae..b1df228 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -18,12 +18,10 @@ > > /* memset is implemented as: > 1. Use overlapping store to avoid branch. > - 2. Force 32-bit displacement for branches to avoid long nop between > - instructions. > - 3. If size is less than VEC, use integer register stores. > - 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. > - 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. > - 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with > + 2. If size is less than VEC, use integer register stores. > + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. > + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. > + 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with > 4 VEC stores and store 4 * VEC at a time until done. */ > > #include <sysdep.h> > @@ -143,14 +141,10 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) > VZEROUPPER > ret > > - .p2align 4 > L(stosb_more_2x_vec): > cmpq $REP_STOSB_THRESHOLD, %rdx > - /* Force 32-bit displacement to avoid long nop between > - instructions. */ > - ja.d32 L(stosb) > + ja L(stosb) > #endif > - .p2align 4 > L(more_2x_vec): > cmpq $(VEC_SIZE * 4), %rdx > ja L(loop_start) > @@ -162,26 +156,12 @@ L(return): > VZEROUPPER > ret > > - .p2align 4 > L(loop_start): > leaq (VEC_SIZE * 4)(%rdi), %rcx > -# if VEC_SIZE == 32 || VEC_SIZE == 64 > - /* Force 32-bit displacement to avoid long nop between > - instructions. */ > - VMOVU.d32 %VEC(0), (%rdi) > -# else > VMOVU %VEC(0), (%rdi) > -# endif > andq $-(VEC_SIZE * 4), %rcx > -# if VEC_SIZE == 32 > - /* Force 32-bit displacement to avoid long nop between > - instructions. */ > - VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx) > - VMOVU.d32 %VEC(0), VEC_SIZE(%rdi) > -# else > VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > VMOVU %VEC(0), VEC_SIZE(%rdi) > -# endif > VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) > VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) > VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) > @@ -190,14 +170,7 @@ L(loop_start): > addq %rdi, %rdx > andq $-(VEC_SIZE * 4), %rdx > cmpq %rdx, %rcx > -# if VEC_SIZE == 32 || VEC_SIZE == 64 > - /* Force 32-bit displacement to avoid long nop between > - instructions. */ > - je.d32 L(return) > -# else > je L(return) > -# endif > - .p2align 4 > L(loop): > VMOVA %VEC(0), (%rcx) > VMOVA %VEC(0), VEC_SIZE(%rcx) > -- > 2.5.5 > I am checking in this.
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 578a5ae..b1df228 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -18,12 +18,10 @@ /* memset is implemented as: 1. Use overlapping store to avoid branch. - 2. Force 32-bit displacement for branches to avoid long nop between - instructions. - 3. If size is less than VEC, use integer register stores. - 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. - 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. - 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with 4 VEC stores and store 4 * VEC at a time until done. */ #include <sysdep.h> @@ -143,14 +141,10 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) VZEROUPPER ret - .p2align 4 L(stosb_more_2x_vec): cmpq $REP_STOSB_THRESHOLD, %rdx - /* Force 32-bit displacement to avoid long nop between - instructions. */ - ja.d32 L(stosb) + ja L(stosb) #endif - .p2align 4 L(more_2x_vec): cmpq $(VEC_SIZE * 4), %rdx ja L(loop_start) @@ -162,26 +156,12 @@ L(return): VZEROUPPER ret - .p2align 4 L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx -# if VEC_SIZE == 32 || VEC_SIZE == 64 - /* Force 32-bit displacement to avoid long nop between - instructions. */ - VMOVU.d32 %VEC(0), (%rdi) -# else VMOVU %VEC(0), (%rdi) -# endif andq $-(VEC_SIZE * 4), %rcx -# if VEC_SIZE == 32 - /* Force 32-bit displacement to avoid long nop between - instructions. */ - VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx) - VMOVU.d32 %VEC(0), VEC_SIZE(%rdi) -# else VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), VEC_SIZE(%rdi) -# endif VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) @@ -190,14 +170,7 @@ L(loop_start): addq %rdi, %rdx andq $-(VEC_SIZE * 4), %rdx cmpq %rdx, %rcx -# if VEC_SIZE == 32 || VEC_SIZE == 64 - /* Force 32-bit displacement to avoid long nop between - instructions. */ - je.d32 L(return) -# else je L(return) -# endif - .p2align 4 L(loop): VMOVA %VEC(0), (%rcx) VMOVA %VEC(0), VEC_SIZE(%rcx)